In [38]:
from nltk.corpus import movie_reviews

In [39]:
movie_reviews.words(movie_reviews.fileids()[0])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [40]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids():
        documents.append((movie_reviews.words(fileid),category))
documents[0:5],len(documents)

([(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
  (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
  (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
  (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
  (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')],
 4000)

In [41]:
import random
random.shuffle(documents)
documents[0:5]

[(['ingredients', ':', 'possessed', 'plastic', 'dolls', ...], 'pos'),
 (['for', 'timing', 'reasons', 'having', 'to', 'do', ...], 'pos'),
 (['if', 'this', 'keeps', 'up', ',', 'jane', 'austen', ...], 'pos'),
 (['i', "'", 'm', 'really', 'starting', 'to', 'wonder', ...], 'neg'),
 (['i', 'still', 'can', "'", 't', 'figure', 'out', 'why', ...], 'neg')]

In [42]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [43]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [44]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)
stops

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [45]:
from nltk import pos_tag
def clean_reviews(words):
    output = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag(w)
            clean_word = lemmatizer.lemmatize(w,get_simple_pos(pos[0][1]))
            output.append(clean_word.lower())
    return output

In [46]:
documents = [(clean_reviews(document), category) for document, category in documents]

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
categories = [category for document,category in documents]

In [49]:
text_document = [" ".join(document) for document,category in documents ]

In [50]:
text_document[0]

'ingredient possessed plastic doll love plastic doll sex starring jennifer tilly voice brad dourif katherine heigl nick stabile john ritter synopsis fourth film chucky series debuted late 1980s basically chucky plastic doll walk talk possessed spirit slain murderer bride chucky chucky longtime girlfriend tiffany jennifer tilly dy spirit inhabits female plastic doll voodoo doll tiffany chucky get married embark quest reach cemetery new jersey mystical gem might enable human doll stow away back vehicle driven newly eloped couple side plot couple suspect murderer opinion bride chucky attempt horror humor succeed somehow chucky moaning mid life crisis gotten married make scary chucky tiffany harping mid life crisis gotten married make scary tiffany suspenseless bride chucky relies mostly jennifer tilly cleavage keep attention first half occasional pun keep attention second half best say bride chucky sarcastic'

In [51]:
from sklearn import model_selection

In [52]:
xtrain,xtest,ytrain,ytest = model_selection.train_test_split(text_document,categories)
len(xtrain)

3000

In [56]:
count_vec = CountVectorizer(max_features=2000)
x_train_features = count_vec.fit_transform(xtrain)
x_train_features.todense()

matrix([[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  1,  0,  0],
        ...,
        [ 0,  0,  0, ...,  1,  0,  0],
        [ 0,  0,  0, ...,  2,  0,  0],
        [ 0,  0,  0, ...,  0,  0, 19]], dtype=int64)

In [58]:
x_test_features = count_vec.transform(xtest)
x_test_features

<1000x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 167637 stored elements in Compressed Sparse Row format>

In [55]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '12',
 '13',
 '15',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '50',
 '60',
 '70',
 '80',
 '90',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'across',
 'act',
 'acted',
 'acting',
 'action',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'add',
 'added',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'african',
 'age',
 'agent',
 'ago',
 'agrees',
 'ahead',
 'air',
 'al',
 'alan',
 'alex',
 'alice',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'amazing',
 'america',
 'american',
 'among',
 'amount',
 'amusing',
 'amy',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animated',
 'animation',
 'anna',
 'anne',
 'annie',
 'annoying',
 'another',
 'answer',
 'anthony',
 'anti',
 'anyone',
 'anything',
 'anyway',
 'apart',
 'apartment',
 'ape',
 'apparent',

# Simply using Count Vectorizer without any other Parameter

In [63]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_features,ytrain)
svc.score(x_test_features,ytest)

0.343

# Using min-df, max-df

In [64]:
count_vec = CountVectorizer(max_features=2000,max_df=0.8,min_df=0.05)
x_train_features = count_vec.fit_transform(xtrain)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1],
        ...,
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 2],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [66]:
x_test_features = count_vec.transform(xtest)

In [67]:
svc = SVC()
svc.fit(x_train_features,ytrain)
svc.score(x_test_features,ytest)

0.296

# Using n-grams

In [68]:
count_vec = CountVectorizer(max_features=2000,ngram_range=(1,2))
x_train_features = count_vec.fit_transform(xtrain)
x_train_features.todense()

matrix([[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  1,  0,  0],
        ...,
        [ 0,  0,  0, ...,  1,  0,  0],
        [ 0,  0,  0, ...,  2,  0,  0],
        [ 0,  0,  0, ...,  0,  0, 19]], dtype=int64)

In [69]:
x_test_features = count_vec.transform(xtest)

In [70]:
svc = SVC()
svc.fit(x_train_features,ytrain)
svc.score(x_test_features,ytest)

0.342