In [1]:
import nltk
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
movie_reviews.fileids('neg')[:10]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt']

In [4]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [5]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [6]:
import random
random.shuffle(documents)
documents[0:5]

[(['this', 'is', 'the', 'worst', 'movie', 'i', "'", 've', ...], 'neg'),
 (['the', 'love', 'for', 'family', 'is', 'one', 'of', ...], 'pos'),
 (['don', "'", 't', 'let', 'this', 'movie', 'fool', ...], 'neg'),
 (['at', 'one', 'point', 'in', 'this', 'movie', 'there', ...], 'neg'),
 (['aggressive', ',', 'bleak', ',', 'and', 'unrelenting', ...], 'pos')]

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [8]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
from nltk import pos_tag


In [11]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)


In [13]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [14]:
documents = [(clean_review(document), category) for document, category in documents]

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
categories = [category for document, category in documents]

In [28]:
text_documents = [" ".join(document) for document, category in documents]
text_documents[:3]

['bad movie view far 98 avenger silly man dress bowler hat woman wear tight leather evil scientist dress teddy bear suit great evil sir august de wynter wear kilt question could go wrong potentially great idea big name cast question probably ask last year stinker batman robin feel production get little smug script little smart direction somehow lose chaos random event collide together form movie great criticism rest fact chemistry emma peel john steed thurman fiennes something vital element 60 tv serial name dialogue go tea finer british perk allow much room character development interaction except perhaps grate viewer nerve one wonder dynamic pair bother kiss end except pure english formality connery sir august fair well thurman fiennes dialogue erratic stormy weather mostly embarrass poor quality movie would prefer never see believe avenger would good choice one thing witness product far inferior three high profile name associate title cannot understand something poorly produce could

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [20]:
count_vec = CountVectorizer(max_features = 2000, ngram_range=(1,2))
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 1, 0, ..., 3, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
count_vec.get_feature_names()[20:30]

['able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accompany',
 'accomplish',
 'achieve',
 'across']

In [22]:
x_test_features = count_vec.transform(x_test)

In [23]:
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 89700 stored elements in Compressed Sparse Row format>

In [24]:
from sklearn.svm import SVC

In [25]:
svc = SVC()
svc.fit(x_train_features, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [26]:
svc.score(x_test_features, y_test)

0.822