In [1]:
from nltk.corpus import movie_reviews

In [2]:
len(movie_reviews.fileids())

2000

In [3]:
len(movie_reviews.words(movie_reviews.fileids()[11]))

629

In [4]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:3]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg')]

In [5]:
import random
random.shuffle(documents)
documents[0:3]

[(['plot', ':', 'good', 'ol', "'", 'texan', 'kid', ...], 'pos'),
 (['it', 'rocks', '-', 'actually', ',', 'lots', 'of', ...], 'neg'),
 (['let', 'me', 'open', 'this', 'one', 'with', 'a', ...], 'pos')]

In [6]:
from nltk.corpus import stopwords
import string
stop = stopwords.words('english')
stop = stop + list(string.punctuation)

In [8]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [9]:
from nltk.corpus import wordnet
def simple_pos_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    else:
        return wordnet.ADV

In [10]:
from nltk import pos_tag
def cleaned_reviews(words):
    output__words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])
            cleaned_output = lemmatizer.lemmatize(w, pos = simple_pos_tag(pos[0][1]))
            output__words.append(cleaned_output.lower())
    return output__words

In [11]:
documents = [(cleaned_reviews(document), category) for document, category in documents]

In [12]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [16]:
all_words = []
for document,category in documents:
    all_words += document

In [13]:
from nltk import FreqDist
freq = FreqDist(all_words)

In [14]:
most_common = freq.most_common(3000)
features = [word for word,frequency in most_common]

In [15]:
def feature_dictionary(words):
    features_present = {}
    words_set = set(words)
    for w in features:
        features_present[w] = w in words_set
    return features_present

In [16]:
training_data = [(feature_dictionary(train_document), category) for train_document,category in training_documents]
testing_data = [(feature_dictionary(test_document), category) for test_document,category in testing_documents]

In [17]:
from nltk import NaiveBayesClassifier

In [18]:
classifier = NaiveBayesClassifier.train(training_data)

In [19]:
import nltk
nltk.classify.accuracy(classifier, testing_data)

0.798

In [20]:
classifier.show_most_informative_features()

Most Informative Features
                  seagal = True              neg : pos    =     11.0 : 1.0
               ludicrous = True              neg : pos    =     10.2 : 1.0
                 freddie = True              neg : pos    =      9.6 : 1.0
                  prinze = True              neg : pos    =      9.6 : 1.0
             outstanding = True              pos : neg    =      8.7 : 1.0
                   damon = True              pos : neg    =      8.6 : 1.0
             wonderfully = True              pos : neg    =      8.4 : 1.0
               stupidity = True              neg : pos    =      8.2 : 1.0
                  sidney = True              pos : neg    =      7.8 : 1.0
              uninspired = True              neg : pos    =      7.6 : 1.0


In [28]:
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier

In [29]:
clf = RandomForestClassifier()

In [31]:
classifier_sklearn = SklearnClassifier(clf)
classifier_sklearn.train(training_data)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [32]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.71

In [34]:
classifier.show_most_informative_features()

Most Informative Features
                  seagal = True              neg : pos    =     11.0 : 1.0
               ludicrous = True              neg : pos    =     10.2 : 1.0
                 freddie = True              neg : pos    =      9.6 : 1.0
                  prinze = True              neg : pos    =      9.6 : 1.0
             outstanding = True              pos : neg    =      8.7 : 1.0
                   damon = True              pos : neg    =      8.6 : 1.0
             wonderfully = True              pos : neg    =      8.4 : 1.0
               stupidity = True              neg : pos    =      8.2 : 1.0
                  sidney = True              pos : neg    =      7.8 : 1.0
              uninspired = True              neg : pos    =      7.6 : 1.0


In [37]:
from sklearn.svm import SVC
svc = SVC()

In [38]:
classifier_sklearn_svc = SklearnClassifier(svc)
classifier_sklearn_svc.train(training_data)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [39]:
nltk.classify.accuracy(classifier_sklearn_svc, testing_data)

0.782

In [42]:
classifier.show_most_informative_features()

Most Informative Features
                  seagal = True              neg : pos    =     11.0 : 1.0
               ludicrous = True              neg : pos    =     10.2 : 1.0
                 freddie = True              neg : pos    =      9.6 : 1.0
                  prinze = True              neg : pos    =      9.6 : 1.0
             outstanding = True              pos : neg    =      8.7 : 1.0
                   damon = True              pos : neg    =      8.6 : 1.0
             wonderfully = True              pos : neg    =      8.4 : 1.0
               stupidity = True              neg : pos    =      8.2 : 1.0
                  sidney = True              pos : neg    =      7.8 : 1.0
              uninspired = True              neg : pos    =      7.6 : 1.0


## Using Count Vectorizer 

In [50]:
all_documents = [" ".join(document) for document, category in documents]

In [47]:
categories = [category for document, category in documents]

In [51]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(all_documents, categories)

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

In [52]:
count_vec = CountVectorizer(max_features = 2500)
train_features = count_vec.fit_transform(x_train)

In [57]:
test_features = count_vec.transform(x_test)

In [58]:
from sklearn.svm import SVC
svc = SVC()

In [59]:
svc.fit(train_features, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [61]:
svc.score(test_features, y_test)

0.806