In [1]:
import random
import nltk
from nltk.corpus import movie_reviews

In [6]:
# build a list of documents -> categorize it into +ve or -ve
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

#shuffle documents
random.shuffle(documents)

print('Number of Documents {}'.format(len(documents)))
print('First Review {}'.format(documents[0]))

#list of all the words 
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
#sort these words from most common to least common
all_words = nltk.FreqDist(all_words)

print('Most Common Words: {}'.format(all_words.most_common(15)))
print('Frequency of the word happy: {}'.format(all_words["happy"]))

Number of Documents 2000
First Review (['while', 'watching', 'boiler', 'room', ',', 'i', 'was', 'constantly', 'reminded', 'of', 'last', 'year', "'", 's', 'masterpiece', 'fight', 'club', '.', 'both', 'films', 'consist', 'of', 'a', 'predominately', 'male', 'cast', '.', 'both', 'films', 'follow', 'young', 'men', 'as', 'they', 'illicitly', 'fight', 'the', 'traditional', 'system', 'for', 'their', 'own', 'desires', '.', 'and', 'both', 'films', 'are', 'seen', 'through', 'the', 'eyes', 'of', 'one', 'narrator', ',', 'who', 'eventually', 'realizes', 'that', 'these', 'men', 'have', 'to', 'be', 'stopped', '.', 'while', 'boiler', 'room', 'writer', '/', 'director', 'ben', 'younger', 'does', 'not', 'get', 'his', 'point', 'across', 'as', 'well', 'as', 'david', 'fincher', 'does', 'for', 'fight', 'club', ',', 'he', 'does', 'contribute', 'another', 'impressive', 'work', 'to', 'a', 'series', 'of', 'films', 'aiming', 'to', 'represent', 'the', 'new', 'generation', '.', 'a', 'generation', 'which', 'has', 'se

In [7]:
print(len(all_words))

39768


In [11]:
# use 4000 most common words as features
word_features = list(all_words.keys())[:4000]

#a function that determines which of the 4000 features are contained in a given review (1 review = i document)
def find_features(document):
    words = set(document)
    features = {}
    
    # key = word from word_features : value = boolean (true for being present in the given review, false otherwise)
    for w in word_features:
        features[w] = (w in words)
        
    return features

#an example of a negative review
'''

features = find_features(movie_reviews.words('neg/cv000_29416.txt'))
for key, value in features.items():
    if value == True:
        print(key)
'''

"\n\nfeatures = find_features(movie_reviews.words('neg/cv000_29416.txt'))\nfor key, value in features.items():\n    if value == True:\n        print(key)\n"

In [12]:
#now for all the documents
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [13]:
#splitting featuresets into test/ train sets
from sklearn import model_selection
#define a seed for reproducibilty
seed = 1
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state = seed)

In [14]:
print(len(training))
print(len(testing))

1500
500


In [15]:
#using sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

In [16]:
model = SklearnClassifier(SVC(kernel = 'linear'))
#training the model
model.train(training)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))>

In [17]:
#test on training dataset
accuracy = nltk.classify.accuracy(model, testing)
print('SVC Accuracy: {}'.format(accuracy))

SVC Accuracy: 0.816
