# NLTK Movie Reviews

## Importing libraries

In [41]:
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews


[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/vishwas/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


## inspecting movie reviews

In [42]:
print(type(movie_reviews), movie_reviews)

<class 'nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader'> <CategorizedPlaintextCorpusReader in '/home/vishwas/nltk_data/corpora/movie_reviews'>


In [43]:
readme = movie_reviews.readme()
print(readme)

Sentiment Polarity Dataset Version 2.0
Bo Pang and Lillian Lee

http://www.cs.cornell.edu/people/pabo/movie-review-data/

Distributed with NLTK with permission from the authors.


Introduction

This README v2.0 (June, 2004) for the v2.0 polarity dataset comes from
the URL http://www.cs.cornell.edu/people/pabo/movie-review-data .


What's New -- June, 2004

This dataset represents an enhancement of the review corpus v1.0
described in README v1.1: it contains more reviews, and labels were
created with an improved rating-extraction system.


Citation Info 

This data was first used in Bo Pang and Lillian Lee,
``A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization 
Based on Minimum Cuts'',  Proceedings of the ACL, 2004.

@InProceedings{Pang+Lee:04a,
  author =       {Bo Pang and Lillian Lee},
  title =        {A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts},
  booktitle =    "Proceedings of the ACL",
  year =      

### So this data was scraped by Mr. Bo Pand and Lilliam Lee for their project. Great work!

In [44]:
category = []

for i in movie_reviews.categories():
    category.append(i)

print(category)

['neg', 'pos']


### so 2 categories are there, negative and positive

## Total number of words in the corpus

In [45]:
import string

word_list = []
for i in movie_reviews.words():
    if i not in string.punctuation:
        word_list.append(i.lower())

In [46]:
print(len(word_list))

1338788


### Wow! That's what we expect of a corpus!

In [47]:
word_dict = {}

for i in word_list:
    if i in word_dict.keys():
        word_dict[i] += 1
    else:
        word_dict[i] = 1

In [48]:
len(word_dict)

39737

### Let's inspect now!

In [49]:
print(len(word_dict))

39737


## Frequency of words

In [50]:
print(word_dict)

 'grudges': 2, 'pronunciation': 1, 'berendt': 1, 'horseflies': 1, 'beverages': 1, 'nepotist': 1, 'chablis': 2, 'castilian': 1, 'rupaul': 1, 'indictment': 1, '1934': 1, '65th': 1, 'springy': 1, 'boringly': 1, 'murkily': 1, 'gallo': 9, 'angelica': 1, 'hypnotized': 6, 'witzky': 1, 'localized': 1, 'spookiness': 1, 'zachary': 2, 'loaned': 1, 'coloring': 1, 'uninhabitable': 1, 'uncreative': 1, 'hershman': 3, 'greenfingers': 5, 'croupier': 1, 'edgefield': 2, 'picaresque': 1, 'accommodations': 1, 'violets': 1, 'limestone': 1, 'reprimands': 1, 'pinching': 1, 'sneezing': 1, 'pansies': 1, 'fives': 1, 'wildflowers': 1, 'primrose': 2, 'batinkoff': 1, 'keri': 2, 'rosen': 1, 'crushes': 1, 'katarina': 6, 'projectors': 2, 'doltish': 1, 'groundlings': 1, 'brassieres': 1, 'amended': 1, 'opined': 1, 'polluted': 1, 'falstaff': 1, 'filmcritic': 1, 'schrager': 1, 'whammy': 1, 'dubuque': 1, '2470': 1, 'moot': 1, 'delusion': 1, 'barcode': 1, 'dajani': 1, 'yawk': 1, 'adoring': 1, 'jetee': 1, 'wayward': 2, 'slot

In [51]:
all_words = nltk.FreqDist(word_list)
all_words

FreqDist({'the': 76529, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, 'is': 25195, 'in': 21822, 's': 18513, 'it': 16107, 'that': 15924, ...})

In [52]:
all_words.most_common(20)

[('the', 76529),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('it', 16107),
 ('that', 15924),
 ('as', 11378),
 ('with', 10792),
 ('for', 9961),
 ('his', 9587),
 ('this', 9578),
 ('film', 9517),
 ('i', 8889),
 ('he', 8864),
 ('but', 8634),
 ('on', 7385)]

### We notice that many of the most common words are stop words.

In [53]:
print(f"The frequency of the word 'boring' is {word_dict['boring']}")

boring_frequency = word_dict['boring']

The frequency of the word 'boring' is 270


### Let's crosscheck with nlkt frequency distribution 

In [54]:
assert boring_frequency == all_words['boring']

### No errors, that means everything is correct!

## Let's convert words to features!

In [55]:
len(movie_reviews.fileids('neg'))

1000

In [56]:
movie_reviews.words('neg/cv162_10977.txt')

['"', 'desperate', 'measures', '"', 'is', 'a', ...]

### creating a feature vector

In [57]:
common_words = all_words.most_common(3000)

feature_vector = []
for i,j in common_words:
    feature_vector.append(i)

# here we took 3000 most common words into feature vector    

In [58]:
feature = {}
review = movie_reviews.words('neg/cv162_10977.txt')

for i in range(len(feature_vector)):
    feature[feature_vector[i]] = feature_vector[i] in review

In [72]:
documents = [(movie_reviews.words(fileid), category)
            for fileid in movie_reviews.fileids()
            for category in movie_reviews.categories(fileid)]

import random
random.seed(2020)
random.shuffle(documents)

print(len(documents))

2000


### Let's create a function which will tell us if the words in a document are there in the feature vector or not!

In [73]:
def find_features(document):
    words = set(document)
    features = {}
    for i in feature_vector:
        features[i] = (i in words)
    
    return features

In [74]:
print(find_features(movie_reviews.words('neg/cv162_10977.txt')))



### creating feature sets that will have tuples of category and presence of words from feature vector

In [75]:
feature_sets = [(find_features(words), category) for (words, category) in documents]

## Let's build a classifier to classify if the review is positive or negative

### train test split : arguably best ratio is 80:20 so lets do that!

In [76]:
train = feature_sets[:1800]
test = feature_sets[1800:]

In [77]:
clf = nltk.NaiveBayesClassifier.train(train)

In [78]:
accuracy = nltk.classify.util.accuracy(clf, test)

In [79]:
print(f"The Naive Bayes Classifier accuracy is {accuracy*100}%")

The Naive Bayes Classifier accuracy is 85.0%


In [80]:
nltk.classify.accuracy(clf,test)

0.85

## So, there we go, Naive Bayes Classifier with an 85% accuracy!

### Let's display some valuable words in this classifier!

In [81]:
clf.show_most_informative_features(15)

Most Informative Features
                  seagal = True              neg : pos    =     12.0 : 1.0
             outstanding = True              pos : neg    =     10.8 : 1.0
                   mulan = True              pos : neg    =      9.3 : 1.0
                  finest = True              pos : neg    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      6.6 : 1.0
             wonderfully = True              pos : neg    =      6.4 : 1.0
              schumacher = True              neg : pos    =      6.4 : 1.0
                  wasted = True              neg : pos    =      6.4 : 1.0
                   damon = True              pos : neg    =      5.8 : 1.0
                   flynt = True              pos : neg    =      5.8 : 1.0
                lebowski = True              pos : neg    =      5.8 : 1.0
                   awful = True              neg : pos    =      5.7 : 1.0
            embarrassing = True              neg : pos    =      5.6 : 1.0

## Let's implement some more Naive Bayes Classifiers  

In [82]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [83]:
MNB_clf = SklearnClassifier(MultinomialNB())
MNB_clf.train(train)
print("Multinomial Naive Bayes Classifier")
print(f"Accuracy : {nltk.classify.accuracy(MNB_clf, test)}")

Multinomial Naive Bayes Classifier
Accuracy : 0.845


In [84]:
BNB_clf = SklearnClassifier(MultinomialNB())
BNB_clf.train(train)
print("Bernoulli Naive Bayes Classifier")
print(f"Accuracy : {nltk.classify.accuracy(BNB_clf, test)}")

Bernoulli Naive Bayes Classifier
Accuracy : 0.845


## We can implement more sklearn classifiers other than Naive Bayes'

### First, let's implement support vector machines

In [85]:
from sklearn.svm import SVC, LinearSVC, NuSVC

In [86]:
SVC_clf = SklearnClassifier(SVC())
SVC_clf.train(train)
print("Support Vector Classifier")
print(f"Accuracy : {nltk.classify.accuracy(SVC_clf, test)}")

Support Vector Classifier
Accuracy : 0.865


In [87]:
LinearSVC_clf = SklearnClassifier(LinearSVC())
LinearSVC_clf.train(train)
print("Linear Support Vector Classifier")
print(f"Accuracy : {nltk.classify.accuracy(LinearSVC_clf, test)}")

Linear Support Vector Classifier
Accuracy : 0.83


In [89]:
NuSVC_clf = SklearnClassifier(NuSVC())
NuSVC_clf.train(train)
print("Nu-supported Vector Classifier")
print(f"Accuracy : {nltk.classify.accuracy(NuSVC_clf, test)}")

Nu-supported Vector Classifier
Accuracy : 0.86


### Now some tree classifiers

In [90]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [91]:
RFClf = SklearnClassifier(RandomForestClassifier())
RFClf.train(train)
print("Random Forest Classifier")
print(f"Accuracy : {nltk.classify.accuracy(RFClf, test)}")

Random Forest Classifier
Accuracy : 0.805


In [92]:
ABClf = SklearnClassifier(AdaBoostClassifier())
ABClf.train(train)
print("Ada Boost Classifier")
print(f"Accuracy : {nltk.classify.accuracy(ABClf, test)}")

Ada Boost Classifier
Accuracy : 0.755


In [93]:
DTClf = SklearnClassifier(DecisionTreeClassifier())
DTClf.train(train)
print("Decision Tree Classifier")
print(f"Accuracy : {nltk.classify.accuracy(DTClf, test)}")

Decision Tree Classifier
Accuracy : 0.605


### A neural network one as well
 

In [94]:
from sklearn.neural_network import MLPClassifier

In [95]:
MLPClf = SklearnClassifier(MLPClassifier())
MLPClf.train(train)
print("Multi layered perceptron Classifier")
print(f"Accuracy : {nltk.classify.accuracy(MLPClf, test)}")

Multi layered perceptron Classifier
Accuracy : 0.85


### and lastly, our simple yet powerful Logistic Regression

In [96]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [97]:
LRClf = SklearnClassifier(LogisticRegression())
LRClf.train(train)
print("Logistic Regression Classifier")
print(f"Accuracy : {nltk.classify.accuracy(LRClf, test)}")

Logistic Regression Classifier
Accuracy : 0.865


In [98]:
SGDClf = SklearnClassifier(SGDClassifier())
SGDClf.train(train)
print("Stochastic Gradient Descent Classifier")
print(f"Accuracy : {nltk.classify.accuracy(SGDClf, test)}")

Stochastic Gradient Descent Classifier
Accuracy : 0.82


### We can see that with default parameters, the classifiers are classifying reviews at a maximum of 86.5% accuracy!

## Conclusion

<table>
    <tr>
        <th>Classifier</th>
        <th>Accuracy</th>
    </tr>
    <tr>
        <td>Naive Bayes'</td>
        <td>0.85</td>
    </tr>
    <tr>
        <td>Multinomial Naive Bayes'</td>
        <td>0.845</td>
    </tr>
    <tr>
        <td>Bernoulli Naive Bayes'</td>
        <td>0.845</td>
    </tr>
    <tr>
        <td>Support Vector</td>
        <td>0.865</td>
    </tr>
    <tr>
        <td>Linearly supported Vector</td>
        <td>0.83</td>
    </tr>
    <tr>
        <td>Nu-supported Vector</td>
        <td>0.86</td>
    </tr>
    <tr>
        <td>Random Forest</td>
        <td>0.805</td>
    </tr>
    <tr>
        <td>Ada Boost</td>
        <td>0.755</td>
    </tr>
    <tr>
        <td>Decision Tree</td>
        <td>0.605</td>
    </tr>
    <tr>
        <td>Multilayered Perceptron</td>
        <td>0.85</td>
    </tr>
    <tr>
        <td>Logistic Regression</td>
        <td>0.865</td>
    </tr>
    <tr>
        <td>Stochastic Gradient Descent</td>
        <td>0.82</td>
    </tr>
</table>

### Thank you!