In [2]:
import nltk
from nltk.corpus import movie_reviews as review
from nltk.probability import FreqDist as fd
import matplotlib.pyplot as plt
import random

In [None]:
nltk.download()

### Basics ###

<font color=red>Print categories of the movie review</font>

In [None]:
review.categories()

<font color=red>All the words in the movie reviews</font>

In [None]:
review.words()

<font color=red>Frequency distribution of the words</font>

In [None]:
freq_dist=fd(review.words())
freq_dist

In [None]:
plt.figure(figsize=(20,5))
plt.title('Frequency distribution curve')
freq_dist.plot(100)

In [None]:
# Frequency of the given word 'Happy'
fd(review.words())['happy']

In [None]:
# Frequency of the 'x' most common words in movie review
x=20
fd(review.words()).most_common(x)

<font color=red>Each of the movie review has a file id associated with it. The file id is the identification factor of movie review.</font>

In [None]:
total_reviews=review.fileids()
pos_review=review.fileids('pos')
neg_review=review.fileids('neg')

print('Total reviews {} Positive: {} Negative: {}'.format(len(total_reviews),len(pos_review),len(neg_review)))

<font color=red>Prints all words in movie_review with file id</font>

In [None]:
fileid=neg_review[0]
review.words(fileid)

### Sentiment analysis ###

In [3]:
data=[(review.words(id),cat)for cat in review.categories()  for id in review.fileids(cat)]
random.shuffle(data)

In [4]:
# All unique words in movie reviews.
words_freq_dist=fd(words.lower() for words in review.words())

In [5]:
#Displays frequency of 2000 most common words in words_freq_dist (Among all unique words, this gives the frequencies of the most 2000 commomn words)
common_words=[words for words,freq in words_freq_dist.most_common(2000)]

In [6]:
def extract_features(data):
    features={}

    for word in common_words:
        features[word]=word in data
    return features  

In [10]:
dataset=[(extract_features(doc),label) for doc,label in data[:50]] # Taking 50 points only, due to time and power consumption.

total_len=len(dataset)
train_len=int(total_len*80/100)
test_len=total_len-train_len

print('Total datapoint: {} Train datapoints {} Test datapoints {}'.format(total_len,train_len,test_len))

Total datapoint: 50 Train datapoints 40 Test datapoints 10


In [12]:
train_data=dataset[:train_len]
test_data=dataset[train_len:]

### Classifier: 1 ###

In [13]:
classifier=nltk.NaiveBayesClassifier.train(train_data)
print('Training accuracy :',nltk.classify.accuracy(classifier,train_data)*100)
print('Testing accuracy :',nltk.classify.accuracy(classifier,test_data)*100)

Training accuracy : 97.5
Testing accuracy : 50.0


### Classifier: 2 ###

In [14]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn import model_selection

In [18]:
training_dataset,test_dataset=model_selection.train_test_split(dataset,test_size = 0.20)
print('Train datapoints {} Test datapoints {}'.format(len(training_dataset),len(test_dataset)))

Train datapoints 40 Test datapoints 10


In [26]:
model = SklearnClassifier(SVC(kernel = 'linear'))
model.train(training_dataset)
train_accuracy = nltk.classify.accuracy(model, training_dataset)*100
accuracy = nltk.classify.accuracy(model, test_dataset)*100
print('SVC Train Accuracy: {} Test Accuracy : {}'.format(train_accuracy,accuracy))

SVC Train Accuracy: 100.0 Test Accuracy : 60.0


### Classifier :3 ###

In [28]:
poly_svc_model = SklearnClassifier(SVC(kernel = 'poly'))
poly_svc_model.train(training_dataset)
train_poly_accuracy = nltk.classify.accuracy(poly_svc_model, training_dataset)*100
test_poly_accuracy = nltk.classify.accuracy(poly_svc_model, test_dataset)*100
print('SVC using poly kernel Train Accuracy: {} Test Accuracy : {}'.format(train_poly_accuracy,test_poly_accuracy))

SVC using poly kernel Train Accuracy: 60.0 Test Accuracy : 50.0
