In [2]:
import numpy as np
import pandas as pd
import nltk 
from nltk.corpus import movie_reviews, stopwords
from nltk.stem import *
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import gutenberg
import matplotlib.pyplot as plt

nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('treebank')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\atind\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atind\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atind\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\atind\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\atind\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\atind\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset 

True

In [3]:
len(movie_reviews.fileids())

2000

In [4]:
movie_reviews.fileids()[:10]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt']

In [5]:
movie_reviews.raw("neg/cv000_29416.txt")

'plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience membe

In [6]:
len(movie_reviews.raw("neg/cv000_29416.txt"))

4043

In [7]:
movie_reviews.words("neg/cv000_29416.txt")

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [8]:
movie_reviews.categories()

['neg', 'pos']

In [9]:
f_dist = nltk.FreqDist(movie_reviews.words())
f_dist

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

In [10]:
word_tokenize(movie_reviews.raw("neg/cv000_29416.txt"))[:10]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']

In [87]:
def basic_processing(text):
    words = word_tokenize(text)
    stop_words = stopwords.words("english")
    words=[i.lower() for i in words if i.isalpha() and i.lower() not in stop_words]
    return words

In [80]:
data = nltk.FreqDist(basic_processing(movie_reviews.raw("neg/cv000_29416.txt")))
word_freq_data=pd.DataFrame()
word_freq_data["Words"]=data.keys()
word_freq_data["Frequency"]=data.values()
word_freq_data.head(10)

Unnamed: 0,Words,Frequency
0,plot,1
1,two,2
2,teen,4
3,couples,1
4,go,2
5,to,16
6,a,14
7,church,1
8,party,1
9,drink,1


In [81]:
len(word_freq_data)

326

In [88]:
def word_freq(words):
    data = nltk.FreqDist(words)
    word_freq_data=pd.DataFrame()
    word_freq_data["Words"]=data.keys()
    word_freq_data["Frequency"]=data.values()
    return data.most_common(300)

In [89]:
# function to vectorize the text
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [90]:
negative_docs_training = [basic_processing(movie_reviews.raw(i)) for i in movie_reviews.fileids("neg")[:1600]][0]
positive_docs_training = [basic_processing(movie_reviews.raw(i)) for i in movie_reviews.fileids("pos")[:1600]][0]

In [96]:
negative_docs_fdist = word_freq(negative_docs)
negative_docs_fdist[:10]

[('movie', 6),
 ('film', 6),
 ('pretty', 5),
 ('make', 5),
 ('teen', 4),
 ('get', 3),
 ('one', 3),
 ('even', 3),
 ('like', 3),
 ('two', 2)]

In [97]:
positive_docs_fdist = word_freq(positive_docs)
positive_docs_fdist[:10]

[('comic', 5),
 ('hell', 5),
 ('film', 5),
 ('like', 4),
 ('say', 4),
 ('book', 3),
 ('moore', 3),
 ('campbell', 3),
 ('ripper', 3),
 ('even', 3)]

In [125]:
doc_words = nltk.FreqDist(basic_processing(movie_reviews.raw()))
feature_vector = list(doc_words)[:3000]

In [126]:
document = [(basic_processing(movie_reviews.raw(file_id)),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]
document[:1]

[(['plot',
   'two',
   'teen',
   'couples',
   'go',
   'church',
   'party',
   'drink',
   'drive',
   'get',
   'accident',
   'one',
   'guys',
   'dies',
   'girlfriend',
   'continues',
   'see',
   'life',
   'nightmares',
   'deal',
   'watch',
   'movie',
   'sorta',
   'find',
   'critique',
   'movie',
   'teen',
   'generation',
   'touches',
   'cool',
   'idea',
   'presents',
   'bad',
   'package',
   'makes',
   'review',
   'even',
   'harder',
   'one',
   'write',
   'since',
   'generally',
   'applaud',
   'films',
   'attempt',
   'break',
   'mold',
   'mess',
   'head',
   'lost',
   'highway',
   'memento',
   'good',
   'bad',
   'ways',
   'making',
   'types',
   'films',
   'folks',
   'snag',
   'one',
   'correctly',
   'seem',
   'taken',
   'pretty',
   'neat',
   'concept',
   'executed',
   'terribly',
   'problems',
   'movie',
   'well',
   'main',
   'problem',
   'simply',
   'jumbled',
   'starts',
   'normal',
   'downshifts',
   'fantasy',
 

In [128]:
def find_feature(word_list):
    feature = {}
    for x in feature_vector:
        feature[x] = x in doc_words
    return feature

In [129]:
feature_sets = [(find_feature(doc_words),category) for (doc_words,category) in document]

In [130]:
feature_sets[0][0]

{'film': True,
 'movie': True,
 'one': True,
 'like': True,
 'even': True,
 'good': True,
 'time': True,
 'would': True,
 'story': True,
 'much': True,
 'character': True,
 'also': True,
 'get': True,
 'characters': True,
 'two': True,
 'first': True,
 'see': True,
 'way': True,
 'well': True,
 'could': True,
 'make': True,
 'really': True,
 'films': True,
 'little': True,
 'life': True,
 'plot': True,
 'people': True,
 'scene': True,
 'bad': True,
 'never': True,
 'man': True,
 'best': True,
 'new': True,
 'many': True,
 'scenes': True,
 'know': True,
 'movies': True,
 'great': True,
 'another': True,
 'director': True,
 'love': True,
 'go': True,
 'action': True,
 'us': True,
 'something': True,
 'end': True,
 'still': True,
 'back': True,
 'seems': True,
 'made': True,
 'work': True,
 'world': True,
 'makes': True,
 'however': True,
 'big': True,
 'every': True,
 'though': True,
 'better': True,
 'audience': True,
 'enough': True,
 'seen': True,
 'around': True,
 'take': True,
 'per

In [135]:
train_set, test_set = train_test_split(feature_sets,test_size = 0.2)

In [136]:
print(len(train_set))
print(len(test_set))

1600
400


In [137]:
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_set)

In [138]:
from nltk import classify 

accuracy = classify.accuracy(classifier, test_set)
print (accuracy)

0.485
