## IMPORTING MODULES

In [1]:
import nltk

In [2]:
from nltk.corpus import movie_reviews

In [3]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/tanishqsaluja/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

## DATA ANALYSIS

In [4]:
movie_reviews.categories() # Documents can be classified into negative and positive

['neg', 'pos']

In [5]:
movie_reviews.fileids() # We have total 2000 documents

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [6]:
len(movie_reviews.fileids("neg")) # We have 1000 documents each of negative and positive each

1000

In [7]:
movie_reviews.words(movie_reviews.fileids()[5]) # List of words in the 5th review documents

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

## FORM A DOCUMENT LIST OF TUPLES OF (LIST_OF_WORDS,CATEGORY)

In [8]:
documents=[]
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[0:4]
# We have 1000 negative documents , followed by 1000 positive documents

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg')]

In [9]:
# You may consider to set based on hard maximum limit instead of predefined hard-coded limit 
# (It will throws ValueError if you try yo set hard+1):
import resource
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))

In [10]:
# Randomly shuffling the documents
import random
random.shuffle(documents)

In [11]:
# Now the documents are shuffled
documents[0:4]

[(['synopsis', ':', 'sullen', 'julie', 'james', ',', ...], 'neg'),
 (['one', 'of', 'kyle', 'mclachlan', "'", 's', 'earlier', ...], 'pos'),
 (['the', 'caveman', "'", 's', 'valentine', 'starring', ...], 'pos'),
 (['arye', 'cross', 'and', 'courteney', 'cox', 'star', ...], 'neg')]

## IMPORT LEMMATIZER TO FIND ROOT WORD FOR EACH WORD OF REVIEW

In [12]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [13]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ # a
    if tag.startswith('V'): 
        return wordnet.VERB # v
    if tag.startswith('N'): 
        return wordnet.NOUN # n
    if tag.startswith('R'): 
        return wordnet.ADV # r
    return wordnet.NOUN # n

## CLEAN EACH REVIEW SUCH THAT IT DOESN'T CONTAIN ANY STOP WORD

In [14]:
def clean_review(words):
    output_words=[]
    for w in words:
        if w.lower() not in stop_words:
            # pos = pos_tag(word_tokenize(w))
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [15]:
from nltk import pos_tag # pos_tag tells us about type of word

In [None]:
print(pos_tag(word_tokenize("Eiffel Tower")))
print(pos_tag(word_tokenize("Big")))

In [None]:
from nltk import word_tokenize

In [None]:
print(pos_tag(["sd"]))
tag=pos_tag(["hello"])
tag[0][1].startswith('h')

## FIND LIST OF ALL STOP WORDS INCLUDING PUNCTUATIONS

In [16]:
from nltk.corpus import stopwords
stop_words=list(stopwords.words('english'))

In [17]:
import string
puncts=list(string.punctuation)

In [19]:
stop_words=puncts+stop_words

## MODIFYING THE CURRENT DOCUMENT BY CLEANING EACH REVIEW

In [20]:
documents=[(clean_review(document),category) for document,category in documents]

In [21]:
print(documents[0][1])
print(documents[0][0])

neg
['synopsis', 'sullen', 'julie', 'james', 'still', 'haunt', 'nightmare', 'killer', 'ben', 'willis', 'know', 'last', 'summer', 'perk', 'new', 'best', 'friend', 'karla', 'wilson', 'win', 'trip', 'four', 'bahamas', 'arrive', 'start', 'hurricane', 'season', 'julie', 'karla', 'run', 'around', 'tight', 'clothing', 'realize', 'walk', 'trap', 'set', 'rainslickered', 'slasher', 'huge', 'hook', 'hand', 'comment', 'may', 'show', 'pop', 'culture', 'illiteracy', 'never', 'see', 'episode', 'fox', 'tv', 'series', 'party', 'five', 'vaguely', 'aware', 'show', 'premise', 'knowledge', 'come', 'brief', 'commercial', 'half', 'paid', 'attention', 'watch', 'tube', 'party', 'five', 'know', 'however', 'seem', 'start', 'ground', 'actress', 'teen', 'horror', 'genre', 'neve', 'campbell', 'scream', 'queen', '90', 'star', 'scream', 'scream', '2', 'craft', 'regular', 'show', 'jennifer', 'love', 'hewitt', 'lo', 'behold', 'follow', 'footstep', 'campbell', 'hewitt', 'horror', 'franchise', 'know', 'last', 'summer', '

## SPLIT THE DATA

In [22]:
training_documents = documents[0:1500]
testing_documents = documents[1500:2000]

## FORM A LIST OF ALL THE WORDS IN ALL THE CLEANED REVIEWS

In [23]:
all_words = []
for doc in training_documents:
    all_words += doc[0]

## Using all_words list , select 3000 most frequently occuring words

In [24]:
freq=nltk.FreqDist(all_words)
common=freq.most_common(3000)
features=[i[0] for i in common] # We just want the word, not the number of occurences

In [25]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'would',
 'bad',
 'much',
 'come',
 'also',
 'give',
 'life',
 'two',
 'look',
 'way',
 'first',
 'know',
 '--',
 'end',
 'seem',
 'year',
 'work',
 'thing',
 'say',
 'really',
 'plot',
 'play',
 'show',
 'little',
 'people',
 'star',
 'love',
 'never',
 'could',
 'director',
 'man',
 'new',
 'try',
 'best',
 'great',
 'performance',
 'action',
 'actor',
 'many',
 'big',
 'want',
 'watch',
 'find',
 'u',
 'think',
 'role',
 'act',
 'another',
 'something',
 'audience',
 'world',
 'back',
 'turn',
 'still',
 'old',
 'set',
 'day',
 'however',
 'use',
 'guy',
 'every',
 'feel',
 'though',
 'enough',
 'cast',
 'around',
 'point',
 'part',
 'begin',
 'interest',
 'comedy',
 'last',
 'run',
 'young',
 'write',
 'real',
 'actually',
 'long',
 'right',
 'fact',
 'woman',
 'may',
 'script',
 'name',
 'funny',
 'effect',
 'line',
 'almost',
 'lot',


In [36]:
def get_feature_dict(words):
    current_features = {} # this will be a dict
    word_set = set(words)
    for w in features:
        current_features[w] = w in word_set
    return current_features

#### TRAINING DATA WILL BE A TUPLE OF DICTIONARY OF FEATURES(True/False) AND CATEGORY FOR EACH REVIEW

In [27]:
training_data = [(get_feature_dict(doc),category) for doc,category in training_documents]

In [28]:
get_feature_dict(training_documents[10][0])

{'film': True,
 'movie': False,
 'one': True,
 'make': True,
 'like': True,
 'character': True,
 'get': True,
 'see': True,
 'go': False,
 'time': True,
 'well': True,
 'scene': True,
 'even': False,
 'good': True,
 'story': True,
 'take': True,
 'would': False,
 'bad': False,
 'much': True,
 'come': False,
 'also': True,
 'give': False,
 'life': True,
 'two': True,
 'look': False,
 'way': True,
 'first': False,
 'know': True,
 '--': True,
 'end': True,
 'seem': False,
 'year': True,
 'work': True,
 'thing': False,
 'say': False,
 'really': False,
 'plot': True,
 'play': True,
 'show': True,
 'little': False,
 'people': False,
 'star': True,
 'love': True,
 'never': False,
 'could': True,
 'director': True,
 'man': True,
 'new': True,
 'try': False,
 'best': True,
 'great': True,
 'performance': True,
 'action': False,
 'actor': True,
 'many': False,
 'big': True,
 'want': True,
 'watch': False,
 'find': True,
 'u': True,
 'think': False,
 'role': True,
 'act': True,
 'another': False,

In [29]:
testing_data = [(get_feature_dict(doc),category) for doc,category in testing_documents]

In [31]:
# training_data[0]

## Use Naive Bayes Classifier for training

In [32]:
from nltk import NaiveBayesClassifier

In [33]:
classifier = NaiveBayesClassifier.train(training_data)

In [34]:
nltk.classify.accuracy(classifier,testing_data)

0.788

In [35]:
classifier.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =     21.5 : 1.0
                  turkey = True              neg : pos    =     18.9 : 1.0
            breathtaking = True              pos : neg    =     11.1 : 1.0
               stupidity = True              neg : pos    =     10.9 : 1.0
              schumacher = True              neg : pos    =     10.3 : 1.0
                   damon = True              pos : neg    =      9.9 : 1.0
                   inept = True              neg : pos    =      9.2 : 1.0
               ludicrous = True              neg : pos    =      8.4 : 1.0
                  seagal = True              neg : pos    =      8.3 : 1.0
              dreamworks = True              pos : neg    =      6.6 : 1.0
                  alicia = True              neg : pos    =      6.2 : 1.0
                   anger = True              pos : neg    =      5.9 : 1.0
             wonderfully = True              pos : neg    =      5.9 : 1.0