In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

# Using Movie Reviews dataset of nltk library

## Importing the moview_reviews from nltk

In [3]:
import random
from nltk.corpus import movie_reviews
help(movie_reviews)

Help on LazyCorpusLoader in module nltk.corpus.util object:

movie_reviews = class LazyCorpusLoader(builtins.object)
 |  movie_reviews(name, reader_cls, *args, **kwargs)
 |  
 |  To see the API documentation for this lazily loaded corpus, first
 |  run corpus.ensure_loaded(), and then run help(this_corpus).
 |  
 |  LazyCorpusLoader is a proxy object which is used to stand in for a
 |  corpus object before the corpus is loaded.  This allows NLTK to
 |  create an object for each corpus, but defer the costs associated
 |  with loading those corpora until the first time that they're
 |  actually accessed.
 |  
 |  The first time this object is accessed in any way, it will load
 |  the corresponding corpus, and transform itself into that corpus
 |  (by modifying its own ``__class__`` and ``__dict__`` attributes).
 |  
 |  If the corpus can not be found, then accessing this object will
 |  raise an exception, displaying installation instructions for the
 |  NLTK data package.  Once they've 

## Preparing the document  

In [16]:
#loading moview_reviews
#import nltk
#nltk.download('movie_reviews')
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
print("Number of Reviews:",len(documents))

#This loaded document is a list of tokens eg-['don',''','t',i,movie,was,good]

#For shuffling the document (not so important ,just to increase reproductibility)
random.seed
random.shuffle(documents)

#list to store all review text and label
text_data=[]
label=[]
for i in range(len(documents)):
    text_data.append(' '.join(documents[i][0]))
    label.append(0 if documents[i][1]=='neg' else 1)
    
print("Number of Positive review:",label.count(1))
print("Number of Negative review:",label.count(0))



Number of Reviews: 2000
Number of Positive review: 1000
Number of Negative review: 1000


## Splitting the Dataset

In [29]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(text_data,label,test_size=0.25,random_state=23)
print(X_train[3])
print()
print(y_train[34])

eddie murphy has a lot riding on harlem nights . as the movie ' s writer , director , executive producer , and star , murphy will shoulder all of the blame if harlem nights fails . but at the same time , he ' ll receive all of the credit if it succeeds . should you sacrifice your hard - earned cash to support murphy ' s risky gamble ? well , that depends on whom you trust more : me or eddie murphy . here ' s what murphy thinks : " i think the audience is expecting a good time . they gonna get sexy . they gonna get funny . they gonna get drama . they gonna get all of that . i think it ' s the best movie i ' ve done " ( paramount radio network ) . here ' s what i think : harlem nights is charmless , unoriginal , disappointing , and almost without question , the worst film of the actor ' s career ( i haven ' t seen best defense ) . and guess who ' s to blame ? ! the movie ' s problem is not murphy ' s direction : harlem nights is a fairly good looking film . no , the project was probably 

## Preparing the Bag of words(DTM) ,Fitting the model , Calculating the Score

In [54]:
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

tf_cv=TfidfVectorizer(stop_words='english')
train_dtm_tf=tf_cv.fit_transform(X_train)
test_dtm_tf=tf_cv.transform(X_test)

nb=MultinomialNB()
nb=nb.fit(train_dtm_tf,y_train)
predicted=nb.predict(test_dtm_tf)
score=100.0* nb.score(test_dtm_tf,y_test)
print("The accuracy of the Naive bayes:",score)
print("Classification Report:")
report=metrics.classification_report(y_test,predicted, target_names = ['Negative', 'Positive'])
print(report)

The accuracy of the Naive bayes: 78.8
Classification Report:
              precision    recall  f1-score   support

    Negative       0.72      0.89      0.80       238
    Positive       0.88      0.69      0.77       262

   micro avg       0.79      0.79      0.79       500
   macro avg       0.80      0.79      0.79       500
weighted avg       0.81      0.79      0.79       500



In [46]:
metrics.confusion_matrix(y_test,predicted)
#confusion(y_test, y_pred, ['Negative', 'Positive'], 'Naive Bayes Model')

array([[213,  25],
       [ 81, 181]], dtype=int64)

## Top 20 positive words 

In [47]:
all_words=np.array(tf_cv.get_feature_names())
top_word_index=np.argsort(nb.coef_[0])[-20:]
tn_lst=[word for word in all_words[top_word_index]]
tn_lst.reverse()
print(tn_lst)

['film', 'movie', 'like', 'life', 'story', 'good', 'just', 'time', 'character', 'characters', 'films', 'great', 'way', 'people', 'best', 'really', 'does', 'love', 'man', 'world']


## Using Logistic Regression

In [53]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(C=1000)

lr=lr.fit(train_dtm_tf,y_train)
predicted=lr.predict(test_dtm_tf)
scr = 100.0 * lr.score(test_dtm_tf, y_test)
print("Accuracy of Logistic Regression:",scr)

Accuracy of Logistic Regression: 84.39999999999999


### Top 20 Positive Word 

In [55]:
top_word_index=np.argsort(lr.coef_[0])[-20:]
tn_lst=[word for word in all_words[top_word_index]]
tn_lst.reverse()
print(tn_lst)

['great', 'fun', 'overall', 'life', 'memorable', 'definitely', 'quite', 'frank', 'performance', 'seen', 'excellent', 'hilarious', 'titanic', 'terrific', 'enjoyed', 'job', 'rob', 'family', 'different', 'performances']


### Top 20 Negative Word

In [56]:
y_train_reverse = [0 if y==1 else 1 for y in y_train]
lr = lr.fit(train_dtm_tf, y_train_reverse)

top_word_index = np.argsort(lr.coef_[0])[-20:]
tn_lst = [word for word in all_words[top_word_index]]
tn_lst.reverse()
print(tn_lst)



['bad', 'plot', 'unfortunately', 'boring', 'worst', 'reason', 'supposed', 'awful', 'poor', 'waste', 'stupid', 'script', 'ridiculous', 'fails', 'harry', 'dull', 'carpenter', 'terrible', 'mess', 'poorly']


# Now using Stemming 

## As Countvectorizer and TF-IDF dont do stemming 

In [5]:
import string,nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

def tokenize(text):
    tokens=nltk.word_tokenize(text)
    tokens=[token for token in tokens if token not in string.punctuation]
    ps=PorterStemmer()
    stems=map(stemmer.stem,tokens)
    return stems

tf_cv=TfidfVectorizer(tokenizer=tokenize)
train_dtm_tf=tf_cv.fit_transform(X_train)
test_dtm_tf=tf_cv.transform(X_test)

lr=LogisticRegression(C=1000)
lr=lr.fit(train_dtm_tf,y_train)
predicted=lr.predict(test_dtm_tf)

scr=100.0 * lr.score(test_dtm_tf,y_test)
print("Accuracy after applying stemming:",scr)

NameError: name 'X_train' is not defined