# SENTIMENT ANALYSER USING NAIVE BAYES

In [6]:
import pandas as pd
import numpy as np
import nltk 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer     
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score


# Reading file and Processing it.

In [9]:
df=pd.read_csv('REVIEWS.txt',sep='\t',names=['liked','txt'])

In [10]:
df.head()


Unnamed: 0,liked,txt
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


# TF-IDF Vectorization 

In [2]:
stopset = set(stopwords.words("english"))
print (stopset)

{'doing', 'she', 'weren', 'it', 'having', 'did', 'd', 'her', 'is', 'whom', 'should', 'y', 'ain', 'me', 'own', 'themselves', 'these', 'my', "isn't", 'between', "needn't", 'at', "shan't", 'hadn', 'what', 'about', 'theirs', 'again', 'each', 'wouldn', 'wasn', 'mustn', 'over', "wouldn't", 'any', 'same', 'where', "you're", 'isn', 'have', 'that', 'as', 'herself', 'when', 'does', 'once', 'in', 'while', 'few', 'ma', "you've", "that'll", 'here', 'only', 'before', 'doesn', "doesn't", 'most', 'i', 'his', 'he', 'been', 'through', "haven't", 'shouldn', 't', 'won', 'which', 'because', 'hasn', 'those', 'yourself', 'off', 'further', 'ourselves', 'who', 'be', 'they', 'or', 'all', "weren't", 'for', 'then', 'needn', 'after', "don't", 'them', 'how', 'your', 'can', 'm', 'but', 'to', 'down', 'don', "hasn't", "shouldn't", 'mightn', 'by', 'yours', 'you', 'why', 'some', 'our', 'other', 's', 'didn', 'itself', 'into', "didn't", "she's", 'are', 'do', "should've", 'than', 'myself', 'no', 're', "wasn't", 'now', 'out

In [13]:
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True,strip_accents='ascii',stop_words=stopset)

# In this case, our dependent variable will be liked as 0(didn't like the movie) or 1(liked the movie)

In [11]:
y = df.liked
print(y)

0       1
1       1
2       1
3       1
4       1
5       1
6       1
7       1
8       1
9       1
10      1
11      1
12      1
13      1
14      1
15      1
16      1
17      1
18      1
19      1
20      1
21      1
22      1
23      1
24      1
25      1
26      1
27      1
28      1
29      1
       ..
6888    0
6889    0
6890    0
6891    0
6892    0
6893    0
6894    0
6895    0
6896    0
6897    0
6898    0
6899    0
6900    0
6901    0
6902    0
6903    0
6904    0
6905    0
6906    0
6907    0
6908    0
6909    0
6910    0
6911    0
6912    0
6913    0
6914    0
6915    0
6916    0
6917    0
Name: liked, Length: 6918, dtype: int64


In [12]:
# convert df.txt from text to features
x = vectorizer.fit_transform(df.txt)
print (x)

  (0, 418)	0.3263290976383834
  (0, 1906)	0.3263290976383834
  (0, 331)	0.3262559657747791
  (0, 216)	0.7161106376265326
  (0, 137)	0.4095866691477198
  (1, 418)	0.09796474160548119
  (1, 1906)	0.09796474160548119
  (1, 331)	0.09794278725273431
  (1, 674)	0.22592190468842513
  (1, 321)	0.398401288038176
  (1, 414)	0.398401288038176
  (1, 584)	0.2298241967707875
  (1, 1446)	0.2238961227930013
  (1, 582)	0.28506337161183415
  (1, 217)	0.28195158685585064
  (1, 1075)	0.14687245685027675
  (1, 1467)	0.398401288038176
  (1, 1360)	0.398401288038176
  (2, 418)	0.23703245675471377
  (2, 1906)	0.23703245675471377
  (2, 331)	0.23697933668230028
  (2, 1076)	0.5507259552895343
  (2, 1109)	0.7267550791330575
  (3, 418)	0.23703245675471377
  (3, 1906)	0.23703245675471377
  :	:
  (6913, 220)	0.855108915546091
  (6914, 1454)	0.5724264186623724
  (6914, 238)	0.3318987022087672
  (6914, 1211)	0.3318987022087672
  (6914, 463)	0.6723202348298334
  (6915, 1214)	0.4220713593028687
  (6915, 1222)	0.238639918

In [10]:
#6918 observations x 2011 unique words.
print (y.shape)
print (x.shape)

(6918,)
(6918, 2011)


# Test Train Split as usual

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y, random_state = 42)

# Training a Naive Bayes classifier

In [17]:
clf = naive_bayes.MultinomialNB()
clf.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Accuracy with Test Data

In [23]:
roc_auc_score(y_test, clf.predict_proba(x_test)[:,1])

0.9979292333245913

# Running Our Model for any random review

In [47]:
movie_reviews_array1 = np.array(["Jupiter Ascending was a disappointing and terrible movie"])
movie_review_vector = vectorizer.transform(movie_reviews_array1)
print (clf.predict(movie_review_vector))

[0]


In [48]:
movie_reviews_array2 = np.array(["saand ki ankh was an interesting story"])
movie_review_vector = vectorizer.transform(movie_reviews_array2)
print (clf.predict(movie_review_vector))

[1]


In [49]:
movie_reviews_array3 = np.array(["It was an amazing stuff"])
movie_review_vector = vectorizer.transform(movie_reviews_array3)
print (clf.predict(movie_review_vector))

[1]


In [50]:
movie_reviews_array5 = np.array(["That was a horrible one"])
movie_review_vector = vectorizer.transform(movie_reviews_array5)
print (clf.predict(movie_review_vector))

[0]
