# Sentiment analysis for Netflix movie reviews

### impoting all the neccesary libraries

In [11]:
import numpy as np
import pandas as pd
# import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn import svm
from sklearn.metrics import classification_report
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.models import Sequential
# from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
# from keras.utils.np_utils import to_categorical
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import string
from nltk.corpus import stopwords

In [12]:
sw = stopwords.words('english')

### reading the data

In [13]:
pos_rev = pd.read_csv("pos.txt", sep = "\n", header = None, encoding = 'latin-1')
pos_rev = pd.concat([pos_rev,pd.Series(np.ones(pos_rev.shape[0]))], ignore_index=True, axis =1)
pos_rev.columns = ['review', 'mood']

In [14]:
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1.0
1,"the gorgeously elaborate continuation of "" the...",1.0
2,effective but too-tepid biopic,1.0
3,if you sometimes like to go to the movies to h...,1.0
4,"emerges as something rare , an issue movie tha...",1.0
...,...,...
5326,both exuberantly romantic and serenely melanch...,1.0
5327,mazel tov to a film about a family's joyous li...,1.0
5328,standing in the shadows of motown is the best ...,1.0
5329,it's nice to see piscopo again after all these...,1.0


In [15]:
print(pos_rev.shape)
print(pos_rev.head())

(5331, 2)
                                              review  mood
0  the rock is destined to be the 21st century's ...   1.0
1  the gorgeously elaborate continuation of " the...   1.0
2                     effective but too-tepid biopic   1.0
3  if you sometimes like to go to the movies to h...   1.0
4  emerges as something rare , an issue movie tha...   1.0


In [16]:
neg_rev = pd.read_csv("negative.txt", sep = "\n", header = None, encoding = 'latin-1')
neg_rev = pd.concat([neg_rev,pd.Series(np.zeros(pos_rev.shape[0]))], ignore_index=True, axis =1)
neg_rev.columns = ['review', 'mood']
print(neg_rev.head())


                                              review  mood
0                   simplistic , silly and tedious.    0.0
1  it's so laddish and juvenile , only teenage bo...   0.0
2  exploitative and largely devoid of the depth o...   0.0
3  [garbus] discards the potential for pathologic...   0.0
4  a visually flashy but narratively opaque and e...   0.0


### preprocessing

In [17]:
pos_rev.loc[:, 'review'] = pos_rev.loc[:, 'review'].apply(lambda x: x.lower())
pos_rev.loc[:, 'review'] = pos_rev.loc[:, 'review'].apply(lambda x: re.sub(r"@\S+", "", x))
pos_rev.loc[:, 'review'] = pos_rev.loc[:, 'review'].apply(lambda x: x.translate(str.maketrans(dict.fromkeys(string.punctuation))))
pos_rev.loc[:, 'review'] = pos_rev.loc[:, 'review'].apply(lambda x:' '.join([word for word in x.split() if word not in (sw)]))

In [18]:
neg_rev.loc[:, 'review'] = neg_rev.loc[:, 'review'].apply(lambda x: x.lower())
neg_rev.loc[:, 'review'] = neg_rev.loc[:, 'review'].apply(lambda x: re.sub(r"@\S+", "", x))
neg_rev.loc[:, 'review'] = neg_rev.loc[:, 'review'].apply(lambda x: x.translate(str.maketrans(dict.fromkeys(string.punctuation))))
neg_rev.loc[:, 'review'] = neg_rev.loc[:, 'review'].apply(lambda x:' '.join([word for word in x.split() if word not in (sw)]))

### connecting both pos and negative review

In [19]:
#connecting both pos and negative review
com_rev = pd.concat([pos_rev, neg_rev], axis =0).reset_index()

### spliting data in training and testing

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(com_rev['review'].values,com_rev['mood'].values, test_size = 0.33, random_state = 42)

In [21]:
# make a dataframe

train_data = pd.DataFrame({'review':X_train, 'mood':Y_train})
test_data = pd.DataFrame({'review':X_test, 'mood':Y_test})

In [22]:
#The TfidfVectorizer will tokenize documents, learn the vocabulary and inverse document frequency weightings, 
#and allow you to encode new documents

from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
vectorizer = TfidfVectorizer()

In [24]:
train_vectors = vectorizer.fit_transform(train_data['review'])
test_vectors = vectorizer.transform(test_data['review'])

### using SVM

In [25]:
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(train_vectors, train_data['mood'])
prediction_linear = classifier_linear.predict(test_vectors)

In [26]:
report = classification_report(test_data['mood'], prediction_linear, output_dict=True)
print('positive:', report['1.0']['recall'])
print('negative:', report['0.0']['recall'])

positive: 0.7429696287964005
negative: 0.7512923607122344


In [27]:
pickle.dump(vectorizer, open('tranform.pkl', 'wb'))

In [28]:
pickle.dump(classifier_linear, open('model.pkl', 'wb'))

### pickling file for deployment

In [29]:
import pickle

data = ["best movie"] # excellent movie # worst movie # bad movie # best movie
#loading the transform model
tfidf=pickle.load(open('tranform.pkl','rb'))


# loading the model
clf = pickle.load(open('model.pkl', 'rb'))

vect = tfidf.transform(data).toarray()
my_prediction = clf.predict(vect)
print(my_prediction)

[1.]


### Naive bayes

In [30]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_vectors,train_data['mood'])
print(clf.score(test_vectors,test_data['mood']))

clf_predicted = clf.predict(test_vectors)

report = classification_report(test_data['mood'], clf_predicted, output_dict=True)
print('positive:', report['1.0']['recall'])
print('negative:', report['0.0']['recall'])

0.7649900539926116
positive: 0.766029246344207
negative: 0.7639287765651924
