In [1]:
# importing the libraires
import re
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer

In [31]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [59]:
# postive sentiment

pos_rev = pd.read_csv('datasets/pos.txt' , sep='\n' ,encoding='latin-1', header = None)
pos_rev['mood'] = 1
# renaming the column
pos_rev.rename(columns = {0:'review'}, inplace = True)

In [60]:
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [61]:
# negative sentiment

neg_rev = pd.read_csv('datasets/negative.txt' , sep='\n' ,encoding='latin-1', header = None)
neg_rev['mood'] = 0
# renaming the column
neg_rev.rename(columns = {0:'review'}, inplace = True)

In [62]:
neg_rev

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


### pipeline
--->lower cases
--->tokenization
--->remove stopwords
--->remove punct
--->lemma / stem
--->bow/tfid
--->train test split
--->Naiave bayes , Svm
--->evaluate model
--->saving the model
--->testing it

In [64]:
lemma = WordNetLemmatizer()
stopwords = spacy.lang.en.stop_words.STOP_WORDS
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : x.lower())
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : re.sub(r'@\S+' , "" , x))
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : re.sub(r'\d+' , "" , x))
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x) if word not in string.punctuation]))
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : " ".join([lemma.lemmatize(word , 'v') for word in nltk.word_tokenize(x) if word not in stopwords]))

In [65]:
lemma = WordNetLemmatizer()
stopwords = spacy.lang.en.stop_words.STOP_WORDS
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : x.lower())
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : re.sub(r'@\S+' , "" , x))
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : re.sub(r'\d+' , "" , x))
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x) if word not in string.punctuation]))
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : " ".join([lemma.lemmatize(word , 'v') for word in nltk.word_tokenize(x) if word not in stopwords]))

In [66]:
pos_rev

Unnamed: 0,review,mood
0,rock destine st century new `` conan `` go spl...,1
1,gorgeously elaborate continuation `` lord ring...,1
2,effective too-tepid biopic,1
3,like movies fun wasabi good place start,1
4,emerge rare issue movie honest keenly observe ...,1
...,...,...
5326,exuberantly romantic serenely melancholy time ...,1
5327,mazel tov film family joyous life act yiddish ...,1
5328,stand shadow motown best kind documentary make...,1
5329,nice piscopo years chaykin headly priceless,1


In [67]:
neg_rev

Unnamed: 0,review,mood
0,simplistic silly tedious,0
1,laddish juvenile teenage boys possibly find funny,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0
...,...,...
5326,terrible movie people find move,0
5327,definitions 'time waster movie surely,0
5328,stand crocodile hunter hurry badly cobble look...,0
5329,thing look like made-for-home-video quickie,0


In [68]:
com_rev = pd.concat([pos_rev , neg_rev],axis = 0).reset_index()
com_rev

Unnamed: 0,index,review,mood
0,0,rock destine st century new `` conan `` go spl...,1
1,1,gorgeously elaborate continuation `` lord ring...,1
2,2,effective too-tepid biopic,1
3,3,like movies fun wasabi good place start,1
4,4,emerge rare issue movie honest keenly observe ...,1
...,...,...,...
10657,5326,terrible movie people find move,0
10658,5327,definitions 'time waster movie surely,0
10659,5328,stand crocodile hunter hurry badly cobble look...,0
10660,5329,thing look like made-for-home-video quickie,0


In [69]:
# train test split
X_train , X_test , y_train , y_test = train_test_split(com_rev['review'].values , com_rev['mood'].values , test_size = 0.2, random_state = 101)

In [70]:
train_data = pd.DataFrame({'review':X_train , 'mood':y_train})
test_data = pd.DataFrame({'review':X_test , 'mood':y_test})

In [71]:
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(train_data['review'])
test_vector = vectorizer.transform(test_data['review'])

In [72]:
train_vector.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [73]:
test_vector.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [74]:
# so see the vocbulary

vectorizer.get_feature_names()

['aaa',
 'aaliyah',
 'abagnale',
 'abandon',
 'abandone',
 'abandono',
 'abbass',
 'abbott',
 'abbreviate',
 'abc',
 'abderrahmane',
 'abel',
 'aberration',
 'abhor',
 'abhorrent',
 'abide',
 'abilities',
 'ability',
 'abject',
 'able',
 'ably',
 'abomination',
 'aborbing',
 'aboriginal',
 'aboul',
 'abound',
 'about',
 'above',
 'abrahams',
 'abrams',
 'abridge',
 'abroad',
 'abrupt',
 'abruptly',
 'absence',
 'absent',
 'absolutamente',
 'absolute',
 'absolutely',
 'absorb',
 'absorbed',
 'absorption',
 'abstract',
 'absurd',
 'absurdist',
 'absurdities',
 'absurdity',
 'absurdly',
 'abundance',
 'abundant',
 'abundantly',
 'aburrido',
 'abuse',
 'aby',
 'abysmal',
 'abysmally',
 'acaba',
 'acabamos',
 'academic',
 'academy',
 'accelerate',
 'accent',
 'accentuate',
 'accept',
 'acceptable',
 'acceptance',
 'access',
 'accessibility',
 'accessible',
 'accident',
 'accidental',
 'acclaim',
 'accomodates',
 'accompany',
 'accomplish',
 'accomplishment',
 'accomplishments',
 'accord',
 

In [75]:
# using svm
from sklearn import svm
from sklearn.metrics import classification_report,accuracy_score

In [76]:
classifier = svm.SVC(kernel = 'linear')
classifier.fit(train_vector, train_data['mood'])

SVC(kernel='linear')

In [77]:
pred = classifier.predict(test_vector)

In [78]:
accuracy_score(pred , test_data['mood'])

0.7416783872480075

In [80]:
import joblib
joblib.dump(classifier , 'classifier.netflix.pkl')
joblib.dump(vectorizer , 'transform.pkl')

['transform.pkl']

In [86]:
model = joblib.load('classifier.netflix.pkl')
vector = joblib.load('transform.pkl')

review = input('please give me the review:- ')

    
tfidf = vector.transform([review]).toarray()
my_pred = model.predict(tfidf)
if my_pred == 1:
    print('postive review')
else:
    print('negative review')

please give me the review:- movie is great 
postive review
