In [1]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
# positive
pos_rev = pd.read_csv('pos.txt', sep = '\n', header = None, encoding = 'latin-1')
print(pos_rev)

                                                      0
0     the rock is destined to be the 21st century's ...
1     the gorgeously elaborate continuation of " the...
2                        effective but too-tepid biopic
3     if you sometimes like to go to the movies to h...
4     emerges as something rare , an issue movie tha...
...                                                 ...
5326  both exuberantly romantic and serenely melanch...
5327  mazel tov to a film about a family's joyous li...
5328  standing in the shadows of motown is the best ...
5329  it's nice to see piscopo again after all these...
5330  provides a porthole into that noble , tremblin...

[5331 rows x 1 columns]


In [3]:
pos_rev['target'] = 1
pos_rev.rename(columns ={0:'review'}, inplace = True)

In [4]:
pos_rev

Unnamed: 0,review,target
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [5]:
# negative
neg_rev = pd.read_csv('negative.txt', sep = '\n' , header = None , encoding = 'latin-1')
neg_rev['target'] = 0
neg_rev.rename(columns={0:'review'}, inplace = True)
neg_rev

Unnamed: 0,review,target
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


In [6]:
# lower, number, punctuation, stopwords,  lemmatization
pos_rev['review'] = pos_rev['review'].apply(lambda x : x.lower())
pos_rev['review'] = pos_rev['review'].apply(lambda x : re.sub('\d+', "", x ))
pos_rev['review'] = pos_rev['review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x)\
                                                        if word not in string.punctuation]))
pos_rev['review'] = pos_rev['review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x)\
                                                        if word not in stopwords.words('english')]))

In [7]:
neg_rev['review'] = neg_rev['review'].apply(lambda x : x.lower())
neg_rev['review'] = neg_rev['review'].apply(lambda x : re.sub('\d+', "", x ))
neg_rev['review'] = neg_rev['review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x)\
                                                        if word not in string.punctuation]))
neg_rev['review'] = neg_rev['review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x)\
                                                        if word not in stopwords.words('english')]))


In [8]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
pos_rev['review'] = pos_rev['review'].apply(lambda x : " ".join([lemma.lemmatize(word) for word in nltk.word_tokenize(x)\
                                                        if word not in stopwords.words('english')]))
neg_rev['review'] = neg_rev['review'].apply(lambda x : " ".join([lemma.lemmatize(word) for word in nltk.word_tokenize(x)\
                                                        if word not in stopwords.words('english')]))

print(pos_rev)
print(neg_rev)

                                                 review  target
0     rock destined st century 's new `` conan `` 's...       1
1     gorgeously elaborate continuation `` lord ring...       1
2                            effective too-tepid biopic       1
3     sometimes like go movie fun wasabi good place ...       1
4     emerges something rare issue movie 's honest k...       1
...                                                 ...     ...
5326  exuberantly romantic serenely melancholy time ...       1
5327  mazel tov film family 's joyous life acting yi...       1
5328  standing shadow motown best kind documentary o...       1
5329  's nice see piscopo year chaykin headly priceless       1
5330  provides porthole noble trembling incoherence ...       1

[5331 rows x 2 columns]
                                                 review  target
0                              simplistic silly tedious       0
1     's laddish juvenile teenage boy could possibly...       0
2     exploitat

In [9]:
pos_rev

Unnamed: 0,review,target
0,rock destined st century 's new `` conan `` 's...,1
1,gorgeously elaborate continuation `` lord ring...,1
2,effective too-tepid biopic,1
3,sometimes like go movie fun wasabi good place ...,1
4,emerges something rare issue movie 's honest k...,1
...,...,...
5326,exuberantly romantic serenely melancholy time ...,1
5327,mazel tov film family 's joyous life acting yi...,1
5328,standing shadow motown best kind documentary o...,1
5329,'s nice see piscopo year chaykin headly priceless,1


In [10]:
neg_rev

Unnamed: 0,review,target
0,simplistic silly tedious,0
1,'s laddish juvenile teenage boy could possibly...,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0
...,...,...
5326,terrible movie people nevertheless find moving,0
5327,many definition 'time waster movie must surely...,0
5328,stand crocodile hunter hurried badly cobbled l...,0
5329,thing look like made-for-home-video quickie,0


In [11]:
# concat the data

com_rev = pd.concat([neg_rev , pos_rev] , axis=0, ignore_index=True)
com_rev

Unnamed: 0,review,target
0,simplistic silly tedious,0
1,'s laddish juvenile teenage boy could possibly...,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0
...,...,...
10657,exuberantly romantic serenely melancholy time ...,1
10658,mazel tov film family 's joyous life acting yi...,1
10659,standing shadow motown best kind documentary o...,1
10660,'s nice see piscopo year chaykin headly priceless,1


In [12]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(com_rev['review'].values ,\
                                                    com_rev['target'].values, test_size = 0.2 , \
                                                    random_state = 101
                                                   )

In [13]:
train_data = pd.DataFrame({'review':X_train , 'mood' : y_train})
test_data = pd.DataFrame({'review':X_test, 'mood' : y_test})

vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data['review'])
test_vectors = vectorizer.transform(test_data['review'])


# to check the vocabulary
vectorizer.get_feature_names()

['aaliyah',
 'abagnale',
 'abandon',
 'abandone',
 'abandoned',
 'abandono',
 'abbas',
 'abbass',
 'abbott',
 'abbreviated',
 'abc',
 'abderrahmane',
 'abel',
 'aberration',
 'abhorrent',
 'abiding',
 'ability',
 'able',
 'ably',
 'abomination',
 'aboriginal',
 'aborted',
 'aboul',
 'abound',
 'above',
 'abrahams',
 'abrams',
 'abrasive',
 'abridged',
 'abroad',
 'abrupt',
 'abruptly',
 'absence',
 'absent',
 'absolutamente',
 'absolute',
 'absolutely',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbs',
 'absorption',
 'abstract',
 'absurd',
 'absurdist',
 'absurdity',
 'absurdly',
 'abundance',
 'abundant',
 'abundantly',
 'aburrido',
 'abuse',
 'abused',
 'abuser',
 'abysmal',
 'abysmally',
 'abyss',
 'acaba',
 'acabamos',
 'academic',
 'academy',
 'accent',
 'accept',
 'acceptable',
 'acceptance',
 'accepting',
 'accepts',
 'access',
 'accessibility',
 'accessible',
 'accident',
 'accidental',
 'acclaim',
 'acclaimed',
 'accommodate',
 'accompanied',
 'accompanies',
 'accompany',
 'a

In [14]:
from sklearn import svm
from sklearn.metrics import classification_report

In [15]:
classifier = svm.SVC(kernel = 'linear')
classifier.fit(train_vectors, train_data['mood'])

SVC(kernel='linear')

In [16]:
pred = classifier.predict(test_vectors)

In [17]:
report = classification_report(test_data['mood'], pred, output_dict = True)

In [18]:
report

{'0': {'precision': 0.7477148080438757,
  'recall': 0.7574074074074074,
  'f1-score': 0.7525298988040479,
  'support': 1080},
 '1': {'precision': 0.7478344562078922,
  'recall': 0.7378917378917379,
  'f1-score': 0.7428298279158699,
  'support': 1053},
 'accuracy': 0.7477730895452415,
 'macro avg': {'precision': 0.747774632125884,
  'recall': 0.7476495726495727,
  'f1-score': 0.7476798633599588,
  'support': 2133},
 'weighted avg': {'precision': 0.747773874859023,
  'recall': 0.7477730895452415,
  'f1-score': 0.7477412562136816,
  'support': 2133}}

In [19]:
import joblib
joblib.dump(classifier, 'netflix.pkl')
joblib.dump(vectorizer, 'vector_transform.pkl')

['vector_transform.pkl']

In [20]:
vector = joblib.load('vector_transform.pkl')
model = joblib.load('netflix.pkl')

In [24]:
data = ['i loved that movie']
tfidf = vector.transform(data).toarray()
model.predict(tfidf)

array([1], dtype=int64)