In [1]:
# Step 1___ Load the dataset

import numpy as np
import pandas as pd

df = pd.read_csv('moviereviews.tsv', sep = '\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [2]:
len(df)

2000

In [3]:
# Step 2___ check for null values present in the dataset

df.isnull().sum()

label      0
review    35
dtype: int64

In [None]:
## We can tell that 35 records show NaN ()  This can be easily removed using the .dropna() pandas function

In [4]:
df.dropna(inplace = True)

len(df)

1965

In [5]:
# Step 3_________Whitespace handling is necessary, like we need to remove blank spaces

blanks = [] # Start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the Dataframe
    if type(rv) == str:          # avoid NaN values
        if rv.isspace():         # test 'review' for whitespaces 
            blanks.append(i)     #  add matching index numbers to the list
            
print(len(blanks), 'blanks: ', blanks)

27 blanks:  [57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [6]:
## Next we'll pass our list of index numbers to the .drop() method and set inplace= True to make the change permanent

df.drop(blanks, inplace = True)

len(df)

1938

In [7]:
df['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [9]:
###Step 4 _________Split the data into train and test sets

from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train , y_test = train_test_split(X, y, test_size = 0.33, random_state = 42 )

In [10]:
### Step 5_________ We use Naive bayes and Linear SVC Classifier

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# naive Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),('clf', MultinomialNB()),])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),('clf',LinearSVC()),])

In [11]:
###Step 6___________Feed the training data thru the first pipeline

## We'll run NB first:

text_clf_nb.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [12]:
# Form a prediciton set

predictions = text_clf_nb.predict(X_test)

In [18]:
# Print the overall accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test,predictions))

0.7640625


In [19]:
### Step 7 ___ Next Run LSVC

text_clf_lsvc.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [22]:
# Form a prediciton set

predictions1 = text_clf_lsvc.predict(X_test)

In [23]:
# Print the overall accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test,predictions1))

0.846875


In [None]:
## You may also add stop words

from sklearn.feature_extraction import text
print(text.ENGLISH_STOP_WORDS)

In [24]:
# RUN this cell to add stopwords to the linear SVC Pipeline


# Linear SVC:
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer()),('clf',LinearSVC()),])
text_clf_lsvc2.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [25]:
print(metrics.accuracy_score(y_test,predictions1))

0.846875


In [27]:
myreview = "I loved the movie. It was amazing!"

In [28]:
## Now check whether the review is postive or negative using the model

print(text_clf_nb.predict([myreview]))

['pos']


In [None]:
## Same thing with Linear SVC

In [29]:
print(text_clf_lsvc.predict([myreview]))

['pos']
