In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("moviereviews.tsv", sep='\t')

In [3]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


**We'll predict negative or positive movie review from classification of text**

In [4]:
len(df)

2000

In [6]:
print(df['review'][0])

how do films like mouse hunt get into theatres ? 
isn't there a law or something ? 
this diabolical load of claptrap from steven speilberg's dreamworks studio is hollywood family fare at its deadly worst . 
mouse hunt takes the bare threads of a plot and tries to prop it up with overacting and flat-out stupid slapstick that makes comedies like jingle all the way look decent by comparison . 
writer adam rifkin and director gore verbinski are the names chiefly responsible for this swill . 
the plot , for what its worth , concerns two brothers ( nathan lane and an appalling lee evens ) who inherit a poorly run string factory and a seemingly worthless house from their eccentric father . 
deciding to check out the long-abandoned house , they soon learn that it's worth a fortune and set about selling it in auction to the highest bidder . 
but battling them at every turn is a very smart mouse , happy with his run-down little abode and wanting it to stay that way . 
the story alternate

In [7]:
df.isnull().sum()

label      0
review    35
dtype: int64

So, we're not missing any labels but we have some empty moview reviews. So, we'll remove them.

In [8]:
df.dropna(inplace=True) #inplace True means permananet drop

In [9]:
df.isnull().sum()

label     0
review    0
dtype: int64

Sometimes a movie review will be empty string " ". But for computer single space is not null. So, we'll check all the reviews that there is actual review in all the rows in data.

In [10]:
mystring = 'hello'
empty = ' '

In [11]:
mystring.isspace()

False

In [12]:
empty.isspace()

True

In [13]:
blanks = [] #setting up list of blanks

# (index, label, review text)
for i,lb,rv in df.itertuples():
    if rv.isspace():
        blanks.append(i)

In [14]:
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [16]:
len(blanks)

27

In [18]:
df.drop(blanks, inplace=True)

In [19]:
len(df)

1938

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X = df['review']
y = df['label']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.3, random_state=42)

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [25]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                    ('clf',LinearSVC())])

In [26]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [27]:
predictions = text_clf.predict(X_test)

In [28]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [29]:
print(confusion_matrix(y_test, predictions))

[[235  47]
 [ 41 259]]


In [30]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       282
         pos       0.85      0.86      0.85       300

    accuracy                           0.85       582
   macro avg       0.85      0.85      0.85       582
weighted avg       0.85      0.85      0.85       582



In [31]:
print(accuracy_score(y_test, predictions))

0.8487972508591065


In [35]:
Dummy_text =['text_clf.predict("Dangal is an emotionally made biographical tale that talks about a farmer National level wrestler It is not the first time we are witnessing a film that speaks about a wrestler’s life, but no matter how many films you might have watched, Dangal stands unique in a lot of ways. Director Nitesh Tiwari has to be credited for the way he has treated the film with not much force fitted theatrics yet grabbing the attention of the audience throughout the film. Yes, there were one or two dramatic moments")']

In [43]:
x = pd.Series(Dummy_text)

In [44]:
x

0    text_clf.predict("Dangal is an emotionally mad...
dtype: object

In [45]:
text_clf.predict(x)

array(['pos'], dtype=object)