In [1]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
data = pd.read_csv("moviereviews.tsv" , sep = "\t")

In [3]:
data.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [4]:
data.isnull().sum()

label      0
review    35
dtype: int64

In [5]:
data = data.dropna()

In [6]:
data.isnull().sum()

label     0
review    0
dtype: int64

### Checking for blank reviews

In [12]:
blanks = []

for index, label, review in data.itertuples():
    
    if review.isspace():
        
        blanks.append(index)

In [13]:
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

**We should get rid of above blank reviews as well**

In [14]:
data = data.drop(blanks)

In [15]:
data.shape

(1938, 2)

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X = data["review"]

y = data["label"]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [19]:
from sklearn.pipeline import Pipeline

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
from sklearn.svm import LinearSVC

In [22]:
pipe = Pipeline([("tfidf",TfidfVectorizer()) , ("model",LinearSVC())])

In [23]:
pipe.fit(X_train , y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('model', LinearSVC())])

In [24]:
y_pred = pipe.predict(X_test)

In [25]:
from sklearn.metrics import classification_report, confusion_matrix

In [26]:
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       282
         pos       0.85      0.86      0.85       300

    accuracy                           0.85       582
   macro avg       0.85      0.85      0.85       582
weighted avg       0.85      0.85      0.85       582



In [27]:
confusion_matrix(y_test , y_pred)

array([[235,  47],
       [ 41, 259]], dtype=int64)