In [111]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

## Importing and loading the dataset

In [112]:
df = pd.read_csv("UPDATED_NLP_COURSE/TextFiles/moviereviews2.tsv", sep='\t')

In [113]:
df.head()

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...


In [114]:
len(df)

6000

## Missing value treatment

In [115]:
df.isnull().sum()

label      0
review    20
dtype: int64

In [116]:
df.dropna(inplace=True)

In [117]:
len(df)

5980

In [118]:
blanks = []

for i, lb, rv in df.itertuples():
    if rv.isspace():
        blanks.append(i)

In [119]:
blanks

[]

## Spliting the data into train and test sets

In [120]:
X = df['review']

In [121]:
y = df['label']

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=89)

## Building Pipeline to vectorize the data, then train and fitting a model

In [123]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                    ('clf', LinearSVC())])

In [124]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

## Making predictions and analyzing the results

In [125]:
predictions = text_clf.predict(X_test)

In [126]:
print(confusion_matrix(y_test, predictions))

[[897  76]
 [ 66 935]]


In [127]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         neg       0.93      0.92      0.93       973
         pos       0.92      0.93      0.93      1001

   micro avg       0.93      0.93      0.93      1974
   macro avg       0.93      0.93      0.93      1974
weighted avg       0.93      0.93      0.93      1974



In [128]:
print('Accuracy : ' + str(round(accuracy_score(y_test, predictions) * 100, 2)) + '%')

Accuracy : 92.81%
