In [32]:
import numpy as np
import pandas as pd

In [33]:
df = pd.read_csv('dataset/moviereviews.tsv', sep='\t')

In [34]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [35]:
len(df)

2000

In [36]:
df['label'].value_counts()

pos    1000
neg    1000
Name: label, dtype: int64

In [37]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [38]:
df = df.dropna() # Remove NaN or none review

In [39]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [40]:
len(df)

1965

In [41]:
def detect_blanks(df):
    contains_blank = list()
    for index, label, review in df.itertuples():
        if type(review)==str:
            if review.isspace():
                contains_blank.append(index)
    print(f"This df contains {len(contains_blank)} blanks")
    return contains_blank

In [42]:
contain_blanks = detect_blanks(df)

This df contains 27 blanks


In [43]:
contain_blanks[:5]

[57, 71, 147, 151, 283]

In [44]:
df.iloc[57]

label     neg
review       
Name: 57, dtype: object

In [45]:
df = df.drop(contain_blanks)

In [46]:
len(df)

1938

In [47]:
detect_blanks(df)

This df contains 0 blanks


[]

In [48]:
df['label'].value_counts()

pos    969
neg    969
Name: label, dtype: int64

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
X = df['review'] # Feature
y = df['label'] # Target or label

In [54]:
X_train.shape

(1356,)

In [55]:
X_test.shape

(582,)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).e

In [57]:
def create_model(vectorizer,model,X_train, y_train):
    text_clf = Pipeline([('vectorizer',vectorizer),('clf', model)])
    text_clf.fit(X_train, y_train)
    return text_clf

In [58]:
text_clf_nb = create_model(TfidfVectorizer(), MultinomialNB(),X_train,y_train)

In [60]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [61]:
def test_model(model,X_test, y_test):
    prediction = model.predict(X_test)
    y_ = [y_test, prediction]
    print(confusion_matrix(*y_))
    print(classification_report(*y_))
    print(f"Accuracy {accuracy_score(*y_)*100:.2f}%")

In [63]:
test_model(text_clf_nb,X_test, y_test)

[[259  23]
 [102 198]]
              precision    recall  f1-score   support

         neg       0.72      0.92      0.81       282
         pos       0.90      0.66      0.76       300

   micro avg       0.79      0.79      0.79       582
   macro avg       0.81      0.79      0.78       582
weighted avg       0.81      0.79      0.78       582

Accuracy 78.52%


In [64]:
text_clf_lsvc = create_model(TfidfVectorizer(), LinearSVC(), X_train, y_train)
test_model(text_clf_lsvc, X_test, y_test)

[[235  47]
 [ 41 259]]
              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       282
         pos       0.85      0.86      0.85       300

   micro avg       0.85      0.85      0.85       582
   macro avg       0.85      0.85      0.85       582
weighted avg       0.85      0.85      0.85       582

Accuracy 84.88%


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  indices = (scores > 0).astype(np.int)


In [65]:
from sklearn.feature_extraction import text
print(text.ENGLISH_STOP_WORDS)

frozenset({'enough', 'are', 'and', 'nobody', 'hereafter', 'yourself', 'then', 'yourselves', 'ourselves', 'never', 'three', 'namely', 'however', 'every', 'system', 'her', 'mostly', 'of', 'me', 'where', 'seem', 'side', 'yet', 'becoming', 'whole', 'it', 'mine', 'couldnt', 'some', 'only', 'how', 'more', 'along', 'somewhere', 'up', 'though', 'my', 'himself', 'most', 'next', 'you', 'any', 'therefore', 'into', 'by', 'there', 'out', 'such', 'forty', 'keep', 'sixty', 'although', 'yours', 'thus', 'sometimes', 'be', 'she', 'thereafter', 'he', 'give', 'full', 'un', 'hundred', 'sincere', 'their', 'please', 'might', 'within', 'herself', 'between', 'hence', 'call', 'again', 'him', 'always', 'can', 'fifty', 'our', 'empty', 'hereupon', 'from', 'around', 'another', 'eight', 'via', 'together', 'top', 'wherever', 'would', 'once', 'eleven', 'had', 'off', 'name', 'what', 'whose', 'being', 'for', 'before', 'nevertheless', 'toward', 'during', 'seeming', 'other', 'they', 'beyond', 'twenty', 'against', 'due', '

In [66]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [73]:
import time

In [76]:
def train_test_model(vectorizer,model,X_train,y_train, X_test, y_test):
    t = time.time()
    text_clf = create_model(vectorizer, model, X_train, y_train)
    test_model(text_clf, X_test, y_test)
    print(f"Time taken: {time.time()-t:.2f} seconds")
    return text_clf

In [77]:
text_clf = train_test_model(TfidfVectorizer(stop_words=stopwords), MultinomialNB(),X_train, y_train, X_test, y_test)

[[249  33]
 [ 78 222]]
              precision    recall  f1-score   support

         neg       0.76      0.88      0.82       282
         pos       0.87      0.74      0.80       300

   micro avg       0.81      0.81      0.81       582
   macro avg       0.82      0.81      0.81       582
weighted avg       0.82      0.81      0.81       582

Accuracy 80.93%
Time taken: 0.80 seconds


In [81]:
text_clf = train_test_model(TfidfVectorizer(), MultinomialNB(),X_train, y_train, X_test, y_test)

[[259  23]
 [102 198]]
              precision    recall  f1-score   support

         neg       0.72      0.92      0.81       282
         pos       0.90      0.66      0.76       300

   micro avg       0.79      0.79      0.79       582
   macro avg       0.81      0.79      0.78       582
weighted avg       0.81      0.79      0.78       582

Accuracy 78.52%
Time taken: 0.82 seconds


In [82]:
text_clf = train_test_model(TfidfVectorizer(stop_words=stopwords), LinearSVC(),X_train, y_train, X_test, y_test)

[[233  49]
 [ 43 257]]
              precision    recall  f1-score   support

         neg       0.84      0.83      0.84       282
         pos       0.84      0.86      0.85       300

   micro avg       0.84      0.84      0.84       582
   macro avg       0.84      0.84      0.84       582
weighted avg       0.84      0.84      0.84       582

Accuracy 84.19%
Time taken: 0.84 seconds


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  indices = (scores > 0).astype(np.int)


In [80]:
text_clf = train_test_model(TfidfVectorizer(), LinearSVC(),X_train, y_train, X_test, y_test)

[[235  47]
 [ 41 259]]
              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       282
         pos       0.85      0.86      0.85       300

   micro avg       0.85      0.85      0.85       582
   macro avg       0.85      0.85      0.85       582
weighted avg       0.85      0.85      0.85       582

Accuracy 84.88%
Time taken: 0.85 seconds


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  indices = (scores > 0).astype(np.int)
