In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [2]:
# Load Data
df = pd.read_csv("../Data/moviereviews.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   2000 non-null   object
 1   review  1965 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [4]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [5]:
df['label'].unique()

array(['neg', 'pos'], dtype=object)

In [6]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [7]:
df['label'].value_counts()

label
neg    1000
pos    1000
Name: count, dtype: int64

## Cleaning and Preprocessing Data

In [8]:
df[df['review'].isnull()]

Unnamed: 0,label,review
140,pos,
208,pos,
270,neg,
334,neg,
448,neg,
522,neg,
606,pos,
696,neg,
728,pos,
738,neg,


In [9]:
# Remove rows with review as NaN
df = df.dropna(axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1965 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1965 non-null   object
 1   review  1965 non-null   object
dtypes: object(2)
memory usage: 46.1+ KB


In [10]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [11]:
# Find if any review value is empty string such as "", " " or any length of empty string
df['review'].str.isspace().sum()

np.int64(27)

In [12]:
# Remove those empty strings
df = df[~df['review'].str.isspace()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1938 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1938 non-null   object
 1   review  1938 non-null   object
dtypes: object(2)
memory usage: 45.4+ KB


## Train/Test Split and Model Training and Evaluation

In [14]:
# Train/Test Split
from sklearn.model_selection import train_test_split

# Split data into X and y
X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [25]:
# Import Basic ML models to train and evaluate data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

# Import Error Metrices
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [20]:
# Use TfIdfVectorizer for calculating term-frequency, inverse document frenquency of all words
tf_idf = TfidfVectorizer(stop_words='english')

X_train_tfidf = tf_idf.fit_transform(X_train)
X_test_tfidf = tf_idf.transform(X_test)

In [21]:
# 1. MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [None]:
# Evaluate the Naive Bayes Multinomial Model
y_train_nb_pred = nb_model.predict(X_train_tfidf)
y_test_nb_pred = nb_model.predict(X_test_tfidf)

In [27]:
# Evaluate for Train data
accu_score = accuracy_score(y_train, y_train_nb_pred)
conf_metrix = confusion_matrix(y_train, y_train_nb_pred)

print("Accuracy Score is: ", accu_score)
print("Confusion Metrics is: \n", conf_metrix)
print("Classification Report is: \n", classification_report(y_train, y_train_nb_pred))

Accuracy Score is:  0.9722580645161291
Confusion Metrics is: 
 [[767  11]
 [ 32 740]]
Classification Report is: 
               precision    recall  f1-score   support

         neg       0.96      0.99      0.97       778
         pos       0.99      0.96      0.97       772

    accuracy                           0.97      1550
   macro avg       0.97      0.97      0.97      1550
weighted avg       0.97      0.97      0.97      1550



In [28]:
# Evaluate for Test data
accu_score = accuracy_score(y_test, y_test_nb_pred)
conf_metrix = confusion_matrix(y_test, y_test_nb_pred)

print("Accuracy Score is: ", accu_score)
print("Confusion Metrics is: \n", conf_metrix)
print("Classification Report is: \n", classification_report(y_test, y_test_nb_pred))

Accuracy Score is:  0.8118556701030928
Confusion Metrics is: 
 [[166  25]
 [ 48 149]]
Classification Report is: 
               precision    recall  f1-score   support

         neg       0.78      0.87      0.82       191
         pos       0.86      0.76      0.80       197

    accuracy                           0.81       388
   macro avg       0.82      0.81      0.81       388
weighted avg       0.82      0.81      0.81       388



In [29]:
# 2. LogisticRegression
log_model = LogisticRegression(random_state=101)

log_model.fit(X_train_tfidf, y_train)

In [47]:
# Predict for Train and Test data
y_train_log_pred = log_model.predict(X_train_tfidf)
y_test_log_pred = log_model.predict(X_test_tfidf)

In [48]:
# Evaluate for Train data
accu_score = accuracy_score(y_train, y_train_log_pred)
conf_metrix = confusion_matrix(y_train, y_train_log_pred)

print("Accuracy Score is: ", accu_score)
print("Confusion Metrics is: \n", conf_metrix)
print("Classification Report is: \n", classification_report(y_train, y_train_log_pred))

Accuracy Score is:  0.983225806451613
Confusion Metrics is: 
 [[768  10]
 [ 16 756]]
Classification Report is: 
               precision    recall  f1-score   support

         neg       0.98      0.99      0.98       778
         pos       0.99      0.98      0.98       772

    accuracy                           0.98      1550
   macro avg       0.98      0.98      0.98      1550
weighted avg       0.98      0.98      0.98      1550



In [49]:
# Evaluate for Test data
accu_score = accuracy_score(y_test, y_test_log_pred)
conf_metrix = confusion_matrix(y_test, y_test_log_pred)

print("Accuracy Score is: ", accu_score)
print("Confusion Metrics is: \n", conf_metrix)
print("Classification Report is: \n", classification_report(y_test, y_test_log_pred))

Accuracy Score is:  0.8221649484536082
Confusion Metrics is: 
 [[163  28]
 [ 41 156]]
Classification Report is: 
               precision    recall  f1-score   support

         neg       0.80      0.85      0.83       191
         pos       0.85      0.79      0.82       197

    accuracy                           0.82       388
   macro avg       0.82      0.82      0.82       388
weighted avg       0.82      0.82      0.82       388



In [33]:
# 3. DecisionTree
decision_tree = DecisionTreeClassifier(random_state=101)

decision_tree.fit(X_train_tfidf, y_train)

In [50]:
# Predict for Train and Test data
y_train_decision_pred = decision_tree.predict(X_train_tfidf)
y_test_decision_pred = decision_tree.predict(X_test_tfidf)

In [51]:
# Evaluate for Train data
accu_score = accuracy_score(y_train, y_train_decision_pred)
conf_metrix = confusion_matrix(y_train, y_train_decision_pred)

print("Accuracy Score is: ", accu_score)
print("Confusion Metrics is: \n", conf_metrix)
print("Classification Report is: \n", classification_report(y_train, y_train_decision_pred))

Accuracy Score is:  1.0
Confusion Metrics is: 
 [[778   0]
 [  0 772]]
Classification Report is: 
               precision    recall  f1-score   support

         neg       1.00      1.00      1.00       778
         pos       1.00      1.00      1.00       772

    accuracy                           1.00      1550
   macro avg       1.00      1.00      1.00      1550
weighted avg       1.00      1.00      1.00      1550



In [52]:
# Evaluate for Test data
accu_score = accuracy_score(y_test, y_test_decision_pred)
conf_metrix = confusion_matrix(y_test, y_test_decision_pred)

print("Accuracy Score is: ", accu_score)
print("Confusion Metrics is: \n", conf_metrix)
print("Classification Report is: \n", classification_report(y_test, y_test_decision_pred))

Accuracy Score is:  0.6417525773195877
Confusion Metrics is: 
 [[125  66]
 [ 73 124]]
Classification Report is: 
               precision    recall  f1-score   support

         neg       0.63      0.65      0.64       191
         pos       0.65      0.63      0.64       197

    accuracy                           0.64       388
   macro avg       0.64      0.64      0.64       388
weighted avg       0.64      0.64      0.64       388



* Based on above evalution results from Naive Bayes, LogisticRegression and Decision Tree, it can be concluded that each of the model performance well on training data but it doesn't get good accuracy score for test data. Hence, I'll further explore Advance models for traing and testing.

In [37]:
# Import Advance Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [38]:
# 1. SVC
svc_model = SVC(random_state=101)

svc_model.fit(X_train_tfidf, y_train)

In [53]:
# Predict for Train and Test data
y_train_svc_pred = svc_model.predict(X_train_tfidf)
y_test_svc_pred = svc_model.predict(X_test_tfidf)

In [54]:
# Evaluate for Train data
accu_score = accuracy_score(y_train, y_train_svc_pred)
conf_metrix = confusion_matrix(y_train, y_train_svc_pred)

print("Accuracy Score is: ", accu_score)
print("Confusion Metrics is: \n", conf_metrix)
print("Classification Report is: \n", classification_report(y_train, y_train_svc_pred))

Accuracy Score is:  1.0
Confusion Metrics is: 
 [[778   0]
 [  0 772]]
Classification Report is: 
               precision    recall  f1-score   support

         neg       1.00      1.00      1.00       778
         pos       1.00      1.00      1.00       772

    accuracy                           1.00      1550
   macro avg       1.00      1.00      1.00      1550
weighted avg       1.00      1.00      1.00      1550



In [55]:
# Evaluate for Test data
accu_score = accuracy_score(y_test, y_test_svc_pred)
conf_metrix = confusion_matrix(y_test, y_test_svc_pred)

print("Accuracy Score is: ", accu_score)
print("Confusion Metrics is: \n", conf_metrix)
print("Classification Report is: \n", classification_report(y_test, y_test_svc_pred))

Accuracy Score is:  0.8273195876288659
Confusion Metrics is: 
 [[161  30]
 [ 37 160]]
Classification Report is: 
               precision    recall  f1-score   support

         neg       0.81      0.84      0.83       191
         pos       0.84      0.81      0.83       197

    accuracy                           0.83       388
   macro avg       0.83      0.83      0.83       388
weighted avg       0.83      0.83      0.83       388



In [42]:
# RandomForest
randomForest_model = RandomForestClassifier(random_state=101)

randomForest_model.fit(X_train_tfidf, y_train)

In [56]:
# Predict for Train and Test data
y_train_randomForest_pred = randomForest_model.predict(X_train_tfidf)
y_test_randomForest_pred = randomForest_model.predict(X_test_tfidf)

In [57]:
# Evaluate for Train data
accu_score = accuracy_score(y_train, y_train_randomForest_pred)
conf_metrix = confusion_matrix(y_train, y_train_randomForest_pred)

print("Accuracy Score is: ", accu_score)
print("Confusion Metrics is: \n", conf_metrix)
print("Classification Report is: \n", classification_report(y_train, y_train_randomForest_pred))

Accuracy Score is:  1.0
Confusion Metrics is: 
 [[778   0]
 [  0 772]]
Classification Report is: 
               precision    recall  f1-score   support

         neg       1.00      1.00      1.00       778
         pos       1.00      1.00      1.00       772

    accuracy                           1.00      1550
   macro avg       1.00      1.00      1.00      1550
weighted avg       1.00      1.00      1.00      1550



In [58]:
# Evaluate for Test data
accu_score = accuracy_score(y_test, y_test_randomForest_pred)
conf_metrix = confusion_matrix(y_test, y_test_randomForest_pred)

print("Accuracy Score is: ", accu_score)
print("Confusion Metrics is: \n", conf_metrix)
print("Classification Report is: \n", classification_report(y_test, y_test_randomForest_pred))

Accuracy Score is:  0.7860824742268041
Confusion Metrics is: 
 [[165  26]
 [ 57 140]]
Classification Report is: 
               precision    recall  f1-score   support

         neg       0.74      0.86      0.80       191
         pos       0.84      0.71      0.77       197

    accuracy                           0.79       388
   macro avg       0.79      0.79      0.79       388
weighted avg       0.79      0.79      0.78       388



In [46]:
# 3. KNN
knn_model = KNeighborsClassifier()

knn_model.fit(X_train_tfidf, y_train)

In [61]:
# Predict for Train and Test data
y_train_knn_pred = knn_model.predict(X_train_tfidf)
y_test_knn_pred = knn_model.predict(X_test_tfidf) 

In [63]:
# Evaluate for Train data
accu_score = accuracy_score(y_train, y_train_knn_pred)
conf_metrix = confusion_matrix(y_train, y_train_knn_pred)

print("Accuracy Score is: ", accu_score)
print("Confusion Metrics is: \n", conf_metrix)
print("Classification Report is: \n", classification_report(y_train, y_train_knn_pred))

Accuracy Score is:  0.8032258064516129
Confusion Metrics is: 
 [[597 181]
 [124 648]]
Classification Report is: 
               precision    recall  f1-score   support

         neg       0.83      0.77      0.80       778
         pos       0.78      0.84      0.81       772

    accuracy                           0.80      1550
   macro avg       0.80      0.80      0.80      1550
weighted avg       0.80      0.80      0.80      1550



In [64]:
# Evaluate for Test data
accu_score = accuracy_score(y_test, y_test_knn_pred)
conf_metrix = confusion_matrix(y_test, y_test_knn_pred)

print("Accuracy Score is: ", accu_score)
print("Confusion Metrics is: \n", conf_metrix)
print("Classification Report is: \n", classification_report(y_test, y_test_knn_pred))

Accuracy Score is:  0.6726804123711341
Confusion Metrics is: 
 [[119  72]
 [ 55 142]]
Classification Report is: 
               precision    recall  f1-score   support

         neg       0.68      0.62      0.65       191
         pos       0.66      0.72      0.69       197

    accuracy                           0.67       388
   macro avg       0.67      0.67      0.67       388
weighted avg       0.67      0.67      0.67       388



### Based on the training and evaluation with different models, it can be concluded that SVC has the best accuracy score followed by Logistic Regression and Naive Bayes. The accuracy score is as below for each model

1. SVC = 83
2. Logistic Regression = 82
3. Naive Bayes (Multinomial) = 81