## Data Loading

In [1]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [2]:
review = pd.read_csv('Movie_Reviews_100.csv')

In [4]:
#label data
review['sentiment']= 'negative'
review.loc[review['Rating (Out of 10)']>5,'sentiment'] ='positive'

In [5]:
review['sentiment'].value_counts()

positive    113714
negative     13935
Name: sentiment, dtype: int64

## Data Cleaning 

In [None]:
%%time
#remove special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
review['Review']=review['Review'].apply(remove_special_characters)

In [None]:
%%time
#stemming
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

review['stems']=review['Review'].apply(simple_stemmer)

In [None]:
%%time
from nltk.corpus import stopwords
review['stems'] = review['stems'].str.split().apply(lambda x: [word for word in x if word not in stopwords.words('english')])

In [None]:
review['stems'] = review['stems'].apply(lambda x: ' '.join(x))

## Feature Engineering - Run everything in this section

### Undersampling

In [6]:
##Undersampling
from imblearn.under_sampling import NearMiss, RandomUnderSampler
naive_under_sample = RandomUnderSampler(sampling_strategy = 'majority')

In [7]:
%%time
#BOW
from sklearn.model_selection import train_test_split
cv=CountVectorizer(min_df=2,max_df=0.95,binary=False,ngram_range=(1,3))
cv_reviews=cv.fit_transform(review['stems'].values.astype('U'))
cv_X, cv_y = naive_under_sample.fit_resample(cv_reviews, review['sentiment'])
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(cv_X, cv_y, test_size=0.2, random_state=42)


CPU times: user 51.9 s, sys: 4.44 s, total: 56.3 s
Wall time: 56.8 s


In [8]:
%%time
#tfidf
naive_under_sample2 = RandomUnderSampler(sampling_strategy = 'majority')
tv=TfidfVectorizer(min_df=2,max_df=0.95,use_idf=True,ngram_range=(1,3))
tv_reviews=tv.fit_transform(review['stems'].values.astype('U'))
tv_X, tv_y = naive_under_sample2.fit_resample(tv_reviews, review['sentiment'])
X_train_tv, X_test_tv, y_train_tv, y_test_tv = train_test_split(tv_X, tv_y, test_size=0.2, random_state=42)

CPU times: user 59.4 s, sys: 5.96 s, total: 1min 5s
Wall time: 1min 7s


### Oversampling

In [9]:
%%time
#BOW
from imblearn.over_sampling import SMOTE
cv_smote_over_sample = SMOTE(sampling_strategy='minority')
cv_over_X, cv_over_y = cv_smote_over_sample.fit_resample(cv_reviews, review['sentiment'])
X_train_cv_o, X_test_cv_o, y_train_cv_o, y_test_cv_o = train_test_split(cv_over_X, cv_over_y, test_size=0.2, random_state=42)


CPU times: user 12.4 s, sys: 2.23 s, total: 14.6 s
Wall time: 14.7 s


In [10]:
%%time
#tdidf
tv_smote_over_sample = SMOTE(sampling_strategy='minority')
tv_over_X, tv_over_y = tv_smote_over_sample.fit_resample(tv_reviews, review['sentiment'])
X_train_tv_o, X_test_tv_o, y_train_tv_o, y_test_tv_o = train_test_split(tv_over_X, tv_over_y, test_size=0.2, random_state=42)

CPU times: user 15.1 s, sys: 3.82 s, total: 18.9 s
Wall time: 19.2 s


## Model Building

### Logistic Regression

In [11]:
%%time
lr_cv_u=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
lr_cv_o=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
lr_tv_u=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
lr_tv_o=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

#Fitting the model for Bag of words
lr_bow_u=lr_cv_u.fit(X_train_cv,y_train_cv)
lr_bow_o=lr_cv_o.fit(X_train_cv_o,y_train_cv_o)

#Fitting the model for tfidf features
lr_tfidf_u=lr_tv_u.fit(X_train_tv,y_train_tv)
lr_tfidf_o=lr_tv_o.fit(X_train_tv_o,y_train_tv_o)


CPU times: user 38min 31s, sys: 31.3 s, total: 39min 2s
Wall time: 7min 40s


In [12]:
#Predicting the model for bag of words

lr_bow_u_predict=lr_cv_u.predict(X_test_cv)
lr_bow_o_predict=lr_cv_o.predict(X_test_cv_o)

##Predicting the model for tfidf features
lr_tfidf_u_predict=lr_tv_u.predict(X_test_tv)
lr_tfidf_o_predict=lr_tv_o.predict(X_test_tv_o)


In [13]:
print("lr_bow_u_score :",accuracy_score(y_test_cv,lr_bow_u_predict))
print("lr_bow_o_score :",accuracy_score(y_test_cv_o,lr_bow_o_predict))
print("lr_tfidf_u_score :",accuracy_score(y_test_tv,lr_tfidf_u_predict))
print("lr_tfidf_o_score :",accuracy_score(y_test_tv_o,lr_tfidf_o_predict))

lr_bow_u_score : 0.8754933620380337
lr_bow_o_score : 0.9401354262850108
lr_tfidf_u_score : 0.8742375313957661
lr_tfidf_o_score : 0.9408609242404257


In [14]:
lr_bow_u_report=classification_report(y_test_cv,lr_bow_u_predict,target_names=['positive','negative'])
print(lr_bow_u_report)

lr_bow_o_report=classification_report(y_test_cv_o,lr_bow_o_predict,target_names=['positive','negative'])
print(lr_bow_o_report)

lr_tfidf_u_report=classification_report(y_test_tv,lr_tfidf_u_predict,target_names=['positive','negative'])
print(lr_tfidf_u_report)

lr_tfidf_o_report=classification_report(y_test_tv_o,lr_tfidf_o_predict,target_names=['positive','negative'])
print(lr_tfidf_o_report)

              precision    recall  f1-score   support

    positive       0.88      0.87      0.88      2796
    negative       0.87      0.88      0.88      2778

    accuracy                           0.88      5574
   macro avg       0.88      0.88      0.88      5574
weighted avg       0.88      0.88      0.88      5574

              precision    recall  f1-score   support

    positive       0.93      0.96      0.94     22881
    negative       0.95      0.92      0.94     22605

    accuracy                           0.94     45486
   macro avg       0.94      0.94      0.94     45486
weighted avg       0.94      0.94      0.94     45486

              precision    recall  f1-score   support

    positive       0.88      0.87      0.87      2796
    negative       0.87      0.88      0.87      2778

    accuracy                           0.87      5574
   macro avg       0.87      0.87      0.87      5574
weighted avg       0.87      0.87      0.87      5574

              preci

### Support Vector Machine

In [15]:
%%time
svm_cv_u=SGDClassifier(loss='hinge',max_iter=500,random_state=42)
svm_cv_o=SGDClassifier(loss='hinge',max_iter=500,random_state=42)
svm_tv_u=SGDClassifier(loss='hinge',max_iter=500,random_state=42)
svm_tv_o=SGDClassifier(loss='hinge',max_iter=500,random_state=42)

#Fitting the model for Bag of words
svm_bow_u=svm_cv_u.fit(X_train_cv,y_train_cv)
svm_bow_o=svm_cv_o.fit(X_train_cv_o,y_train_cv_o)

#Fitting the model for tfidf features
svm_tfidf_u=svm_tv_u.fit(X_train_tv,y_train_tv)
svm_tfidf_o=svm_tv_o.fit(X_train_tv_o,y_train_tv_o)


CPU times: user 14.3 s, sys: 820 ms, total: 15.2 s
Wall time: 9.48 s


In [16]:
#Predicting the model for bag of words

svm_bow_u_predict=svm_cv_u.predict(X_test_cv)
svm_bow_o_predict=svm_cv_o.predict(X_test_cv_o)

##Predicting the model for tfidf features
svm_tfidf_u_predict=svm_tv_u.predict(X_test_tv)
svm_tfidf_o_predict=svm_tv_o.predict(X_test_tv_o)


In [17]:
print("svm_bow_u_score :",accuracy_score(y_test_cv,svm_bow_u_predict))
print("svm_bow_o_score :",accuracy_score(y_test_cv_o,svm_bow_o_predict))
print("svm_tfidf_u_score :",accuracy_score(y_test_tv,svm_tfidf_u_predict))
print("svm_tfidf_o_score :",accuracy_score(y_test_tv_o,svm_tfidf_o_predict))

svm_bow_u_score : 0.8697524219590959
svm_bow_o_score : 0.9383766433627929
svm_tfidf_u_score : 0.877108001435235
svm_tfidf_o_score : 0.8970892142637295


In [18]:
svm_bow_u_report=classification_report(y_test_cv,svm_bow_u_predict,target_names=['positive','negative'])
print(svm_bow_u_report)

svm_bow_o_report=classification_report(y_test_cv_o,svm_bow_o_predict,target_names=['positive','negative'])
print(svm_bow_o_report)

svm_tfidf_u_report=classification_report(y_test_tv,svm_tfidf_u_predict,target_names=['positive','negative'])
print(svm_tfidf_u_report)

svm_tfidf_o_report=classification_report(y_test_tv_o,svm_tfidf_o_predict,target_names=['positive','negative'])
print(svm_tfidf_o_report)

              precision    recall  f1-score   support

    positive       0.88      0.86      0.87      2796
    negative       0.86      0.88      0.87      2778

    accuracy                           0.87      5574
   macro avg       0.87      0.87      0.87      5574
weighted avg       0.87      0.87      0.87      5574

              precision    recall  f1-score   support

    positive       0.92      0.96      0.94     22881
    negative       0.96      0.92      0.94     22605

    accuracy                           0.94     45486
   macro avg       0.94      0.94      0.94     45486
weighted avg       0.94      0.94      0.94     45486

              precision    recall  f1-score   support

    positive       0.88      0.88      0.88      2796
    negative       0.87      0.88      0.88      2778

    accuracy                           0.88      5574
   macro avg       0.88      0.88      0.88      5574
weighted avg       0.88      0.88      0.88      5574

              preci

### Multinomial Naive Bayes

In [19]:
%%time
mnb_cv_u=MultinomialNB()
mnb_cv_o=MultinomialNB()
mnb_tv_u=MultinomialNB()
mnb_tv_o=MultinomialNB()

#Fitting the model for Bag of words
mnb_bow_u=mnb_cv_u.fit(X_train_cv,y_train_cv)
mnb_bow_o=mnb_cv_o.fit(X_train_cv_o,y_train_cv_o)

#Fitting the model for tfidf features
mnb_tfidf_u=mnb_tv_u.fit(X_train_tv,y_train_tv)
mnb_tfidf_o=mnb_tv_o.fit(X_train_tv_o,y_train_tv_o)

CPU times: user 2.65 s, sys: 144 ms, total: 2.79 s
Wall time: 2.8 s


In [20]:
#Predicting the model for bag of words

mnb_bow_u_predict=mnb_cv_u.predict(X_test_cv)
mnb_bow_o_predict=mnb_cv_o.predict(X_test_cv_o)

##Predicting the model for tfidf features
mnb_tfidf_u_predict=mnb_tv_u.predict(X_test_tv)
mnb_tfidf_o_predict=mnb_tv_o.predict(X_test_tv_o)


In [21]:
print("mnb_bow_u_score :",accuracy_score(y_test_cv,mnb_bow_u_predict))
print("mnb_bow_o_score :",accuracy_score(y_test_cv_o,mnb_bow_o_predict))
print("mnb_tfidf_u_score :",accuracy_score(y_test_tv,mnb_tfidf_u_predict))
print("mnb_tfidf_o_score :",accuracy_score(y_test_tv_o,mnb_tfidf_o_predict))

mnb_bow_u_score : 0.8711876569788303
mnb_bow_o_score : 0.8585938530536869
mnb_tfidf_u_score : 0.8783638320775027
mnb_tfidf_o_score : 0.9601415820252386


In [22]:
mnb_bow_u_report=classification_report(y_test_cv,mnb_bow_u_predict,target_names=['positive','negative'])
print(mnb_bow_u_report)

mnb_bow_o_report=classification_report(y_test_cv_o,mnb_bow_o_predict,target_names=['positive','negative'])
print(mnb_bow_o_report)

mnb_tfidf_u_report=classification_report(y_test_tv,mnb_tfidf_u_predict,target_names=['positive','negative'])
print(mnb_tfidf_u_report)

mnb_tfidf_o_report=classification_report(y_test_tv_o,mnb_tfidf_o_predict,target_names=['positive','negative'])
print(mnb_tfidf_o_report)

              precision    recall  f1-score   support

    positive       0.85      0.90      0.87      2796
    negative       0.89      0.85      0.87      2778

    accuracy                           0.87      5574
   macro avg       0.87      0.87      0.87      5574
weighted avg       0.87      0.87      0.87      5574

              precision    recall  f1-score   support

    positive       0.93      0.77      0.85     22881
    negative       0.81      0.94      0.87     22605

    accuracy                           0.86     45486
   macro avg       0.87      0.86      0.86     45486
weighted avg       0.87      0.86      0.86     45486

              precision    recall  f1-score   support

    positive       0.85      0.92      0.88      2796
    negative       0.91      0.84      0.87      2778

    accuracy                           0.88      5574
   macro avg       0.88      0.88      0.88      5574
weighted avg       0.88      0.88      0.88      5574

              preci

## XGBoost

In [18]:
from xgboost import XGBClassifier

In [19]:
%%time
xbg_cv_u=XGBClassifier(random_state=42,learning_rate=0.9)
xbg_cv_o=XGBClassifier(random_state=42,learning_rate=0.9)
xbg_tv_u=XGBClassifier(random_state=42,learning_rate=0.9)
xbg_tv_o=XGBClassifier(random_state=42,learning_rate=0.9)

#Fitting the model for Bag of words
xbg_bow_u=xbg_cv_u.fit(X_train_cv,y_train_cv)
xbg_bow_o=xbg_cv_o.fit(X_train_cv_o,y_train_cv_o)

#Fitting the model for tfidf features
xbg_tfidf_u=xbg_tv_u.fit(X_train_tv,y_train_tv)
xbg_tfidf_o=xbg_tv_o.fit(X_train_tv_o,y_train_tv_o)

CPU times: user 1h 1min 58s, sys: 1min 16s, total: 1h 3min 15s
Wall time: 9min 2s


In [20]:
#Predicting the model for bag of words
xbg_bow_u_predict=xbg_cv_u.predict(X_test_cv)
xbg_bow_o_predict=xbg_cv_o.predict(X_test_cv_o)

##Predicting the model for tfidf features
xbg_tfidf_u_predict=xbg_tv_u.predict(X_test_tv)
xbg_tfidf_o_predict=xbg_tv_o.predict(X_test_tv_o)

In [21]:
print("xbg_bow_u_score :",accuracy_score(y_test_cv,xbg_bow_u_predict))
print("xbg_bow_o_score :",accuracy_score(y_test_cv_o,xbg_bow_o_predict))
print("xbg_tfidf_u_score :",accuracy_score(y_test_tv,xbg_tfidf_u_predict))
print("xbg_tfidf_o_score :",accuracy_score(y_test_tv_o,xbg_tfidf_o_predict))

xbg_bow_u_score : 0.826157158234661
xbg_bow_o_score : 0.9535681308534494
xbg_tfidf_u_score : 0.8252601363473269
xbg_tfidf_o_score : 0.9331442641691949


In [22]:
xbg_bow_u_report=classification_report(y_test_cv,xbg_bow_u_predict,target_names=['positive','negative'])
print(xbg_bow_u_report)

xbg_bow_o_report=classification_report(y_test_cv_o,xbg_bow_o_predict,target_names=['positive','negative'])
print(xbg_bow_o_report)

xbg_tfidf_u_report=classification_report(y_test_tv,xbg_tfidf_u_predict,target_names=['positive','negative'])
print(xbg_tfidf_u_report)

xbg_tfidf_o_report=classification_report(y_test_tv_o,xbg_tfidf_o_predict,target_names=['positive','negative'])
print(xbg_tfidf_o_report)

              precision    recall  f1-score   support

    positive       0.83      0.82      0.83      2796
    negative       0.82      0.83      0.83      2778

    accuracy                           0.83      5574
   macro avg       0.83      0.83      0.83      5574
weighted avg       0.83      0.83      0.83      5574

              precision    recall  f1-score   support

    positive       0.97      0.94      0.95     22881
    negative       0.94      0.97      0.95     22605

    accuracy                           0.95     45486
   macro avg       0.95      0.95      0.95     45486
weighted avg       0.95      0.95      0.95     45486

              precision    recall  f1-score   support

    positive       0.83      0.82      0.83      2796
    negative       0.82      0.83      0.83      2778

    accuracy                           0.83      5574
   macro avg       0.83      0.83      0.83      5574
weighted avg       0.83      0.83      0.83      5574

              preci

## Decision Trees

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
%%time
dct_cv_u=DecisionTreeClassifier(random_state=42,criterion='entropy')
dct_cv_o=DecisionTreeClassifier(random_state=42,criterion='entropy')
dct_tv_u=DecisionTreeClassifier(random_state=42,criterion='entropy')
dct_tv_o=DecisionTreeClassifier(random_state=42,criterion='entropy')

#Fitting the model for Bag of words
dct_bow_u=dct_cv_u.fit(X_train_cv,y_train_cv)
dct_bow_o=dct_cv_o.fit(X_train_cv_o,y_train_cv_o)

#Fitting the model for tfidf features
dct_tfidf_u=dct_tv_u.fit(X_train_tv,y_train_tv)
dct_tfidf_o=dct_tv_o.fit(X_train_tv_o,y_train_tv_o)

CPU times: user 53min 13s, sys: 12.1 s, total: 53min 25s
Wall time: 53min 39s


In [25]:
#Predicting the model for bag of words
dct_bow_u_predict=dct_cv_u.predict(X_test_cv)
dct_bow_o_predict=dct_cv_o.predict(X_test_cv_o)

##Predicting the model for tfidf features
dct_tfidf_u_predict=dct_tv_u.predict(X_test_tv)
dct_tfidf_o_predict=dct_tv_o.predict(X_test_tv_o)

In [27]:
print("dct_bow_u_score :",accuracy_score(y_test_cv,dct_bow_u_predict))
print("dct_bow_o_score :",accuracy_score(y_test_cv_o,dct_bow_o_predict))
print("dct_tfidf_u_score :",accuracy_score(y_test_tv,dct_tfidf_u_predict))
print("dct_tfidf_o_score :",accuracy_score(y_test_tv_o,dct_tfidf_o_predict))

dct_bow_u_score : 0.7054180121994976
dct_bow_o_score : 0.8821175746383503
dct_tfidf_u_score : 0.7206673842841765
dct_tfidf_o_score : 0.8930440135426285


In [28]:
dct_bow_u_report=classification_report(y_test_cv,dct_bow_u_predict,target_names=['positive','negative'])
print(dct_bow_u_report)

dct_bow_o_report=classification_report(y_test_cv_o,dct_bow_o_predict,target_names=['positive','negative'])
print(dct_bow_o_report)

dct_tfidf_u_report=classification_report(y_test_tv,dct_tfidf_u_predict,target_names=['positive','negative'])
print(dct_tfidf_u_report)

dct_tfidf_o_report=classification_report(y_test_tv_o,dct_tfidf_o_predict,target_names=['positive','negative'])
print(dct_tfidf_o_report)

              precision    recall  f1-score   support

    positive       0.71      0.70      0.70      2796
    negative       0.70      0.71      0.71      2778

    accuracy                           0.71      5574
   macro avg       0.71      0.71      0.71      5574
weighted avg       0.71      0.71      0.71      5574

              precision    recall  f1-score   support

    positive       0.87      0.90      0.88     22881
    negative       0.90      0.86      0.88     22605

    accuracy                           0.88     45486
   macro avg       0.88      0.88      0.88     45486
weighted avg       0.88      0.88      0.88     45486

              precision    recall  f1-score   support

    positive       0.72      0.72      0.72      2796
    negative       0.72      0.72      0.72      2778

    accuracy                           0.72      5574
   macro avg       0.72      0.72      0.72      5574
weighted avg       0.72      0.72      0.72      5574

              preci