**Sentiment Analysis of Movie Reviews**

**Problem Statement:**

In this, we have to predict the number of positive and negative reviews based on sentiments by using different classification models.

**Import necessary libraries**

In [1]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
import spacy
import re, string, unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from tqdm import tqdm 
from spacy.lang.en.stop_words import STOP_WORDS
import os
import warnings
import multiprocessing

cpus = multiprocessing.cpu_count()
nlp = spacy.load('en_core_web_lg')
pool = multiprocessing.Pool(multiprocessing.cpu_count())

warnings.filterwarnings('ignore')
print(f'Using {multiprocessing.cpu_count()} CPU cores.')

Using 8 CPU cores.


**Import the training dataset**

In [2]:
#importing the training data
# imdb_data=pd.read_csv('data/train.csv')
imdb_data=pd.read_csv('data/processed_train.csv')
print(imdb_data.shape)
imdb_data.head(10)

(29341, 3)


Unnamed: 0,ID,review,sentiment
0,41411,I watched this film because I'm a big fan of R...,0
1,37586,It does not seem that this movie managed to pl...,1
2,6017,"Enough is not a bad movie , just mediocre .",0
3,44656,my friend and i rented this one a few nights a...,0
4,38711,"Just about everything in this movie is wrong, ...",0
5,24198,this isn't 'Bonnie and Clyde' or 'Thelma and L...,1
6,37654,I have to say that I really liked UNDER SIEGE ...,0
7,45340,Kramer Vs. Kramer is a near-heartening drama a...,1
8,9745,"Like the other comments says, this might be su...",0
9,55813,The tunes are the best aspect of this televisi...,1


In [3]:
#importing the testing data
# imdb_test_data=pd.read_csv('data/test.csv')
imdb_test_data=pd.read_csv('data/processed_test.csv')
print(imdb_test_data.shape)
imdb_test_data.head(10)

(29341, 2)


Unnamed: 0,ID,review
0,22622,Robert Lansing plays a scientist experimenting...
1,10162,"Well I've enjoy this movie, even though someti..."
2,17468,First things first - though I believe Joel Sch...
3,42579,I watched this movie on the grounds that Amber...
4,701,A certain sexiness underlines even the dullest...
5,13418,"Why did they change the cute, Rugrats televisi..."
6,50674,Who will love my children has changed my heart...
7,33704,Im gonna make this short and sweet because i d...
8,21242,Though I never like to be the sort of person w...
9,40345,The main aspect about the Superstar's movies a...


**Exploratery data analysis**

In [4]:
#Summary of the dataset
imdb_data.describe()

Unnamed: 0,ID,sentiment
count,29341.0,29341.0
mean,29348.411097,0.509662
std,17002.074346,0.499915
min,4.0,0.0
25%,14564.0,0.0
50%,29348.0,1.0
75%,44162.0,1.0
max,58681.0,1.0


**Sentiment count**

In [5]:
#sentiment count
imdb_data['sentiment'].value_counts()

1    14954
0    14387
Name: sentiment, dtype: int64

We can see that the dataset is balanced.

**Spliting the training dataset**

In [6]:
#split the dataset  
#train dataset
train_reviews=imdb_data.review
train_sentiments=imdb_data.sentiment
#test dataset
test_reviews=imdb_test_data.review
# test_sentiments=imdb_test_data.sentiment
print(train_reviews.shape,train_sentiments.shape)
# print(test_reviews.shape,test_sentiments.shape)

(29341,) (29341,)


**Text normalization**

**Removing html strips and noise text**

In [7]:
class text_processing_pipeline():

    try:
        nlp = spacy.load('en_core_web_lg') # spacy model
    except:
        raise ValueError('spaCy en_core_web_lg model not found.')

    def __init__(self, text):
        self.text = text
        # self.text = " ".join([w.lemma_ for w in self.nlp(text)])
        self.nlp = nlp 
        self.tokens = None
        self.rm_tokens = None
        self.rm_text = None

     # Removing the html strips
    def strip_html(self):
        soup = BeautifulSoup(self.text, "html.parser")
        self.text = soup.get_text().lower()
        return self

    def text_preprocessor(self, remove_stopwords=True):
        """
        - Lowercase the sentence
        - Change "'t" to "not"
        - Remove "@name"
        - Isolate and remove punctuations except "?"
        - Remove other special characters
        - Remove stop words
        - Remove trailing whitespace
        """

        s = self.text

        # Change 't to 'not'
        s = re.sub(r"\'t", " not", s)
        # Remove @name
        s = re.sub(r'(@.*?)[\s]', ' ', s)
        # Isolate and remove punctuations except '?'
        s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
        s = re.sub(r'[^\w\s\?]', ' ', s)
        # Removing the square brackets
        s = re.sub('\[[^]]*\]', '', s)
        # Remove some special characters
        s = re.sub(r'([\;\:\|•«\n])', ' ', s)
        s = re.sub(r'[^a-zA-z0-9\s]','',s)
        # Remove trailing whitespace
        s = re.sub(r'\s+', ' ', s).strip()

        self.tokens = [str(t) for t in self.nlp(s)]

        if remove_stopwords:
            self.rm_tokens = [w for w in self.tokens if w not in STOP_WORDS]
            self.rm_text = " ".join([t for t in self.rm_tokens])
        else:
            pass

        return self

    # Stemming the text
    def simple_stemmer(self):
        ps = nltk.porter.PorterStemmer()
        self.text= ' '.join([ps.stem(word) for word in self.text.split()])
        return self

    @staticmethod
    def lemmatize(text, nlp):
        return (" ".join([w.lemma_ for w in nlp(text)]))

    def main_pipeline(self):
        return self.lemmatize(self.strip_html().text_preprocessor().get_rm_text(), nlp)

    def get_text(self):
        return self.text

    def get_tokens(self):
        return self.tokens

    def get_rm_text(self):
        return self.rm_text

    def __repr__(self):
        return f"text_processing_pipeline([text = '{self.text}'])"

In [9]:
print(imdb_data.loc[7,'review'])

Kramer Vs. Kramer is a near-heartening drama about shocking, drastic augmentations of the two subjects of a failed married couple. Meryl Streep, in the throes of her trademark maternal sensitivity, plays an unhappy stay-at-home mother who feels confined to such a role and within the first five minutes of the film leaves her inattentive husband, in a fantastic performance by Dustin Hoffman, to find another role for herself. Hoffman is dumbstruck, having absolutely no idea what to do with himself, having taken so much for granted that he doesn't know the first thing about getting his son to school in the morning.<br /><br />Hoffman seamlessly characterizes this husband as such a juicy load of setbacks. He is restless, relentless and impatient, but even though the positive side to those three adjectives should include just the opposite, he is unremittingly fixated on whatever he turns his head to. He's been focused on his career in advertising, and when he is left to raise his son Billy a

In [10]:
"""Usage Example"""
nlp = spacy.load('en_core_web_lg')

# text_processing_pipeline.lemmatize(text_processing_pipeline(imdb_data.loc[7,'review']).strip_html().text_preprocessor().get_rm_text(), nlp)
text_processing_pipeline(imdb_data.loc[7,'review']).main_pipeline()

'kramer vs kramer near hearten drama shock drastic augmentation subject fail marry couple meryl streep throe trademark maternal sensitivity play unhappy stay home mother feels confine role minute film leave inattentive husband fantastic performance dustin hoffman find role hoffman dumbstruck have absolutely idea having taken grant doesn know thing get son school morning hoffman seamlessly characterize husband juicy load setback restless relentless impatient positive adjective include opposite unremittingly fixated turn head s focused career advertising left raise son billy chaos usher immediately s throw temper tantrum quit angrily halfway activity awhile befriend neighbor joanna s friend play sexy jane alexander hoffman cool jet understand wife leave meantime boundless energy redirect raise billy lose job custody battle title brilliantly grey circumstance end little unmotivated subjectified audience line shoot witty screen write touch diminish magical 1970'

In [12]:
%%time
processed_text = []

for text in tqdm(imdb_data['review']):
    tmp = text_processing_pipeline(text).main_pipeline()
    processed_text.append(tmp)

imdb_data['processed_review'] = processed_text

# def mp_wrapper(text):
#     return text_processing_pipeline(text).main_pipeline()

# processed_text = list(map(mp_wrapper, imdb_data['review']))
# mp_wrapper(imdb_data.loc[7,'review'])
# processed_text = list(tqdm(pool.imap(mp_wrapper, imdb_data['review']), total=len(imdb_data['review'])))

# processed_text = list(tqdm(pool.imap(mp_wrapper, list(imdb_data.loc[0:3,'review'])), total=len(list(imdb_data.loc[0:3,'review']))))

100%|██████████| 29341/29341 [23:56<00:00, 20.42it/s]CPU times: user 23min 44s, sys: 3.36 s, total: 23min 47s
Wall time: 23min 56s



In [16]:
# imdb_data.to_csv('./data/processed_train.csv',index=0)
imdb_data

Unnamed: 0,ID,review,sentiment,processed_review
0,41411,I watched this film because I'm a big fan of R...,0,watch film m big fan river phoenix joaquin pho...
1,37586,It does not seem that this movie managed to pl...,1,movie manage lot people see place bump acciden...
2,6017,"Enough is not a bad movie , just mediocre .",0,bad movie mediocre
3,44656,my friend and i rented this one a few nights a...,0,friend rent night ago single good movie see me...
4,38711,"Just about everything in this movie is wrong, ...",0,movie wrong wrong wrong mike myers example s r...
...,...,...,...,...
29336,8019,It 's one of the most honest films ever made a...,1,s honest film hollywood
29337,453,An absorbing and unsettling psychological drama .,1,absorb unsettling psychological drama
29338,13097,"Soylent Green IS...a really good movie, actual...",1,soylent green good movie actually ve think don...
29339,26896,There just isn't enough here. There a few funn...,0,isn funny spot disappoint love stupid movie ex...


In [18]:
%%time
processed_text = []

for text in tqdm(imdb_test_data['review']):
    tmp = text_processing_pipeline(text).main_pipeline()
    processed_text.append(tmp)

100%|██████████| 29341/29341 [19:50<00:00, 24.65it/s]CPU times: user 19min 50s, sys: 2.13 s, total: 19min 52s
Wall time: 19min 50s



In [20]:
# imdb_test_data.to_csv('./data/processed_test.csv',index=0)
imdb_test_data['processed_review'] = processed_text

Unnamed: 0,ID,review,processed_review
0,22622,Robert Lansing plays a scientist experimenting...,robert lansing play scientist experiment pass ...
1,10162,"Well I've enjoy this movie, even though someti...",ve enjoy movie turn stereotypical situation nt...
2,17468,First things first - though I believe Joel Sch...,thing believe joel schumacher well mediocre di...
3,42579,I watched this movie on the grounds that Amber...,watch movie ground amber benson rock nick stah...
4,701,A certain sexiness underlines even the dullest...,certain sexiness underline dull tangent
...,...,...,...
29336,30370,It is difficult to rate a writer/director's fi...,difficult rate writer director s effort movie ...
29337,18654,"After watching this movie once, it quickly bec...",watch movie quickly favorite different event h...
29338,47985,"Even though i sat and watched the whole thing,...",sit watch thing good place big chunk informati...
29339,9866,Warning Spoilers following. Superb recreation ...,warn spoiler follow superb recreation base ant...


In [21]:
norm_train_reviews = imdb_data.review
norm_test_reviews=imdb_test_data.review

print(norm_train_reviews.shape)
print(norm_test_reviews.shape)

(29341,)
(29341,)


In [None]:
# https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-bert-and-hugging-face-294e8a04b671

**Bags of words model **

It is used to convert text documents to numerical vectors or bag of words.

In [22]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (29341, 4272351)
BOW_cv_test: (29341, 4272351)


**Term Frequency-Inverse Document Frequency model (TFIDF)**

It is used to convert text documents to  matrix of  tfidf features.

In [23]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (29341, 4272351)
Tfidf_test: (29341, 4272351)


**Labeling the sentiment text**

In [None]:
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(imdb_data['sentiment'])
print(sentiment_data.shape)

**Split the sentiment tdata**

In [24]:
#Spliting the sentiment data
train_sentiments = imdb_data['sentiment']
test_sentiments = imdb_test_data['sentiment']
print(train_sentiments)
print(test_sentiments)

KeyError: 'sentiment'

**Modelling the dataset**

Let us build logistic regression model for both bag of words and tfidf features

In [25]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,train_sentiments)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_train_reviews,train_sentiments)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)


**Logistic regression model performane on test dataset**

In [26]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)
##Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

[1 1 0 ... 1 1 0]
[1 1 0 ... 1 1 0]


**Accuracy of the model**

In [27]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_sentiments,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(test_sentiments,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

NameError: name 'test_sentiments' is not defined

**Print the classification report**

In [None]:
#Classification report for bag of words 
lr_bow_report=classification_report(test_sentiments,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

#Classification report for tfidf features
lr_tfidf_report=classification_report(test_sentiments,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

**Confusion matrix**

In [None]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,lr_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,lr_tfidf_predict,labels=[1,0])
print(cm_tfidf)

**Stochastic gradient descent or Linear support vector machines for bag of words and tfidf features**

In [None]:
#training the linear svm
svm=SGDClassifier(loss='hinge',max_iter=500,random_state=42)
#fitting the svm for bag of words
svm_bow=svm.fit(cv_train_reviews,train_sentiments)
print(svm_bow)
#fitting the svm for tfidf features
svm_tfidf=svm.fit(tv_train_reviews,train_sentiments)
print(svm_tfidf)

**Model performance on test data**

In [None]:
#Predicting the model for bag of words
svm_bow_predict=svm.predict(cv_test_reviews)
print(svm_bow_predict)
#Predicting the model for tfidf features
svm_tfidf_predict=svm.predict(tv_test_reviews)
print(svm_tfidf_predict)

**Accuracy of the model**

In [None]:
#Accuracy score for bag of words
svm_bow_score=accuracy_score(test_sentiments,svm_bow_predict)
print("svm_bow_score :",svm_bow_score)
#Accuracy score for tfidf features
svm_tfidf_score=accuracy_score(test_sentiments,svm_tfidf_predict)
print("svm_tfidf_score :",svm_tfidf_score)

**Print the classification report**

In [None]:
#Classification report for bag of words 
svm_bow_report=classification_report(test_sentiments,svm_bow_predict,target_names=['Positive','Negative'])
print(svm_bow_report)
#Classification report for tfidf features
svm_tfidf_report=classification_report(test_sentiments,svm_tfidf_predict,target_names=['Positive','Negative'])
print(svm_tfidf_report)

**Plot the confusion matrix**

In [None]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,svm_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,svm_tfidf_predict,labels=[1,0])
print(cm_tfidf)

**Multinomial Naive Bayes for bag of words and tfidf features**

In [None]:
#training the model
mnb=MultinomialNB()
#fitting the svm for bag of words
mnb_bow=mnb.fit(cv_train_reviews,train_sentiments)
print(mnb_bow)
#fitting the svm for tfidf features
mnb_tfidf=mnb.fit(tv_train_reviews,train_sentiments)
print(mnb_tfidf)

**Model performance on test data**

In [None]:
#Predicting the model for bag of words
mnb_bow_predict=mnb.predict(cv_test_reviews)
print(mnb_bow_predict)
#Predicting the model for tfidf features
mnb_tfidf_predict=mnb.predict(tv_test_reviews)
print(mnb_tfidf_predict)

**Accuracy of the model**

In [None]:
#Accuracy score for bag of words
mnb_bow_score=accuracy_score(test_sentiments,mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)
#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(test_sentiments,mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

**Print the classification report**

In [None]:
#Classification report for bag of words 
mnb_bow_report=classification_report(test_sentiments,mnb_bow_predict,target_names=['Positive','Negative'])
print(mnb_bow_report)
#Classification report for tfidf features
mnb_tfidf_report=classification_report(test_sentiments,mnb_tfidf_predict,target_names=['Positive','Negative'])
print(mnb_tfidf_report)

**Plot the confusion matrix**

In [None]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,mnb_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,mnb_tfidf_predict,labels=[1,0])
print(cm_tfidf)

**Conclusion:**
* We can observed that both logistic regression and multinomial naive bayes model performing well compared to linear support vector  machines.
* Still we can improve the accuracy of the models by preprocessing data and by using lexicon models like Textblob.