# Sentimet Analysis on IMDB reviews and comparing between Logistic Regression and Naive Bayes

Import all the libraries.

In [None]:
pip install kaggle



In [None]:
import os
import re

import pandas as pd
import numpy as np
import zipfile
import gdown

# Text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# For Classificaiton
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve, auc
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

## Import Dataset : IMDB Dataset of 50K Movie Reviews
Ref: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?resource=download

In [None]:
os.system(f"kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

0

In [None]:
zip_file = '/content/imdb-dataset-of-50k-movie-reviews.zip'

with zipfile.ZipFile(zip_file, 'r') as _:
  print(_.namelist())

  imdb_data = pd.read_csv(_.open('IMDB Dataset.csv'))

imdb_data.head(10)

['IMDB Dataset.csv']


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [None]:
# Summary of Dataset
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [None]:
imdb_data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


## Text Pre-processing

Removing html and special characters by using Regular expression

In [None]:
def remove_speacial_characters(text):
  # lower the text
  text = text.lower()
  # Remove html tags and some other special characters and replace by space
  text = re.sub(r'<.*?>',' ',text)
  text = re.sub(r'[^a-zA-Z]', ' ', text)
  return text

# test = imdb_data['review'].loc[:5].to_string()
# test = remove_speacial_characters(test)
# print(test)»

Download NLTK library for text normalization

In [None]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
print(stop_words)

{'ain', 'has', 'our', 'y', "hasn't", 'what', 'were', 'myself', 'above', "needn't", 'hers', 'hadn', 'how', 'over', 'which', 'and', 'between', 'only', 'mustn', "haven't", 'to', 'am', 'yours', 'few', 'than', 'shouldn', "won't", 'some', 'by', 'now', 'weren', "shouldn't", 'she', 'of', 'these', 'mightn', 'it', "you're", 'own', 'both', 'doing', "didn't", 'd', 'my', 'this', 'nor', 'on', 'as', 'no', 's', "shan't", 'why', 'most', 'couldn', 'did', 'm', 'all', "it's", 'll', "wouldn't", 'him', 'can', 'whom', 'her', 'about', 'wasn', 'yourselves', 'after', 're', 'a', "that'll", 'doesn', 't', 'just', 'yourself', "you've", 'here', 'who', 'through', 'any', 'from', 'ours', "doesn't", 'then', 'during', 'because', 'o', "mustn't", 'aren', 'once', 'himself', 'we', 'does', 'they', 'into', 'haven', 'he', 'should', 'not', 'there', 'don', 'so', 'i', 'you', 'before', 'isn', "wasn't", 'while', 'theirs', 'will', 'its', 'are', 'was', "she's", 'if', 'ma', 'having', 've', "couldn't", 'herself', 'your', 'down', 'didn',

Normalizing the text by removing insignificant words from the dataset

In [None]:
def normalizing_text(text):
  # spliting the text into words
  words = text.split()
  words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
  return ' '.join(words)

# normalizing_text(test)

In [None]:
imdb_data['cleaned_review'] = imdb_data['review'].apply(remove_speacial_characters)
imdb_data['cleaned_review'] = imdb_data['cleaned_review'].apply(normalizing_text)
imdb_data.head(10)

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...
5,"Probably my all-time favorite movie, a story o...",positive,probably time favorite movie story selflessnes...
6,I sure would like to see a resurrection of a u...,positive,sure would like see resurrection dated seahunt...
7,"This show was an amazing, fresh & innovative i...",negative,show amazing fresh innovative idea first aired...
8,Encouraged by the positive comments about this...,negative,encouraged positive comment film looking forwa...
9,If you like original gut wrenching laughter yo...,positive,like original gut wrenching laughter like movi...


Labeling the setiment text to be 1 or 0 for positive and negative

In [None]:
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(imdb_data['sentiment'])
print(sentiment_data.shape)

(50000, 1)


## Modeling
In this project will be compairing 2 method for modeling the text
1. Bag of Word
2. Word2Vec

In [None]:
# Split the dataset into training and testing sets
train_review, test_review, train_sentiment, test_sentiment = train_test_split(imdb_data['cleaned_review'], imdb_data['sentiment'], test_size=0.2, random_state=42)

### Bag of Word

Using `CountVectorizer` for Bag of Words by counting the occurrences of each word in the text.

In [None]:
# Creating Bag of Word by changing text to numerical form
bow = CountVectorizer(min_df=0.0, max_df=1.0, binary=False, ngram_range=(1,3))
# Transform the train review data
bow_train_review = bow.fit_transform(train_review)
# Transform the test review data
bow_test_review = bow.transform(test_review)

print('Bag of Word train shape: ',bow_train_review.shape)
print('Bag of Word test shape: ',bow_test_review.shape)

Bag of Word train shape:  (40000, 6785610)
Bag of Word test shape:  (10000, 6785610)


In [None]:
# Count the number of non-zero elements in the second row (index 1)
word_count_row_1 = bow_train_review[2]
print(word_count_row_1)


  (0, 3944212)	1
  (0, 3836448)	1
  (0, 3936954)	1
  (0, 5698535)	4
  (0, 2054001)	1
  (0, 6130265)	1
  (0, 3492303)	2
  (0, 4905495)	1
  (0, 1590096)	1
  (0, 2704003)	1
  (0, 963955)	1
  (0, 4531281)	1
  (0, 6329962)	1
  (0, 1650978)	1
  (0, 4376805)	1
  (0, 1098230)	1
  (0, 2237370)	1
  (0, 5156123)	1
  (0, 6440463)	1
  (0, 3809232)	1
  (0, 5908507)	1
  (0, 6532041)	1
  (0, 2502448)	1
  (0, 2860629)	1
  (0, 2026262)	1
  :	:
  (0, 2238727)	1
  (0, 5156159)	1
  (0, 6445122)	1
  (0, 3954224)	1
  (0, 3810050)	1
  (0, 5909584)	1
  (0, 6539069)	1
  (0, 2511463)	1
  (0, 2861460)	1
  (0, 2029906)	1
  (0, 5270207)	1
  (0, 5749593)	1
  (0, 382343)	1
  (0, 5719230)	1
  (0, 6284284)	1
  (0, 5210565)	1
  (0, 1710859)	1
  (0, 5828751)	1
  (0, 6746324)	1
  (0, 3641100)	1
  (0, 2554275)	1
  (0, 3840453)	1
  (0, 1482967)	1
  (0, 1954979)	1
  (0, 502286)	1


### Word2Vec

In [None]:
pip install gdown



Pre-trained model from : https://www.geeksforgeeks.org/pre-trained-word-embedding-in-nlp/#word2vec

In [None]:
url = 'https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM'
output = 'GoogleNews-vectors-negative300.bin.gz'

# Download the file
gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
From (redirected): https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&confirm=t&uuid=e2e4dee3-b520-4369-8d03-5f9e70374ae7
To: /content/GoogleNews-vectors-negative300.bin.gz
100%|██████████| 1.65G/1.65G [00:20<00:00, 79.7MB/s]


'GoogleNews-vectors-negative300.bin.gz'

In [None]:
#import gensim library
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
#replace with the path where you have downloaded your model.
pretrained_model_path = '/content/GoogleNews-vectors-negative300.bin.gz'
#initialise the pre trained model using load_word2vec_format from gensim module.
word2vectors = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)

In [None]:
def get_word2vec(text):
    words = text.split()
    embedding = np.mean([word2vectors[word] for word in words if word in word2vectors], axis=0)
    return embedding

In [None]:
# Apply the Word2Vec embedding function to each review in the training set
w2v_train_review = np.array([get_word2vec(review) for review in train_review])

# Apply the Word2Vec embedding function to each review in the testing set
w2v_test_review = np.array([get_word2vec(review) for review in test_review])

# Display the shape of the transformed training and testing data
print("Word2Vec training data shape:", w2v_train_review.shape)
print("Word2Vec testing data shape:", w2v_test_review.shape)

Word2Vec training data shape: (40000, 300)
Word2Vec testing data shape: (10000, 300)


In [None]:
print(w2v_train_review[1])

[ 0.10351159  0.02554063 -0.00521907  0.11084097 -0.09341091  0.02059218
  0.07865704 -0.03566159  0.09043921  0.06562093  0.00194129 -0.10943693
 -0.01935565  0.02964334 -0.12808788  0.10449463  0.03912126  0.12189169
  0.01740035 -0.11670842  0.00119106  0.07203894  0.05373416  0.00232719
  0.05367818 -0.07382595 -0.03070988  0.09050223  0.04971371  0.03525964
 -0.09388385 -0.01777267  0.01382033 -0.00432396  0.01790703 -0.01586347
  0.10108241  0.0031276   0.02797514  0.0689873   0.12953958 -0.05523536
  0.11759847 -0.02546605 -0.05369478 -0.04017718 -0.06310587  0.00726554
  0.07432264  0.01126693 -0.08917326  0.03194346 -0.03305621  0.00257674
  0.05785846 -0.00454566 -0.01535937 -0.08695193  0.0253292  -0.0588561
 -0.01174108  0.11653878 -0.04420393 -0.06865042 -0.0410025   0.00339901
 -0.05622252  0.0379255  -0.02910076  0.02747031  0.02961854 -0.00440351
  0.01695403  0.00596372 -0.16116394 -0.03496327  0.01981909  0.03202012
  0.08110814  0.05648097  0.01366728  0.00641178  0.

## Classification
In this project will compare 2 techniques which are **Logistic Regression** and **Naive Bayes**.
In Naive Bayes cliassifier I chose to do 2 method according to the form of the model.


*   Multinomial Naive Bayes: Suitable for discrete data like word counts or frequencies, which aligns well with the Bag of Words representation.
*   Gaussian Naive Bayes: Suitable for continuous data, making it a better fit for the Word2Vec representation where the features are real-valued.



### Bag of Word : Logistic Regression vs Multinomial Naive Bayes

In [None]:
# Logistic Regression : Bag of Word
logistic_bow = LogisticRegression(penalty='l2',max_iter=1000)
# Training the model
logistic_bow.fit(bow_train_review, train_sentiment)
# Predicting the output
pred_logistic_bow = logistic_bow.predict(bow_test_review)

# Multinomial Naive Bayes : Bag of Word
nb_bow = MultinomialNB()
# Training the model
nb_bow.fit(bow_train_review, train_sentiment)
# Predicting the output
pred_nb_bow = nb_bow.predict(bow_test_review)

### Word2Vec : : Logistic Regression vs Gaussian Naive Bayes

In [None]:
# Logistic Regression : Word2Vec
logistic_w2v = LogisticRegression(penalty='l2', max_iter=1000)
# Training the model
logistic_w2v.fit(w2v_train_review, train_sentiment)
# Predictng the output
pred_logistic_w2v = logistic_w2v.predict(w2v_test_review)

# Gaussian Naive Bayes : Word2Vec
nb_w2v = GaussianNB()
# Training the model
nb_w2v.fit(w2v_train_review, train_sentiment)
#Predicting the output
pred_nb_w2v = nb_w2v.predict(w2v_test_review)

## Evaluation

In [None]:
# Logistic Regression with BoW
print("Logistic Regression with BoW Classification Report:")
print(classification_report(test_sentiment, pred_logistic_bow))
print("Accuracy:", accuracy_score(test_sentiment, pred_logistic_bow))
print("AUROC:", roc_auc_score(test_sentiment, logistic_bow.decision_function(bow_test_review)))

# Multinomial Naive Bayes with BoW
print("\nMultinomial Naive Bayes with BoW Classification Report:")
print(classification_report(test_sentiment, pred_nb_bow))
print("Accuracy:", accuracy_score(test_sentiment, pred_nb_bow))
print("AUROC:", roc_auc_score(test_sentiment, nb_bow.predict_proba(bow_test_review)[:, 1]))

print("\n-----------------------------------------------------------")
# Logistic Regression with Word2Vec
print("\nLogistic Regression with Word2Vec Classification Report:")
print(classification_report(test_sentiment, pred_logistic_w2v))
print("Accuracy:", accuracy_score(test_sentiment, pred_logistic_w2v))
print("AUROC:", roc_auc_score(test_sentiment, logistic_w2v.decision_function(w2v_test_review)))

# Gaussian Naive Bayes with Word2Vec
print("\nGaussian Naive Bayes with Word2Vec Classification Report:")
print(classification_report(test_sentiment, pred_nb_w2v))
print("Accuracy:", accuracy_score(test_sentiment, pred_nb_w2v))
print("AUROC:", roc_auc_score(test_sentiment, nb_w2v.predict_proba(w2v_test_review)[:, 1]))


Logistic Regression with BoW Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Accuracy: 0.8987
AUROC: 0.9619498050261379

Multinomial Naive Bayes with BoW Classification Report:
              precision    recall  f1-score   support

    negative       0.88      0.89      0.89      4961
    positive       0.89      0.88      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Accuracy: 0.8864
AUROC: 0.9498238472828686

-----------------------------------------------------------

Logistic Regression with Word2Vec Classification Report:
              precision    recal

In [None]:


# Function to take user input, preprocess it, and get predictions from the models
def get_prediction_from_input():
    # Get user input
    user_input = input("Enter a movie review: ")

    # Preprocess the input text
    cleaned_text = remove_speacial_characters(user_input)
    cleaned_text_2 = normalizing_text(cleaned_text)

    # Convert the cleaned text to Bag of Words (BoW) representation
    bow_vector = bow.transform([cleaned_text])
    bow_vector_2 = bow.transform([cleaned_text_2])

    # Convert the cleaned text to Word2Vec representation
    word2vec_vector = np.array([get_word2vec(cleaned_text)])
    word2vec_vector_2 = np.array([get_word2vec(cleaned_text_2)])

    # Get predictions from the models
    print("\nPredictions based on your input:")
    print("\n-------Cleaned text-------------")
    print(f"Text: {cleaned_text}\n")

    # Logistic Regression with BoW
    pred_logistic_bow = logistic_bow.predict(bow_vector)
    print(f"Logistic Regression (BoW): {pred_logistic_bow[0]}")

    # Multinomial Naive Bayes with BoW
    pred_nb_bow = nb_bow.predict(bow_vector)
    print(f"Multinomial Naive Bayes (BoW): {pred_nb_bow[0]}")

    # Logistic Regression with Word2Vec
    pred_logistic_w2v = logistic_w2v.predict(word2vec_vector)
    print(f"Logistic Regression (Word2Vec): {pred_logistic_w2v[0]}")

    # Gaussian Naive Bayes with Word2Vec
    pred_nb_w2v = nb_w2v.predict(word2vec_vector)
    print(f"Gaussian Naive Bayes (Word2Vec): {pred_nb_w2v[0]}")

    # cleaned and normalization for input data

    print("\n-----Cleaned and Normalize text----")
    print(f"Text: {cleaned_text_2}\n")

    # Logistic Regression with BoW
    pred_logistic_bow_2 = logistic_bow.predict(bow_vector_2)
    print(f"Logistic Regression (BoW): {pred_logistic_bow_2[0]}")

    # Multinomial Naive Bayes with BoW
    pred_nb_bow_2 = nb_bow.predict(bow_vector_2)
    print(f"Multinomial Naive Bayes (BoW): {pred_nb_bow_2[0]}")

    # Logistic Regression with Word2Vec
    pred_logistic_w2v_2 = logistic_w2v.predict(word2vec_vector_2)
    print(f"Logistic Regression (Word2Vec): {pred_logistic_w2v_2[0]}")

    # Gaussian Naive Bayes with Word2Vec
    pred_nb_w2v_2 = nb_w2v.predict(word2vec_vector_2)
    print(f"Gaussian Naive Bayes (Word2Vec): {pred_nb_w2v_2[0]}")


# Call the function
get_prediction_from_input()
