**COMSC-341NL: Natural Language Processing** 

Urvi Suwal 

*Final Project: Sentiment Analysis using Word2Vec and Naive Bayes*

Completed: 4/30/2023


# Data Collection

In [None]:
import re
import nltk
import random
import numpy as np
import pandas as pd
import gensim 
from gensim.models import Word2Vec 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score,accuracy_score,recall_score
from sklearn.metrics import classification_report, confusion_matrix

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Reading csv file containing 50,000 IMDb movie reviews into a Dataframe 
df = pd.read_csv('/content/drive/MyDrive/SPRING 2023/COMSC-341NL Natural Language Processing/final project/Colab Notebooks/IMDB Dataset.csv')
df


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
# converting sentiment values in the dataframe to integers
  # 1 --> positive
  # 0 --> negative  
df.sentiment = [1 if s == 'positive' else 0 for s in df.sentiment]
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


# Data Cleaning

In [None]:
# Data cleaning by removing: 
  # punctuation marks
  # HTML tags
  # URL's
  # characters which are not letters or digits
  # successive whitespaces
  # convert the text to lower case
  # strip whitespaces from the beginning and the end of the reviews

def process(x):
    x = re.sub('[,\.!?:()"]', '', x)
    x = re.sub('<.*?>', ' ', x)
    x = re.sub('http\S+', ' ', x)
    x = re.sub('[^a-zA-Z0-9]', ' ', x)
    x = re.sub('\s+', ' ', x)
    return x.lower().strip()

df['review'] = df['review'].apply(lambda x: process(x))

In [None]:
# Removal of all stop words 
stopWords_set = set(nltk.corpus.stopwords.words('english'))

def sw_remove(x):
    words = nltk.tokenize.word_tokenize(x)
    filtered_list = [word for word in words if word not in stopWords_set]
    return ' '.join(filtered_list)

df['review'] = df['review'].apply(lambda x: sw_remove(x))

# Training Word2Vec and Naive Bayes

## vector_size = 300

In [None]:
# vector size 300
sentences = [sentence.split() for sentence in df['review']]
w2v_model = Word2Vec(sentences, vector_size=300, window=5, min_count=1, workers=4)

In [None]:
# from tqdm import tqdm 

def vectorize(sentence): 
  words = sentence.split()
  words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
  if len(words_vecs) == 0: 
    return np.zeros(300)
  words_vecs = np.array(words_vecs)
  return words_vecs.mean(axis=0)


In [None]:
train_rev, test_rev, train_sent, test_sent = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)
test_rev_copy = test_rev

train_rev = np.array([vectorize(sentence) for sentence in train_rev])
test_rev = np.array([vectorize(sentence) for sentence in test_rev])

In [None]:
gnb = GaussianNB()
gnb.fit(train_rev, train_sent)
pred = gnb.predict(test_rev)
accNB = accuracy_score(test_sent, pred)
y_pred_prob = gnb.predict_proba(test_rev)
print("Confusion matrix")
print(confusion_matrix(test_sent,pred))
print(classification_report(test_sent,pred))
print ("accuracy: ", accNB)

# # printing out wrong predictions for error analysis
# for input, prediction, label in zip(test_rev_copy, pred, test_sent):
#   if prediction != label:
#     print(input, 'has been classified as ', prediction, 'and should be ', label) 

Confusion matrix
[[3749 1212]
 [1067 3972]]
              precision    recall  f1-score   support

           0       0.78      0.76      0.77      4961
           1       0.77      0.79      0.78      5039

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000

accuracy:  0.7721


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
print("Precision score: {}".format(precision_score(test_sent,pred)))
print("Recall score: {}".format(recall_score(test_sent,pred)))
print("f1 score: {}".format(f1_score(test_sent,pred)))
print("accuracy: {}".format(accuracy_score(test_sent,pred)))



Precision score: 0.7662037037037037
Recall score: 0.788251637229609
f1 score: 0.7770713097916463
accuracy: 0.7721


## vector_size = 400

In [None]:
sentences = [sentence.split() for sentence in df['review']]
w2v_model_400 = Word2Vec(sentences, vector_size=400, window=5, min_count=1, workers=4)

In [None]:
train_rev, test_rev, train_sent, test_sent = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)
test_rev_copy = test_rev

train_rev_400 = np.array([vectorize(sentence, w2v_model_400, 400) for sentence in train_rev])
test_rev_400 = np.array([vectorize(sentence, w2v_model_400, 400) for sentence in test_rev])

In [None]:
gnb = GaussianNB()
gnb.fit(train_rev_400, train_sent)
pred = gnb.predict(test_rev_400)
accNB = accuracy_score(test_sent, pred)
y_pred_prob = gnb.predict_proba(test_rev_400)
print("Confusion matrix")
print(confusion_matrix(test_sent,pred))
print(classification_report(test_sent,pred))
print ("accuracy: ", accNB)

Confusion matrix
[[3769 1192]
 [1080 3959]]
              precision    recall  f1-score   support

           0       0.78      0.76      0.77      4961
           1       0.77      0.79      0.78      5039

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000

accuracy:  0.7728


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
print("Precision score: {}".format(precision_score(test_sent,pred)))
print("Recall score: {}".format(recall_score(test_sent,pred)))
print("f1 score: {}".format(f1_score(test_sent,pred)))
print("accuracy: {}".format(accuracy_score(test_sent,pred)))



Precision score: 0.7685886235682392
Recall score: 0.7856717602698948
f1 score: 0.777036310107949
accuracy: 0.7728


## vector_size = 500

In [None]:
sentences = [sentence.split() for sentence in df['review']]
w2v_model_500 = Word2Vec(sentences, vector_size=500, window=5, min_count=1, workers=4)

In [None]:
train_rev, test_rev, train_sent, test_sent = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)
test_rev_copy = test_rev

train_rev_500 = np.array([vectorize(sentence, w2v_model_500, 500) for sentence in train_rev])
test_rev_500 = np.array([vectorize(sentence, w2v_model_500, 500) for sentence in test_rev])

In [None]:
gnb = GaussianNB()
gnb.fit(train_rev_500, train_sent)
pred = gnb.predict(test_rev_500)
accNB = accuracy_score(test_sent, pred)
y_pred_prob = gnb.predict_proba(test_rev_500)
print("Confusion matrix")
print(confusion_matrix(test_sent,pred))
print(classification_report(test_sent,pred))
print ("accuracy: ", accNB)

Confusion matrix
[[3776 1185]
 [1059 3980]]
              precision    recall  f1-score   support

           0       0.78      0.76      0.77      4961
           1       0.77      0.79      0.78      5039

    accuracy                           0.78     10000
   macro avg       0.78      0.78      0.78     10000
weighted avg       0.78      0.78      0.78     10000

accuracy:  0.7756


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
print("Precision score: {}".format(precision_score(test_sent,pred)))
print("Recall score: {}".format(recall_score(test_sent,pred)))
print("f1 score: {}".format(f1_score(test_sent,pred)))
print("accuracy: {}".format(accuracy_score(test_sent,pred)))


Precision score: 0.7705711519845111
Recall score: 0.7898392538202024
f1 score: 0.7800862406899255
accuracy: 0.7756
