**COMSC-341NL: Natural Language Processing** 

Urvi Suwal 

*Final Project: Sentiment Analysis using Word2Vec and CNN*

Completed: 4/30/2023


# Data Collection

In [None]:
import re
import nltk
import random
import numpy as np
import pandas as pd
import gensim 
from gensim.models import Word2Vec 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,accuracy_score,recall_score
from sklearn.metrics import classification_report, confusion_matrix
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Reading csv file containing 50,000 IMDb movie reviews into a Dataframe 
df = pd.read_csv('/content/drive/MyDrive/SPRING 2023/COMSC-341NL Natural Language Processing/final project/Colab Notebooks/IMDB Dataset.csv')
df


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
# converting sentiment values in the dataframe to integers
  # 1 --> positive
  # 0 --> negative  
df.sentiment = [1 if s == 'positive' else 0 for s in df.sentiment]
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


# Data Cleaning

In [None]:
# Data cleaning by removing: 
  # punctuation marks
  # HTML tags
  # URL's
  # characters which are not letters or digits
  # successive whitespaces
  # convert the text to lower case
  # strip whitespaces from the beginning and the end of the reviews

def process(x):
    x = re.sub('[,\.!?:()"]', '', x)
    x = re.sub('<.*?>', ' ', x)
    x = re.sub('http\S+', ' ', x)
    x = re.sub('[^a-zA-Z0-9]', ' ', x)
    x = re.sub('\s+', ' ', x)
    return x.lower().strip()

df['review'] = df['review'].apply(lambda x: process(x))

In [None]:
# Removal of all stop words 
stopWords_set = set(nltk.corpus.stopwords.words('english'))

def sw_remove(x):
    words = nltk.tokenize.word_tokenize(x)
    filtered_list = [word for word in words if word not in stopWords_set]
    return ' '.join(filtered_list)

df['review'] = df['review'].apply(lambda x: sw_remove(x))

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake thinks zombie...,0
4,petter mattei love time money visually stunnin...,1


# Training Word2Vec

In [None]:
reviews = []
for i in df['review']:
    reviews.append(i.split())
print(reviews[:2])
len(reviews[:2])

[['one', 'reviewers', 'mentioned', 'watching', '1', 'oz', 'episode', 'hooked', 'right', 'exactly', 'happened', 'first', 'thing', 'struck', 'oz', 'brutality', 'unflinching', 'scenes', 'violence', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'hearted', 'timid', 'show', 'pulls', 'punches', 'regards', 'drugs', 'sex', 'violence', 'hardcore', 'classic', 'use', 'word', 'called', 'oz', 'nickname', 'given', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'focuses', 'mainly', 'emerald', 'city', 'experimental', 'section', 'prison', 'cells', 'glass', 'fronts', 'face', 'inwards', 'privacy', 'high', 'agenda', 'em', 'city', 'home', 'manyaryans', 'muslims', 'gangstas', 'latinos', 'christians', 'italians', 'irish', 'moreso', 'scuffles', 'death', 'stares', 'dodgy', 'dealings', 'shady', 'agreements', 'never', 'far', 'away', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'goes', 'shows', 'dare', 'forget', 'pretty', 'pictures', 'painted', 'mainstream', 'audiences', 'forget', '

2

In [None]:
word2vec_model = Word2Vec(reviews, vector_size=300, window=3, min_count=1, workers=16)
print(word2vec_model)

Word2Vec<vocab=125791, vector_size=300, alpha=0.025>


In [None]:
word2vec_model_400 = Word2Vec(reviews, vector_size=400, window=3, min_count=1, workers=16)
print(word2vec_model_400)

Word2Vec<vocab=125791, vector_size=400, alpha=0.025>


In [None]:
word2vec_model_500 = Word2Vec(reviews, vector_size=500, window=3, min_count=1, workers=16)
print(word2vec_model_500)

Word2Vec<vocab=125791, vector_size=500, alpha=0.025>


# Pre-Processing: Tokenizing and Padding 

In [None]:

token = Tokenizer(125791)
token.fit_on_texts(df['review'])
text = token.texts_to_sequences(df['review'])
text = pad_sequences(text)
print(text[:2])

[[   0    0    0 ... 1069 3943  359]
 [   0    0    0 ... 1823   12  114]]


In [None]:

from sklearn import preprocessing
from keras.utils import to_categorical

le = preprocessing.LabelEncoder()
y = le.fit_transform(df['sentiment'])
y = to_categorical(y)
y[:2]

array([[0., 1.],
       [0., 1.]], dtype=float32)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(np.array(text), y, test_size=0.2, stratify=y)


In [None]:
from tensorflow.keras.layers import Embedding

def gensim_to_keras_embedding(model, train_embeddings=False):
    """Get a Keras 'Embedding' layer with weights set from Word2Vec model's learned word embeddings.

    Parameters
    ----------
    train_embeddings : bool
        If False, the returned weights are frozen and stopped from being updated.
        If True, the weights can / will be further updated in Keras.

    Returns
    -------
    `keras.layers.Embedding`
        Embedding layer, to be used as input to deeper network layers.

    """
    keyed_vectors = model.wv  # structure holding the result of training
    weights = keyed_vectors.vectors  # vectors themselves, a 2D numpy array    
    index_to_key = keyed_vectors.index_to_key  # which row in `weights` corresponds to which word?

    layer = Embedding(
        input_dim=weights.shape[0],
        output_dim=weights.shape[1],
        weights=[weights],
        trainable=train_embeddings,
    )
    return layer

# Training CNN

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, MaxPool1D, GlobalMaxPool1D, Embedding, Activation


keras_model = Sequential()
keras_model.add(gensim_to_keras_embedding(word2vec_model, train_embeddings = True))
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(GlobalMaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Dense(200))
keras_model.add(Activation('relu'))
keras_model.add(Dropout(0.2))
keras_model.add(Dense(2))
keras_model.add(Activation('softmax'))
keras_model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer='adam')
keras_model.fit(x_train, y_train, batch_size=16, epochs=2, validation_data=(x_test, y_test))

In [None]:
# vector_size = 400 

from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, MaxPool1D, GlobalMaxPool1D, Embedding, Activation


keras_model = Sequential()
keras_model.add(gensim_to_keras_embedding(word2vec_model_400, train_embeddings = True))
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(GlobalMaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Dense(200))
keras_model.add(Activation('relu'))
keras_model.add(Dropout(0.2))
keras_model.add(Dense(2))
keras_model.add(Activation('softmax'))
keras_model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer='adam')
keras_model.fit(x_train, y_train, batch_size=16, epochs=2, validation_data=(x_test, y_test))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f22bfec24a0>

In [None]:
# vector_size = 400 

from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, MaxPool1D, GlobalMaxPool1D, Embedding, Activation


keras_model = Sequential()
keras_model.add(gensim_to_keras_embedding(word2vec_model_500, train_embeddings = True))
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(GlobalMaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Dense(200))
keras_model.add(Activation('relu'))
keras_model.add(Dropout(0.2))
keras_model.add(Dense(2))
keras_model.add(Activation('softmax'))
keras_model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer='adam')
keras_model.fit(x_train, y_train, batch_size=16, epochs=2, validation_data=(x_test, y_test))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4a60f02680>

In [None]:
prediction = keras_model.predict(x_test)



# Evaluation

In [None]:
p = prediction.round()

In [None]:
keras_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 500)         62895500  
                                                                 
 dropout (Dropout)           (None, None, 500)         0         
                                                                 
 conv1d (Conv1D)             (None, None, 50)          75050     
                                                                 
 conv1d_1 (Conv1D)           (None, None, 50)          7550      
                                                                 
 max_pooling1d (MaxPooling1D  (None, None, 50)         0         
 )                                                               
                                                                 
 dropout_1 (Dropout)         (None, None, 50)          0         
                                                        

In [None]:
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score
acc = accuracy_score(y_test, p)
print(classification_report(y_test,p))
print ("accuracy: ", acc)

              precision    recall  f1-score   support

           0       0.94      0.85      0.89      5000
           1       0.87      0.94      0.90      5000

   micro avg       0.90      0.90      0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000
 samples avg       0.90      0.90      0.90     10000

accuracy:  0.8979


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
print("Precision score: {}".format(precision_score(y_test, p, average='macro')))
print("Recall score: {}".format(recall_score(y_test, p, average='macro')))
print("f1 score: {}".format(f1_score(y_test, p, average='macro')))
print("accuracy: {}".format(accuracy_score(y_test, p)))



Precision score: 0.9011347668049055
Recall score: 0.8978999999999999
f1 score: 0.8976937495760828
accuracy: 0.8979
