In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import contractions
import re
from bs4 import BeautifulSoup

from sklearn.metrics import accuracy_score
import tensorflow as tf
import gensim
from sklearn.model_selection import train_test_split
from tensorflow.keras import utils

import warnings
warnings.filterwarnings("ignore")

## Versions
##### gensim : 1.22.4
##### tensorflow: 2.11.0
##### numpy: 1.22.4

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#load the google news word2vec model from gensim 
import gensim.downloader as api

google_news_w2v = api.load('word2vec-google-news-300')



## 1. Dataset Generation

In [None]:
df = pd.read_csv('drive/MyDrive/hw3/data3.tsv', sep='\t', on_bad_lines = 'skip')

In [None]:
df = df[['star_rating','review_body']]

In [None]:
# A little bit cleaning before we split the classes

df['star_rating'].unique()

In [None]:
#dropping random rows with random date values

df = df.drop(df[df.star_rating.isin(['2015-08-28', '2015-08-16', '2015-08-14', '2015-07-27', '2015-07-26', '2015-07-23',
       '2015-07-22','2015-06-14', '2015-06-02', '2015-04-14',
       '2015-04-09', '2015-04-08', '2015-04-03', '2015-04-02',
       '2015-04-01', '2015-03-31', '2015-03-30', '2015-03-18',
       '2015-02-28', '2015-02-10', '2014-12-30', '2014-12-03',
       '2014-10-09'])].index)

In [None]:
#dropping missing values

df = df.dropna()

In [None]:
#converting data type of star rating to int

df['star_rating'] = df['star_rating'].astype('int')

In [None]:
#Converting reviews to lower case

df['review_body'] = df['review_body'].apply(str.lower)

In [None]:
#Removing HTML lines from reviews

df['review_body'] = df['review_body'].apply(lambda row:BeautifulSoup(row, 'html.parser').get_text())

In [None]:
#Removing URLs from reviews

df['review_body'] = df['review_body'].apply(lambda row:re.sub(r'\s*(https?://|www\.)+\S+(\s+|$)', ' ', row, flags = re.UNICODE))


In [None]:
#removing non-alphabetical characters

df['review_body'] = df['review_body'].apply(lambda text:re.sub(r"[^a-zA-Z' ]", ' ', text))

In [None]:
#applying contractions

df['review_body'] = df['review_body'].apply(lambda text: contractions.fix(text))

In [None]:
#creating a new column according to specified labels (Star Rating 1 and 2 = Class 1, 3 = Class 2, 4 and 5 = Class 3)

def labeler(x):
    if x == 1 or x == 2:
        return 'Class 1'
    elif (x == 3):
        return 'Class 2'
    elif x == 4 or x == 5:
        return 'Class 3'
    
#applying the labeler function to 'star_rating' column 
df['class'] = df['star_rating'].apply(labeler)

In [None]:
# Grouping by class and taking 20000 samples from each class

df = df.groupby('class').sample(n=20000, random_state=42)

In [None]:
df.to_csv('drive/MyDrive/hw3/df_colab.csv')

Half of the dataset cleaning was performed outside colab in a jupyter notebook since google colab was not allowing me to upload .tsv file. The code for cleaning is given above and then it was stored as 'df_colab.csv' which contains the 60k reviews

In [4]:
df = pd.read_csv('drive/MyDrive/hw3/df_colab.csv')

In [5]:
df

Unnamed: 0.1,Unnamed: 0,star_rating,review_body,class
0,4132182,1,really really thin and small extension it does...,Class 1
1,3704580,2,i ordered a reconditioned remington ms ap...,Class 1
2,2794197,2,the product is not as pictured i was wanting...,Class 1
3,3556189,2,the colour is much darker then i thought it wo...,Class 1
4,4307159,1,this item could not be made any worse yet stil...,Class 1
...,...,...,...,...
63502,3510151,4,i have now had this mirror for two years or so...,Class 3
63503,4325318,4,it was delivered faster than expected and abo...,Class 3
63504,2800938,5,great product i tried many other products and...,Class 3
63505,2065657,5,great stuff works great and both water and no...,Class 3


In [6]:
df['class'].replace(['Class 1', 'Class 2', 'Class 3'],
                        [1, 2, 3], inplace=True)

In [8]:
all_words = [row.split(' ') for row in df['review_body']]

In [9]:
all_words = [[item for item in sub_list if item != ''] for sub_list in all_words]

## 2. Word Embeddings

In [10]:
word2vec_model = gensim.models.Word2Vec(all_words, size=300, window=11, min_count=10, sg=1)

In [11]:
word2vec_model.save('word2vec_model.bin')

In [12]:
saved_word2vec_model = gensim.models.Word2Vec.load('word2vec_model.bin')

In [86]:
#find similarity of the words using custom model

#Example 1 - given

print(f"Similarity of the words using pre-trained model - excellent and outstanding: {saved_word2vec_model.wv.similarity('excellent', 'outstanding')}")

#Example 2 - given

print(f"Most similar word using pre-trained model: {saved_word2vec_model.wv.most_similar(positive = ['king','woman'], negative = ['man'], topn=1)}")

#Example 3 - my own

print(f"Most similar word using pre-trained model: {saved_word2vec_model.wv.most_similar(positive = ['lipstick','red'], negative = ['lips'], topn=1)}")

#Example 4 - my own

print(f"Most similar word using pre-trained model: {saved_word2vec_model.most_similar(positive=['hair','brown'], negative = ['color'], topn=1)}")

#Example 5 - my own

print(f"Similarity of the words using pre-trained model - pretty and cute: {saved_word2vec_model.similarity('pretty', 'cute')}")


Similarity of the words using pre-trained model - excellent and outstanding: 0.6325795650482178
Most similar word using pre-trained model: [('unavailable', 0.5172719955444336)]
Most similar word using pre-trained model: [('orangey', 0.5602223873138428)]
Most similar word using pre-trained model: [('highlighted', 0.5245702266693115)]
Similarity of the words using pre-trained model - pretty and cute: 0.4033522307872772


In [85]:
#find similarity of the words using pre-trained model

#Example 1 - given

print(f"Similarity of the words using pre-trained model - excellent and outstanding: {google_news_w2v.similarity('excellent', 'outstanding')}")

#Example 2 - given 

print(f"Most similar word using pre-trained model: {google_news_w2v.most_similar(positive = ['king','woman'], negative = ['man'], topn=1)}")

#Example 3 - my own

print(f"Most similar word using pre-trained model: {google_news_w2v.most_similar(positive=['lipstick','red'], negative = ['lips'], topn=1)}")

#Example 4 - my own

print(f"Most similar word using pre-trained model: {google_news_w2v.most_similar(positive=['hair','brown'], negative = ['color'], topn=1)}")

#Example 5 - my own

print(f"Similarity of the words using pre-trained model - pretty and cute: {google_news_w2v.similarity('pretty', 'cute')}")




Similarity of the words using pre-trained model - excellent and outstanding: 0.5567485690116882
Most similar word using pre-trained model: [('queen', 0.7118192911148071)]
Most similar word using pre-trained model: [('yellow', 0.5638219118118286)]
Most similar word using pre-trained model: [('curly_hair', 0.6184735298156738)]
Similarity of the words using pre-trained model - pretty and cute: 0.3201156556606293


*What do you conclude from comparing vectors generated by yourself and the pretrained model? Which of the Word2Vec models seems to encode semantic similarities between words better?*

From the above results, the word vectors generated by my word2vec model performs better  than the vectors generated by google_news dataset in example 1 which is between the words  'excellent ' and 'outstanding' (semantic similarity my vectors: 0.6325795650482178 and semantic similarity google's vectors:  0.556748628616333).

For example 2 which is king + woman - man, the Google Word2Vec model performs much better than my model (google's vectors: [('queen', 0.7118193507194519)] , my vectors: [('unavailable', 0.5172719955444336)]). This is because king and queen are not available in my vocabulary. 

For one of my own examples (example 3) which is 'lipstick'+'red'-'lips,' my model performs better as it gives out the result 'orangey,' which is much closer to the words than the word predicted by google, which was 'yellow.' This might be the case because my model was trained on vectors generated on a 'Beauty' reviews dataset which might lead it to perform better when words from that dataset are used, even when the google model was trained on a vast corpus.

In conclusion, my model seems to perform better compared to google's.

## 3. Simple Models

In [87]:
def average_of_word_vectors(review, w2v_model):
    review_words = review.split(' ')
    
    review_words_vectors = []
    for word in review_words:
        try:
            review_words_vectors.append(w2v_model[word])
        except:
            continue
            
    if len(review_words_vectors) != 0:
        return np.mean(review_words_vectors, axis=0)
    else:
        return np.zeros(300)

In [88]:
df['average_features'] = df['review_body'].apply(lambda review: average_of_word_vectors(review, google_news_w2v))

In [109]:
X = df['average_features'] #data
y = df['class']   #class


# Split the dataset into 80% train set and 20% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.to_list()
X_test  = X_test.to_list()

In [93]:
from sklearn.linear_model import Perceptron

# training SVM model on the average vectors
perceptron_clf = Perceptron(n_jobs=-1, random_state=123)
perceptron_clf.fit(X_train, y_train)

# predicting the labels on test split
y_pred_test = perceptron_clf.predict(X_test)

print("----------Perceptron----------\n")
print(f"Accuracy score of Perceptron model: {accuracy_score(y_test, y_pred_test)}")

----------Perceptron----------

Accuracy score of perceptron model: 0.5517241379310345


In [95]:
from sklearn import svm
# training SVM model on the average vectors
svm_clf = svm.LinearSVC(random_state=27)
svm_clf.fit(X_train, y_train)

# predicting the labels on test split
y_pred_test = svm_clf.predict(X_test)

print("----------SVM----------\n")
print(f"Accuracy score of SVM model: {accuracy_score(y_test, y_pred_test)}")

----------SVM----------

Accuracy score of SVM model: 0.6557235081089592


What do you conclude from comparing performances for the models trained using the two different feature types (TF-IDF and your trained Word2Vec features)?


In [2]:
accuracies = {"Features": ["Google-W2V", "TF-IDF", "Google-W2V", "TF-IDF"],
          "Model": ["Perceptron", "Perceptron", "SVM", "SVM"], 
          "Accuracy": ["0.5517", "0.6900", "0.6557", "0.7134"]}

df_report = pd.DataFrame(accuracies)
df_report

Unnamed: 0,Features,Model,Accuracy
0,Google-W2V,Perceptron,0.5517
1,TF-IDF,Perceptron,0.69
2,Google-W2V,SVM,0.6557
3,TF-IDF,SVM,0.7134


By comparing the accuracies here, we can conclude that TF-IDF features perform much better than Word2Vec features for our dataset

## 4. Feed-Forward Neural Networks

### 4A: Average Word2Vec Vectors


In [117]:
X = df['average_features'] #data
y = df['class']   #class


# Split the dataset into 80% train set and 20% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.to_list()
X_test  = X_test.to_list()

In [118]:
 X_train_array_fnn = np.array(X_train)
 X_test_array_fnn = np.array(X_test)
 y_train_array_fnn = np.array(y_train)
 y_test_array_fnn = np.array(y_test)

In [119]:
y_train_array_int_fnn = utils.to_categorical(y_train_array_fnn - 1 , num_classes=3)
y_test_array_int_fnn = utils.to_categorical(y_test_array_fnn - 1 , num_classes=3)

In [120]:
model_fnn = tf.keras.Sequential([
    tf.keras.layers.Dense(100, activation='relu', input_dim=300),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])
model_fnn.summary()


Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_11 (Dense)            (None, 100)               30100     
                                                                 
 dense_12 (Dense)            (None, 10)                1010      
                                                                 
 dense_13 (Dense)            (None, 3)                 33        
                                                                 
Total params: 31,143
Trainable params: 31,143
Non-trainable params: 0
_________________________________________________________________


In [121]:
model_fnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [122]:
model_fnn.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_11 (Dense)            (None, 100)               30100     
                                                                 
 dense_12 (Dense)            (None, 10)                1010      
                                                                 
 dense_13 (Dense)            (None, 3)                 33        
                                                                 
Total params: 31,143
Trainable params: 31,143
Non-trainable params: 0
_________________________________________________________________


In [123]:
model_fnn.fit(X_train_array_fnn, y_train_array_int_fnn, epochs=8, verbose=1, validation_split=0.1,
          validation_data=(X_test_array_fnn, y_test_array_int_fnn))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7ef84cf9b3d0>

FNN Accuracy (Average vectors): 67.19% 

### 4B: Concatenating the first 10 Word2Vec Vectors

In [100]:
def length_10(review, word2vec_model):
    tokens = review.split()
    vectors = [word2vec_model.wv[t] for t in tokens if t in word2vec_model.wv.vocab]
    padded_vectors = vectors[:10] + [np.zeros(300)] * max(0, 10 - len(vectors))

    return np.concatenate(padded_vectors)

In [101]:
X = [length_10(review,word2vec_model) for review in df['review_body']]

In [102]:
y = df['class']

In [103]:
# Split the dataset into 80% train set and 20% test set
X_train_10, X_test_10, y_train_10, y_test_10 = train_test_split(X, y, test_size=0.2, random_state=42)

In [104]:
X_train_array_fnn_10 = np.array(X_train_10)
X_test_array_fnn_10  = np.array(X_test_10)
y_train_array_fnn_10  = np.array(y_train_10)
y_test_array_fnn_10  = np.array(y_test_10)

In [105]:
y_train_array_int_fnn_10 = utils.to_categorical(y_train_array_fnn_10 - 1 , num_classes=3)
y_test_array_int_fnn_10 = utils.to_categorical(y_test_array_fnn_10 - 1 , num_classes=3)

In [106]:
model_fnn_10 = tf.keras.Sequential([
    tf.keras.layers.Dense(100, activation='relu', input_dim=3000),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])
model_fnn_10.summary()


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 100)               300100    
                                                                 
 dense_9 (Dense)             (None, 10)                1010      
                                                                 
 dense_10 (Dense)            (None, 3)                 33        
                                                                 
Total params: 301,143
Trainable params: 301,143
Non-trainable params: 0
_________________________________________________________________


In [107]:
model_fnn_10.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [108]:
model_fnn_10.fit(X_train_array_fnn_10, y_train_array_int_fnn_10, epochs=8, verbose=1, validation_split=0.1,
          validation_data=(X_test_array_fnn_10, y_test_array_int_fnn_10))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7ef84d2787f0>

FNN Accuracy (Concatenating first 10 words): 57.72% 

What do you conclude by comparing accuracy values you obtain with
those obtained in the “’Simple Models” section.

The accuracy results of the feedforward neural network models with different input features show that the model with the input feature as the average of all word vectors outperforms or performs similarly to the perceptron and SVM models. However, the feedforward neural network model with the input feature as the concatenation of the first ten word vectors performs worse than the SVM and slightly better than Perceptron models. This indicates that using the average of all word vectors as the input feature is a better choice than concatenating the first ten word vectors. Additionally, it can be concluded that the feedforward neural network model is more robust and slightly more accurate when the average of word vectors is used as the input feature.

## 5. Recurrent Neural Networks

### 5A: Simple RNN

In [50]:
def length_20(review, word2vec_model):
  review = review.split()
  vector = [word2vec_model.wv.vocab[r].index for r in review if r in word2vec_model.wv.vocab]

  if len(vector) > 20:
    vector = vector[:20]
  elif len(vector) < 20:
    vector.extend([0] * (20 - len(vector)))

  return np.array(vector)

In [51]:
X_rnn = [length_20(review,google_news_w2v) for review in df['review_body']]

In [46]:
y_rnn = df['class']

In [47]:
import torch 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [52]:
# Split the dataset into 80% train set and 20% test set
X_rnn_train, X_rnn_test, y_rnn_train, y_rnn_test = train_test_split(X_rnn , y_rnn, test_size=0.2, random_state=42)

In [53]:
 X_rnn_train_array = np.array(X_rnn_train)
 X_rnn_test_array  = np.array(X_rnn_test)
 y_rnn_train_array  = np.array(y_rnn_train)
 y_rnn_test_array  = np.array(y_rnn_test)

In [54]:
y_train_array_int_rnn = utils.to_categorical(y_rnn_train_array - 1 , num_classes=3)
y_test_array_int_rnn = utils.to_categorical(y_rnn_test_array - 1 , num_classes=3)

In [55]:
max_review_length= 20

In [56]:
model_srnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(google_news_w2v.vocab), output_dim=300, weights=[google_news_w2v.wv.vectors], input_length=max_review_length, trainable=False),
    tf.keras.layers.SimpleRNN(20), #SimpleRNN layer
    tf.keras.layers.Dense(3, activation='softmax')
])

model_srnn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 20, 300)           900000000 
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 20)                6420      
                                                                 
 dense_2 (Dense)             (None, 3)                 63        
                                                                 
Total params: 900,006,483
Trainable params: 6,483
Non-trainable params: 900,000,000
_________________________________________________________________


In [57]:
model_srnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [58]:
model_srnn.fit(X_rnn_train_array, y_train_array_int_rnn, epochs=8, verbose=1, validation_split=0.1,
          validation_data=(X_rnn_test_array, y_test_array_int_rnn))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7efadc179eb0>

Simple RNN Accuracy (Concatenating first 10 words): 60.81%

### 5B: Gated Recurrent Unit 

In [60]:
model_grnn = tf.keras.Sequential([    
    tf.keras.layers.Embedding(input_dim=len(google_news_w2v.vocab), output_dim=300, weights=[google_news_w2v.wv.vectors], input_length=max_review_length, trainable=False),
    tf.keras.layers.GRU(20), #GRU layer
    tf.keras.layers.Dense(3, activation='softmax')
])

model_grnn.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 20, 300)           900000000 
                                                                 
 gru (GRU)                   (None, 20)                19320     
                                                                 
 dense_3 (Dense)             (None, 3)                 63        
                                                                 
Total params: 900,019,383
Trainable params: 19,383
Non-trainable params: 900,000,000
_________________________________________________________________


In [61]:
model_grnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [63]:
model_grnn.fit(X_rnn_train_array, y_train_array_int_rnn, epochs=8, verbose=1, validation_split=0.1,
          validation_data=(X_rnn_test_array, y_test_array_int_rnn))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7efab635d940>

GRU RNN Accuracy: 67.30%

### 5C: LSTM

In [64]:
model_lrnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(google_news_w2v.vocab), output_dim=300, weights=[google_news_w2v.wv.vectors], input_length=max_review_length, trainable=False),
    tf.keras.layers.LSTM(20),
    tf.keras.layers.Dense(3, activation='softmax')
])

model_lrnn.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 20, 300)           900000000 
                                                                 
 lstm (LSTM)                 (None, 20)                25680     
                                                                 
 dense_4 (Dense)             (None, 3)                 63        
                                                                 
Total params: 900,025,743
Trainable params: 25,743
Non-trainable params: 900,000,000
_________________________________________________________________


In [65]:
model_lrnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [66]:
model_lrnn.fit(X_rnn_train_array, y_train_array_int_rnn, epochs=8, verbose=1, validation_split=0.1,
          validation_data=(X_rnn_test_array, y_test_array_int_rnn))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7efa646866a0>

LSTM RNN Accuracy: 66.23% 

What do you conclude by comparing accuracy values you obtain by GRU,
LSTM, and simple RNN?

Based on the accuracy results obtained for the SimpleRNN, GRU, and LSTM models, it can be inferred that the GRU model outperforms both the SimpleRNN and LSTM models. The superiority of the GRU model is attributed to its gated units, which enable it to retain long-term dependencies among words and thus improve prediction accuracy. In contrast, the RNN model lacks long-term memory and can only facilitate basic sequence prediction based on its short-term memory.