## Text Classification on Amazon Shoe reviews Dataset with Word2Vec Word Embeddings in Gensim and training using LSTM In Keras.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


### IMPORTING THE MODULES

In [None]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

# sets matplotlib to inline and displays graphs below the corressponding cell.
% matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#nltk
import nltk
!pip install nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet
import requests
from bs4 import BeautifulSoup
# for part-of-speech tagging
from nltk import pos_tag

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer


import re # regex

#model_selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn.metrics import accuracy_score,roc_auc_score 
from sklearn.metrics import classification_report
from mlxtend.plotting import plot_confusion_matrix



 
#keras
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input,LSTM
from keras.models import Model
from keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras import optimizers

from gensim.models import Word2Vec

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


### LOADING THE DATASET

In [None]:
reviews = pd.read_csv('/content/drive/MyDrive/amazon_reviews_us_Shoes_v1_00.tsv', sep='\t',error_bad_lines=False)


b'Skipping line 54101: expected 15 fields, saw 22\nSkipping line 55857: expected 15 fields, saw 22\nSkipping line 60448: expected 15 fields, saw 22\n'
b'Skipping line 76918: expected 15 fields, saw 22\nSkipping line 87925: expected 15 fields, saw 22\nSkipping line 88500: expected 15 fields, saw 22\nSkipping line 114276: expected 15 fields, saw 22\nSkipping line 128751: expected 15 fields, saw 22\n'
b'Skipping line 136095: expected 15 fields, saw 22\nSkipping line 140007: expected 15 fields, saw 22\nSkipping line 177148: expected 15 fields, saw 22\nSkipping line 180087: expected 15 fields, saw 22\nSkipping line 183010: expected 15 fields, saw 22\nSkipping line 183949: expected 15 fields, saw 22\nSkipping line 192879: expected 15 fields, saw 22\n'
b'Skipping line 223261: expected 15 fields, saw 22\nSkipping line 240588: expected 15 fields, saw 22\nSkipping line 247955: expected 15 fields, saw 22\nSkipping line 249336: expected 15 fields, saw 22\nSkipping line 259363: expected 15 fields, 

In [None]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4358820 entries, 0 to 4358819
Data columns (total 15 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   marketplace        object
 1   customer_id        int64 
 2   review_id          object
 3   product_id         object
 4   product_parent     int64 
 5   product_title      object
 6   product_category   object
 7   star_rating        int64 
 8   helpful_votes      int64 
 9   total_votes        int64 
 10  vine               object
 11  verified_purchase  object
 12  review_headline    object
 13  review_body        object
 14  review_date        object
dtypes: int64(5), object(10)
memory usage: 498.8+ MB


In [None]:
reviews.head(5)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,18069663,R3P2HIOQCIN5ZU,B000XB31C0,265024781,Minnetonka Men's Double Deerskin Softsole Mocc...,Shoes,1,0,0,N,Y,.,Do not buy: really didn't start to wear them u...,2015-08-31
1,US,16251825,R12VVR0WH5Q24V,B00CFYZH5W,259035853,Teva Men's Pajaro Flip-Flop,Shoes,5,0,0,N,Y,super flip flop,provides great cushion as well as archsupport,2015-08-31
2,US,20381037,RNCCKB6TV5EEF,B00S8JNN3Q,666066660,Anne Klein Perfect Pair Wristlet,Shoes,4,0,0,N,Y,Great clutch purse!,It's perfect if you need something small for c...,2015-08-31
3,US,108364,R2NZXYIVCGB13W,B00XFBPOQG,448483263,adidas Men's 10K Lifestyle Runner Sneaker,Shoes,5,0,6,N,Y,Badass,Getting what u see,2015-08-31
4,US,45449350,R2EQ1TG9IT3OEQ,B00SW64Y9W,7853171,OverBling Sneakers for Men Casual Men Shoes Ge...,Shoes,3,0,0,N,Y,Three Stars,small,2015-08-31


#### A brief description of the dataset

Data includes:
- 4358820 reviews

### DATA CLEANING AND PRE-PROCESSING

####  Problem involves sentiment analysis so, separated review_body and ratings as a dataset. 

In [None]:
reviews['review_body']=reviews['review_body'].dropna()

In [None]:
data=reviews[['review_body','star_rating']]


In [None]:
data.head()

Unnamed: 0,review_body,star_rating
0,Do not buy: really didn't start to wear them u...,1
1,provides great cushion as well as archsupport,5
2,It's perfect if you need something small for c...,4
3,Getting what u see,5
4,small,3


In [None]:
def sentiment(i):
    if i<=2:
        return 0
    else:
        return 1
    
data['sentiment'] = data['star_rating'].apply(sentiment)

In [None]:
data.head(2)

Unnamed: 0,review_body,star_rating,sentiment
0,Do not buy: really didn't start to wear them u...,1,0
1,provides great cushion as well as archsupport,5,1


In [None]:
data['sentiment'].value_counts()

1    3884692
0     474128
Name: sentiment, dtype: int64

In [None]:
data.duplicated().sum()

335789

In [None]:
# remove duplicates/ for every duplicate we will keep only one row of that type. 
df=data.drop_duplicates(inplace=False)

#### Note that there is no point for keeping rows with different scores or sentiment for same review text, will keep only one instance and drop the rest of the duplicates.

In [None]:
df['review_body']=df['review_body'].apply(str)


In [None]:
# now check the shape. note that shape is reduced which shows that we did has duplicate rows.
print(df.shape)
df.head()

(4023031, 3)


Unnamed: 0,review_body,star_rating,sentiment
0,Do not buy: really didn't start to wear them u...,1,0
1,provides great cushion as well as archsupport,5,1
2,It's perfect if you need something small for c...,4,1
3,Getting what u see,5,1
4,small,3,1


#### Pre-processing steps :

1 ) First **removing punctuation and html tags** if any. note that the html tags may be present.

2) **Tokenize** the reviews into tokens or words .

3) Next **remove the stop words and shorter words** as they cause noise.

4) **Stem or lemmatize** the words depending on what does better. Herer I have yse lemmatizer.

In [None]:
# function to clean and pre-process the text.
def clean_reviews(review):  
    
    # 1. Removing html tags
    review_text = BeautifulSoup(review,"lxml").get_text()
    
    # 2. Retaining only alphabets.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    
    # 3. Converting to lower case and splitting
    word_tokens= review_text.lower().split()
    
    # 4. Remove stopwords
    le=WordNetLemmatizer()
    stop_words= set(stopwords.words("english"))     
    word_tokens= [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned_review=" ".join(word_tokens)
    return cleaned_review

#### Since the data has very large, preprocessing and training the model has been a tedious task, hence to proceed ahead a sample of 50k (in the same proposition of negative and positive in the original data) reviews are considered

In [None]:
pos_df=df.loc[df.sentiment==1,:][:45000]
neg_df=df.loc[df.sentiment==0,:][:5000]

In [None]:
pos_df.head()

Unnamed: 0,review_body,star_rating,sentiment
1,provides great cushion as well as archsupport,5,1
2,It's perfect if you need something small for c...,4,1
3,Getting what u see,5,1
4,small,3,1
5,My 13 year old son loved these shoes excellent...,5,1


In [None]:
neg_df.head()

Unnamed: 0,review_body,star_rating,sentiment
0,Do not buy: really didn't start to wear them u...,1,0
14,It's okay .,2,0
19,Tissue paper is this thicker than this. It ri...,1,0
30,This pair of Sketchers is now the second pair ...,2,0
33,Not what I expected would not order again,1,0


#### We can now combine reviews of each sentiment and shuffle them so that their order doesn't make any sense.

In [None]:
#combining
df=pd.concat([pos_df,neg_df],ignore_index=True)

In [None]:
print(df.shape)
df.head()

(50000, 3)


Unnamed: 0,review_body,star_rating,sentiment
0,provides great cushion as well as archsupport,5,1
1,It's perfect if you need something small for c...,4,1
2,Getting what u see,5,1
3,small,3,1
4,My 13 year old son loved these shoes excellent...,5,1


In [None]:
# shuffling rows
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape)  # perfectly fine.
df.head()


(50000, 3)


Unnamed: 0,review_body,star_rating,sentiment
0,I was not able to fit them in my shoes as the ...,1,0
1,Comfy and light,5,1
2,Perfect size with phone inside. Very stylish!,4,1
3,Perfect fit and they are comfy enough to wear ...,5,1
4,Very cute and stylish sneakers for a 2 year ol...,4,1


### CREATING GOOGLE WORD2VEC WORD EMBEDDINGS IN GENSIM

Word embeddings of the sample are generated using Gensim. 

####  Word2Vec in Gensim. 
This process involves tokenizing the words and then converting each word into a vector as follows.

In [None]:

#!pip install nltk
#nltk.download('punkt')
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')


tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences=[]
sum=0
for review_body in df['review_body']:
  sents=tokenizer.tokenize(review_body.strip())
  sum+=len(sents)
  for sent in sents:
    cleaned_sent= clean_reviews(sent)
    sentences.append(cleaned_sent.split()) # can use word_tokenize also.
print(sum)
print(len(sentences))  # total no of sentences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


134802
134802


#### Top 5 of the reviews after the tokenization

In [None]:
# trying to print few sentences
for te in sentences[:5]:
  print(te,"\n")

['able', 'fit', 'shoe', 'heel', 'part', 'wide'] 

['comfy', 'light'] 

['perfect', 'size', 'phone', 'inside'] 

['stylish'] 

['perfect', 'fit', 'comfy', 'enough', 'wear', 'day'] 



####Creation of the word 2 vector embeddings.

In [None]:
import gensim
w2v_model=gensim.models.Word2Vec(sentences=sentences,size=100,window=10,min_count=1)

#### Parameters: -

sentences :  The sentences we have obtained.

size : The dimesnions of the vector used to represent each word.

window : The number f words around any word to see the context.

min_count : The minimum number of times a word should appear for its embedding to be formed or learnt.


In [None]:
w2v_model.train(sentences,epochs=10,total_examples=len(sentences))

(6102144, 7360400)

In [None]:
# embedding of a particular word.
w2v_model.wv.get_vector('like')

array([ 0.07367925,  0.7264668 ,  2.5144527 ,  3.411206  , -0.57569915,
       -0.34555066, -0.2521098 ,  0.52374077,  0.68020594, -1.1104329 ,
        0.16678157,  0.1622212 , -0.94836414,  0.26322594, -0.5710929 ,
       -1.1914994 ,  0.6835419 , -0.3816949 , -0.7597397 , -1.4954362 ,
        1.6410891 , -1.9118441 , -0.57412183, -0.32931828, -1.2157779 ,
        0.83956826,  0.53381723,  1.4196945 ,  2.0628216 ,  0.25225857,
       -0.9118603 ,  0.17952287,  0.1686097 ,  1.2065403 , -0.42273128,
        0.90328974, -0.03724175, -0.45577332,  1.1973754 , -0.5416251 ,
        0.60922384,  1.7128277 , -0.9014182 , -0.90069443,  0.13453196,
       -0.40893528, -0.79901254, -0.310973  ,  2.2714424 , -0.8470457 ,
        0.65688616, -0.11749965, -2.1804264 ,  0.15849583,  0.10332197,
       -2.246705  , -1.0243639 ,  1.6441951 ,  0.2823556 , -0.38164017,
       -0.5126825 , -0.2880718 ,  0.65396625,  1.4210078 ,  0.42719245,
        0.12415409, -0.11529443, -1.0243325 , -2.609697  , -0.94

In [None]:
# total numberof extracted words.
vocab=w2v_model.wv.vocab
print("The total number of words are : ",len(vocab))

The total number of words are :  17642


In [None]:
# words most similar to a given word.
w2v_model.wv.most_similar('great')

[('good', 0.8103077411651611),
 ('fantastic', 0.726043701171875),
 ('amazing', 0.7166142463684082),
 ('excellent', 0.7101442813873291),
 ('awesome', 0.6996778845787048),
 ('nice', 0.6668009757995605),
 ('wonderful', 0.6541703939437866),
 ('perfect', 0.5843102931976318),
 ('decent', 0.5571184158325195),
 ('fabulous', 0.5174053907394409)]

In [None]:
# similaraity b/w two words
w2v_model.wv.similarity('good','like')

0.4165073

#### Now creating a dictionary with words in vocab and their embeddings. This will be used when we will be creating embedding matrix (for feeding to keras embedding layer).

In [None]:
print("The no of words :",len(vocab))
# print(vocab)

The no of words : 17642


In [None]:
# print(vocab)
vocab=list(vocab.keys())

In [None]:
word_vec_dict={}
for word in vocab:
  word_vec_dict[word]=w2v_model.wv.get_vector(word)
print("The no of key-value pairs : ",len(word_vec_dict)) # should come equal to vocab size
  

The no of key-value pairs :  17642


In [None]:
#just check
for word in vocab[:5]:
  print(word_vec_dict[word])

[ 1.8681308  -1.5907913   0.7092128  -2.0597997   0.17542903  2.6452513
 -0.13219646  0.38573468  1.5134158  -0.5503074   0.90431553  1.6712047
 -1.0414796   1.8485112  -0.1069838   0.19475853  0.92763615 -1.7791939
  1.5459243   0.7807732  -1.0991529   0.19378278  1.2601229   0.60766494
  0.9202211   0.87655133  2.020083    0.5514928   0.12454478  1.6737947
  1.3164634  -0.33275178  0.7862966  -0.39058927  0.9576454   0.3757094
 -0.30306423 -2.220082    0.06182353 -0.49734315  0.01309261  1.0931726
 -0.17895778 -1.0237908   1.5208414   1.7499568   0.6011901  -1.3033823
  0.29578832  0.36418635  2.679266   -1.840971    0.86608136 -1.9595351
  0.6005827  -0.9309796  -0.4863807   0.39657393 -0.64415056  0.9428907
 -0.00879765  0.39329818 -1.2173469   0.92669064  0.1219023  -0.55398625
  0.5210502  -1.4682353   0.57874274  1.2332221  -0.7556639  -1.6617166
  0.3195188   0.17394839 -0.53246033  1.7535421  -0.81922716  1.32934
  2.6050766   0.21209076 -0.41728774  1.5244435   1.5414956  -0.

### PREPARING THE DATA FOR KERAS EMBEDDING LAYER.

Now we have obtained the w2v embeddings. But there are a couple of steps required by Keras embedding layer before we can move on.

**Also note that since w2v embeddings have been made now ; we can preprocess our review column by using the function that we saw above.**

In [None]:
# cleaning reviews.
df['clean_review']=df['review_body'].apply(clean_reviews)

#### We need to find the maximum lenght of any document or review in our case. WE will pad all reviews to have this same length.This will be required by Keras embedding layer. Must check [this](https://www.kaggle.com/rajmehra03/a-detailed-explanation-of-keras-embedding-layer) kernel on Kaggle for a wonderful explanation of keras embedding layer.

In [None]:
# number of unique words = 17642.

# now since we will have to pad we need to find the maximum lenght of any document.

maxi=-1
for i,rev in enumerate(df['clean_review']):
  tokens=rev.split()
  if(len(tokens)>maxi):
    maxi=len(tokens)
print(maxi)

864


#### Now we integer encode the words in the reviews using Keras tokenizer. 

**Note that there two important variables: which are the vocab_size which is the total no of unique words while the second is max_doc_len which is the length of every document after padding. Both of these are required by the Keras embedding layer.**

In [None]:
tok = Tokenizer()
tok.fit_on_texts(df['clean_review'])
vocab_size = len(tok.word_index) + 1
encd_rev = tok.texts_to_sequences(df['clean_review'])

In [None]:
max_rev_len=864  # max lenght of a review
vocab_size = len(tok.word_index) + 1  # total no of words
embed_dim=100 # embedding dimension as choosen in word2vec constructor

In [None]:
# now padding to have a amximum length of 1565
pad_rev= pad_sequences(encd_rev, maxlen=max_rev_len, padding='post')
pad_rev.shape   # note that we had 100K reviews and we have padded each review to have  a lenght of 1565 words.

(50000, 864)

### CREATING THE EMBEDDING MATRIX

In [None]:
# now creating the embedding matrix
embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
  embed_vector=word_vec_dict.get(word)
  if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
    embed_matrix[i]=embed_vector
  # if word is not found then embed_vector corressponding to that vector will stay zero.

In [None]:
# checking.
print(embed_matrix[14])

[-1.02270544  1.68903255  0.96197712  0.22039273  0.54488081 -1.28520119
  0.30548918 -0.18557262  0.59502357  0.52351195  2.59328961  0.72548217
  0.74023163  1.9755162   0.72038507  0.5469622  -0.52381659  2.08125043
  0.49530736  0.00963028  0.16873631  0.21225256 -0.23121022  2.3845005
  0.31284866  0.66081506  2.37831259  0.12089077 -1.02842796 -0.02299441
 -0.19593173 -2.5768683   1.35332429  0.71290058  1.59513903 -1.23864543
 -2.68241358 -2.01168776 -1.33537412  0.18032724  2.70774913  1.0714767
  0.15928094  1.2433852  -2.01404238 -0.31898686  1.84889877  0.62318939
  1.68551302  0.94095796 -1.94330335 -0.25611097 -0.36068547  0.73992944
  0.77674252  1.40886128  0.93105477 -0.06325792  0.98345608  2.25248981
  0.37534621  0.105055    0.14191245  0.12529299  0.16005531  0.46031779
  1.24156034 -1.87979949  2.74796367 -1.27429664  0.11646506  2.5239327
 -1.9948411   1.97571027 -0.59736043 -0.91857111 -0.87826413 -1.06080234
  0.62121588 -1.13254499 -1.61669827  0.14996479  1.09

### PREPARING TRAIN AND VALIDATION SETS.

In [None]:
# prepare train and val sets first
#Y=keras.utils.to_categorical(df['sentiment'])  # one hot target as required by NN.
x_train,x_test,y_train,y_test=train_test_split(pad_rev,df['sentiment'],test_size=0.10,random_state=42)

### Modelling RNN model 

In [None]:
import tensorflow as tf
from keras.initializers import Constant
from keras.layers import ReLU
from keras.layers import Dropout
from tensorflow.keras import optimizers

layers=64
dropout=0.5

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(17643, 128, input_length=864))
model.add(tf.keras.layers.LSTM(layers,return_sequences=True))
model.add(tf.keras.layers.Dropout(dropout))
model.add(tf.keras.layers.LSTM(layers))
model.add(tf.keras.layers.Dropout(dropout))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 864, 128)          2258304   
                                                                 
 lstm_7 (LSTM)               (None, 864, 64)           49408     
                                                                 
 dropout_6 (Dropout)         (None, 864, 64)           0         
                                                                 
 lstm_8 (LSTM)               (None, 64)                33024     
                                                                 
 dropout_7 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2,340,801
Trainable params: 2,340,801
No

#### Let us now print a summary of the model.

In [None]:
# compile the model
#from keras.optimizers import rmsprop_v2
import tensorflow as tf
model.compile(optimizer=tf.keras.optimizers.Adam(),loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
# specify batch size and epochs for training.
epochs=2
batch_size=32

In [None]:

# fitting the model.
history=model.fit(x_train,y_train,epochs=epochs,batch_size=batch_size,validation_data=(x_test,y_test))

Epoch 1/2
Epoch 2/2


In [None]:
y_pred= model.predict(x_test)


In [None]:
y_pred=np.argmax(y_pred,axis=1)

In [None]:
import sklearn
sklearn.metrics.accuracy_score(y_pred,y_test)

0.0968

In [None]:
sklearn.metrics.confusion_matrix(y_pred,y_test)

array([[ 484, 4516],
       [   0,    0]])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.10      1.00      0.18       484
           1       0.00      0.00      0.00      4516

    accuracy                           0.10      5000
   macro avg       0.05      0.50      0.09      5000
weighted avg       0.01      0.10      0.02      5000





The final accuracy after 2 epochs is about 90% which is pretty decent.

### Challenges faced

1) Dataset provided has around 440k rows and 14 columns which took quite a time to perform preprocessing of the data. 

2)After Prepocessing I saw it was a imbalanced Dataset.
 

3) Due to the unavailability of high performing computers, I was able to perform the modelling on relatively smaller data of scale 1 in 10.

4) Due to the time constraint couldn't complete proper hypertuning of the parameters to improve performance of LSTM model developed. 

5) Although the validation accuracy of the model developed is on higher side, still quite a good amount of underlying study of models can be done to build an ideal model.

 