### "Amazon-Alexa" text classification using custom trained Word Embeddings ( w/ gensim )

#### https://www.kaggle.com/datasets/sid321axn/amazon-alexa-reviews

### import the required libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [None]:
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

### Load the input data ( "amazon alexa reviews data")

In [None]:
# Loading TSV file
df_amazon = pd.read_csv ("amazon_alexa.tsv", sep="\t")

In [None]:
# Top 5 records
df_amazon_samp = df_amazon.head()

In [None]:
df_amazon_mod = df_amazon_samp[['verified_reviews','feedback']]
df_amazon_mod

Unnamed: 0,verified_reviews,feedback
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


### Create the tokens directly

In [None]:
for sent in df_amazon_mod.verified_reviews:
    print(sent)

Love my Echo!
Loved it!
Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you.  I like being able to turn lights on and off while away from home.
I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.
Music


### Cleaning the Data 1 : Removing Punctuation, and Numerics

In [None]:
import re
def clean(string):
    string = str(string)
    cleanString = re.sub('[^A-Za-z]+',' ', string )
    return cleanString

df_amazon_mod['reviews_non_numeric'] = df_amazon_mod['verified_reviews'].map(clean)
for sent in df_amazon_mod.reviews_non_numeric:
    print(sent)

Love my Echo 
Loved it 
Sometimes while playing a game you can answer a question correctly but Alexa says you got it wrong and answers the same as you I like being able to turn lights on and off while away from home 
I have had a lot of fun with this thing My yr old learns about dinosaurs i control the lights and play games like categories Has nice sound when playing music as well 
Music


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_mod['reviews_non_numeric'] = df_amazon_mod['verified_reviews'].map(clean)


### Cleaning the Data 2 : Keeping words with length > 2

In [None]:
df_amazon_mod['reviews_len_trim'] = df_amazon_mod['reviews_non_numeric'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
for sent in df_amazon_mod.reviews_len_trim:
    print(sent)

Love Echo
Loved
Sometimes while playing game you can answer question correctly but Alexa says you got wrong and answers the same you like being able turn lights and off while away from home
have had lot fun with this thing old learns about dinosaurs control the lights and play games like categories Has nice sound when playing music well
Music


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_mod['reviews_len_trim'] = df_amazon_mod['reviews_non_numeric'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))


### Cleaning the Data 3 : Upper to Lower case conversion and Tokenization

In [None]:
df_amazon_mod['reviews_Tokenized'] = df_amazon_mod['reviews_len_trim'].map(lambda x: x.lower().split())
for sent in df_amazon_mod.reviews_Tokenized:
    print(sent)

['love', 'echo']
['loved']
['sometimes', 'while', 'playing', 'game', 'you', 'can', 'answer', 'question', 'correctly', 'but', 'alexa', 'says', 'you', 'got', 'wrong', 'and', 'answers', 'the', 'same', 'you', 'like', 'being', 'able', 'turn', 'lights', 'and', 'off', 'while', 'away', 'from', 'home']
['have', 'had', 'lot', 'fun', 'with', 'this', 'thing', 'old', 'learns', 'about', 'dinosaurs', 'control', 'the', 'lights', 'and', 'play', 'games', 'like', 'categories', 'has', 'nice', 'sound', 'when', 'playing', 'music', 'well']
['music']


In [None]:
df_amazon_mod['reviews_Tokenized']

0                                         [love, echo]
1                                              [loved]
2    [sometimes, while, playing, game, you, can, an...
3    [have, had, lot, fun, with, this, thing, old, ...
4                                              [music]
Name: reviews_Tokenized, dtype: object

### Cleaning the Data 4 : Removal of Stopwords (NLTK based)

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

df_amazon_mod['reviews_Tokenized_stop'] = df_amazon_mod['reviews_Tokenized'].map(lambda x: " ".join(x for x in x if x not in stop_words))
df_amazon_mod['reviews_Tokenized_stop'] = df_amazon_mod['reviews_Tokenized_stop'].map(lambda x: x.split())
for sent in df_amazon_mod.reviews_Tokenized_stop:
    print(sent)

['love', 'echo']
['loved']
['sometimes', 'playing', 'game', 'answer', 'question', 'correctly', 'alexa', 'says', 'got', 'wrong', 'answers', 'like', 'able', 'turn', 'lights', 'away', 'home']
['lot', 'fun', 'thing', 'old', 'learns', 'dinosaurs', 'control', 'lights', 'play', 'games', 'like', 'categories', 'nice', 'sound', 'playing', 'music', 'well']
['music']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df_amazon_mod

Unnamed: 0,verified_reviews,feedback,reviews_non_numeric,reviews_len_trim,reviews_Tokenized,reviews_Tokenized_stop
0,Love my Echo!,1,Love my Echo,Love Echo,"[love, echo]","[love, echo]"
1,Loved it!,1,Loved it,Loved,[loved],[loved]
2,"Sometimes while playing a game, you can answer...",1,Sometimes while playing a game you can answer ...,Sometimes while playing game you can answer qu...,"[sometimes, while, playing, game, you, can, an...","[sometimes, playing, game, answer, question, c..."
3,I have had a lot of fun with this thing. My 4 ...,1,I have had a lot of fun with this thing My yr ...,have had lot fun with this thing old learns ab...,"[have, had, lot, fun, with, this, thing, old, ...","[lot, fun, thing, old, learns, dinosaurs, cont..."
4,Music,1,Music,Music,[music],[music]


In [None]:
gensim_custom_model = Word2Vec(df_amazon_mod['reviews_Tokenized_stop'],
                 min_count=1, # ignores words that appear less than min_count
                vector_size = 50,   # Dimensionality of words embeddings
                workers = 1,  # Number of processors ( for parallelization )
                window = 5,   # Context window for words during training
                epochs=30)      # Number of epochs training over corpus
# summarize the loaded model
print(gensim_custom_model)
# summarize vocabulary
words = gensim_custom_model.wv.index_to_key
#print(words)
# access vector for one word
#print(gensim_custom_model['sentence'])
# save model
gensim_custom_model.save('gensim_embed_model.bin')
# load model
new_model = Word2Vec.load('gensim_embed_model.bin')
print(new_model)

Word2Vec<vocab=34, vector_size=50, alpha=0.025>
Word2Vec<vocab=34, vector_size=50, alpha=0.025>


In [None]:
print(len(new_model.wv['playing']))
print(new_model.wv['playing'])

50
[ 0.0156784  -0.0190993  -0.00044369  0.0069885  -0.00200676  0.01649684
  0.01824158  0.01323869 -0.00155643  0.01526161 -0.01696927  0.00618912
 -0.00920567 -0.01018283  0.00708348  0.0109418   0.01584405 -0.01129287
  0.01467431  0.01292428 -0.00736534 -0.01732778  0.01128697  0.01308822
 -0.00155207 -0.01325175 -0.01420046 -0.00483971  0.01019067 -0.00739097
 -0.0187839   0.00748899  0.00988669 -0.01288648  0.00225701 -0.00418658
  0.00026838 -0.02001667  0.00522902 -0.00942097  0.00234981 -0.00311385
  0.00438211 -0.01578781 -0.00508244  0.00558605  0.01073515 -0.00498928
 -0.01893339  0.00919462]


In [None]:
# Display the words that are most relevant
new_model.wv.most_similar('playing')

[('old', 0.22794558107852936),
 ('games', 0.2042125016450882),
 ('answer', 0.19420550763607025),
 ('nice', 0.1669164001941681),
 ('control', 0.1307593137025833),
 ('learns', 0.06198083981871605),
 ('thing', 0.05265916883945465),
 ('fun', 0.05002260208129883),
 ('got', 0.047969937324523926),
 ('music', -0.007605755235999823)]

### Aggregted sentence vector for each sentence based on the word vectors

In [None]:
words = set(new_model.wv.index_to_key)
#df_amazon_mod['reviews_Vect'] = np.array([np.array([new_model.wv[i] for i in ls if i in words])for ls in df_amazon_mod['reviews_Tokenized']])

In [None]:
df_amazon_mod['reviews_vect'] = np.array([np.array([new_model.wv[i] for i in ls if i in words])
                       for ls in df_amazon_mod['reviews_Tokenized_stop']], dtype=object)

In [None]:
df_amazon_mod.head()

Unnamed: 0,verified_reviews,feedback,reviews_non_numeric,reviews_len_trim,reviews_Tokenized,reviews_Tokenized_stop,reviews_vect
0,Love my Echo!,1,Love my Echo,Love Echo,"[love, echo]","[love, echo]","[[-0.010473806, -0.014793425, 0.0015434895, 0...."
1,Loved it!,1,Loved it,Loved,[loved],[loved],"[[-0.017356802, -0.002893958, 0.018958932, -0...."
2,"Sometimes while playing a game, you can answer...",1,Sometimes while playing a game you can answer ...,Sometimes while playing game you can answer qu...,"[sometimes, while, playing, game, you, can, an...","[sometimes, playing, game, answer, question, c...","[[0.016329026, -0.008903075, 0.017943563, 0.01..."
3,I have had a lot of fun with this thing. My 4 ...,1,I have had a lot of fun with this thing My yr ...,have had lot fun with this thing old learns ab...,"[have, had, lot, fun, with, this, thing, old, ...","[lot, fun, thing, old, learns, dinosaurs, cont...","[[-0.010635383, -0.0025674026, -0.014737026, 0..."
4,Music,1,Music,Music,[music],[music],"[[-0.0010714707, 0.0004607762, 0.010204348, 0...."


In [None]:
vector_size_n_w2v = 50
text_vect_avg = []
for v in df_amazon_mod['reviews_vect']:
    if v.size:
        text_vect_avg.append(v.mean(axis=0))
    else:
        text_vect_avg.append(np.zeros(vector_size_n_w2v, dtype=float)) # the same vector size must be used here as for model training


df_amazon_mod['reviews_vect_avg'] = text_vect_avg
df_amazon_mod.head()

Unnamed: 0,verified_reviews,feedback,reviews_non_numeric,reviews_len_trim,reviews_Tokenized,reviews_Tokenized_stop,reviews_vect,reviews_vect_avg
0,Love my Echo!,1,Love my Echo,Love Echo,"[love, echo]","[love, echo]","[[-0.010473806, -0.014793425, 0.0015434895, 0....","[-0.01482069, 0.00154335, 0.0049357023, 0.0127..."
1,Loved it!,1,Loved it,Loved,[loved],[loved],"[[-0.017356802, -0.002893958, 0.018958932, -0....","[-0.017356802, -0.002893958, 0.018958932, -0.0..."
2,"Sometimes while playing a game, you can answer...",1,Sometimes while playing a game you can answer ...,Sometimes while playing game you can answer qu...,"[sometimes, while, playing, game, you, can, an...","[sometimes, playing, game, answer, question, c...","[[0.016329026, -0.008903075, 0.017943563, 0.01...","[-0.00045847645, -0.0017047827, -0.0029925797,..."
3,I have had a lot of fun with this thing. My 4 ...,1,I have had a lot of fun with this thing My yr ...,have had lot fun with this thing old learns ab...,"[have, had, lot, fun, with, this, thing, old, ...","[lot, fun, thing, old, learns, dinosaurs, cont...","[[-0.010635383, -0.0025674026, -0.014737026, 0...","[-0.00034754357, 0.0009433863, 0.003989959, 0...."
4,Music,1,Music,Music,[music],[music],"[[-0.0010714707, 0.0004607762, 0.010204348, 0....","[-0.0010714707, 0.0004607762, 0.010204348, 0.0..."


In [None]:
df_amazon_mod['reviews_vect'][0]

array([[-0.01047381, -0.01479342,  0.00154349,  0.00692605,  0.00415308,
         0.00617851, -0.01122894, -0.01977135, -0.01405053,  0.00044408,
         0.00924498,  0.00903766,  0.00375922,  0.0103482 , -0.0002284 ,
         0.00824733, -0.01823576,  0.01541012,  0.01228415,  0.01022369,
         0.01442174,  0.01688822,  0.00150832, -0.0034021 ,  0.00103306,
        -0.01862676,  0.01681271, -0.01275744,  0.0168468 , -0.00848969,
         0.00129533, -0.01833198, -0.01910515, -0.01567747, -0.0154668 ,
         0.00074795, -0.01444187, -0.00991971, -0.01056027, -0.00858074,
         0.01403418,  0.00965582,  0.01736757,  0.01418634, -0.01137109,
         0.01449615, -0.01858839, -0.00518595, -0.01550732,  0.00839872],
       [-0.01916757,  0.01788012,  0.00832792,  0.01848166,  0.01328081,
         0.00583667,  0.01960989, -0.00884255, -0.01362813,  0.00845348,
         0.00745707, -0.01134817,  0.01940375, -0.00711265,  0.01909865,
         0.001689  , -0.01266289, -0.00393822, -0.

In [None]:
(-0.01047381 + -0.01916757)/2

-0.01482069

In [None]:
(-0.01479342 + 0.01788012)/2

0.0015433499999999998

In [None]:
df_Machine_Learning = pd.DataFrame(text_vect_avg)
df_Machine_Learning

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.014821,0.001543,0.004936,0.012704,0.008717,0.006008,0.00419,-0.014307,-0.013839,0.004449,...,0.004621,0.008455,0.008577,0.005895,-0.006737,0.005583,-0.008687,0.001568,-0.012002,0.00037
1,-0.017357,-0.002894,0.018959,-0.015099,-0.010716,0.018633,-0.017947,0.007652,0.001331,0.013321,...,0.004116,-0.008007,-0.016483,0.012556,-0.003898,-0.001332,-0.003543,-0.009071,0.008123,-0.00854
2,-0.000458,-0.001705,-0.002993,6e-05,-0.002196,-0.003602,0.003324,0.00175,-0.004716,-0.005003,...,0.002352,0.000673,-0.001607,-0.00029,0.008668,0.002922,8.6e-05,-0.003052,0.001597,-0.000946
3,-0.000348,0.000943,0.00399,0.001106,0.000606,-0.003924,0.001465,0.004346,0.001724,0.001815,...,0.004069,-0.001282,-0.001508,0.001134,0.000646,-0.000351,0.002489,-0.001007,0.000842,0.006857
4,-0.001071,0.000461,0.010204,0.018039,-0.018623,-0.014289,0.012961,0.017959,-0.010069,-0.007561,...,-0.019176,0.010014,-0.017516,-0.008797,-3.3e-05,-0.000544,-0.015306,0.019199,0.009976,0.018518


In [None]:
df_Machine_Learning.columns = ['Col_' + str(i+1) for i in range(0, df_Machine_Learning.shape[1])]
df_Machine_Learning

Unnamed: 0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,Col_10,...,Col_41,Col_42,Col_43,Col_44,Col_45,Col_46,Col_47,Col_48,Col_49,Col_50
0,-0.014821,0.001543,0.004936,0.012704,0.008717,0.006008,0.00419,-0.014307,-0.013839,0.004449,...,0.004621,0.008455,0.008577,0.005895,-0.006737,0.005583,-0.008687,0.001568,-0.012002,0.00037
1,-0.017357,-0.002894,0.018959,-0.015099,-0.010716,0.018633,-0.017947,0.007652,0.001331,0.013321,...,0.004116,-0.008007,-0.016483,0.012556,-0.003898,-0.001332,-0.003543,-0.009071,0.008123,-0.00854
2,-0.000458,-0.001705,-0.002993,6e-05,-0.002196,-0.003602,0.003324,0.00175,-0.004716,-0.005003,...,0.002352,0.000673,-0.001607,-0.00029,0.008668,0.002922,8.6e-05,-0.003052,0.001597,-0.000946
3,-0.000348,0.000943,0.00399,0.001106,0.000606,-0.003924,0.001465,0.004346,0.001724,0.001815,...,0.004069,-0.001282,-0.001508,0.001134,0.000646,-0.000351,0.002489,-0.001007,0.000842,0.006857
4,-0.001071,0.000461,0.010204,0.018039,-0.018623,-0.014289,0.012961,0.017959,-0.010069,-0.007561,...,-0.019176,0.010014,-0.017516,-0.008797,-3.3e-05,-0.000544,-0.015306,0.019199,0.009976,0.018518


In [None]:
final_df = pd.concat([df_amazon_mod[['verified_reviews','feedback']], df_Machine_Learning], axis=1, sort=False)
final_df

Unnamed: 0,verified_reviews,feedback,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,...,Col_41,Col_42,Col_43,Col_44,Col_45,Col_46,Col_47,Col_48,Col_49,Col_50
0,Love my Echo!,1,-0.014821,0.001543,0.004936,0.012704,0.008717,0.006008,0.00419,-0.014307,...,0.004621,0.008455,0.008577,0.005895,-0.006737,0.005583,-0.008687,0.001568,-0.012002,0.00037
1,Loved it!,1,-0.017357,-0.002894,0.018959,-0.015099,-0.010716,0.018633,-0.017947,0.007652,...,0.004116,-0.008007,-0.016483,0.012556,-0.003898,-0.001332,-0.003543,-0.009071,0.008123,-0.00854
2,"Sometimes while playing a game, you can answer...",1,-0.000458,-0.001705,-0.002993,6e-05,-0.002196,-0.003602,0.003324,0.00175,...,0.002352,0.000673,-0.001607,-0.00029,0.008668,0.002922,8.6e-05,-0.003052,0.001597,-0.000946
3,I have had a lot of fun with this thing. My 4 ...,1,-0.000348,0.000943,0.00399,0.001106,0.000606,-0.003924,0.001465,0.004346,...,0.004069,-0.001282,-0.001508,0.001134,0.000646,-0.000351,0.002489,-0.001007,0.000842,0.006857
4,Music,1,-0.001071,0.000461,0.010204,0.018039,-0.018623,-0.014289,0.012961,0.017959,...,-0.019176,0.010014,-0.017516,-0.008797,-3.3e-05,-0.000544,-0.015306,0.019199,0.009976,0.018518
