# WORD EMBEDDDING

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

reviews = ['nice food',
        'amazing restaurant',
        'too good',
        'just loved it!',
        'will go again',
        'horrible food',
        'never go there',
        'poor service',
        'poor quality',
        'needs improvement']

sentiment = np.array([1,1,1,1,1,0,0,0,0,0])

In [None]:
one_hot("amazing restaurant",30)

[2, 23]

In [None]:
vocab_size = 30
encoded_reviews = [one_hot(d, vocab_size) for d in reviews]
print(encoded_reviews)

[[17, 24], [2, 23], [24, 10], [6, 26, 28], [9, 11, 3], [12, 24], [28, 11, 26], [21, 22], [21, 19], [16, 20]]


In [None]:
max_length = 4
padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')
print(padded_reviews)

[[17 24  0  0]
 [ 2 23  0  0]
 [24 10  0  0]
 [ 6 26 28  0]
 [ 9 11  3  0]
 [12 24  0  0]
 [28 11 26  0]
 [21 22  0  0]
 [21 19  0  0]
 [16 20  0  0]]


In [None]:
embeded_vector_size = 5

model = Sequential()
model.add(Embedding(vocab_size, embeded_vector_size, input_length=max_length,name="embedding"))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [None]:
X = padded_reviews
y = sentiment

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 5)              150       
_________________________________________________________________
flatten_1 (Flatten)          (None, 20)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 21        
Total params: 171
Trainable params: 171
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(X, y, epochs=50, verbose=0)

<keras.callbacks.History at 0x1b9675e61d0>

In [None]:
# evaluate the model
loss, accuracy = model.evaluate(X, y)
accuracy



0.8999999761581421

In [None]:
weights = model.get_layer('embedding').get_weights()[0]
len(weights)

30

In [None]:
weights[13]

array([-0.00979682,  0.04166451,  0.02031307,  0.00557742,  0.0394435 ],
      dtype=float32)

In [None]:
weights[4]

array([ 0.00436239,  0.01971683,  0.01808694,  0.04933788, -0.0415208 ],
      dtype=float32)

In [None]:
weights[16]

array([-0.01950807,  0.02621825,  0.05658838, -0.09349167,  0.09969497],
      dtype=float32)

# WORD TO VECTOR

In [2]:
import gensim
import pandas as pd
import json

In [8]:

# Load JSON data into a DataFrame
df = pd.read_json("Cell_Phones_and_Accessories_10.json", lines=True)

# Display the first 2 rows
df.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A3CW0ZLUO5X2B1,B006YJBITS,"35-year Technology Consumer ""8-tracks to 802.11""","[0, 0]","First, the disclosure:The vendor provided me t...",4,Good product for specialized uses...,1327449600,"01 25, 2012"
1,A1W415JP5WEAJK,B006YJBITS,Alex S,"[5, 6]",This wonderful little charger (about the size ...,5,Emergency backup ...,1332979200,"03 29, 2012"


In [9]:
df.shape

(1854, 9)

# Simple Preprocessing & Tokenization
The first thing to do for any data science task is to clean the data. For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. This is something we will do over here too.

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.

In [11]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0       [first, the, disclosure, the, vendor, provided...
1       [this, wonderful, little, charger, about, the,...
2       [these, are, an, absolutely, great, item, to, ...
3       [this, incredicharge, works, very, well, and, ...
4       [this, battery, charges, three, devices, at, t...
                              ...                        
1849    [this, is, more, of, convenience, than, tool, ...
1850    [if, your, cell, phone, battery, seems, to, be...
1851    [useful, emergency, power, backup, for, your, ...
1852    [lifecharge, juicypack, mah, portable, charger...
1853    [received, this, at, no, cost, in, exchange, f...
Name: reviewText, Length: 1854, dtype: object

In [12]:
review_text.loc[0]

['first',
 'the',
 'disclosure',
 'the',
 'vendor',
 'provided',
 'me',
 'this',
 'product',
 'free',
 'of',
 'charge',
 'now',
 'the',
 'review',
 'this',
 'is',
 'good',
 'product',
 'for',
 'user',
 'with',
 'specialized',
 'needs',
 'which',
 'ones',
 'if',
 'you',
 'use',
 'portable',
 'gaming',
 'device',
 'or',
 'often',
 'view',
 'entertainment',
 'content',
 'on',
 'your',
 'tablet',
 'or',
 'smart',
 'phone',
 'then',
 'you',
 'probably',
 'have',
 'greater',
 'reliance',
 'on',
 'proximity',
 'to',
 'household',
 'ac',
 'or',
 'automotive',
 'dc',
 'outlet',
 'than',
 'users',
 'involved',
 'in',
 'less',
 'power',
 'intensive',
 'activities',
 'the',
 'incredicharge',
 'offers',
 'additional',
 'operation',
 'time',
 'or',
 'charging',
 'capability',
 'when',
 'you',
 'are',
 'will',
 'not',
 'have',
 'ready',
 'access',
 'to',
 'household',
 'or',
 'automotive',
 'outlet',
 'out',
 'of',
 'the',
 'box',
 'mine',
 'took',
 'about',
 'an',
 'hour',
 'to',
 'reach',
 'its',
 

In [13]:
df.reviewText.loc[0]

"First, the disclosure:The vendor provided me this product free of charge.Now the review:This is good product for a user with specialized needs.Which ones?If you use a portable gaming device or often view entertainment content on your tablet or smart phone, then you probably have greater reliance on proximity to a household AC or automotive DC outlet than users involved in less power-intensive activities.The Incredicharge offers additional operation time (or a charging capability) when you are will not have ready access to a household or automotive outlet.Out of the box, mine took about an hour to reach its charged state (from an approximately 25% charge according to the indicating LEDs. Charging took place from a laptop USB port (via an included adapter).Once charged, power is delivered to supported devices by one of two avaialable USB ports that connect via six different charging adapters. It charged two nearly depleted phones (one with an extended life battery) in about the same tim

# Training the Word2Vec Model
Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter.

Workers define how many CPU threads to be used.

Initialize the model

In [14]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

Build the model

In [15]:
model.build_vocab(review_text, progress_per=1000)

Train the model

In [16]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(1717299, 2344395)

SAVE THE MODEL FOR FUTURE USE

In [17]:
model.save("./word2vec-amazon-cell-accessories-reviews-short.model")

FINDING THE SIMILAR WORDS

In [18]:
model.wv.most_similar("bad")

[('deal', 0.9185417294502258),
 ('waste', 0.9115172028541565),
 ('care', 0.9045664668083191),
 ('feeling', 0.9039129614830017),
 ('forever', 0.9034271836280823),
 ('though', 0.9011083841323853),
 ('however', 0.901063084602356),
 ('crazy', 0.8988466858863831),
 ('said', 0.8968404531478882),
 ('wouldn', 0.896065354347229)]

In [19]:
model.wv.similarity(w1="cheap", w2="inexpensive")

0.7710698

In [20]:
model.wv.similarity(w1="great", w2="good")

0.73020387