In [None]:
# !pip install scipy==1.10.1
# !pip install python-Levenshtein
# import scipy

In [1]:
import gensim
import numpy as np
import pandas as pd

### Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Cell Phones & Accessories category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset: 
http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz

In [2]:
df = pd.read_json("C:/Users/ys726/Desktop/DL/Data/Cell_Phones_and_Accessories_5.json", lines=True)

In [3]:
df.shape

(194439, 9)

In [4]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [5]:
df.reviewText[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

### Simple Preprocessing & Tokenization


The first thing to do for any data science task is to clean the data. For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. This is something we will do over here too.

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.

In [6]:
print(gensim.utils.simple_preprocess("They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"))

['they', 'look', 'good', 'and', 'stick', 'good', 'just', 'don', 'like', 'the', 'rounded', 'shape', 'because', 'was', 'always', 'bumping', 'it', 'and', 'siri', 'kept', 'popping', 'up', 'and', 'it', 'was', 'irritating', 'just', 'won', 'buy', 'product', 'like', 'this', 'again']


In [7]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
len(review_text)

194439

In [8]:
review_text[:5]

0    [they, look, good, and, stick, good, just, don...
1    [these, stickers, work, like, the, review, say...
2    [these, are, awesome, and, make, my, phone, lo...
3    [item, arrived, in, great, time, and, was, in,...
4    [awesome, stays, on, and, looks, great, can, b...
Name: reviewText, dtype: object

In [9]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2, 
    workers=4
)

In [10]:
model.build_vocab(review_text, progress_per=1000)

In [11]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(61505488, 83868975)

In [12]:
model.save('./word2vec-amazon-cell-accessories-reviews-short.model')

In [13]:
model.wv.most_similar("bad")

[('shabby', 0.6801645755767822),
 ('terrible', 0.6614121794700623),
 ('good', 0.5853710174560547),
 ('horrible', 0.5821133852005005),
 ('pathetic', 0.5253623127937317),
 ('poor', 0.5218164920806885),
 ('funny', 0.5077080726623535),
 ('disappointing', 0.5044824481010437),
 ('crappy', 0.503677487373352),
 ('cheap', 0.5028417706489563)]

In [14]:
model.wv.similarity(w1="cheap", w2="inexpensive")

0.54111195

In [15]:
model.wv.similarity(w1="great", w2="good")

0.79101133