In [1]:
# !pip install gensim

In [2]:
# !pip install python-levenshtein

In [3]:
import gensim
import pandas as pd

df = pd.read_json(r"Cell_Phones_and_Accessories_5.json",lines=True) # lines=True means that it reads each line as a json object i.e., 1 line represents 1 json object
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [4]:
df.reviewText[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

In [5]:
gensim.utils.simple_preprocess(df.reviewText[0])

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

<h2 style="color:darkblue;"> Simple Preprocessing</h2>
<p style = "color: blue;">It is important for an NLP problem. Text preprocessing involves removing stop wods like to,the,a,an,I,etc., removing punctuation marks, trimming spaces and converting words to lower case.</style>

In [6]:
df.shape

(194439, 9)

In [7]:
reviewText = df.reviewText.apply(gensim.utils.simple_preprocess)
reviewText

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

In [8]:
reviewText[0]

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

<h2 style="color:darkblue;">Training the Word2Vec model</h2>

In [9]:
model = gensim.models.Word2Vec(
    window=10, # selecting window size of surrounding words
    min_count=2, # there should be minimum 2 words in a statement to apply model
    workers=4, # no. of threads working at the same time to run the code
)

In [10]:
model.build_vocab(reviewText,progress_per=1000)

In [12]:
model.epochs

5

In [13]:
model.corpus_count

194439

In [14]:
model.train(reviewText,total_examples=model.corpus_count,epochs=model.epochs)

(61512622, 83868975)

In [15]:
model.save("./word2vec-amazon-cell-accessories-reviews-short.model")

<h2 style="color:darkblue;">Finding similar words and similarity between words</h2>

In [16]:
model.wv.most_similar('bad')

[('terrible', 0.6558495163917542),
 ('shabby', 0.6464138627052307),
 ('good', 0.6000820994377136),
 ('horrible', 0.5973426103591919),
 ('awful', 0.5644568800926208),
 ('legit', 0.5478417873382568),
 ('mad', 0.5300626158714294),
 ('disappointing', 0.5297746658325195),
 ('okay', 0.5265395045280457),
 ('crappy', 0.5143276453018188)]

In [18]:
model.wv.similarity(w1 = 'cheap',w2 = 'inexpensive')

0.5315012

In [20]:
model.wv.similarity(w1='good',w2='great')

0.76794505

<h3>Further Reading</h3>
<p>You can read about gensim more at https://radimrehurek.com/gensim/models/word2vec.html</p>

<p>Explore other Datasets related to Amazon Reviews: http://jmcauley.ucsd.edu/data/amazon/</p>

<h1>Excercise</h1>

In [23]:
data = pd.read_json(r"Sports_and_Outdoors_5.json",lines=True)
data

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"
...,...,...,...,...,...,...,...,...,...
296332,A2XX2A4OJCDNLZ,B00LFPS0CY,RatherLiveInKeyWest,"[2, 3]",This is a water bottle done right. It is a ver...,5,Hydracentials Sporty 25 Oz Stainless Steel Wat...,1405036800,"07 11, 2014"
296333,A3LGT6UZL99IW1,B00LFPS0CY,"Richard C. Drew ""Anaal Nathra/Uthe vas Bethod...","[0, 0]",If you're looking for an insulated water bottl...,5,"Large, incredibly well made water bottle!",1405641600,"07 18, 2014"
296334,ASKZO80Z1RKTR,B00LFPS0CY,Robin Lee,"[0, 0]","This Hydracentials Sporty 25 OZ, double insula...",5,"""Great Water Bottle For Hot Day""......",1405900800,"07 21, 2014"
296335,APRNS6DB68LLV,B00LFPS0CY,"Rob Slaven ""slavenrm@gmail. com""","[1, 1]",As usual I received this item free in exchange...,5,A pretty impressive water bottle. Best I've s...,1405900800,"07 21, 2014"


In [25]:
data.reviewText[0]

'This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy'

In [26]:
gensim.utils.simple_preprocess(data.reviewText[0])

['this',
 'came',
 'in',
 'on',
 'time',
 'and',
 'am',
 'veru',
 'happy',
 'with',
 'it',
 'haved',
 'used',
 'it',
 'already',
 'and',
 'it',
 'makes',
 'taking',
 'out',
 'the',
 'pins',
 'in',
 'my',
 'glock',
 'very',
 'easy']

In [28]:
reviewText_1 = data.reviewText.apply(gensim.utils.simple_preprocess)
reviewText_1

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

In [29]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4
)

In [30]:
model.build_vocab(reviewText,progress_per=100)

In [31]:
model.corpus_count

194439

In [32]:
model.epochs

5

In [33]:
model.train(reviewText,total_examples=model.corpus_count,epochs=model.epochs)

(61503612, 83868975)

In [34]:
model.save('./Sports_and_Outdoors_5.model')

In [35]:
model.wv.most_similar('awful')

[('horrible', 0.8219799995422363),
 ('terrible', 0.8087835311889648),
 ('crappy', 0.60399329662323),
 ('poor', 0.5983189344406128),
 ('atrocious', 0.571763813495636),
 ('amazing', 0.5637309551239014),
 ('pathetic', 0.549805760383606),
 ('mediocre', 0.5488786697387695),
 ('bad', 0.5483220815658569),
 ('horrendous', 0.5478130578994751)]

In [36]:
model.wv.similarity(w1='good',w2='great')

0.7827706

In [37]:
model.wv.similarity(w1='slow',w2='steady')

0.15797198