In [1]:
import pandas as pd
import gensim

### Reading and Exploring the dataset
The dataset we are using here is a subset of Amazon reviews from the Sports & Outdoors category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz


In [3]:
df = pd.read_json('reviews_Sports_and_Outdoors_5.json.gz', lines=True)

In [4]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [5]:
df.shape

(296337, 9)

### Simple Preprocessing & Tokenization
The first thing to do for any data science task is to clean the data. For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. This is something we will do over here too.

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.

In [6]:
df.reviewText

0         This came in on time and I am veru happy with ...
1         I had a factory Glock tool that I was using fo...
2         If you don't have a 3/32 punch or would like t...
3         This works no better than any 3/32 punch you w...
4         I purchased this thinking maybe I need a speci...
                                ...                        
296332    This is a water bottle done right. It is a ver...
296333    If you're looking for an insulated water bottl...
296334    This Hydracentials Sporty 25 OZ, double insula...
296335    As usual I received this item free in exchange...
296336    Hydracentials insulated 25 oz water bottle.Thi...
Name: reviewText, Length: 296337, dtype: object

In [7]:
df.reviewText[0]

'This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy'

In [8]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

In [9]:
model = gensim.models.Word2Vec(
    window = 10,
    min_count = 2,
    workers = 4
)

In [10]:
model.build_vocab(review_text, progress_per=1000)

In [11]:
model.epochs

5

In [12]:
model.corpus_count

296337

In [13]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(91337429, 121496535)

In [14]:
model.save("./word2vec-amazon-sports-outdoor-reviews-short.model")

In [15]:
model.wv.most_similar("bad")

[('terrible', 0.6722434163093567),
 ('shabby', 0.6529035568237305),
 ('horrible', 0.6356949806213379),
 ('funny', 0.5628359913825989),
 ('greatest', 0.5309087038040161),
 ('strange', 0.5172533988952637),
 ('stupid', 0.5110173225402832),
 ('crappy', 0.505344808101654),
 ('good', 0.492776095867157),
 ('darn', 0.48854827880859375)]

In [16]:
model.wv.similarity(w1='cheap', w2='inexpensive')

0.5147049

In [17]:
model.wv.similarity(w1='great', w2='product')

-0.11933756