In [1]:
pip install gensim

Collecting FuzzyTM>=0.4.0 (from gensim)
  Downloading FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Downloading pyFUME-0.2.25-py3-none-any.whl (67 kB)
     ---------------------------------------- 0.0/67.1 kB ? eta -:--:--
     ------------------ --------------------- 30.7/67.1 kB 1.3 MB/s eta 0:00:01
     ---------------------------------------- 67.1/67.1 kB 1.2 MB/s eta 0:00:00
Collecting simpful (from pyfume->FuzzyTM>=0.4.0->gensim)
  Obtaining dependency information for simpful from https://files.pythonhosted.org/packages/8d/93/8448d3f1aa9d2911b8cba2602aaa1af85eb31a26d28b7b737f1fa5b40c02/simpful-2.11.1-py3-none-any.whl.metadata
  Downloading simpful-2.11.1-py3-none-any.whl.metadata (4.8 kB)
Collecting fst-pso (from pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting miniful (from fst-pso->pyfume->FuzzyTM>=

In [1]:
import gensim
import pandas as pd

## Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Cell Phones & Accessories category. The data is stored as a JSON file and can be read using pandas. 
- From Codebasics.

Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz

In [3]:
df = pd.read_json('C:\\Users\\User\\Desktop\\Datasets\\Amazon Reviews\\Cell_Phones_and_Accessories_5.json', lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [4]:
df.shape

(194439, 9)

In [5]:
df.reviewText[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

In [7]:
#this command will convert evrything to lower case, remove punctuation marks and also remove some simple
# or actually less useful words like "I", "a", etc. Its very simple preprocessing

gensim.utils.simple_preprocess("They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again")

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [9]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

In [10]:
model = gensim.models.Word2Vec(
    window = 10,                #window size specifies how many words before and after the target word showuld be considered
    min_count = 2,              # means if a sentence has less than 2 words, dont consider that sentence. Sentence should have atleas 2 words to be considered in model training
    workers = 4
)

#### Build vocabulary

In [11]:
#this command builds the vocabulary. progress_per shows progress after processing 1000 words

model.build_vocab(review_text, progress_per=1000)

In [12]:
model.epochs

5

In [13]:
model.corpus_count

194439

#### Train the model

In [14]:
model.train(review_text, total_examples = model.corpus_count, epochs = model.epochs)

(61509179, 83868975)

#### Save the model

In [16]:
model.save('./word2vec-amozon-cell-accessories-reviews-short.model')

In [17]:
# now test the model by giving a sample word to it

model.wv.most_similar('bad')

[('terrible', 0.6624701619148254),
 ('shabby', 0.6371552348136902),
 ('horrible', 0.6134973168373108),
 ('good', 0.5950566530227661),
 ('awful', 0.570065438747406),
 ('okay', 0.5504992604255676),
 ('crappy', 0.5308010578155518),
 ('poor', 0.5252400636672974),
 ('funny', 0.5236604809761047),
 ('ok', 0.5187947154045105)]

In [18]:
# calculate and print the cosine similarity between two words (or vectors)

model.wv.similarity(w1='cheap', w2='inexpensive')

0.54357314

In [19]:
model.wv.similarity(w1='great', w2='good')

0.7796236

In [20]:
model.wv.similarity(w1='great', w2='product')

-0.046373796