In [12]:
import logging
from pathlib import Path

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
import numpy as np
from tqdm import tqdm
import yaml

tqdm.pandas()

# Load Data

In [2]:
# global variables
DATA_PATH = 'data/evaluation'
CATEGORY = 'Grocery_and_Gourmet_Food'

train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")

In [3]:
train.head()

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A23RYWDS884TUL,5.0,This curry paste makes a delicious curry. I j...,2013-05-28,curry paste delicious curry fry chicken vegeta...
1,1,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A945RBQWGZXCK,5.0,I've purchased different curries in the grocer...,2012-09-17,purchase different curry grocery store complet...
2,3,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3AMNY44OP8AOU,4.0,I started a new diet restricting all added sug...,2014-01-23,start new diet restrict added sugar brand suga...
3,4,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3IB4CQ2QEJLJ8,5.0,So many flavors. I can't begin to tell you how...,2014-04-27,flavor begin tell love mae ploy curry ask reci...
4,5,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",AQA5DF3RWKETQ,5.0,I've used this a lot recently in some of my ch...,2012-11-27,use lot recently chicken dish use lot like spi...


# Generate Training Corpus

In [4]:
train["processedReviewText"] = train["processedReviewText"].progress_apply(lambda x: x.split())

100%|█████████████████████████████████████████████████████| 47774/47774 [00:00<00:00, 140669.82it/s]


In [5]:
train_corpus = [TaggedDocument(review, [asin]) for asin, review in list(zip(train["asin"], train["processedReviewText"]))]

In [6]:
train_corpus[:5]

[TaggedDocument(words=['curry', 'paste', 'delicious', 'curry', 'fry', 'chicken', 'vegetable', 'add', 'coconut', 'milk', 'delicious', 'dish', 'spicy', 'look', 'mild', 'look'], tags=['9742356831']),
 TaggedDocument(words=['purchase', 'different', 'curry', 'grocery', 'store', 'completely', 'happy', 'flavor', 'read', 'recipe', 'recommend', 'brand', 'use', 'recommend', 'specific', 'brand', 'great', 'flavor', 'purchase', 'green', 'red', 'yellow', 'curry', 'paste', 'enjoy', 'type'], tags=['9742356831']),
 TaggedDocument(words=['start', 'new', 'diet', 'restrict', 'added', 'sugar', 'brand', 'sugar', 'add', 'ingredient', 'taste', 'great', 'boot', 'sure', 'finish', 'tub', 'long'], tags=['9742356831']),
 TaggedDocument(words=['flavor', 'begin', 'tell', 'love', 'mae', 'ploy', 'curry', 'ask', 'recipe', 'basically', 'plastic', 'lidded', 'container', 'size', 'want', 'half', 'yogurts', 'bag', 'inside', 'pull', 'inside', 'clear', 'bag', 'curry', 'mash', 'cut', 'corner', 'little', 'time', 'spice', 'mixed

# Training D2V Model

In [7]:
params = yaml.safe_load(open("params.yaml"))["generate_vectors"]
MODEL_PARAMS = params["d2v_params"]
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(message)s')

In [8]:
model = Doc2Vec(**MODEL_PARAMS)
model.build_vocab(train_corpus)

2021-08-21 16:57:57,232 starting a new internal lifecycle event log for Doc2Vec
2021-08-21 16:57:57,235 Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d50,n5,w5,mc5,s1e-05,t8)', 'datetime': '2021-08-21T16:57:57.211637', 'gensim': '4.0.1', 'python': '3.9.6 (default, Jun 29 2021, 05:25:02) \n[Clang 12.0.5 (clang-1205.0.22.9)]', 'platform': 'macOS-11.4-x86_64-i386-64bit', 'event': 'created'}
2021-08-21 16:57:57,236 collecting all words and their counts
2021-08-21 16:57:57,236 PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-08-21 16:57:57,315 PROGRESS: at example #10000, processed 364241 words (4674196/s), 14898 word types, 904 tags
2021-08-21 16:57:57,396 PROGRESS: at example #20000, processed 744636 words (4737820/s), 21785 word types, 1940 tags
2021-08-21 16:57:57,485 PROGRESS: at example #30000, processed 1142910 words (4522375/s), 27143 word types, 2955 tags
2021-08-21 16:57:57,577 PROGRESS: at example #40000, processed 1580118 words (4809888/s), 31844 wo

In [9]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2021-08-21 16:57:57,881 Doc2Vec lifecycle event {'msg': 'training model with 8 workers on 10527 vocabulary and 50 features, using sg=0 hs=0 sample=1e-05 negative=5 window=5', 'datetime': '2021-08-21T16:57:57.881613', 'gensim': '4.0.1', 'python': '3.9.6 (default, Jun 29 2021, 05:25:02) \n[Clang 12.0.5 (clang-1205.0.22.9)]', 'platform': 'macOS-11.4-x86_64-i386-64bit', 'event': 'train'}
2021-08-21 16:57:58,889 EPOCH 1 - PROGRESS: at 31.35% examples, 140990 words/s, in_qsize 16, out_qsize 0
2021-08-21 16:57:59,891 EPOCH 1 - PROGRESS: at 64.68% examples, 145754 words/s, in_qsize 15, out_qsize 0
2021-08-21 16:58:00,751 job loop exiting, total 196 jobs
2021-08-21 16:58:00,886 worker exiting, processed 23 jobs
2021-08-21 16:58:00,887 worker thread finished; awaiting finish of 7 more threads
2021-08-21 16:58:00,898 worker exiting, processed 23 jobs
2021-08-21 16:58:00,899 EPOCH 1 - PROGRESS: at 97.71% examples, 151629 words/s, in_qsize 6, out_qsize 1
2021-08-21 16:58:00,900 worker thread finish

2021-08-21 16:58:13,273 worker exiting, processed 24 jobs
2021-08-21 16:58:13,274 worker thread finished; awaiting finish of 0 more threads
2021-08-21 16:58:13,275 EPOCH - 5 : training on 1942617 raw words (469404 effective words) took 3.1s, 149828 effective words/s
2021-08-21 16:58:14,282 EPOCH 6 - PROGRESS: at 31.85% examples, 143240 words/s, in_qsize 15, out_qsize 0
2021-08-21 16:58:15,291 EPOCH 6 - PROGRESS: at 64.68% examples, 145239 words/s, in_qsize 15, out_qsize 0
2021-08-21 16:58:16,139 job loop exiting, total 196 jobs
2021-08-21 16:58:16,274 worker exiting, processed 24 jobs
2021-08-21 16:58:16,274 worker thread finished; awaiting finish of 7 more threads
2021-08-21 16:58:16,284 worker exiting, processed 25 jobs
2021-08-21 16:58:16,284 worker thread finished; awaiting finish of 6 more threads
2021-08-21 16:58:16,297 worker exiting, processed 25 jobs
2021-08-21 16:58:16,298 EPOCH 6 - PROGRESS: at 98.16% examples, 152179 words/s, in_qsize 5, out_qsize 1
2021-08-21 16:58:16,299 

2021-08-21 16:58:30,574 EPOCH 11 - PROGRESS: at 63.71% examples, 141933 words/s, in_qsize 15, out_qsize 0
2021-08-21 16:58:31,517 job loop exiting, total 196 jobs
2021-08-21 16:58:31,591 EPOCH 11 - PROGRESS: at 94.43% examples, 144595 words/s, in_qsize 13, out_qsize 0
2021-08-21 16:58:31,697 worker exiting, processed 26 jobs
2021-08-21 16:58:31,698 worker thread finished; awaiting finish of 7 more threads
2021-08-21 16:58:31,705 worker exiting, processed 24 jobs
2021-08-21 16:58:31,705 worker thread finished; awaiting finish of 6 more threads
2021-08-21 16:58:31,725 worker exiting, processed 23 jobs
2021-08-21 16:58:31,725 worker thread finished; awaiting finish of 5 more threads
2021-08-21 16:58:31,730 worker exiting, processed 24 jobs
2021-08-21 16:58:31,730 worker thread finished; awaiting finish of 4 more threads
2021-08-21 16:58:31,735 worker exiting, processed 25 jobs
2021-08-21 16:58:31,735 worker thread finished; awaiting finish of 3 more threads
2021-08-21 16:58:31,746 worker 

2021-08-21 16:58:47,166 job loop exiting, total 196 jobs
2021-08-21 16:58:47,298 worker exiting, processed 24 jobs
2021-08-21 16:58:47,299 worker thread finished; awaiting finish of 7 more threads
2021-08-21 16:58:47,307 worker exiting, processed 23 jobs
2021-08-21 16:58:47,308 worker thread finished; awaiting finish of 6 more threads
2021-08-21 16:58:47,325 worker exiting, processed 24 jobs
2021-08-21 16:58:47,325 worker thread finished; awaiting finish of 5 more threads
2021-08-21 16:58:47,328 worker exiting, processed 24 jobs
2021-08-21 16:58:47,329 EPOCH 16 - PROGRESS: at 98.28% examples, 152030 words/s, in_qsize 4, out_qsize 1
2021-08-21 16:58:47,330 worker thread finished; awaiting finish of 4 more threads
2021-08-21 16:58:47,334 worker exiting, processed 25 jobs
2021-08-21 16:58:47,334 worker thread finished; awaiting finish of 3 more threads
2021-08-21 16:58:47,347 worker exiting, processed 24 jobs
2021-08-21 16:58:47,347 worker thread finished; awaiting finish of 2 more thread

2021-08-21 16:59:02,533 worker exiting, processed 25 jobs
2021-08-21 16:59:02,533 worker thread finished; awaiting finish of 6 more threads
2021-08-21 16:59:02,548 worker exiting, processed 26 jobs
2021-08-21 16:59:02,549 worker thread finished; awaiting finish of 5 more threads
2021-08-21 16:59:02,553 worker exiting, processed 24 jobs
2021-08-21 16:59:02,553 worker thread finished; awaiting finish of 4 more threads
2021-08-21 16:59:02,558 worker exiting, processed 25 jobs
2021-08-21 16:59:02,558 worker thread finished; awaiting finish of 3 more threads
2021-08-21 16:59:02,572 worker exiting, processed 24 jobs
2021-08-21 16:59:02,572 EPOCH 21 - PROGRESS: at 99.13% examples, 153370 words/s, in_qsize 2, out_qsize 1
2021-08-21 16:59:02,574 worker exiting, processed 24 jobs
2021-08-21 16:59:02,575 worker thread finished; awaiting finish of 2 more threads
2021-08-21 16:59:02,577 worker exiting, processed 25 jobs
2021-08-21 16:59:02,577 worker thread finished; awaiting finish of 1 more threa

2021-08-21 16:59:17,717 worker exiting, processed 23 jobs
2021-08-21 16:59:17,717 worker thread finished; awaiting finish of 5 more threads
2021-08-21 16:59:17,723 worker exiting, processed 26 jobs
2021-08-21 16:59:17,723 worker thread finished; awaiting finish of 4 more threads
2021-08-21 16:59:17,724 worker exiting, processed 25 jobs
2021-08-21 16:59:17,725 worker thread finished; awaiting finish of 3 more threads
2021-08-21 16:59:17,737 worker exiting, processed 24 jobs
2021-08-21 16:59:17,737 worker thread finished; awaiting finish of 2 more threads
2021-08-21 16:59:17,743 worker exiting, processed 25 jobs
2021-08-21 16:59:17,743 EPOCH 26 - PROGRESS: at 99.57% examples, 154649 words/s, in_qsize 1, out_qsize 1
2021-08-21 16:59:17,745 worker thread finished; awaiting finish of 1 more threads
2021-08-21 16:59:17,747 worker exiting, processed 24 jobs
2021-08-21 16:59:17,747 worker thread finished; awaiting finish of 0 more threads
2021-08-21 16:59:17,749 EPOCH - 26 : training on 194261

2021-08-21 16:59:32,880 worker exiting, processed 26 jobs
2021-08-21 16:59:32,881 worker thread finished; awaiting finish of 4 more threads
2021-08-21 16:59:32,884 worker exiting, processed 23 jobs
2021-08-21 16:59:32,884 worker thread finished; awaiting finish of 3 more threads
2021-08-21 16:59:32,895 worker exiting, processed 24 jobs
2021-08-21 16:59:32,895 worker thread finished; awaiting finish of 2 more threads
2021-08-21 16:59:32,901 worker exiting, processed 25 jobs
2021-08-21 16:59:32,901 EPOCH 31 - PROGRESS: at 99.57% examples, 154299 words/s, in_qsize 1, out_qsize 1
2021-08-21 16:59:32,904 worker exiting, processed 25 jobs
2021-08-21 16:59:32,904 worker thread finished; awaiting finish of 1 more threads
2021-08-21 16:59:32,905 worker thread finished; awaiting finish of 0 more threads
2021-08-21 16:59:32,905 EPOCH - 31 : training on 1942617 raw words (468977 effective words) took 3.0s, 154883 effective words/s
2021-08-21 16:59:33,927 EPOCH 32 - PROGRESS: at 32.25% examples, 14

2021-08-21 16:59:48,029 worker thread finished; awaiting finish of 3 more threads
2021-08-21 16:59:48,039 worker exiting, processed 25 jobs
2021-08-21 16:59:48,040 worker thread finished; awaiting finish of 2 more threads
2021-08-21 16:59:48,045 worker exiting, processed 24 jobs
2021-08-21 16:59:48,045 EPOCH 36 - PROGRESS: at 99.56% examples, 154714 words/s, in_qsize 1, out_qsize 1
2021-08-21 16:59:48,046 worker exiting, processed 26 jobs
2021-08-21 16:59:48,047 worker thread finished; awaiting finish of 1 more threads
2021-08-21 16:59:48,049 worker thread finished; awaiting finish of 0 more threads
2021-08-21 16:59:48,049 EPOCH - 36 : training on 1942617 raw words (468885 effective words) took 3.0s, 155268 effective words/s
2021-08-21 16:59:49,070 EPOCH 37 - PROGRESS: at 32.25% examples, 143578 words/s, in_qsize 15, out_qsize 0
2021-08-21 16:59:50,074 EPOCH 37 - PROGRESS: at 66.07% examples, 147625 words/s, in_qsize 15, out_qsize 0
2021-08-21 16:59:50,884 job loop exiting, total 196 j

2021-08-21 17:00:03,187 worker exiting, processed 26 jobs
2021-08-21 17:00:03,187 worker thread finished; awaiting finish of 2 more threads
2021-08-21 17:00:03,189 worker thread finished; awaiting finish of 1 more threads
2021-08-21 17:00:03,190 EPOCH 41 - PROGRESS: at 100.00% examples, 154978 words/s, in_qsize 0, out_qsize 1
2021-08-21 17:00:03,190 worker thread finished; awaiting finish of 0 more threads
2021-08-21 17:00:03,190 EPOCH - 41 : training on 1942617 raw words (468453 effective words) took 3.0s, 154930 effective words/s
2021-08-21 17:00:04,217 EPOCH 42 - PROGRESS: at 32.25% examples, 143334 words/s, in_qsize 15, out_qsize 0
2021-08-21 17:00:05,219 EPOCH 42 - PROGRESS: at 66.07% examples, 147702 words/s, in_qsize 16, out_qsize 0
2021-08-21 17:00:06,028 job loop exiting, total 196 jobs
2021-08-21 17:00:06,164 worker exiting, processed 23 jobs
2021-08-21 17:00:06,164 worker thread finished; awaiting finish of 7 more threads
2021-08-21 17:00:06,174 worker exiting, processed 24 

2021-08-21 17:00:18,358 worker thread finished; awaiting finish of 0 more threads
2021-08-21 17:00:18,359 EPOCH - 46 : training on 1942617 raw words (468886 effective words) took 3.0s, 154042 effective words/s
2021-08-21 17:00:19,379 EPOCH 47 - PROGRESS: at 32.25% examples, 143744 words/s, in_qsize 15, out_qsize 0
2021-08-21 17:00:20,382 EPOCH 47 - PROGRESS: at 66.07% examples, 147853 words/s, in_qsize 15, out_qsize 0
2021-08-21 17:00:21,188 job loop exiting, total 196 jobs
2021-08-21 17:00:21,321 worker exiting, processed 25 jobs
2021-08-21 17:00:21,322 worker thread finished; awaiting finish of 7 more threads
2021-08-21 17:00:21,333 worker exiting, processed 23 jobs
2021-08-21 17:00:21,334 worker thread finished; awaiting finish of 6 more threads
2021-08-21 17:00:21,355 worker exiting, processed 26 jobs
2021-08-21 17:00:21,356 worker thread finished; awaiting finish of 5 more threads
2021-08-21 17:00:21,358 worker exiting, processed 25 jobs
2021-08-21 17:00:21,358 worker thread finis

In [13]:
# save model
MODEL_PATH = Path("models/d2v")

model.save(f"{MODEL_PATH}/{CATEGORY}_{MODEL_PARAMS['vector_size']}_d2v.model")

2021-08-21 17:02:33,784 Doc2Vec lifecycle event {'fname_or_handle': 'models/d2v/Grocery_and_Gourmet_Food_50_d2v.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-08-21T17:02:33.784912', 'gensim': '4.0.1', 'python': '3.9.6 (default, Jun 29 2021, 05:25:02) \n[Clang 12.0.5 (clang-1205.0.22.9)]', 'platform': 'macOS-11.4-x86_64-i386-64bit', 'event': 'saving'}
2021-08-21 17:02:33,787 not storing attribute cum_table
2021-08-21 17:02:33,788 {'uri': 'models/d2v/Grocery_and_Gourmet_Food_50_d2v.model', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'compression': None, 'transport_params': None}
2021-08-21 17:02:33,804 saved models/d2v/Grocery_and_Gourmet_Food_50_d2v.model


# Evaluate D2V Model

### Testing Retrieval of Vectors via Index

In [14]:
model.dv[0]

array([-1.9998994 , -1.1165652 , -0.66850775,  0.98593354, -0.1539128 ,
        1.836437  ,  0.04748935,  0.91502106, -0.27679813, -0.71812797,
       -0.707553  , -0.6039597 , -0.58606654,  1.0271451 , -0.03445855,
       -1.2801527 , -0.48076808, -0.33833098,  0.32701826, -0.56980634,
       -0.9162374 ,  0.33309776, -0.37450856,  0.02434505, -0.3233081 ,
        2.094295  , -1.4732084 ,  1.2163205 , -1.8009837 , -0.34398207,
        0.79519683,  0.48662725,  0.38086358,  0.9061864 ,  0.98379046,
       -0.49270517, -1.679146  , -0.8714166 , -0.8851388 ,  1.7304606 ,
       -0.21798603, -0.90019166, -1.5550374 , -1.2914916 ,  0.3585223 ,
        0.7392237 ,  0.06309976, -0.6030123 ,  1.4061383 ,  0.12848437],
      dtype=float32)

### Testing Retrieval of Vectors via Tags

In [15]:
model.dv['9742356831']

array([-1.9998994 , -1.1165652 , -0.66850775,  0.98593354, -0.1539128 ,
        1.836437  ,  0.04748935,  0.91502106, -0.27679813, -0.71812797,
       -0.707553  , -0.6039597 , -0.58606654,  1.0271451 , -0.03445855,
       -1.2801527 , -0.48076808, -0.33833098,  0.32701826, -0.56980634,
       -0.9162374 ,  0.33309776, -0.37450856,  0.02434505, -0.3233081 ,
        2.094295  , -1.4732084 ,  1.2163205 , -1.8009837 , -0.34398207,
        0.79519683,  0.48662725,  0.38086358,  0.9061864 ,  0.98379046,
       -0.49270517, -1.679146  , -0.8714166 , -0.8851388 ,  1.7304606 ,
       -0.21798603, -0.90019166, -1.5550374 , -1.2914916 ,  0.3585223 ,
        0.7392237 ,  0.06309976, -0.6030123 ,  1.4061383 ,  0.12848437],
      dtype=float32)

### Assessing Model

In [22]:
# let's try to generate a random item id and infer its vector and compare if we can get similar items back
random_asin = np.random.choice(list(train['asin'].unique()), 1)[0]

# combining all the words from the all reviews
asin_review = []
for review in train[train['asin'] == random_asin]["processedReviewText"]:
    asin_review.extend(review)

# inferring vector
print(f"For item {random_asin}, {train[train['asin'] == random_asin]['title'].unique()[0]}...\n")
top_n = model.dv.most_similar([model.infer_vector(asin_review, epochs=50)], topn=5)
print(f'Most similar D2V vectors: {top_n}')

# print item details
sim_asins = (pd.DataFrame(top_n, columns=['asin', 'similarities'])
             .merge(train[['asin', 'title']], how='left', on='asin')
             .drop_duplicates())
print(f"\n{sim_asins}")

For item B001ELL67A,  Grocery &amp; Gourmet Food" />...

Most similar D2V vectors: [('B001ELL67A', 0.951135516166687), ('B0029XDZDK', 0.8652843832969666), ('B000TK6LBS', 0.8596549034118652), ('B003M7TTYG', 0.8523412942886353), ('B0029XDZFS', 0.840634286403656)]

          asin  similarities  \
0   B001ELL67A      0.951136   
14  B0029XDZDK      0.865284   
32  B000TK6LBS      0.859655   
46  B003M7TTYG      0.852341   
53  B0029XDZFS      0.840634   

                                                title  
0                      Grocery &amp; Gourmet Food" />  
14  Coffee People Black Tiger K-Cup Portion Pack f...  
32  Caribou Coffee Caribou Blend, Keurig Single-Se...  
46  Green Mountain Coffee Revv, 22-Count K-Cups fo...  
53  Coffee People Wake Up Call K-Cup Portion Pack ...  
