### 0. Import packages

In [18]:
import logging
import missingno as msno
import numpy as np
import pandas as pd
import warnings

from annoy import AnnoyIndex
from gensim.models.doc2vec import Doc2Vec
from pandarallel import pandarallel
from pathlib import Path
from pprint import pprint
from src.data import load_dataset
from src.features import build_features, preprocessing
from tqdm import tqdm

pandarallel.initialize()
tqdm.pandas()
warnings.filterwarnings("ignore")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


### 1. Loading processed data pickle

In [2]:
# global variables
DATA_PATH = Path('data/processed/')
CATEGORY = 'Clothing_Shoes_and_Jewelry'

In [3]:
train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [4]:
print(f"Train: {train.shape}")
print(f"Test: {test.shape}")

Train: (231491, 6)
Test: (47135, 6)


In [5]:
# check train
train.head().append(train.tail())

Unnamed: 0,index,overall,reviewerID,asin,reviewText,processedReviewText
0,0,5.0,A1KLRMWW2FWPL4,0000031887,This is a great tutu and at a really great pri...,this great tutu great price it look cheap glad...
1,1,5.0,A2G5TCU2WDFZ65,0000031887,I bought this for my 4 yr old daughter for dan...,buy yr old daughter dance class wore today tim...
2,2,5.0,A1RLQXYNCMWRWN,0000031887,What can I say... my daughters have it in oran...,what daughters orange black white pink think b...
3,3,5.0,A8U3FAMSJVHS5,0000031887,"We bought several tutus at once, and they are ...",we buy tutu get high review sturdy seemingly t...
4,4,5.0,A3GEOILWLK86XM,0000031887,Thank you Halo Heaven great product for Little...,thank halo heaven great product little girls m...
231486,278629,5.0,AEL6CQNQXONBX,B00KKXCJQU,This set of travel organizers includes four pi...,this set travel organizer include piece total ...
231487,278630,5.0,ACJT8MUC0LRF0,B00KKXCJQU,When I pack it looks like a disaster area in a...,when pack look like disaster area suitcase pac...
231488,278632,5.0,A1EVV74UQYVKRY,B00KKXCJQU,I've been traveling back and forth to England ...,travel forth england pack way suitcases some p...
231489,278633,5.0,A1UQBFCERIP7VJ,B00KKXCJQU,These are very nice packing cubes and the 18 x...,these nice packing cube laundry storage bag ni...
231490,278635,5.0,A30VWT3R25QAVD,B00KKXCJQU,When I signed up to receive a free set of Shac...,when sign receive free set shacke pak review t...


In [6]:
# check test
test.head().append(test.tail())

Unnamed: 0,index,overall,reviewerID,asin,reviewText,processedReviewText
0,11,5.0,A2A2WZYLU528RO,0000031887,My daughter has worn this skirt almost every d...,my daughter worn skirt day receive washer clot...
1,16,5.0,A1MXJVYXE2QU6H,0000031887,Perfect for my budding grand daughter ballerin...,perfect bud grand daughter ballerina beautiful...
2,41,5.0,A35V32HZEGZH04,1608299953,".When in the military, I was stationed in Eur...",when military station europe year the friend q...
3,44,5.0,A216NSW58Q3SCJ,1617160377,Although I haven't kept up with the lessons as...,although keep lesson learn lot italian level s...
4,47,5.0,AY3D7DG5L5WCK,1617160377,I love Rosetta Stone it is easy to use in a fe...,love rosetta stone easy use quick lesson speak...
47130,278605,5.0,A3VE6Q3RY2JIM1,B00KA602SY,Much better quality and texture than expected ...,much good quality texture expect price the col...
47131,278607,2.0,A2FENE35P9Z592,B00KCWMG5S,For tiny teenagers.,for tiny teenager
47132,278625,5.0,ACJT8MUC0LRF0,B00KGCLROK,You don&#8217;t have to be a yoga or martial a...,you yoga martial art practitioner enjoy half t...
47133,278631,5.0,A2DG63DN704LOI,B00KKXCJQU,I don't normally go ga-ga over a product very ...,normally ga ga product cub awesome help review...
47134,278634,5.0,A22CW0ZHY3NJH8,B00KKXCJQU,I am on vacation with my family of four and th...,vacation family shacke pak set wonderful excep...


### 2. Doc2Vec model

In [7]:
%%time
# tokenising review text
train['tokenizedReviewText'] = train['processedReviewText'].parallel_apply(lambda x: x.split())

CPU times: user 1.42 s, sys: 406 ms, total: 1.82 s
Wall time: 2.26 s


In [8]:
# check `tokenizedReviewText`
train.head()

Unnamed: 0,index,overall,reviewerID,asin,reviewText,processedReviewText,tokenizedReviewText
0,0,5.0,A1KLRMWW2FWPL4,31887,This is a great tutu and at a really great pri...,this great tutu great price it look cheap glad...,"[this, great, tutu, great, price, it, look, ch..."
1,1,5.0,A2G5TCU2WDFZ65,31887,I bought this for my 4 yr old daughter for dan...,buy yr old daughter dance class wore today tim...,"[buy, yr, old, daughter, dance, class, wore, t..."
2,2,5.0,A1RLQXYNCMWRWN,31887,What can I say... my daughters have it in oran...,what daughters orange black white pink think b...,"[what, daughters, orange, black, white, pink, ..."
3,3,5.0,A8U3FAMSJVHS5,31887,"We bought several tutus at once, and they are ...",we buy tutu get high review sturdy seemingly t...,"[we, buy, tutu, get, high, review, sturdy, see..."
4,4,5.0,A3GEOILWLK86XM,31887,Thank you Halo Heaven great product for Little...,thank halo heaven great product little girls m...,"[thank, halo, heaven, great, product, little, ..."


#### 2.1 Preparing `user` and `item` tagged documents

In [9]:
user_documents, asin_documents = preprocessing.prepare_tagged_documents(users='reviewerID', asins='asin', reviews='tokenizedReviewText', df=train)

In [10]:
pprint(user_documents[:5])

[TaggedDocument(words=['this', 'great', 'tutu', 'great', 'price', 'it', 'look', 'cheap', 'glad', 'look', 'amazon', 'affordable', 'tutu', 'poorly'], tags=['A1KLRMWW2FWPL4']),
 TaggedDocument(words=['buy', 'yr', 'old', 'daughter', 'dance', 'class', 'wore', 'today', 'time', 'teacher', 'think', 'adorable', 'buy', 'light', 'blue', 'long', 'sleeve', 'leotard', 'happy', 'color', 'match', 'great', 'price', 'good', 'dollar'], tags=['A2G5TCU2WDFZ65']),
 TaggedDocument(words=['what', 'daughters', 'orange', 'black', 'white', 'pink', 'think', 'buy', 'fuccia', 'it', 'good', 'way', 'exalt', 'dancer', 'outfit', 'great', 'color', 'comfortable', 'look', 'great', 'easy', 'wear', 'durables', 'little', 'girls', 'love', 'think', 'great', 'buy', 'costumer', 'play'], tags=['A1RLQXYNCMWRWN']),
 TaggedDocument(words=['we', 'buy', 'tutu', 'get', 'high', 'review', 'sturdy', 'seemingly', 'the', 'girl', 'wear', 'regularly', 'include', 'play', 'tutu', 'stand', 'fits', 'yr', 'old', 'yr', 'old', 'clearly', 'plenty', '

In [11]:
pprint(asin_documents[:5])

[TaggedDocument(words=['this', 'great', 'tutu', 'great', 'price', 'it', 'look', 'cheap', 'glad', 'look', 'amazon', 'affordable', 'tutu', 'poorly'], tags=['0000031887']),
 TaggedDocument(words=['buy', 'yr', 'old', 'daughter', 'dance', 'class', 'wore', 'today', 'time', 'teacher', 'think', 'adorable', 'buy', 'light', 'blue', 'long', 'sleeve', 'leotard', 'happy', 'color', 'match', 'great', 'price', 'good', 'dollar'], tags=['0000031887']),
 TaggedDocument(words=['what', 'daughters', 'orange', 'black', 'white', 'pink', 'think', 'buy', 'fuccia', 'it', 'good', 'way', 'exalt', 'dancer', 'outfit', 'great', 'color', 'comfortable', 'look', 'great', 'easy', 'wear', 'durables', 'little', 'girls', 'love', 'think', 'great', 'buy', 'costumer', 'play'], tags=['0000031887']),
 TaggedDocument(words=['we', 'buy', 'tutu', 'get', 'high', 'review', 'sturdy', 'seemingly', 'the', 'girl', 'wear', 'regularly', 'include', 'play', 'tutu', 'stand', 'fits', 'yr', 'old', 'yr', 'old', 'clearly', 'plenty', 'room', 'grow

#### 2.3 Training Doc2Vec model

In [26]:
# model parameters
VECTOR_SIZE = 150
MIN_COUNT = 10
NEGATIVE = 5
NS_EXPONENT = 0.5
SAMPLE = 1e-05
DM = 1
WORKERS = 8
EPOCHS = 10

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(message)s')

##### 2.3.1 `user` d2v model

In [27]:
user_d2v = Doc2Vec(
    vector_size=VECTOR_SIZE,
    min_count=MIN_COUNT,
    negative=NEGATIVE,
    sample=SAMPLE,
    ns_exponent=NS_EXPONENT,
    dm=DM,
    workers=WORKERS,
)

# building vocab
user_d2v.build_vocab(user_documents)

# training model
user_d2v.train(user_documents, total_examples=user_d2v.corpus_count, epochs=EPOCHS)

2021-07-19 23:52:28,442 starting a new internal lifecycle event log for Doc2Vec
2021-07-19 23:52:28,443 Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d150,n5,w5,mc10,s1e-05,t8)', 'datetime': '2021-07-19T23:52:28.442206', 'gensim': '4.0.1', 'python': '3.9.4 (default, Apr  5 2021, 01:50:46) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-11.4-x86_64-i386-64bit', 'event': 'created'}
2021-07-19 23:52:28,444 collecting all words and their counts
2021-07-19 23:52:28,446 PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-07-19 23:52:28,516 PROGRESS: at example #10000, processed 282592 words (4060309/s), 12776 word types, 8139 tags
2021-07-19 23:52:28,577 PROGRESS: at example #20000, processed 569321 words (4795773/s), 18063 word types, 14160 tags
2021-07-19 23:52:28,640 PROGRESS: at example #30000, processed 862630 words (4649256/s), 22378 word types, 18865 tags
2021-07-19 23:52:28,703 PROGRESS: at example #40000, processed 1152769 words (4673148/s), 260

##### 2.3.2 `asin` d2v model

In [39]:
asin_d2v = Doc2Vec(
    vector_size=VECTOR_SIZE,
    min_count=MIN_COUNT,
    negative=NEGATIVE,
    sample=SAMPLE,
    ns_exponent=NS_EXPONENT,
    dm=DM,
    workers=WORKERS,
)

# building vocab
asin_d2v.build_vocab(asin_documents)

# training model
asin_d2v.train(asin_documents, total_examples=asin_d2v.corpus_count, epochs=EPOCHS)

2021-07-20 00:05:07,171 starting a new internal lifecycle event log for Doc2Vec
2021-07-20 00:05:07,172 Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d150,n5,w5,mc10,s1e-05,t8)', 'datetime': '2021-07-20T00:05:07.171231', 'gensim': '4.0.1', 'python': '3.9.4 (default, Apr  5 2021, 01:50:46) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-11.4-x86_64-i386-64bit', 'event': 'created'}
2021-07-20 00:05:07,173 collecting all words and their counts
2021-07-20 00:05:07,176 PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-07-20 00:05:07,237 PROGRESS: at example #10000, processed 282592 words (4685298/s), 12776 word types, 677 tags
2021-07-20 00:05:07,293 PROGRESS: at example #20000, processed 569321 words (5214871/s), 18063 word types, 1392 tags
2021-07-20 00:05:07,351 PROGRESS: at example #30000, processed 862630 words (5068506/s), 22378 word types, 2208 tags
2021-07-20 00:05:07,411 PROGRESS: at example #40000, processed 1152769 words (4908405/s), 26044 

In [50]:
MODEL_PATH = Path("models/d2v/")

user_d2v.save(f"{MODEL_PATH}/{CATEGORY}_user_d2v.model")
asin_d2v.save(f"{MODEL_PATH}/{CATEGORY}_asin_d2v.model")

2021-07-20 00:31:59,088 Doc2Vec lifecycle event {'fname_or_handle': 'models/d2v/Clothing_Shoes_and_Jewelry_user_d2v.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-07-20T00:31:59.088398', 'gensim': '4.0.1', 'python': '3.9.4 (default, Apr  5 2021, 01:50:46) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-11.4-x86_64-i386-64bit', 'event': 'saving'}
2021-07-20 00:31:59,089 not storing attribute cum_table
2021-07-20 00:31:59,090 {'uri': 'models/d2v/Clothing_Shoes_and_Jewelry_user_d2v.model', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'compression': None, 'transport_params': None}
2021-07-20 00:31:59,147 saved models/d2v/Clothing_Shoes_and_Jewelry_user_d2v.model
2021-07-20 00:31:59,148 Doc2Vec lifecycle event {'fname_or_handle': 'models/d2v/Clothing_Shoes_and_Jewelry_asin_d2v.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': froze

In [51]:
# load model
MODEL_PATH = Path("models/d2v/")

user_d2v = Doc2Vec.load(f"{MODEL_PATH}/{CATEGORY}_user_d2v.model")
asin_d2v = Doc2Vec.load(f"{MODEL_PATH}/{CATEGORY}_asin_d2v.model")

2021-07-20 00:32:51,835 loading Doc2Vec object from models/d2v/Clothing_Shoes_and_Jewelry_user_d2v.model
2021-07-20 00:32:51,840 {'uri': 'models/d2v/Clothing_Shoes_and_Jewelry_user_d2v.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'compression': None, 'transport_params': None}
2021-07-20 00:32:51,878 loading dv recursively from models/d2v/Clothing_Shoes_and_Jewelry_user_d2v.model.dv.* with mmap=None
2021-07-20 00:32:51,879 loading wv recursively from models/d2v/Clothing_Shoes_and_Jewelry_user_d2v.model.wv.* with mmap=None
2021-07-20 00:32:51,881 setting ignored attribute cum_table to None
2021-07-20 00:32:52,012 Doc2Vec lifecycle event {'fname': 'models/d2v/Clothing_Shoes_and_Jewelry_user_d2v.model', 'datetime': '2021-07-20T00:32:52.012475', 'gensim': '4.0.1', 'python': '3.9.4 (default, Apr  5 2021, 01:50:46) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-11.4-x86_64-i386-64bit

#### Testing retrieval of vectors via index

In [52]:
print([i[1] for i in user_documents[:10]])

[['A1KLRMWW2FWPL4'], ['A2G5TCU2WDFZ65'], ['A1RLQXYNCMWRWN'], ['A8U3FAMSJVHS5'], ['A3GEOILWLK86XM'], ['A27UF1MSF3DB2'], ['A16GFPNVF4Y816'], ['A2M2APVYIB2U6K'], ['A1NJ71X3YPQNQ9'], ['A3EERSWHAI6SO']]


In [54]:
user_d2v.dv[0]

array([-0.02055808, -0.05843449, -0.05547184,  0.01778437,  0.00469217,
       -0.03501041, -0.00673267,  0.03883101, -0.01805281, -0.0209298 ,
       -0.03623148, -0.00159739, -0.05911888,  0.0729358 , -0.03065956,
       -0.02815126,  0.02357766, -0.05407812, -0.00607294,  0.10883669,
       -0.04466315,  0.00948019,  0.03358833, -0.01949031,  0.00525372,
        0.03148691, -0.0259487 , -0.00393125, -0.03223359, -0.07157879,
       -0.02014398,  0.02571197, -0.07092678, -0.01904643, -0.04474743,
       -0.00384329,  0.04944458, -0.00731054,  0.03500834, -0.03514026,
       -0.01279218,  0.02124795, -0.03646373, -0.04348272,  0.02164138,
       -0.03481154,  0.03408755, -0.0179861 , -0.00462096,  0.07106731,
       -0.03033135,  0.03183918, -0.0095906 , -0.0425368 , -0.05297974,
        0.00681152,  0.01870618, -0.00066938,  0.02898959, -0.04136917,
       -0.01876115, -0.02415633, -0.0366751 , -0.00385988,  0.07085671,
       -0.00878479,  0.00941915, -0.04667619, -0.02240438, -0.07

#### Testing retrieval of vectors via tags

In [55]:
user_d2v.dv['A1KLRMWW2FWPL4']   # '2' is at index of 2 as shown previously

array([-0.02055808, -0.05843449, -0.05547184,  0.01778437,  0.00469217,
       -0.03501041, -0.00673267,  0.03883101, -0.01805281, -0.0209298 ,
       -0.03623148, -0.00159739, -0.05911888,  0.0729358 , -0.03065956,
       -0.02815126,  0.02357766, -0.05407812, -0.00607294,  0.10883669,
       -0.04466315,  0.00948019,  0.03358833, -0.01949031,  0.00525372,
        0.03148691, -0.0259487 , -0.00393125, -0.03223359, -0.07157879,
       -0.02014398,  0.02571197, -0.07092678, -0.01904643, -0.04474743,
       -0.00384329,  0.04944458, -0.00731054,  0.03500834, -0.03514026,
       -0.01279218,  0.02124795, -0.03646373, -0.04348272,  0.02164138,
       -0.03481154,  0.03408755, -0.0179861 , -0.00462096,  0.07106731,
       -0.03033135,  0.03183918, -0.0095906 , -0.0425368 , -0.05297974,
        0.00681152,  0.01870618, -0.00066938,  0.02898959, -0.04136917,
       -0.01876115, -0.02415633, -0.0366751 , -0.00385988,  0.07085671,
       -0.00878479,  0.00941915, -0.04667619, -0.02240438, -0.07

### 3. Examining D2V Results

#### 3.1 Are inferred vectors close to the precalculated ones?

Example of detailed review with better results:

In [59]:
doc_id = np.random.choice(list(train['index'].unique()), 1)
doc_tokens = train[train['index'] == doc_id[0]]['tokenizedReviewText'].values[0]

print(f"For doc {doc_id[0]}...\nTokens: {doc_tokens}")
inferred_docvec = asin_d2v.infer_vector(doc_tokens, steps=100, alpha=0.025)
print(f'Most similar D2V vectors: {asin_d2v.dv.most_similar([inferred_docvec], topn=3)}')

For doc 187116...
Tokens: ['add', 'bracelet', 'it', 'pretty', 'look', 'nice', 'it', 'bright', 'stand']
Most similar D2V vectors: [('B007XE5TKM', 0.9632561206817627), ('B00FI9PPS0', 0.9579210877418518), ('B00CVP0IFY', 0.957432210445404)]


Example of less-detailed review with worse results:

In [None]:
doc_id = np.random.choice(list(train['index'].unique()), 1)
doc_tokens = train[train['index'] == doc_id[0]]['tokenizedReviewText'].values[0]

print(f"For doc {doc_id[0]}...\nTokens: {doc_tokens}")
inferred_docvec = model.infer_vector(doc_tokens, steps=100, alpha=0.025)
print(f'Most similar D2V vectors: {model.dv.most_similar([inferred_docvec], topn=3)}')

In [None]:
similar_tokens = train[train['index'] == 444194]['tokenizedReviewText'].values[0]
print(similar_tokens)

### (WIP) Creating appropriate mapping

Before we can generate recommendations based on the vectors trained by `D2V`, we will need to generate a few mappings to help us aggregate vectors on both users and items level.

As our model ideates based on the how both `user` and `item` can be represented by their `documents` – which, in this case is an aggregation of all the past reviews. Hence, we will need to generate mappings that identifies:

1. `User-Review`: What are the reviews that have been given by a user.
2. `Product-Review`: What are the reviews that have been received by a product.

With the above mapping, we are able to develop a item-level recommendation based on the past items purchased and reviewed by the users. 

In [None]:
train_user_review_map = train.groupby(['reviewerID'])['index'].progress_apply(list).to_dict()
pprint(list(train_user_review_map.items())[:5])

In [None]:
# OLD METHOD
# train_user_index_map = {}
# users = list(set(train['reviewerID']))
# index = 0
# for user in tqdm(users):
#     if user not in train_user_index_map.values():
#         train_user_index_map[index] = user
#         index += 1
#     else:
#         pass
# print(len(train_user_index_map))

train_user_index_map = pd.DataFrame({'reviewerID': list(set(train['reviewerID']))}).to_dict()['reviewerID']
print(len(train_user_index_map))

In [None]:
# verify checks
print(train['reviewerID'].nunique())
print(len(train_user_review_map))
print(len(train_user_index_map))

In [None]:
train_prod_review_map = train.groupby(['asin'])['index'].progress_apply(list).to_dict()
pprint(list(train_prod_review_map.items())[:5])

In [None]:
# OLD METHOD
# train_prod_index_map = {}
# prods = list(train_movie_reviews['asin'].unique())
# index = 0
# for prod in tqdm(prods):
#     if prod not in train_prod_index_map.values():
#         train_prod_index_map[index] = prod
#         index += 1
#     else:
#         pass
# print(len(train_prod_index_map))

train_prod_index_map = pd.DataFrame({'asin': list(set(train['asin']))}).to_dict()['asin']
pprint(list(train_prod_index_map.items())[:5])

In [None]:
# verify checks
print(train['asin'].nunique())
print(len(train_prod_review_map))
print(len(train_prod_index_map))

In [None]:
MAP_PATH = Path("data/processed/mappings/")

np.save(f'{MAP_PATH}/{CATEGORY}_train_user_review_map.npy', train_user_review_map)
np.save(f'{MAP_PATH}/{CATEGORY}_train_user_index_map.npy', train_user_index_map)
np.save(f'{MAP_PATH}/{CATEGORY}_train_prod_review_map.npy', train_prod_review_map)
np.save(f'{MAP_PATH}/{CATEGORY}_train_prod_index_map.npy', train_prod_index_map)

In [None]:
MAP_PATH = Path("data/processed/mappings/")

train_user_review_map = np.load(f'{MAP_PATH}/{CATEGORY}_train_user_review_map.npy', allow_pickle=True).item()
train_user_index_map = np.load(f'{MAP_PATH}/{CATEGORY}_train_user_index_map.npy', allow_pickle=True).item()
train_prod_review_map = np.load(f'{MAP_PATH}/{CATEGORY}_train_prod_review_map.npy', allow_pickle=True).item()
train_prod_index_map = np.load(f'{MAP_PATH}/{CATEGORY}_train_prod_index_map.npy', allow_pickle=True).item()

### Creating aggregated vector by product `asin` and building ANN trees

In [None]:
# creating approximate nearest neighbour setup
f = 150
t = AnnoyIndex(f, 'angular')    # using cosine measure

agg_prod_vectors = {}
# in range for all users
for k, v in tqdm(train_prod_index_map.items()):
    ind_reviews_vectors = np.zeros(f)
    # retrieve the agg reviews list based on user id
    # asin = train_prod_index_map[k]
    agg_list = train_prod_review_map[v]
    for j in agg_list:
        # retrieve the row number
        # row_num = temp.loc[temp['index'] == j].index[0]
        # retrieve the vector from d2v
        try:
            ind_reviews_vectors += np.array(model.dv[str(j)])
        except KeyError:
            pass
    # aggregate all doc vectors from a prod
    agg_vectors = np.divide(ind_reviews_vectors, len(agg_list))
    agg_prod_vectors[k] = agg_vectors
    # we need to add these into the ann object
    t.add_item(k, agg_vectors)

# build the trees
ANN_PATH = Path(f"anns/{CATEGORY}/")

t.build(300)
t.save(f'{ANN_PATH}/{CATEGORY}-f150.ann')

In [None]:
f = 150

ANN_PATH = Path(f"anns/{CATEGORY}/")
u = AnnoyIndex(f, 'angular')
u.load(f'{ANN_PATH}/{CATEGORY}-f150.ann')

In [None]:
np.save(f'{MAP_PATH}/{CATEGORY}_agg_prod_vectors.npy', agg_prod_vectors)

In [None]:
agg_prod_vectors = np.load(f'{MAP_PATH}/{CATEGORY}_agg_prod_vectors.npy', allow_pickle=True).item()

### Creating aggregated vector by users `reviewerID`

In [None]:
f = 150

agg_user_vectors = {}
# in range for all users
for k, v in tqdm(train_user_index_map.items()):
    ind_reviews_vectors = np.zeros(f)
    # retrieve the agg reviews list based on user id
    # user_id = train_user_index_map[i]
    agg_list = train_user_review_map[v]
    for j in agg_list:
        # retrieve the vector from d2v
        ind_reviews_vectors += np.array(model.dv[str(j)])
    # aggregate all doc vectors from a prod
    agg_vectors = np.divide(ind_reviews_vectors, len(agg_list))
    agg_user_vectors[k] = agg_vectors

In [None]:
np.save(f'{MAP_PATH}/{CATEGORY}_agg_user_vectors.npy', agg_user_vectors)

In [None]:
agg_user_vectors = np.load(f'{MAP_PATH}/{CATEGORY}_agg_user_vectors.npy', allow_pickle=True).item()

### Using Nearest Neighbor Search (NNS) to retrieve top 10 items 

Extending the idea of generating recommendations, we able to generate recommendations based on two factors:

1. Past users's purchase (using their reviews)
2. Item-level similarities based on aggregation of reviews embeddings

This meant that, given a user's past purchase history, where we generated an aggregation of reviews embeddings, we are able to use that same vector to help recommend items from the item *f*-dimensional space. Also, based on an instance where perhaps last click on a certain item, we are able to generate recommendations based on the comparison of similarities reviews from the clicked items to the other items within the item *f*-dimensional space.

#### Using users-level recommendation

In [None]:
def generate_random_user_recommendations():
    user_index = np.random.randint(0, len(train_user_index_map))
    reviewer_agg_vectors = agg_user_vectors[user_index]
    reviewer_id = train_user_index_map[user_index]

    ann_similar_items = u.get_nns_by_vector(reviewer_agg_vectors, 25, search_k=-1, include_distances=False)

    recommendations = pd.DataFrame(columns=['rank', 'asin', 'overall', 'review_counts', 'titles', 'brands'])
    ranks = [i for i in range(1,11,1)]
    asins = []
    overalls = []
    review_counts = []
    titles = []
    brands = []
    for item in ann_similar_items:
        user_history_train = train[train['reviewerID'] == reviewer_id]['asin'].unique()
        if item not in user_history_train:
            asin = train_prod_index_map[item]
            overall = train[train['asin'] == asin]['overall'].mean()
            review_count = len(train[train['asin'] == asin]['reviewText'])
            title = train[train['asin'] == asin]['title'].unique()[0]
            brand = train[train['asin'] == asin]['brand'].unique()[0]

            asins.append(asin)
            overalls.append(overall)
            review_counts.append(review_count)
            titles.append(title if len(title) > 0 else '')
            brands.append(brand if len(str(brand)) > 0 else '')
        else:
            pass

    recommendations['rank'] = ranks
    recommendations['asin'] = asins[:10]
    recommendations['overall'] = overalls[:10]
    recommendations['review_counts'] = review_counts[:10]
    recommendations['titles'] = titles[:10]
    recommendations['brands'] = brands[:10]
    
    # retrieving the recommendations
    print(f"For user: {reviewer_id}\n")
    print(f"Recommended items are:")
    for index, row in recommendations.iterrows():
        print(f"Rank: {row[0]} | {row[1]} | Rating: {row[2]:.2f} | Reviewed: {row[3]} | Title: {row[4]}")

    return reviewer_id

reviewer_id = generate_random_user_recommendations()

In [None]:
# looking at the past purchase history of the user
user_history_train= train[train['reviewerID'] == reviewer_id]
user_history_train[['title', 'asin', 'overall', 'reviewText']]

In [None]:
user_history_test = test[test['reviewerID'] == reviewer_id]
user_history_test[['title', 'asin', 'overall', 'reviewText']]

#### Using items-level recommendation

In [None]:
def generate_random_prod_recommendations():
    user_index = np.random.randint(0, len(train_user_index_map))
    reviewer_agg_vectors = agg_user_vectors[user_index]
    reviewer_id = train_user_index_map[user_index]

    prod_index = np.random.randint(0, len(train_prod_index_map))
    prod_agg_vectors = agg_prod_vectors[prod_index]
    prod_asin = train_prod_index_map[prod_index]
    prod_title = train[train['asin'] == prod_asin]['title'].unique()[0]

    ann_similar_items = u.get_nns_by_vector(prod_agg_vectors, 50, search_k=-1, include_distances=False)

    recommendations = pd.DataFrame(columns=['rank', 'asin', 'overall', 'review_counts', 'titles', 'brands'])
    ranks = [i for i in range(1,12,1)]
    asins = []
    overalls = []
    review_counts = []
    titles = []
    brands = []
    for item in ann_similar_items:
        user_history_train = train[train['reviewerID'] == reviewer_id]['asin'].unique()
        if item not in user_history_train:
            asin = train_prod_index_map[item]
            overall = train[train['asin'] == asin]['overall'].mean()
            review_count = len(train[train['asin'] == asin]['reviewText'])
            title = train[train['asin'] == asin]['title'].unique()[0]
            brand = train[train['asin'] == asin]['brand'].unique()[0]

            asins.append(asin)
            overalls.append(overall)
            review_counts.append(review_count)
            titles.append(title if len(title) > 0 else '')
            brands.append(brand if len(str(brand)) > 0 else '')

    recommendations['rank'] = ranks
    recommendations['asin'] = asins[:11]
    recommendations['overall'] = overalls[:11]
    recommendations['review_counts'] = review_counts[:11]
    recommendations['titles'] = titles[:11]
    recommendations['brands'] = brands[:11]
    
    # retrieving the recommendations
    print(f"For user: {reviewer_id}, given last viewed/clicked product: {prod_asin} – {prod_title}...\n")
    print(f"Recommended items are:")
    for index, row in recommendations.iterrows():
        print(f"Rank: {row[0]} | {row[1]} | Rating: {row[2]:.2f} | Reviewed: {row[3]} | Title: {row[4]}")

generate_random_prod_recommendations()

### Computing Metrics