### 0. Import packages

In [1]:
import logging
import missingno as msno
import numpy as np
import pandas as pd
import warnings

from annoy import AnnoyIndex
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pandarallel import pandarallel
from pathlib import Path
from pprint import pprint
from src.data import load_dataset
from src.features import build_features, preprocessing
from tqdm import tqdm

pandarallel.initialize()
tqdm.pandas()
warnings.filterwarnings("ignore")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


### 1. Loading processed data pickle

In [2]:
# global variables
DATA_PATH = Path('data/processed/')
CATEGORY = 'Sports_and_Outdoors'

In [3]:
train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [4]:
print(f"Train: {train.shape}")
print(f"Test: {test.shape}")

Train: (2242666, 8)
Test: (434692, 8)


In [None]:
# check train
train.head().append(train.tail())

In [None]:
# check test
test.head().append(test.tail())

### 2. Training Doc2Vec model

In [None]:
# tokenising review text
train['tokenizedReviewText'] = train['processedReviewText'].parallel_apply(lambda x: x.split())

In [None]:
# check `tokenizedReviewText`
train.head()

In [None]:
# preparing data for d2v
indexes = list(train['index'])
reviews = list(train['tokenizedReviewText'])

logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s",
    level=logging.INFO)

# creating tagged documents
documents = [TaggedDocument(doc[1], [str(doc[0])]) for _, doc in enumerate(zip(indexes, reviews))]
pprint(documents[:5])

In [None]:
# generating and training d2v
# model parameters
VECTOR_SIZE = 150
MIN_COUNT = 10
NEGATIVE = 5
SAMPLE = 1e-05
DM = 1
WORKERS = 8
EPOCHS = 20

# building the model
model = Doc2Vec(
    vector_size=VECTOR_SIZE,
    min_count=MIN_COUNT,
    negative=NEGATIVE,
    sample=SAMPLE,
    dm=DM,
    workers=WORKERS,
)
model.build_vocab(documents)

# training the model
model.train(documents, total_examples=model.corpus_count, epochs=EPOCHS)

In [None]:
MODEL_PATH = Path("models/d2v/")
model.save(f"{MODEL_PATH}/{CATEGORY}_d2v.model")

In [5]:
# load model
MODEL_PATH = Path("models/d2v/")
model = Doc2Vec.load(f"{MODEL_PATH}/{CATEGORY}_d2v.model")

#### Testing retrieval of vectors via index

In [None]:
print([i[1] for i in documents[:10]])

In [6]:
model.dv[2]

array([-0.00751332, -0.03107653, -0.01384184, -0.01297431, -0.00852802,
       -0.0106923 , -0.01367828,  0.03381048, -0.00208825,  0.02394518,
       -0.04435515, -0.00538987, -0.00521584,  0.01179986, -0.03706737,
       -0.03014077,  0.0459164 ,  0.0039641 ,  0.0058409 ,  0.06359709,
        0.00169906,  0.01850473,  0.04046334, -0.00611207,  0.01891683,
        0.0229055 , -0.0273975 ,  0.00510953, -0.01142103, -0.04234928,
        0.00269255, -0.00932012,  0.0163039 ,  0.0085518 , -0.01792221,
       -0.01183674, -0.0050256 ,  0.00414017, -0.00787533, -0.04172866,
       -0.01757223,  0.01712419, -0.00416967,  0.02410017, -0.02095975,
        0.01827184, -0.00526504, -0.01363115, -0.01183784,  0.02941062,
       -0.03414746,  0.01961362, -0.01695325,  0.02233639, -0.00714071,
        0.02432763, -0.03584614,  0.02528099, -0.00444841,  0.02611478,
        0.00618138, -0.01781676, -0.03504014,  0.04539623,  0.03562992,
        0.00398827,  0.00650692, -0.0088322 , -0.01472492,  0.04

#### Testing retrieval of vectors via tags

In [7]:
model.dv['2']   # '2' is at index of 2 as shown previously

array([-0.00751332, -0.03107653, -0.01384184, -0.01297431, -0.00852802,
       -0.0106923 , -0.01367828,  0.03381048, -0.00208825,  0.02394518,
       -0.04435515, -0.00538987, -0.00521584,  0.01179986, -0.03706737,
       -0.03014077,  0.0459164 ,  0.0039641 ,  0.0058409 ,  0.06359709,
        0.00169906,  0.01850473,  0.04046334, -0.00611207,  0.01891683,
        0.0229055 , -0.0273975 ,  0.00510953, -0.01142103, -0.04234928,
        0.00269255, -0.00932012,  0.0163039 ,  0.0085518 , -0.01792221,
       -0.01183674, -0.0050256 ,  0.00414017, -0.00787533, -0.04172866,
       -0.01757223,  0.01712419, -0.00416967,  0.02410017, -0.02095975,
        0.01827184, -0.00526504, -0.01363115, -0.01183784,  0.02941062,
       -0.03414746,  0.01961362, -0.01695325,  0.02233639, -0.00714071,
        0.02432763, -0.03584614,  0.02528099, -0.00444841,  0.02611478,
        0.00618138, -0.01781676, -0.03504014,  0.04539623,  0.03562992,
        0.00398827,  0.00650692, -0.0088322 , -0.01472492,  0.04

### 3. Examining D2V Results

#### 3.1 Are inferred vectors close to the precalculated ones?

In [None]:
train[train['index'] == doc_id[0]]

Example of detailed review with better results:

In [None]:
doc_id = np.random.choice(list(train['index'].unique()), 1)
doc_tokens = train[train['index'] == doc_id[0]]['tokenizedReviewText'].values[0]

print(f"For doc {doc_id[0]}...\nTokens: {doc_tokens}")
inferred_docvec = model.infer_vector(doc_tokens, steps=100, alpha=0.025)
print(f'Most similar D2V vectors: {model.dv.most_similar([inferred_docvec], topn=3)}')

In [None]:
# looking at the 2nd similar vector
similar_tokens = train[train['index'] == 881221]['tokenizedReviewText'].values[0]
print(similar_tokens)

Example of less-detailed review with worse results:

In [None]:
doc_id = np.random.choice(list(train['index'].unique()), 1)
doc_tokens = train[train['index'] == doc_id[0]]['tokenizedReviewText'].values[0]

print(f"For doc {doc_id[0]}...\nTokens: {doc_tokens}")
inferred_docvec = model.infer_vector(doc_tokens, steps=100, alpha=0.025)
print(f'Most similar D2V vectors: {model.dv.most_similar([inferred_docvec], topn=3)}')

In [None]:
similar_tokens = train[train['index'] == 444194]['tokenizedReviewText'].values[0]
print(similar_tokens)

### Creating appropriate mapping

Before we can generate recommendations based on the vectors trained by `D2V`, we will need to generate a few mappings to help us aggregate vectors on both users and items level.

As our model ideates based on the how both `user` and `item` can be represented by their `documents` – which, in this case is an aggregation of all the past reviews. Hence, we will need to generate mappings that identifies:

1. `User-Review`: What are the reviews that have been given by a user.
2. `Product-Review`: What are the reviews that have been received by a product.

With the above mapping, we are able to develop a item-level recommendation based on the past items purchased and reviewed by the users. 

In [None]:
train_user_review_map = train.groupby(['reviewerID'])['index'].progress_apply(list).to_dict()
pprint(list(train_user_review_map.items())[:5])

In [None]:
# OLD METHOD
# train_user_index_map = {}
# users = list(set(train['reviewerID']))
# index = 0
# for user in tqdm(users):
#     if user not in train_user_index_map.values():
#         train_user_index_map[index] = user
#         index += 1
#     else:
#         pass
# print(len(train_user_index_map))

train_user_index_map = pd.DataFrame({'reviewerID': list(set(train['reviewerID']))}).to_dict()['reviewerID']
print(len(train_user_index_map))

In [None]:
# verify checks
print(train['reviewerID'].nunique())
print(len(train_user_review_map))
print(len(train_user_index_map))

In [None]:
train_prod_review_map = train.groupby(['asin'])['index'].progress_apply(list).to_dict()
pprint(list(train_prod_review_map.items())[:5])

In [None]:
# OLD METHOD
# train_prod_index_map = {}
# prods = list(train_movie_reviews['asin'].unique())
# index = 0
# for prod in tqdm(prods):
#     if prod not in train_prod_index_map.values():
#         train_prod_index_map[index] = prod
#         index += 1
#     else:
#         pass
# print(len(train_prod_index_map))

train_prod_index_map = pd.DataFrame({'asin': list(set(train['asin']))}).to_dict()['asin']
pprint(list(train_prod_index_map.items())[:5])

In [None]:
# verify checks
print(train['asin'].nunique())
print(len(train_prod_review_map))
print(len(train_prod_index_map))

In [None]:
MAP_PATH = Path("data/processed/mappings/")

np.save(f'{MAP_PATH}/{CATEGORY}_train_user_review_map.npy', train_user_review_map)
np.save(f'{MAP_PATH}/{CATEGORY}_train_user_index_map.npy', train_user_index_map)
np.save(f'{MAP_PATH}/{CATEGORY}_train_prod_review_map.npy', train_prod_review_map)
np.save(f'{MAP_PATH}/{CATEGORY}_train_prod_index_map.npy', train_prod_index_map)

In [8]:
MAP_PATH = Path("data/processed/mappings/")

train_user_review_map = np.load(f'{MAP_PATH}/{CATEGORY}_train_user_review_map.npy', allow_pickle=True).item()
train_user_index_map = np.load(f'{MAP_PATH}/{CATEGORY}_train_user_index_map.npy', allow_pickle=True).item()
train_prod_review_map = np.load(f'{MAP_PATH}/{CATEGORY}_train_prod_review_map.npy', allow_pickle=True).item()
train_prod_index_map = np.load(f'{MAP_PATH}/{CATEGORY}_train_prod_index_map.npy', allow_pickle=True).item()

### Creating aggregated vector by product `asin` and building ANN trees

In [None]:
# creating approximate nearest neighbour setup
f = 150
t = AnnoyIndex(f, 'angular')    # using cosine measure

agg_prod_vectors = {}
# in range for all users
for k, v in tqdm(train_prod_index_map.items()):
    ind_reviews_vectors = np.zeros(f)
    # retrieve the agg reviews list based on user id
    # asin = train_prod_index_map[k]
    agg_list = train_prod_review_map[v]
    for j in agg_list:
        # retrieve the row number
        # row_num = temp.loc[temp['index'] == j].index[0]
        # retrieve the vector from d2v
        try:
            ind_reviews_vectors += np.array(model.dv[str(j)])
        except KeyError:
            pass
    # aggregate all doc vectors from a prod
    agg_vectors = np.divide(ind_reviews_vectors, len(agg_list))
    agg_prod_vectors[k] = agg_vectors
    # we need to add these into the ann object
    t.add_item(k, agg_vectors)

# build the trees
ANN_PATH = Path(f"anns/{CATEGORY}/")

t.build(300)
t.save(f'{ANN_PATH}/{CATEGORY}-f150.ann')

In [10]:
f = 150

ANN_PATH = Path(f"anns/{CATEGORY}/")
u = AnnoyIndex(f, 'angular')
u.load(f'{ANN_PATH}/{CATEGORY}-f150.ann')

True

In [None]:
np.save(f'{MAP_PATH}/{CATEGORY}_agg_prod_vectors.npy', agg_prod_vectors)

In [9]:
agg_prod_vectors = np.load(f'{MAP_PATH}/{CATEGORY}_agg_prod_vectors.npy', allow_pickle=True).item()

### Creating aggregated vector by users `reviewerID`

In [None]:
f = 150

agg_user_vectors = {}
# in range for all users
for k, v in tqdm(train_user_index_map.items()):
    ind_reviews_vectors = np.zeros(f)
    # retrieve the agg reviews list based on user id
    # user_id = train_user_index_map[i]
    agg_list = train_user_review_map[v]
    for j in agg_list:
        # retrieve the vector from d2v
        ind_reviews_vectors += np.array(model.dv[str(j)])
    # aggregate all doc vectors from a prod
    agg_vectors = np.divide(ind_reviews_vectors, len(agg_list))
    agg_user_vectors[k] = agg_vectors

In [None]:
np.save(f'{MAP_PATH}/{CATEGORY}_agg_user_vectors.npy', agg_user_vectors)

In [11]:
agg_user_vectors = np.load(f'{MAP_PATH}/{CATEGORY}_agg_user_vectors.npy', allow_pickle=True).item()

### Using Nearest Neighbor Search (NNS) to retrieve top 10 items 

Extending the idea of generating recommendations, we able to generate recommendations based on two factors:

1. Past users's purchase (using their reviews)
2. Item-level similarities based on aggregation of reviews embeddings

This meant that, given a user's past purchase history, where we generated an aggregation of reviews embeddings, we are able to use that same vector to help recommend items from the item *f*-dimensional space. Also, based on an instance where perhaps last click on a certain item, we are able to generate recommendations based on the comparison of similarities reviews from the clicked items to the other items within the item *f*-dimensional space.

#### Using users-level recommendation

In [12]:
def generate_random_user_recommendations():
    user_index = np.random.randint(0, len(train_user_index_map))
    reviewer_agg_vectors = agg_user_vectors[user_index]
    reviewer_id = train_user_index_map[user_index]

    ann_similar_items = u.get_nns_by_vector(reviewer_agg_vectors, 25, search_k=-1, include_distances=False)

    recommendations = pd.DataFrame(columns=['rank', 'asin', 'overall', 'review_counts', 'titles', 'brands'])
    ranks = [i for i in range(1,11,1)]
    asins = []
    overalls = []
    review_counts = []
    titles = []
    brands = []
    for item in ann_similar_items:
        user_history_train = train[train['reviewerID'] == reviewer_id]['asin'].unique()
        if item not in user_history_train:
            asin = train_prod_index_map[item]
            overall = train[train['asin'] == asin]['overall'].mean()
            review_count = len(train[train['asin'] == asin]['reviewText'])
            title = train[train['asin'] == asin]['title'].unique()[0]
            brand = train[train['asin'] == asin]['brand'].unique()[0]

            asins.append(asin)
            overalls.append(overall)
            review_counts.append(review_count)
            titles.append(title if len(title) > 0 else '')
            brands.append(brand if len(str(brand)) > 0 else '')
        else:
            pass

    recommendations['rank'] = ranks
    recommendations['asin'] = asins[:10]
    recommendations['overall'] = overalls[:10]
    recommendations['review_counts'] = review_counts[:10]
    recommendations['titles'] = titles[:10]
    recommendations['brands'] = brands[:10]
    
    # retrieving the recommendations
    print(f"For user: {reviewer_id}\n")
    print(f"Recommended items are:")
    for index, row in recommendations.iterrows():
        print(f"Rank: {row[0]} | {row[1]} | Rating: {row[2]:.2f} | Reviewed: {row[3]} | Title: {row[4]}")

    return reviewer_id

reviewer_id = generate_random_user_recommendations()

For user: A247LS4EOGZJK9

Recommended items are:
Rank: 1 | B01BL0Z8GS | Rating: 4.28 | Reviewed: 69 | Title: Ulticlip 3
Rank: 2 | B00W5EC39K | Rating: 4.19 | Reviewed: 189 | Title: BLACKHAWK! Ambidextrous Appendix Reversible Carry Inside The Pants Holster
Rank: 3 | B01B5E3G4Q | Rating: 4.47 | Reviewed: 15 | Title: Desantis Slim-Tuk Inside Fits Glock 43 Ambidextrous Kydex Pants Holster, Black
Rank: 4 | B017AFD5JK | Rating: 3.67 | Reviewed: 57 | Title: Q-Series IWB Minimalist Concealed Carry Stealth Holster
Rank: 5 | B007F7IWGU | Rating: 4.13 | Reviewed: 30 | Title: 5.11 Tactical Unisex Adult Inside Waistband 19/23 26/27 RH Holster
Rank: 6 | B01DMMKUFC | Rating: 4.72 | Reviewed: 134 | Title: CYA Supply Co. IWB Holster Fits: Glock 43 - Veteran Owned Company - Made in USA - Inside Waistband Concealed Carry Holster
Rank: 7 | B001UOHLCU | Rating: 4.20 | Reviewed: 44 | Title: Galco Triton Kydex IWB Holster for Glock 19, 23, 32 (Black, Right-hand)
Rank: 8 | B00HZTAX42 | Rating: 4.50 | Reviewed

In [13]:
# looking at the past purchase history of the user
user_history_train= train[train['reviewerID'] == reviewer_id]
user_history_train[['title', 'asin', 'overall', 'reviewText']]

Unnamed: 0,title,asin,overall,reviewText
1175232,Techna Clip Conceal Carry Gun Belt Clips for B...,B0094T6MKK,5.0,perfect accessory for the bodyguard. For just ...
1875527,"Browning BR182BL-BRK Vanquish Linerlock Knife,...",B00VU4MBQG,5.0,the flip out top thumb guard is an awesome ide...
1881104,BLACKHAWK! Ambidextrous Appendix Reversible Ca...,B00W5EC39K,3.0,"The clip is too flimsy, if the clip was metal ..."
2137869,Ulticlip 3,B01BL0Z8GS,5.0,Can be a little hard to release depending on t...
2173979,CYA Supply Co. IWB Holster Fits: Glock 43 - Ve...,B01DMMKUFC,5.0,"excellent holster for the money!! bought ""claw..."


In [14]:
user_history_test = test[test['reviewerID'] == reviewer_id]
user_history_test[['title', 'asin', 'overall', 'reviewText']]

Unnamed: 0,title,asin,overall,reviewText
419015,Fairwin EDC Tactical Belt - Reinforced Thick C...,B01D3842OA,5.0,can't beat the cost VS product for this belt. ...


#### Using items-level recommendation

In [29]:
def generate_random_prod_recommendations():
    user_index = np.random.randint(0, len(train_user_index_map))
    reviewer_agg_vectors = agg_user_vectors[user_index]
    reviewer_id = train_user_index_map[user_index]

    prod_index = np.random.randint(0, len(train_prod_index_map))
    prod_agg_vectors = agg_prod_vectors[prod_index]
    prod_asin = train_prod_index_map[prod_index]
    prod_title = train[train['asin'] == prod_asin]['title'].unique()[0]

    ann_similar_items = u.get_nns_by_vector(prod_agg_vectors, 50, search_k=-1, include_distances=False)

    recommendations = pd.DataFrame(columns=['rank', 'asin', 'overall', 'review_counts', 'titles', 'brands'])
    ranks = [i for i in range(1,12,1)]
    asins = []
    overalls = []
    review_counts = []
    titles = []
    brands = []
    for item in ann_similar_items:
        user_history_train = train[train['reviewerID'] == reviewer_id]['asin'].unique()
        if item not in user_history_train:
            asin = train_prod_index_map[item]
            overall = train[train['asin'] == asin]['overall'].mean()
            review_count = len(train[train['asin'] == asin]['reviewText'])
            title = train[train['asin'] == asin]['title'].unique()[0]
            brand = train[train['asin'] == asin]['brand'].unique()[0]

            asins.append(asin)
            overalls.append(overall)
            review_counts.append(review_count)
            titles.append(title if len(title) > 0 else '')
            brands.append(brand if len(str(brand)) > 0 else '')

    recommendations['rank'] = ranks
    recommendations['asin'] = asins[:11]
    recommendations['overall'] = overalls[:11]
    recommendations['review_counts'] = review_counts[:11]
    recommendations['titles'] = titles[:11]
    recommendations['brands'] = brands[:11]
    
    # retrieving the recommendations
    print(f"For user: {reviewer_id}, given last viewed/clicked product: {prod_asin} – {prod_title}...\n")
    print(f"Recommended items are:")
    for index, row in recommendations.iterrows():
        print(f"Rank: {row[0]} | {row[1]} | Rating: {row[2]:.2f} | Reviewed: {row[3]} | Title: {row[4]}")

generate_random_prod_recommendations()

For user: AZJF5T6HWZZFD, given last viewed/clicked product: B00029EWIA – Spark-Lite Fire Starter...

Recommended items are:
Rank: 1 | B00029EWIA | Rating: 4.14 | Reviewed: 7 | Title: Spark-Lite Fire Starter
Rank: 2 | B00QGKYKJA | Rating: 4.91 | Reviewed: 57 | Title: Lightning Strike Standard Fire Starter Holland
Rank: 3 | B00029EWYY | Rating: 4.30 | Reviewed: 168 | Title: Four Seasons Survival Tinder-Quik Fire Tab
Rank: 4 | B0027T0KJS | Rating: 4.56 | Reviewed: 72 | Title: S.O.L. Survive Outdoors Longer Tinder Quik Fire Starters (12-Count)
Rank: 5 | B001H9N8BG | Rating: 4.62 | Reviewed: 52 | Title: Ultimate Survival Technologies StrikeForce Fire Starter
Rank: 6 | B0013L2DKU | Rating: 4.76 | Reviewed: 265 | Title: Light My Fire Original Swedish FireSteel Army 12,000 Strike Fire Starter - Black
Rank: 7 | B003R9VMPS | Rating: 4.69 | Reviewed: 32 | Title: FireSteel Armageddon with Super Scraper and Lanyard
Rank: 8 | B0035LVZFK | Rating: 4.03 | Reviewed: 32 | Title: Zippo Emergency Fire Sta

### Computing Metrics