In [1]:
!pip install gensim
!pip install pandas
!pip install pandarallel
!pip install numpy
!pip install tqdm
!pip install ipywidgets



In [2]:
import logging
import numpy as np
import pandas as pd
import warnings

from gensim.models.doc2vec import Doc2Vec
from pandarallel import pandarallel
from pathlib import Path
from pprint import pprint
from src.features import preprocessing
from tqdm import tqdm

pandarallel.initialize(progress_bar=True)
tqdm.pandas()
warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# 1. Load data

Let's load the train/test data that we have processed previously. We will do a quick check on the shape of both datasets, and also visually inspect that the `processedReviewText` should still be `str` format – hence, we are required to tokenized it before parsing into the `Doc2Vec` model.

In [3]:
# global variables
DATA_PATH = Path('data/processed/')
CATEGORY = 'Clothing_Shoes_and_Jewelry'

train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [4]:
print(f"Train: {train.shape}, unique users: {train.reviewerID.nunique()}, unique items: {train.asin.nunique()}")
print(f"Test: {test.shape}, unique users: {test.reviewerID.nunique()}, unique items: {test.asin.nunique()}")

Train: (231491, 5), unique users: 39387, unique items: 23033
Test: (47145, 5), unique users: 39380, unique items: 17949


In [5]:
# check train
train.head().append(train.tail())

Unnamed: 0,overall,reviewerID,asin,reviewText,processedReviewText
0,5.0,A1KLRMWW2FWPL4,0000031887,This is a great tutu and at a really great pri...,this great tutu great price it look cheap glad...
1,5.0,A2G5TCU2WDFZ65,0000031887,I bought this for my 4 yr old daughter for dan...,buy yr old daughter dance class wore today tim...
2,5.0,A1RLQXYNCMWRWN,0000031887,What can I say... my daughters have it in oran...,what daughters orange black white pink think b...
3,4.0,A27UF1MSF3DB2,0000031887,I received this today and I'm not a fan of it ...,receive today fan daughter think puffier look ...
4,5.0,A16GFPNVF4Y816,0000031887,Bought this as a backup to the regular ballet ...,bought backup regular ballet outfit daughter w...
231486,5.0,ACJT8MUC0LRF0,B00KKXCJQU,When I pack it looks like a disaster area in a...,when pack look like disaster area suitcase pac...
231487,5.0,A2DG63DN704LOI,B00KKXCJQU,I don't normally go ga-ga over a product very ...,normally ga ga product cub awesome help review...
231488,5.0,A1UQBFCERIP7VJ,B00KKXCJQU,These are very nice packing cubes and the 18 x...,these nice packing cube laundry storage bag ni...
231489,5.0,A22CW0ZHY3NJH8,B00KKXCJQU,I am on vacation with my family of four and th...,vacation family shacke pak set wonderful excep...
231490,5.0,A30VWT3R25QAVD,B00KKXCJQU,When I signed up to receive a free set of Shac...,when sign receive free set shacke pak review t...


In [6]:
# check test
test.head().append(test.tail())

Unnamed: 0,overall,reviewerID,asin,reviewText,processedReviewText
0,5.0,A8U3FAMSJVHS5,0000031887,"We bought several tutus at once, and they are ...",we buy tutu get high review sturdy seemingly t...
1,5.0,A3GEOILWLK86XM,0000031887,Thank you Halo Heaven great product for Little...,thank halo heaven great product little girls m...
2,5.0,A2A2WZYLU528RO,0000031887,My daughter has worn this skirt almost every d...,my daughter worn skirt day receive washer clot...
3,5.0,A34ATJR9KFIXL9,0000031887,Full and well stitched. This tutu is a beauti...,full stitch this tutu beautiful purple color l...
4,5.0,A1MXJVYXE2QU6H,0000031887,Perfect for my budding grand daughter ballerin...,perfect bud grand daughter ballerina beautiful...
47140,5.0,A2XX2A4OJCDNLZ,B00KF9180W,While balaclavas can be used for a variety of ...,while balaclavas variety thing use mainly late...
47141,2.0,A34BZM6S9L7QI4,B00KGCLROK,These were a free sample for review. I was ex...,these free sample review excite try unfortunat...
47142,5.0,A25C2M3QF9G7OQ,B00KGCLROK,These socks are very nicely made and quite com...,these sock nicely comfortable wear the grip do...
47143,5.0,AEL6CQNQXONBX,B00KKXCJQU,This set of travel organizers includes four pi...,this set travel organizer include piece total ...
47144,5.0,A1EVV74UQYVKRY,B00KKXCJQU,I've been traveling back and forth to England ...,travel forth england pack way suitcases some p...


# 2. Doc2Vec model

## 2.1 Preparing `TaggedDocument` for Doc2Vec model

In this following section, we will generating tagged documents that will be feed into the `Doc2Vec` model. We will be required to generate documents, where each is 'tagged' with the corresponding `asin` of which the review is addressing. This enables the Doc2Vec model to identify documents that are associated to each of the asin within our training dataset, and create a document vector based on the seperated documents for each asin. 

The intuition behind this preparation is that we assume that each asin is a representation of all the reviews customers has left after purchasing. If customers like any aspect of the product, the reviews should leave relevant positive feedback on that particular e.g., "the boots is comfortable" – hence, we know that this particular boots is comfortable. If a product (asin) has many of such reviews, semantically, we can build an item profile that associates this boots as a product that is comfortable. As embeddings are generated in *n*-dimensional vector space, we can then attempt to find similar products within the neighbourhood that has a similar profile be it is either a pair of boots, or a product that is comfortable in nature. 

The reason why we are building this in an item-item level is because at a user-level, interests may vary greatly depending on the user needs when purchasing items. Also, as some users may be more negative in nature, their reviews may generally be more critical which will inherently develop a profile that is critical in nature. Assuming this, if we were to place this user profile vector into the *n*-dimensional vector space, we will likely be recommending products that were also critically (or negatively) reviewed. This meant that for this user, we might only be generating poor recommendations due to how its neighbourhood is associated with a negatively semantics. However, in terms of item-level, it is highly unlike that a user has only made poor purchases on the site and hence, its more likely that the items profile develop should have a mix of good and possibly poor semantics. This ensures that we will have positive recommendations generated if we were to implement a treshold of sort, to ensure that the average/weighted average rating of the products meets an initial criteria for recommendations.

In [9]:
%%time
# tokenizng the `processedReviewText`
train['tokenizedReviewText'] = train['processedReviewText'].parallel_apply(lambda x: x.split())

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=28937), Label(value='0 / 28937')))…

Process ForkPoolWorker-37:
Process ForkPoolWorker-38:
Process ForkPoolWorker-36:
Process ForkPoolWorker-32:
Process ForkPoolWorker-35:
Process ForkPoolWorker-34:


KeyboardInterrupt: 

In [None]:
# check train
train.head().append(train.tail())

## 2.2 Preparing `item-level` tagged documents

In [None]:
_, item_documents = preprocessing.prepare_tagged_documents(users='reviewerID', asins='asin', reviews='tokenizedReviewText', df=train)

In [None]:
pprint(item_documents[:10])

## 2.3 Training `Doc2Vec` model

We will be training a `Doc2Vec` model with initial hyperparameters decided based on a study by [Caselles-Dupré, Lesaint, and Royo-Letelier (2018)](https://arxiv.org/abs/1804.04212), where they observed that `negative sampling distribution`, `number of epochs`, `subsampling parameter` and `window size` can significantly improve performance on recommendation tasks. Hence, we will have decided to start with values that similar to those presented in the studies as the basis for improvement during this project.

In [None]:
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(message)s')

# model parameters
VECTOR_SIZE = 150
MIN_COUNT = 10
NEGATIVE = 5
NS_EXPONENT = 0.5
SAMPLE = 1e-05
DM = 1
WORKERS = 8
EPOCHS = 50

In [None]:
model = Doc2Vec(
    vector_size=VECTOR_SIZE,
    min_count=MIN_COUNT,
    negative=NEGATIVE,
    sample=SAMPLE,
    ns_exponent=NS_EXPONENT,
    dm=DM,
    workers=WORKERS,
)

# building vocab
model.build_vocab(item_documents)

# training model
model.train(item_documents, total_examples=model.corpus_count, epochs=EPOCHS)

In [None]:
# saving model
MODEL_PATH = Path("models/d2v/")

model.save(f"{MODEL_PATH}/{CATEGORY}_{VECTOR_SIZE}_{SAMPLE}_{EPOCHS}_d2v.model")

## 2.4 Verifying Doc2Vec model

In [None]:
MODEL_PATH = Path("models/d2v/")

model = Doc2Vec.load(f"{MODEL_PATH}/{CATEGORY}_{VECTOR_SIZE}_{SAMPLE}_{EPOCHS}_d2v.model")

### 2.4.1 Testing retrieval of vectors via index

In [None]:
model.dv[0] # '0000031887'

### 2.4.2 Testing retrieval of vectors via tags

In [None]:
print([i[1] for i in item_documents[:5]])

In [None]:
model.dv['0000031887']

### 2.4.3 Checking number of document vectors

In [None]:
print(f"Number of document vectors: {len(model.dv.index_to_key)}")

We observed that by calling the document vector via both index and actual tags returns the same vector. We also generated `23033` vectors that is aligned with the number of unique items we have in training data. 

## 2.5 Examining Doc2Vec results

### 2.5.1 Are inferred vectors close to the precalculated ones?

In [None]:
# let's try to generate a random item id and infer its vector and compare if we can get similar items back
random_asin = np.random.choice(list(train['asin'].unique()), 1)[0]

# combining all the words from the all reviews
all_review = []
for review in train[train['asin'] == random_asin]["tokenizedReviewText"]:
    all_review.extend(review)

# inferring vector
print(f"For item {random_asin}...\n")
print(f'Most similar D2V vectors: {model.dv.most_similar([model.infer_vector(all_review, epochs=5)], topn=5)}')

In [None]:
# let's try to generate a random item id and infer its vector and compare if we can get similar items back
random_asin = np.random.choice(list(train['asin'].unique()), 1)[0]

# combining all the words from the all reviews
all_review = []
for review in train[train['asin'] == random_asin]["tokenizedReviewText"]:
    all_review.extend(review)

# inferring vector
print(f"For item {random_asin}...\n")
print(f'Most similar D2V vectors: {model.dv.most_similar([model.infer_vector(all_review, epochs=5)], topn=5)}')

We are able to verify that by using the same tokens used for training, we were able to still infer accurate vectors from the model.

### 2.5.2 Generating top-N recommendations based on aggregated item history

As mentioned previously, we wanted to generate recommendations based on item-level instead of user-level due to the fact that user's nature is dynamic. What user like now, might not be the same a few days or weeks later. However, on a item perspective, if an item is good, general public who purchased the item should have positive feedback. The nature of the item profile should not vary as much as it would on a user-level. 

Hence, instead of building a user profile based on the reviews that the user has given, instead, we will build an *aggregated* item history profile based on the past purchase history of the users. A simplification of the algorithm would be:

1. Identify the list of items previously purchased by a user
2. Using the Doc2Vec model trained on unique tags based on `asin`, we mean-aggregate the item's document vectors to produce a aggregate item purchase history vector
3. Using the aggregated item purchase history vector, we find the top-N, 10, recommendations while excluding previously purchased item and setting a treshold for the mean/weighted-mean rating of the recommended items to ensure that items purchase is popular among others as well.

In [None]:
# TODO: need to figure how to define a treshold
def get_top_n(user, n=10, threshold=4):

    # retrieving user's purchase history
    purchase_history = train.groupby(['reviewerID'])['asin'].apply(list)[user]

    purchase_history_vec = np.zeros(150)
    for item in purchase_history:
        purchase_history_vec += model.dv[item]
    # mean aggregation
    purchase_history_vec /= len(purchase_history)

    return [i[0] for i in model.dv.most_similar([purchase_history_vec], topn=n)]

In [None]:
# generating a random user
random_user = np.random.choice(list(train['reviewerID'].unique()), 1)[0]
# looking at user records
pprint(train[train['reviewerID'] == random_user][["asin", "reviewText"]])

print(f"\nFor user {random_user}...\n")
print(f'Most similar item D2V vectors: {get_top_n(random_user)}')

The example of recommendation above where user `A2OSOO0NRPLZRH`

## 3. Computing Metrics

Now that we are able to generate recommendations for users, we are going to compute metrics to better evaluate our model with existing techniques used in recommendations for comparison. 

We will be using the following metrics:

1. `Precision@K`: Proportion of recommendations that are relevant (which means that items that users has already make a purchase before).
2. `Recall@K`: Proportion of relevant recommendations retrieved.


In [None]:
# let take a look at our testing set 
test.head().append(test.tail())

In [None]:
# creating the purchase history of users in the testing set
test_purchase_history = test.groupby(['reviewerID'])['asin'].apply(list).to_frame().reset_index()

pprint(test_purchase_history.iloc[:5])

In [None]:
test_purchase_history['reviewerID'].parallel_apply(get_top_n)