In [15]:
!pip install gensim
!pip install pandas
!pip install pandarallel
!pip install numpy
!pip install tqdm
!pip install ipywidgets
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp38-cp38-macosx_10_13_x86_64.whl (7.2 MB)
[K     |████████████████████████████████| 7.2 MB 6.4 MB/s 
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Collecting joblib>=0.11
  Using cached joblib-1.0.1-py3-none-any.whl (303 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.0.1 scikit-learn-0.24.2 threadpoolctl-2.2.0


In [34]:
import logging
import numpy as np
import pandas as pd
import warnings

from pandarallel import pandarallel
from pathlib import Path
from pprint import pprint
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from src.features import preprocessing 
from tqdm import tqdm

pandarallel.initialize(progress_bar=True)
tqdm.pandas()
warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


## 1. Load data

In [3]:
# global variables
DATA_PATH = Path('data/processed/')
CATEGORY = 'Clothing_Shoes_and_Jewelry'

train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [4]:
print(f"Train: {train.shape}, unique users: {train.reviewerID.nunique()}, unique items: {train.asin.nunique()}")
print(f"Test: {test.shape}, unique users: {test.reviewerID.nunique()}, unique items: {test.asin.nunique()}")

Train: (231491, 5), unique users: 39387, unique items: 23033
Test: (47145, 5), unique users: 39380, unique items: 17949


In [5]:
# check train
train.head().append(train.tail())

Unnamed: 0,overall,reviewerID,asin,reviewText,processedReviewText
0,5.0,A1KLRMWW2FWPL4,0000031887,This is a great tutu and at a really great pri...,this great tutu great price it look cheap glad...
1,5.0,A2G5TCU2WDFZ65,0000031887,I bought this for my 4 yr old daughter for dan...,buy yr old daughter dance class wore today tim...
2,5.0,A1RLQXYNCMWRWN,0000031887,What can I say... my daughters have it in oran...,what daughters orange black white pink think b...
3,4.0,A27UF1MSF3DB2,0000031887,I received this today and I'm not a fan of it ...,receive today fan daughter think puffier look ...
4,5.0,A16GFPNVF4Y816,0000031887,Bought this as a backup to the regular ballet ...,bought backup regular ballet outfit daughter w...
231486,5.0,ACJT8MUC0LRF0,B00KKXCJQU,When I pack it looks like a disaster area in a...,when pack look like disaster area suitcase pac...
231487,5.0,A2DG63DN704LOI,B00KKXCJQU,I don't normally go ga-ga over a product very ...,normally ga ga product cub awesome help review...
231488,5.0,A1UQBFCERIP7VJ,B00KKXCJQU,These are very nice packing cubes and the 18 x...,these nice packing cube laundry storage bag ni...
231489,5.0,A22CW0ZHY3NJH8,B00KKXCJQU,I am on vacation with my family of four and th...,vacation family shacke pak set wonderful excep...
231490,5.0,A30VWT3R25QAVD,B00KKXCJQU,When I signed up to receive a free set of Shac...,when sign receive free set shacke pak review t...


In [6]:
# check test
test.head().append(test.tail())

Unnamed: 0,overall,reviewerID,asin,reviewText,processedReviewText
0,5.0,A8U3FAMSJVHS5,0000031887,"We bought several tutus at once, and they are ...",we buy tutu get high review sturdy seemingly t...
1,5.0,A3GEOILWLK86XM,0000031887,Thank you Halo Heaven great product for Little...,thank halo heaven great product little girls m...
2,5.0,A2A2WZYLU528RO,0000031887,My daughter has worn this skirt almost every d...,my daughter worn skirt day receive washer clot...
3,5.0,A34ATJR9KFIXL9,0000031887,Full and well stitched. This tutu is a beauti...,full stitch this tutu beautiful purple color l...
4,5.0,A1MXJVYXE2QU6H,0000031887,Perfect for my budding grand daughter ballerin...,perfect bud grand daughter ballerina beautiful...
47140,5.0,A2XX2A4OJCDNLZ,B00KF9180W,While balaclavas can be used for a variety of ...,while balaclavas variety thing use mainly late...
47141,2.0,A34BZM6S9L7QI4,B00KGCLROK,These were a free sample for review. I was ex...,these free sample review excite try unfortunat...
47142,5.0,A25C2M3QF9G7OQ,B00KGCLROK,These socks are very nicely made and quite com...,these sock nicely comfortable wear the grip do...
47143,5.0,AEL6CQNQXONBX,B00KKXCJQU,This set of travel organizers includes four pi...,this set travel organizer include piece total ...
47144,5.0,A1EVV74UQYVKRY,B00KKXCJQU,I've been traveling back and forth to England ...,travel forth england pack way suitcases some p...


## 2. Preparing data for `Item-KNN` model

In [21]:
# creating a pivot matrix
user_item_matrix_df = train.pivot_table(index='reviewerID', columns='asin', values='overall').fillna(0.0)

# checking the first 5 rows
user_item_matrix_df.head()

asin,0000031887,0123456479,1608299953,1617160377,B00001W0KA,B00001WRHJ,B00004SR8W,B00004SR8Z,B00004SR9P,B00004U1J2,...,B00K0BPFY0,B00K551QR6,B00K5T4NHC,B00K8J06CK,B00KA2X4QK,B00KA602SY,B00KCWMG5S,B00KF9180W,B00KGCLROK,B00KKXCJQU
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A001114613O3F18Q5NVR6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00146182PNM90WNNAZ5Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00165422B2GAUE3EL6Z0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00338282E99B8OR2JYTZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00354001GE099Q1FL0TU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 2.1 Creating sparse matrix

In [10]:
user_item_matrix = csr_matrix(user_item_matrix_df.values)

In [13]:
print(f"User-Item Matrix: {user_item_matrix.shape}")

User-Item Matrix: (39387, 23033)


### 2.2 Training a KNN model

In [19]:
# instantiating model
model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10)
# fitting model to data
model.fit(user_item_matrix)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=10)

### 2.3 Evaluating model queries

In [24]:
query_index = np.random.choice(user_item_matrix_df.shape[0])
print(f"Query index: {query_index}...")

distances, indices = model.kneighbors(user_item_matrix_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors=10)

Query index: 1573...


## 3. Generating recommendations

In [31]:
similar_users = []

for i in range(0, len(distances.flatten())):
    if i == 0:
        print(f"Recommendation for {user_item_matrix_df.index[query_index]}...\n")
    else:
        print(f'{i}: {user_item_matrix_df.index[indices.flatten()[i]]}, with a distance of {distances.flatten()[i]:.5f}')
        similar_users.append(user_item_matrix_df.index[indices.flatten()[i]])

Recommendation for A15JPYV0L19RF...

1: A3ID3ZGPOA1LJ7, with a distance of 0.68107
2: AY8GWRA0LQTAB, with a distance of 0.71873
3: A1AFY8WLHKC20T, with a distance of 0.74842
4: A1VN1E366B3PUS, with a distance of 0.79035
5: A1MXJVANURX26J, with a distance of 0.79264
6: A394HK1TR2GV9P, with a distance of 0.79797
7: A15BCH9AYL7PD5, with a distance of 0.80415
8: APOXJ8I6412IR, with a distance of 0.80497
9: A3CHCNPBZJSTFE, with a distance of 0.80642


In [40]:
# computing weighted average
product_metrics = train.groupby(['asin']).agg({'overall': np.mean, 'reviewerID': 'count'}).rename(columns={'overall': 'rating_average', 'reviewerID': 'count'}).reset_index()
product_metrics = preprocessing.compute_weighted_ratings(product_metrics, 'count', 'rating_average')

# check dataframe
pprint(product_metrics)

             asin  rating_average  count  rating_weighted
0      0000031887        4.500000     18         4.402917
1      0123456479        4.200000      5         4.230287
2      1608299953        4.200000     10         4.223076
3      1617160377        4.538462     13         4.403525
4      B00001W0KA        4.714286      7         4.426922
...           ...             ...    ...              ...
23028  B00KA602SY        3.833333      6         4.099094
23029  B00KCWMG5S        2.500000      4         3.778973
23030  B00KF9180W        4.714286      7         4.426922
23031  B00KGCLROK        5.000000      3         4.406043
23032  B00KKXCJQU        5.000000      7         4.538033

[23033 rows x 4 columns]


In [41]:
# join the weighted average back into the training dataframe so we can sort
train = train.merge(product_metrics[['asin', 'rating_weighted']], on='asin')
train.head()

Unnamed: 0,overall,reviewerID,asin,reviewText,processedReviewText,rating_weighted
0,5.0,A1KLRMWW2FWPL4,31887,This is a great tutu and at a really great pri...,this great tutu great price it look cheap glad...,4.402917
1,5.0,A2G5TCU2WDFZ65,31887,I bought this for my 4 yr old daughter for dan...,buy yr old daughter dance class wore today tim...,4.402917
2,5.0,A1RLQXYNCMWRWN,31887,What can I say... my daughters have it in oran...,what daughters orange black white pink think b...,4.402917
3,4.0,A27UF1MSF3DB2,31887,I received this today and I'm not a fan of it ...,receive today fan daughter think puffier look ...,4.402917
4,5.0,A16GFPNVF4Y816,31887,Bought this as a backup to the regular ballet ...,bought backup regular ballet outfit daughter w...,4.402917


### 3.1 Retrieving past purchase history of similar users

In [54]:
similar_users_history = train[train['reviewerID'].isin(similar_users)][['asin', 'rating_weighted']].drop_duplicates().sort_values(by='rating_weighted', ascending=False)[:10]['asin'].to_list()

['B000GKUGC6',
 'B0013UMQX0',
 'B004LCPDBY',
 'B000JFHQWG',
 'B000XP0FDY',
 'B00383YAMK',
 'B000R7NYU4',
 'B0031U0PO2',
 'B000KK0OLA',
 'B004OYT3TM']

### 3.2 Generate a loop to go through all test users

In [113]:
test_purchase_history = test.groupby(['reviewerID'])['asin'].apply(list).to_frame().reset_index()

In [114]:
# let randomly sample 1000 rows to make predictions
sampled_test_purchase_history = test_purchase_history.sample(n=5000, random_state=42)

pprint(sampled_test_purchase_history)

           reviewerID                                               asin
109    A104QGECCAFCI9                                       [B00592VMNI]
15112  A2G5OW0UIBAUIT                           [B008SCM0AU, B00AOCV6OI]
13118  A29BPMJI0ZYH4H  [B0058XH5D4, B007BZ5CUU, B00A0SXLOO, B00AVPHH4...
37097   ARQZEE0LA1PBB                                       [B000A2KC7O]
31660   A8VSC4N8D63MJ                                       [B007ZRS0ZI]
...               ...                                                ...
14667  A2EP4PMBS78D5F                                       [B001SN8DHK]
16387  A2KNB31SNXN0MR                                       [B008MMJ27K]
33258   AENTXUFIYPSMZ                                       [B003NX8C2O]
7029    A1OJHJSWH0F4K                                       [B0087SX5YA]
26273  A3IKG99RBVQDMK                                       [B00CJ6YMES]

[5000 rows x 2 columns]


In [116]:
user_recommendations = {}

for user in tqdm(sampled_test_purchase_history.reviewerID.values):
    query_index = user_item_matrix_df.index.to_list().index(user)
    distances, indices = model.kneighbors(user_item_matrix_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors=10)
    recommendations = (train[train['reviewerID'].isin(similar_users)][['asin', 'rating_weighted']]
                   .drop_duplicates()
                   .sort_values(by='rating_weighted', ascending=False)[:10]['asin']
                   .to_list())
    # storing k,v - user, recommendations into dictionary
    user_recommendations[user] = recommendations

100%|██████████| 5000/5000 [01:51<00:00, 44.73it/s]


In [117]:
def precision_at_k(user, asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(user_recommendations[user])
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating precision@K - relevant / total recommended
    precision_at_k = num_relevant / k
    
    return precision_at_k

def recall_at_k(user, asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(user_recommendations[user])
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating recall@K - relevant / total relevant items
    recall_at_k = num_relevant / len(asins)
    
    return recall_at_k

In [119]:
sampled_test_purchase_history['precision@K'] = sampled_test_purchase_history.progress_apply(lambda x: precision_at_k(x.reviewerID, x.asin), axis=1)
sampled_test_purchase_history['recall@K'] = sampled_test_purchase_history.progress_apply(lambda x: recall_at_k(x.reviewerID, x.asin), axis=1)

100%|██████████| 5000/5000 [00:00<00:00, 42528.92it/s]
100%|██████████| 5000/5000 [00:00<00:00, 44877.59it/s]


In [121]:
# checking the dataframe
sampled_test_purchase_history.head().append(sampled_test_purchase_history.tail())

Unnamed: 0,reviewerID,asin,precision@K,recall@K
109,A104QGECCAFCI9,[B00592VMNI],0.0,0.0
15112,A2G5OW0UIBAUIT,"[B008SCM0AU, B00AOCV6OI]",0.0,0.0
13118,A29BPMJI0ZYH4H,"[B0058XH5D4, B007BZ5CUU, B00A0SXLOO, B00AVPHH4...",0.0,0.0
37097,ARQZEE0LA1PBB,[B000A2KC7O],0.0,0.0
31660,A8VSC4N8D63MJ,[B007ZRS0ZI],0.0,0.0
14667,A2EP4PMBS78D5F,[B001SN8DHK],0.0,0.0
16387,A2KNB31SNXN0MR,[B008MMJ27K],0.0,0.0
33258,AENTXUFIYPSMZ,[B003NX8C2O],0.0,0.0
7029,A1OJHJSWH0F4K,[B0087SX5YA],0.0,0.0
26273,A3IKG99RBVQDMK,[B00CJ6YMES],0.0,0.0


In [126]:
average_precision_at_k = sampled_test_purchase_history["precision@K"].mean()
average_recall_at_k = sampled_test_purchase_history["recall@K"].mean()

print(f"The model has a average precision@K: {average_precision_at_k:.5f}, average recall@K: {average_recall_at_k:.5f}.")

The model has a average precision@K: 0.00012, average recall@K: 0.00120.


### 3.3 Looking at the correct recommendations

In [125]:
sampled_test_purchase_history[sampled_test_purchase_history['recall@K'] == 1]

Unnamed: 0,reviewerID,asin,precision@K,recall@K
38958,AYEKQENVQ5D4,[B000KK0OLA],0.1,1.0
8059,A1S39JTPIBMR6C,[B0031U0PO2],0.1,1.0
37210,AS6Z51F731HD2,[B0031U0PO2],0.1,1.0
36938,AR6Y9LV34NIQT,[B0031U0PO2],0.1,1.0
27067,A3LB2NFSHKWGCJ,[B000XP0FDY],0.1,1.0
8510,A1TMAVN4CEM8U8,[B000GKUGC6],0.1,1.0
