In [1]:
import numpy as np 
import pandas as pd
import warnings

from gensim.models.doc2vec import Doc2Vec
from pandarallel import pandarallel
from src.models import cf
from tqdm import tqdm 

pandarallel.initialize()
tqdm.pandas()
warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.




# Load Data and Model

In [2]:
# global variables
DATA_PATH = 'data/evaluation'
MODEL_PATH = 'models/d2v'
CATEGORY = 'Grocery_and_Gourmet_Food'

train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")
model = Doc2Vec.load(f"{MODEL_PATH}/{CATEGORY}_50_d2v.model") 

In [3]:
train.head().append(train.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A23RYWDS884TUL,5.0,This curry paste makes a delicious curry. I j...,2013-05-28,curry paste delicious curry fry chicken vegeta...
1,1,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A945RBQWGZXCK,5.0,I've purchased different curries in the grocer...,2012-09-17,purchase different curry grocery store complet...
2,3,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3AMNY44OP8AOU,4.0,I started a new diet restricting all added sug...,2014-01-23,start new diet restrict added sugar brand suga...
3,4,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3IB4CQ2QEJLJ8,5.0,So many flavors. I can't begin to tell you how...,2014-04-27,flavor begin tell love mae ploy curry ask reci...
4,5,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",AQA5DF3RWKETQ,5.0,I've used this a lot recently in some of my ch...,2012-11-27,use lot recently chicken dish use lot like spi...
47769,77420,B00I33696K,Reese's Miniature Peanut Butter Cups .31oz - 1...,"['Grocery & Gourmet Food', 'Candy & Chocolate'...",A192LQZWDYPR4U,5.0,Another quality Reese Peanut Butter Cup produc...,2014-02-27,quality reese peanut butter cup product great ...
47770,77421,B00I33696K,Reese's Miniature Peanut Butter Cups .31oz - 1...,"['Grocery & Gourmet Food', 'Candy & Chocolate'...",A2QKXW3LDQ66P5,5.0,I purchased these for my husband who has every...,2013-02-20,purchase husband love reeses valentine day pre...
47771,77430,B00ID9VSOM,"Viva Labs Organic Coconut Sugar: Non-GMO, Low-...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A2P3TGJU301KXD,5.0,this stuff is INCREDIBILY yummy! SO much bette...,2014-07-15,stuff incredibily yummy good regular brown sug...
47772,77456,B00IRL93SY,Barrie House Kenya Estate - AA Single Cup Caps...,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",AEFE9VDHTQ199,5.0,"Very nice aroma, body and taste! Will buy this...",2014-05-24,nice aroma body taste buy coffee good coffee a...
47773,77508,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A2AEZQ3DGBBLPR,2.0,This is a no go for diabetics according to my ...,2014-06-26,diabetic accord wife doctor order intention us...


In [4]:
# testing model
model.dv[0]

array([-1.9998994 , -1.1165652 , -0.66850775,  0.98593354, -0.1539128 ,
        1.836437  ,  0.04748935,  0.91502106, -0.27679813, -0.71812797,
       -0.707553  , -0.6039597 , -0.58606654,  1.0271451 , -0.03445855,
       -1.2801527 , -0.48076808, -0.33833098,  0.32701826, -0.56980634,
       -0.9162374 ,  0.33309776, -0.37450856,  0.02434505, -0.3233081 ,
        2.094295  , -1.4732084 ,  1.2163205 , -1.8009837 , -0.34398207,
        0.79519683,  0.48662725,  0.38086358,  0.9061864 ,  0.98379046,
       -0.49270517, -1.679146  , -0.8714166 , -0.8851388 ,  1.7304606 ,
       -0.21798603, -0.90019166, -1.5550374 , -1.2914916 ,  0.3585223 ,
        0.7392237 ,  0.06309976, -0.6030123 ,  1.4061383 ,  0.12848437],
      dtype=float32)

# Generate User Embeddings

In [5]:
# get user rating history
user_rating_history = train.groupby(['reviewerID'])['asin'].apply(list)

print(user_rating_history)

reviewerID
A00177463W0XWB16A9O05                             [B0029XDZIK, B0094ISOMA]
A022899328A0QROR32DCT                             [B001ACMCNU, B003TO9RSU]
A068255029AHTHDXZURNU                             [B000K8WVYA, B0094ISOMA]
A06944662TFWOKKV4GJKX                                         [B000CQBZPG]
A1004703RC79J9                                                [B001E50THY]
                                               ...                        
AZWRZZAMX90VT            [B0007R9L5Q, B000CQ01GU, B000E123IC, B000E46LZ...
AZXKAH2DE6C8A            [B000EML7DS, B000ODF2ME, B001650XUK, B0018QLG9...
AZXON596A1VXC                         [B00113SKZW, B00113ZTVK, B001L4JH5I]
AZYXC63SS008M                                                 [B0040WCQKQ]
AZZ5ASC403N74                                                 [B004U49QU2]
Name: asin, Length: 13397, dtype: object


In [6]:
# getting unique users
unique_users = user_rating_history.reset_index()['reviewerID'].tolist()

# generating user embeddings for all unique users
user_embeddings = {}

for user in tqdm(unique_users):
    user_embedding = np.zeros(50)
    for item in user_rating_history[user]:
        user_embedding += model.dv[item]
        
    # mean aggregation
    user_embedding /= len(user_rating_history[user])
    user_embeddings[user] = user_embedding

100%|██████████████████████████████████████████████████████| 13397/13397 [00:00<00:00, 39805.31it/s]


# Generate Top-N Recommendations (N=10)

In [7]:
def get_top_n(d2v, user_embeddings, n=10):
    """Return the top-N recommendations for each user based on cosine similarity.
    
    Args:
        d2v ([Doc2Vec]): Doc2Vec of item representations based on reviews.
        user_embeddings ([dict]): User representations based on mean aggregation of item
            representations within the d2v vector dimension space.
    
    Returns:
        ([dict]): A dictionary of top-N recommendations for each unique user, sorted by
            cosine similarties.
    """
    
    # retrieve a 200 items candidate list based on similarities
    top_ns = {}
    for user in tqdm(user_embeddings.items()):
        candidate_items = [i[0] for i in d2v.dv.most_similar([user[1]], topn=200)]
        unrated_items = set(candidate_items) - set(user_rating_history[user[0]])
        
        user_top_n = []
        idx = 0
        while len(user_top_n) < n:
            if candidate_items[idx] in unrated_items:
                user_top_n.append(candidate_items[idx])
                idx += 1
            else:
                idx += 1
        
        top_ns[user[0]] = user_top_n
        
    return top_ns

In [8]:
top_ns = get_top_n(model, user_embeddings, 10)

100%|███████████████████████████████████████████████████████| 13397/13397 [00:04<00:00, 2951.35it/s]


In [9]:
# generating a random user
random_user = np.random.choice(list(train['reviewerID'].unique()), 1)[0]
print(f"For user: {random_user}:")
print(f"Purchase History:\n{train[train['reviewerID'] == random_user][['asin', 'title']]}")

# find the recommendations
print(f"\nRecommending:\n")
print(f"{train[train['asin'].isin(top_ns[random_user])][['asin', 'title']].drop_duplicates(subset='asin')}")

For user: A1IRN1M05TPOVT:
Purchase History:
             asin                                              title
3202   B000CQE3HS  Slim Jim Giant Smoked Snack Sticks, Tabasco, ....
3810   B000E1BL5S  Planters Trail Mix, Sweet &amp; Salty 6 Oz (Pa...
5835   B000EQT77M  TERRA Sweets &amp; Beets Chips, No Salt Added,...
9160   B000H27NU6  Ghirardelli Chocolate Premium Hot Cocoa Mix, D...
15998  B0016MN9G8  Twizzlers Strawberry Candy Twists - 180 Pcs, 3...
21161  B001EQ4GPO   Planters Cashews with Almonds &amp; Pecans, 9...
21700  B001EQ55YU  Emerald Nuts Whole Cashews, 10-Ounce Canisters...
23518  B001HBXO54    Lindor Assorted Chocolate Truffles, 21.2 ounce 
24821  B001M0A8XQ  Special K Breakfast Cereal Original, 18 oz(Pac...
26527  B001YJBLMM  Crunchmaster Multi-Grain Crackers, Gluten Free...
33157  B003ZXFWP2  Cheez-It Baked Snack Mix, Classic, 10.5 oz Box...

Recommending:

             asin                                              title
8275   B000FYVKUA  Buffalo Bills 2-lb Mild 

# Evaluate Top-N Recommendations (N=10)

### Defining Evaluation Metrics

In [10]:
def precision_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating precision@K - relevant / total recommended
    precision_at_k = num_relevant / k
    
    return precision_at_k

def recall_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating recall@K - relevant / total relevant items
    recall_at_k = num_relevant / len(asins)
    
    return recall_at_k

In [11]:
# loading test dataset
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [12]:
test.head().append(test.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,2,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A1TCSC0YWT82Q0,5.0,I love ethnic foods and to cook them. I recent...,2013-08-03,love ethnic food cook recently purchase produc...
1,8,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A1Z7Y2GMAP9SRY,5.0,I like to make my own curry but this is a tast...,2014-06-27,like curry tasty alternative use base kind dif...
2,23,B00004S1C5,"Ateco Food Coloring Kit, 6 colors","['Grocery & Gourmet Food', 'Cooking & Baking',...",A14YSMLYLJEMET,1.0,This product is no where near natural / organi...,2013-03-29,product near natural organic wish review purch...
3,31,B00005344V,Traditional Medicinals Organic Breathe Easy Se...,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",A2F488C4PLWGEI,5.0,If my wife drinks a cup of this tea when she f...,2014-03-23,wife drink cup tea feel attack come help avoid...
4,32,B00005344V,Traditional Medicinals Organic Breathe Easy Se...,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",AO1HXV7DWZZIR,5.0,I don't know about the medicinal aspects of th...,2014-02-06,know medicinal aspect tea flavor downright scr...
28001,77519,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A1WT3TVHANP7ZF,3.0,Hmmm. I really wanted to love this sweetener. ...,2014-07-22,hmmm want love sweetener half sugar half stevi...
28002,77520,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A3NEAETOSXDBOM,5.0,"I confess I have a sweet tooth, and love the t...",2014-06-30,confess sweet tooth love taste sugar recognize...
28003,77521,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",AD1ZOPB0BBEHB,4.0,"It has a little of the stevia aftertaste, but ...",2014-07-17,little stevia aftertaste fair compromise able ...
28004,77522,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A18ECVX2RJ7HUE,5.0,i love marinade for grilled flank steak or lon...,2014-05-30,love marinade grilled flank steak london broil...
28005,77523,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A2G04D4QZAXL15,3.0,I've been using Truvia (a form of stevia) on m...,2014-05-27,use truvia form stevia cereal greek yogurt yea...


In [13]:
# generating test rating history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [14]:
test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])

In [15]:
# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

In [16]:
test_merged.head()

Unnamed: 0,reviewerID,asin,pred_asin
0,A00177463W0XWB16A9O05,"[B00474OR8G, B00BFM6OAW]","[B0033GMSTY, B001CHFUDC, B002AQ0OL2, B001EO5Y5..."
1,A022899328A0QROR32DCT,[B00CMQDKES],"[B000E48IP6, B000VK4F5A, B004K677KG, B005I3EGJ..."
2,A068255029AHTHDXZURNU,[B001FA1K2G],"[B001KZ1AE4, B003SBRQAU, B005DVUYCU, B0014EOUA..."
3,A06944662TFWOKKV4GJKX,[B000GFYRHG],"[B001EQ5IN8, B000L8CB76, B0000DBN1L, B001HX59Z..."
4,A1004703RC79J9,[B003GTR8IO],"[B001EQ55PE, B001E50TJC, B001EQ4Q72, B001M1JDM..."


In [17]:
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin), axis=1)

100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 51524.91it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 53171.73it/s]


In [18]:
test_merged.head()

Unnamed: 0,reviewerID,asin,pred_asin,precision@k,recall@k
0,A00177463W0XWB16A9O05,"[B00474OR8G, B00BFM6OAW]","[B0033GMSTY, B001CHFUDC, B002AQ0OL2, B001EO5Y5...",0.0,0.0
1,A022899328A0QROR32DCT,[B00CMQDKES],"[B000E48IP6, B000VK4F5A, B004K677KG, B005I3EGJ...",0.0,0.0
2,A068255029AHTHDXZURNU,[B001FA1K2G],"[B001KZ1AE4, B003SBRQAU, B005DVUYCU, B0014EOUA...",0.0,0.0
3,A06944662TFWOKKV4GJKX,[B000GFYRHG],"[B001EQ5IN8, B000L8CB76, B0000DBN1L, B001HX59Z...",0.0,0.0
4,A1004703RC79J9,[B003GTR8IO],"[B001EQ55PE, B001E50TJC, B001EQ4Q72, B001M1JDM...",0.0,0.0


In [19]:
k = 10
average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

The MEM-ECF has a average precision@10: 0.00628, average recall@10: 0.03796.


# Looking at Relevant Recommendations

In [20]:
test_merged[test_merged['recall@k'] > 0]

Unnamed: 0,reviewerID,asin,pred_asin,precision@k,recall@k
44,A10BWUA2MGA9BK,[B000S8593W],"[B004YTV5S4, B00856TSCC, B000S8593W, B00B8DSFX...",0.1,1.00
68,A10IRGY2RUZ6MW,[B008TSVXWE],"[B008TSVXWE, B001EQ5S9C, B000FYVKUA, B000RFS57...",0.1,1.00
77,A10N8O0MPJ7IRH,"[B00021639Y, B00021639Y]","[B0013JQON4, B000EDBPO8, B003LPKETS, B00021639...",0.1,0.50
83,A10PUMFCPP2SXV,[B009PARMMA],"[B001F10XUU, B009PARMMA, B004DJD0DS, B000GB0TM...",0.1,1.00
91,A10UCLH6YM8F79,"[B004SKON6U, B004UA52F4]","[B004UA52F4, B000VJM4FO, B0044R3DNG, B00FQGP20...",0.1,0.50
...,...,...,...,...,...
13245,AZKRFNQ8EFO4T,"[B000EICISA, B00473PVVO]","[B000EICISA, B000E7WM0U, B000EICJWA, B000LKVSD...",0.1,0.50
13253,AZOCADWL9N2H2,"[B000EML7FG, B001E4Q5GO, B001KUWFPC, B003DNL9VA]","[B00550MI00, B003DNL9VA, B001OCKI5U, B00B2JUEB...",0.1,0.25
13259,AZQA8ZIGS01FG,"[B001CHFUDC, B004538TME]","[B001CHFUDC, B000TK6LBS, B0033HGLTG, B001EO5Y5...",0.1,0.50
13271,AZWIAYHWL2FWE,[B004D6042G],"[B000K6Z22U, B004D6042G, B000G6Q4GW, B007VCF1Q...",0.1,1.00


# Generating Top-N Recommendations (N=25)

In [21]:
top_ns = get_top_n(model, user_embeddings, 25)

100%|███████████████████████████████████████████████████████| 13397/13397 [00:05<00:00, 2463.37it/s]


In [22]:
test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

In [23]:
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=25), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=25), axis=1)

k = 25
average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 38239.81it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 40204.20it/s]

The MEM-ECF has a average precision@25: 0.00442, average recall@25: 0.06376.





In [24]:
test_merged[test_merged['recall@k'] > 0]

Unnamed: 0,reviewerID,asin,pred_asin,precision@k,recall@k
44,A10BWUA2MGA9BK,[B000S8593W],"[B004YTV5S4, B00856TSCC, B000S8593W, B00B8DSFX...",0.04,1.000000
55,A10F4G1THW8581,"[B0033GMSTY, B005ZBZLT4]","[B001CHDITU, B001CHFUDC, B0033GZMXS, B0033HGLT...",0.04,0.500000
56,A10FJGRMOTJ35Y,"[B0033HPPIO, B0046HG0SY, B007PA33MA]","[B0033HGLTG, B001CHH3PU, B004RYX8UO, B001ELL4F...",0.04,0.333333
68,A10IRGY2RUZ6MW,[B008TSVXWE],"[B008TSVXWE, B001EQ5S9C, B000FYVKUA, B000RFS57...",0.04,1.000000
77,A10N8O0MPJ7IRH,"[B00021639Y, B00021639Y]","[B0013JQON4, B000EDBPO8, B003LPKETS, B00021639...",0.04,0.500000
...,...,...,...,...,...
13253,AZOCADWL9N2H2,"[B000EML7FG, B001E4Q5GO, B001KUWFPC, B003DNL9VA]","[B00550MI00, B003DNL9VA, B001OCKI5U, B00B2JUEB...",0.04,0.250000
13255,AZOTVJHNSAQXG,"[B001EQ4RWQ, B005CULMWI]","[B00FKZ5TL0, B003DVKBK2, B001EQ5PKE, B001QKE59...",0.04,0.500000
13259,AZQA8ZIGS01FG,"[B001CHFUDC, B004538TME]","[B001CHFUDC, B000TK6LBS, B0033HGLTG, B001EO5Y5...",0.04,0.500000
13271,AZWIAYHWL2FWE,[B004D6042G],"[B000K6Z22U, B004D6042G, B000G6Q4GW, B007VCF1Q...",0.04,1.000000


# Generating Top-N Recommendations (N=30)

In [25]:
top_ns = get_top_n(model, user_embeddings, 30)

100%|███████████████████████████████████████████████████████| 13397/13397 [00:05<00:00, 2677.10it/s]


In [26]:
test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

In [27]:
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=30), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=30), axis=1)

k = 30
average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 47299.31it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 45175.29it/s]

The MEM-ECF has a average precision@30: 0.00404, average recall@30: 0.07000.





In [28]:
test_merged[test_merged['recall@k'] > 0]

Unnamed: 0,reviewerID,asin,pred_asin,precision@k,recall@k
3,A06944662TFWOKKV4GJKX,[B000GFYRHG],"[B001EQ5IN8, B000L8CB76, B0000DBN1L, B001HX59Z...",0.033333,1.000000
44,A10BWUA2MGA9BK,[B000S8593W],"[B004YTV5S4, B00856TSCC, B000S8593W, B00B8DSFX...",0.033333,1.000000
55,A10F4G1THW8581,"[B0033GMSTY, B005ZBZLT4]","[B001CHDITU, B001CHFUDC, B0033GZMXS, B0033HGLT...",0.033333,0.500000
56,A10FJGRMOTJ35Y,"[B0033HPPIO, B0046HG0SY, B007PA33MA]","[B0033HGLTG, B001CHH3PU, B004RYX8UO, B001ELL4F...",0.033333,0.333333
68,A10IRGY2RUZ6MW,[B008TSVXWE],"[B008TSVXWE, B001EQ5S9C, B000FYVKUA, B000RFS57...",0.033333,1.000000
...,...,...,...,...,...
13253,AZOCADWL9N2H2,"[B000EML7FG, B001E4Q5GO, B001KUWFPC, B003DNL9VA]","[B00550MI00, B003DNL9VA, B001OCKI5U, B00B2JUEB...",0.033333,0.250000
13255,AZOTVJHNSAQXG,"[B001EQ4RWQ, B005CULMWI]","[B00FKZ5TL0, B003DVKBK2, B001EQ5PKE, B001QKE59...",0.033333,0.500000
13259,AZQA8ZIGS01FG,"[B001CHFUDC, B004538TME]","[B001CHFUDC, B000TK6LBS, B0033HGLTG, B001EO5Y5...",0.033333,0.500000
13271,AZWIAYHWL2FWE,[B004D6042G],"[B000K6Z22U, B004D6042G, B000G6Q4GW, B007VCF1Q...",0.033333,1.000000


# Generating Top-N Recommendations (N=45)

In [29]:
top_ns = get_top_n(model, user_embeddings, 45)

100%|███████████████████████████████████████████████████████| 13397/13397 [00:04<00:00, 2814.53it/s]


In [30]:
test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

In [31]:
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=45), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=45), axis=1)

k = 45
average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 44435.75it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 44212.78it/s]


The MEM-ECF has a average precision@45: 0.00343, average recall@45: 0.08819.


In [32]:
test_merged[test_merged['recall@k'] > 0]

Unnamed: 0,reviewerID,asin,pred_asin,precision@k,recall@k
0,A00177463W0XWB16A9O05,"[B00474OR8G, B00BFM6OAW]","[B0033GMSTY, B001CHFUDC, B002AQ0OL2, B001EO5Y5...",0.022222,0.500000
3,A06944662TFWOKKV4GJKX,[B000GFYRHG],"[B001EQ5IN8, B000L8CB76, B0000DBN1L, B001HX59Z...",0.022222,1.000000
19,A102UXGLDF76G1,"[B0009TN7F2, B001HTC17S]","[B005761IRG, B000FDOUXA, B00DRA8HWI, B0012AL3W...",0.022222,0.500000
44,A10BWUA2MGA9BK,[B000S8593W],"[B004YTV5S4, B00856TSCC, B000S8593W, B00B8DSFX...",0.022222,1.000000
55,A10F4G1THW8581,"[B0033GMSTY, B005ZBZLT4]","[B001CHDITU, B001CHFUDC, B0033GZMXS, B0033HGLT...",0.022222,0.500000
...,...,...,...,...,...
13255,AZOTVJHNSAQXG,"[B001EQ4RWQ, B005CULMWI]","[B00FKZ5TL0, B003DVKBK2, B001EQ5PKE, B001QKE59...",0.022222,0.500000
13259,AZQA8ZIGS01FG,"[B001CHFUDC, B004538TME]","[B001CHFUDC, B000TK6LBS, B0033HGLTG, B001EO5Y5...",0.022222,0.500000
13271,AZWIAYHWL2FWE,[B004D6042G],"[B000K6Z22U, B004D6042G, B000G6Q4GW, B007VCF1Q...",0.022222,1.000000
13273,AZWP97BZPJI1D,"[B0025UCHS6, B0025UCHT0, B00BIEU5QQ]","[B007JFXXJY, B000V9NFLQ, B005CT9OEW, B008YUL4C...",0.022222,0.333333


# Evaluating `EmbeddedCF` class

In [33]:
class EmbeddedCF():
    """
    """
    def __init__(self, d2v):
        self.d2v = d2v
        self.user_rating_history = None
        self.user_embeddings = None
        
    def fit(self, train, dimension=50):
        # get user rating history
        user_rating_history = train.groupby(['reviewerID'])['asin'].apply(list)
        # getting unique users
        unique_users = user_rating_history.reset_index()['reviewerID'].tolist()

        # generating user embeddings for all unique users
        user_embeddings = {}

        for user in tqdm(unique_users):
            user_embedding = np.zeros(dimension)
            for item in user_rating_history[user]:
                user_embedding += self.d2v.dv[item]

            # mean aggregation
            user_embedding /= len(user_rating_history[user])
            user_embeddings[user] = user_embedding
        
        self.user_rating_history = user_rating_history
        self.user_embeddings = user_embeddings
        
    def predict(self, n=200):
        """Generate a list of n-number of candidates items.

        This only generates a generic candidate list of items which do not factor
        in existing rated items and also top-N items required for recommendations.

        """
        candidate_items = {}
        for user in tqdm(self.user_embeddings.items()):
            candidate_items[user[0]] = [i for i in self.d2v.dv.most_similar([user[1]], topn=n)]

        return candidate_items


In [34]:
mem_ecf = EmbeddedCF(model)

In [35]:
mem_ecf.fit(train, dimension=50)

100%|██████████████████████████████████████████████████████| 13397/13397 [00:00<00:00, 42798.92it/s]


In [36]:
mem_ecf.user_rating_history

reviewerID
A00177463W0XWB16A9O05                             [B0029XDZIK, B0094ISOMA]
A022899328A0QROR32DCT                             [B001ACMCNU, B003TO9RSU]
A068255029AHTHDXZURNU                             [B000K8WVYA, B0094ISOMA]
A06944662TFWOKKV4GJKX                                         [B000CQBZPG]
A1004703RC79J9                                                [B001E50THY]
                                               ...                        
AZWRZZAMX90VT            [B0007R9L5Q, B000CQ01GU, B000E123IC, B000E46LZ...
AZXKAH2DE6C8A            [B000EML7DS, B000ODF2ME, B001650XUK, B0018QLG9...
AZXON596A1VXC                         [B00113SKZW, B00113ZTVK, B001L4JH5I]
AZYXC63SS008M                                                 [B0040WCQKQ]
AZZ5ASC403N74                                                 [B004U49QU2]
Name: asin, Length: 13397, dtype: object

In [37]:
predictions = mem_ecf.predict(n=200)

100%|███████████████████████████████████████████████████████| 13397/13397 [00:04<00:00, 2682.15it/s]


In [38]:
# check predictions
predictions['A00177463W0XWB16A9O05'][:5]

[('B0029XDZIK', 0.9551868438720703),
 ('B0033GMSTY', 0.8697038292884827),
 ('B001CHFUDC', 0.8662201166152954),
 ('B002AQ0OL2', 0.8370271325111389),
 ('B001EO5Y52', 0.8349511027336121)]

In [39]:
def get_top_n(predictions, user_rating_history, n=10):
    """Return the top-N recommendations for each user based on cosine similarity.
    
    Args:
    
    Returns:
        ([dict]): A dictionary of top-N recommendations for each unique user, sorted by
            cosine similarties.
    """
    
    # retrieve a 200 items candidate list based on similarities
    top_ns = {}
    for user in tqdm(predictions):
        rated_items = user_rating_history[user]
        candidate_items = [i[0] for i in predictions[user]]
        unrated_items = set(candidate_items) - set(rated_items)
        
        user_top_n = []
        idx = 0
        while len(user_top_n) < n:
            if candidate_items[idx] in unrated_items:
                user_top_n.append(candidate_items[idx])
                idx += 1
            else:
                idx += 1
        
        top_ns[user] = user_top_n
        
    return top_ns

### N=10

In [40]:
top_ns = get_top_n(predictions, mem_ecf.user_rating_history, n=10)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 10
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|██████████████████████████████████████████████████████| 13397/13397 [00:00<00:00, 25808.82it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 46451.99it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 48486.85it/s]

The MEM-ECF has a average precision@10: 0.00628, average recall@10: 0.03796.





### N=25

In [41]:
top_ns = get_top_n(predictions, mem_ecf.user_rating_history, n=25)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 25
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|██████████████████████████████████████████████████████| 13397/13397 [00:00<00:00, 29920.31it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 46217.08it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 40798.45it/s]

The MEM-ECF has a average precision@25: 0.00442, average recall@25: 0.06376.





### N=30

In [42]:
top_ns = get_top_n(predictions, mem_ecf.user_rating_history, n=30)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 30
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|██████████████████████████████████████████████████████| 13397/13397 [00:00<00:00, 22118.05it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 41727.69it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 43009.35it/s]

The MEM-ECF has a average precision@30: 0.00404, average recall@30: 0.07000.





### N=45

In [43]:
top_ns = get_top_n(predictions, mem_ecf.user_rating_history, n=45)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 45
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|██████████████████████████████████████████████████████| 13397/13397 [00:00<00:00, 25740.60it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 47591.19it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 48284.66it/s]

The MEM-ECF has a average precision@45: 0.00343, average recall@45: 0.08819.





In [44]:
# looking at how many get correct
test_merged[test_merged['recall@k'] > 0]

Unnamed: 0,reviewerID,asin,pred_asin,precision@k,recall@k
0,A00177463W0XWB16A9O05,"[B00474OR8G, B00BFM6OAW]","[B0033GMSTY, B001CHFUDC, B002AQ0OL2, B001EO5Y5...",0.022222,0.500000
3,A06944662TFWOKKV4GJKX,[B000GFYRHG],"[B001EQ5IN8, B000L8CB76, B0000DBN1L, B001HX59Z...",0.022222,1.000000
19,A102UXGLDF76G1,"[B0009TN7F2, B001HTC17S]","[B005761IRG, B000FDOUXA, B00DRA8HWI, B0012AL3W...",0.022222,0.500000
44,A10BWUA2MGA9BK,[B000S8593W],"[B004YTV5S4, B00856TSCC, B000S8593W, B00B8DSFX...",0.022222,1.000000
55,A10F4G1THW8581,"[B0033GMSTY, B005ZBZLT4]","[B001CHDITU, B001CHFUDC, B0033GZMXS, B0033HGLT...",0.022222,0.500000
...,...,...,...,...,...
13255,AZOTVJHNSAQXG,"[B001EQ4RWQ, B005CULMWI]","[B00FKZ5TL0, B003DVKBK2, B001EQ5PKE, B001QKE59...",0.022222,0.500000
13259,AZQA8ZIGS01FG,"[B001CHFUDC, B004538TME]","[B001CHFUDC, B000TK6LBS, B0033HGLTG, B001EO5Y5...",0.022222,0.500000
13271,AZWIAYHWL2FWE,[B004D6042G],"[B000K6Z22U, B004D6042G, B000G6Q4GW, B007VCF1Q...",0.022222,1.000000
13273,AZWP97BZPJI1D,"[B0025UCHS6, B0025UCHT0, B00BIEU5QQ]","[B007JFXXJY, B000V9NFLQ, B005CT9OEW, B008YUL4C...",0.022222,0.333333
