In [1]:
import numpy as np
import pandas as pd
import warnings

from gensim.models.doc2vec import Doc2Vec
from pandarallel import pandarallel
from tqdm import tqdm

from src.models import cf

pandarallel.initialize()
tqdm.pandas()
warnings.filterwarnings('ignore')



INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Load Data and Models

In [2]:
# global variables
DATA_PATH = "data/evaluation"
D2V_PATH = "models/d2v"
CATEGORY = "Clothing_Shoes_and_Jewelry"

train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")
d2v = Doc2Vec.load(f"{D2V_PATH}/{CATEGORY}_50_d2v.model")

In [3]:
# checking train dataframe
train.head().append(train.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,0000031887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A1KLRMWW2FWPL4,5.0,This is a great tutu and at a really great pri...,2011-02-12,great tutu great price look cheap glad look am...
1,1,0000031887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A2G5TCU2WDFZ65,5.0,I bought this for my 4 yr old daughter for dan...,2013-01-19,buy yr old daughter dance class wear today tim...
2,2,0000031887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A1RLQXYNCMWRWN,5.0,What can I say... my daughters have it in oran...,2013-01-04,daughter orange black white pink think buy fuc...
3,3,0000031887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A8U3FAMSJVHS5,5.0,"We bought several tutus at once, and they are ...",2014-04-27,buy tutu high review sturdy seemingly girl wea...
4,4,0000031887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A3GEOILWLK86XM,5.0,Thank you Halo Heaven great product for Little...,2014-03-15,thank halo heaven great product little girls g...
176500,278257,B00JULS24Q,Buyinhouse Feeling Adorable Cute Pink Bowknot ...,"[['Clothing, Shoes & Jewelry', 'Novelty, Costu...",A2IQT5AFFXA1OM,4.0,This item is exactly as described but for some...,2014-05-21,item exactly described reason like person like...
176501,278290,B00K035Y08,Sakkas 197 Oasis Gauzy Crepe Sleeveless Blouse...,"[['Clothing, Shoes & Jewelry', 'Women', 'Cloth...",A1XWMGHBAPKOV3,5.0,Put on and it fit great easy care also.Would t...,2014-06-15,fit great easy care tell friend buy thank hawa...
176502,278297,B00K0352PU,Sakkas Paradise Embroidered Relaxed Fit Blouse,"[['Clothing, Shoes & Jewelry', 'Women', 'Cloth...",A3VTQB69FYGQDU,3.0,Before I washed the top it was extremely large...,2014-06-06,wash extremely large fit perfectly wash embroi...
176503,278316,B00K8J06CK,TrendzArt Azules Poly Span Floral Print Full L...,"[['Clothing, Shoes & Jewelry', 'Women', 'Cloth...",A241BLSJL8AGY,4.0,This maxi skirt was very nice material it fit ...,2014-06-20,maxi skirt nice material fit nicely color pret...
176504,278345,B00KF9180W,[2 PACK] Multi-Purpose Sports Balaclava - For ...,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",A2YKWYC3WQJX5J,5.0,This is truly a year round product. Here in th...,2014-06-21,truly year round product midwest summer use ba...


In [4]:
# testing d2v models
d2v.dv[0]

array([ 5.1905590e-01, -1.8664794e-02,  3.6558089e-01,  2.1327564e-01,
       -1.2871456e-01, -6.6470662e-03,  8.4289604e-01,  3.3393204e-01,
       -1.5057931e+00, -1.6294066e-03,  3.7197164e-01,  2.6250139e-01,
       -1.0654725e+00, -5.2654725e-01, -1.3182850e+00,  4.3912196e-01,
        7.9720378e-01,  2.8077942e-01, -8.1759244e-01, -1.0747051e+00,
       -1.4533414e-01,  4.2795819e-01,  9.3902397e-01,  1.1115754e+00,
        1.6438978e+00,  1.3701128e+00,  3.3023664e-01, -3.3916229e-01,
       -6.3672036e-02,  4.8618659e-01,  3.0647725e-01,  8.2723975e-01,
       -4.7669479e-01, -1.3936814e-02, -1.3913001e+00,  1.2560036e+00,
        2.0797744e+00,  8.0265391e-01,  7.8117311e-01,  2.2114015e+00,
        8.6191636e-01,  2.7219886e-01, -8.0978021e-02, -9.9884219e-02,
        2.2644877e+00, -8.0019486e-01,  1.3349795e-02, -8.4338784e-02,
       -1.2035748e+00,  6.8047214e-01], dtype=float32)

# Generate User Embeddings

In [5]:
# get user rating history
train_user_rating_history = train.groupby(["reviewerID"])["asin"].progress_apply(list)
print(train_user_rating_history)

100%|██████████████████████████████████████████████████████| 39386/39386 [00:00<00:00, 64156.19it/s]

reviewerID
A001114613O3F18Q5NVR6     [B0016JNS44, B001T54XA8, B004AZXO1I, B004QJWKLS]
A00146182PNM90WNNAZ5Q     [B000JJX7C0, B000MX3SH2, B003CO205E, B008JXDFCU]
A00165422B2GAUE3EL6Z0     [B007WADN4G, B007WAEBPQ, B007WAT3I6, B008G51WHQ]
A00338282E99B8OR2JYTZ                 [B002FA5B8O, B003F06XQW, B00768LFYY]
A00354001GE099Q1FL0TU                 [B00387EEYA, B003RYZY8E, B0058XN9ZC]
                                               ...                        
AZZMQ85DPFEG3            [B0007KPPAI, B005EYUQ7E, B005VEMVI4, B007LOTZ5...
AZZNK89PXD006                         [B000KGOHLM, B005C3DH00, B005PQPLLC]
AZZT1ERHBSNQ8            [B000UANLGU, B002ATSG8C, B002UTJVMM, B007P83XJ...
AZZTOUKVTUMVM             [B000196UJ0, B009GE3XQ4, B00C2DJ66C, B00C4NV6LS]
AZZYW4YOE1B6E                         [B003V4AKTS, B004G8GOW0, B007LTV82W]
Name: asin, Length: 39386, dtype: object





In [6]:
def generate_user_embeddings(user_rating_history: pd.DataFrame, d2v: Doc2Vec) -> dict:
    """
    
    Args:
        unique_users ([list]):
        d2v ([Doc2Vec]):
    """
    
    # generate unique users
    unique_users = user_rating_history.reset_index()["reviewerID"].tolist()
    
    user_embeddings = {}
    for user in tqdm(unique_users):
        user_embedding = np.zeros(50)
        for item in user_rating_history[user]:
            user_embedding += d2v.dv[item]
            
        # computing mean aggregation
        user_embedding /= len(user_rating_history[user])
        user_embeddings[user] = user_embedding
        
    return user_embeddings

In [7]:
train_user_embeddings = generate_user_embeddings(train_user_rating_history, d2v)

100%|██████████████████████████████████████████████████████| 39386/39386 [00:01<00:00, 34280.59it/s]


# Utility Functions

In [8]:
def get_top_n(predictions: dict, user_rating_history: pd.DataFrame, n=10) -> dict:
    """Return the top-N recommendations for each user based on cosine similarity.
    
    Args:
        d2v ([Doc2Vec]): Doc2Vec of item representations based on reviews.
        user_rating_history ([dict]): "Future" purchases aggregated by users based on
            test set.
    
    Returns:
        ([dict]): A dictionary of top-N recommendations for each unique user, sorted by
            cosine similarties.
    """
    
    # retrieve a 200 items candidate list based on similarities
    top_ns = {}
    for user in predictions:
        rated_items = user_rating_history[user]
        candidate_items = [i[0] for i in predictions[user]]
        unrated_items = set(candidate_items) - set(rated_items)
        
        user_top_n = []
        idx = 0
        while len(user_top_n) < n:
            if candidate_items[idx] in unrated_items:
                user_top_n.append(candidate_items[idx])
                idx += 1
            else:
                idx += 1
        
        top_ns[user] = user_top_n
        
    return top_ns

def recall_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating recall@K - relevant / total relevant items
    recall_at_k = num_relevant / len(asins)
    
    return recall_at_k

def novelty_at_k(item_popularity, predicted_asins, k=10):
    """
    """
    # finding avg novelty
    popularity_sum = item_popularity.loc[predicted_asins].sum()
    novelty_at_k = ((k*1) - popularity_sum) / k
    
    return novelty_at_k

def generate_item_popularity(train: pd.DataFrame) -> pd.DataFrame:
    """
    """
    
    # create a mapping of item popularatity
    # based on sum(item's review / max reviews) / no items
    max_reviews = (train.groupby(['asin'])
                   .agg({'processedReviewText': 'count'})
                   .max()
                   .values[0])
    item_popularity = (train.groupby(['asin'])
                       .agg({'processedReviewText': 'count'})
                       .apply(lambda x: x/max_reviews))
    
    return item_popularity
    

def evaluate_recommendations(top_ns: dict, user_rating_history: pd.DataFrame, item_popularity: pd.DataFrame, k=10) -> pd.DataFrame:
    """
    
    Args:
        top_ns
        user_rating_history
    """
    
    test_recommendations = pd.DataFrame(top_ns.items(), columns=["reviewerID", "pred_asin"])
    
    # combined test history and recommendations
    test_merged = pd.merge(user_rating_history, test_recommendations, on="reviewerID", how="inner")
    
    # generating recall@k metrics
    test_merged["recall@k"] = test_merged.apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)
    test_merged["novelty@k"] = test_merged.apply(lambda x: novelty_at_k(item_popularity, x.pred_asin, k=k), axis=1)
    average_recall_at_k = test_merged["recall@k"].mean()
    average_novelty_at_k = test_merged["novelty@k"].mean()
    
    print(f"The MEM-ECF has an average recall@{k}: {average_recall_at_k:.5f}, average novelty@{k}: {average_novelty_at_k:.5f}")
    
    return test_merged

# Generate N-Recommendations = {10, 25, 30, 45}

## Load Test Data

In [9]:
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [10]:
test.head().append(test.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,6,0000031887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A16GFPNVF4Y816,5.0,Bought this as a backup to the regular ballet ...,2014-05-03,bought backup regular ballet outfit daughter w...
1,17,0000031887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A2XJ13PIXVJFJH,1.0,Never GOT this item - but gave a 1 STAR becaus...,2014-05-12,item star reply supplier great try send item r...
2,23,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,"[['Clothing, Shoes & Jewelry', 'Novelty, Costu...",A2WNN1DQVL4LH5,5.0,The minute I saw this my heart skipped a beat....,2013-11-07,minute saw heart skip beat nice case sort coll...
3,24,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,"[['Clothing, Shoes & Jewelry', 'Novelty, Costu...",A1ZPOCG2ST2CY3,5.0,Love this Jewelry Box so well put together ho...,2014-01-19,love jewelry box hold plendy love pink look ni...
4,27,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,"[['Clothing, Shoes & Jewelry', 'Novelty, Costu...",A1JC50F14SLAEV,3.0,I wanted to have the title summarize my though...,2014-05-12,want title summarize thought decide read entir...
99497,278341,B00KF9180W,[2 PACK] Multi-Purpose Sports Balaclava - For ...,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",A1EVV74UQYVKRY,4.0,I go walking a lot in all kinds of weather and...,2014-06-16,walk lot kind weather know lot like block cold...
99498,278342,B00KF9180W,[2 PACK] Multi-Purpose Sports Balaclava - For ...,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",ABUE0ALHKWKHC,5.0,This two pack of Balaclavas makes for a very n...,2014-06-09,pack balaclava nice purchase balaclava fit snu...
99499,278343,B00KF9180W,[2 PACK] Multi-Purpose Sports Balaclava - For ...,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",A1PI8VBCXXSGC7,5.0,"Well, the first thing I did was try the balacl...",2014-06-13,thing try balaclava hubby try fit average size...
99500,278344,B00KF9180W,[2 PACK] Multi-Purpose Sports Balaclava - For ...,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",A2XX2A4OJCDNLZ,5.0,While balaclavas can be used for a variety of ...,2014-06-13,balaclava use variety thing use mainly late fa...
99501,278346,B00KF9180W,[2 PACK] Multi-Purpose Sports Balaclava - For ...,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",A3UJRNI8UR4871,4.0,"Nice material, but not as nice as silk or mer...",2014-06-09,nice material nice silk merino wool course mat...


In [11]:
# generating test history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [12]:
print(test_user_history)

                  reviewerID                                              asin
0      A001114613O3F18Q5NVR6              [B000J6ZYL0, B005BXP7R2, B0093STGGO]
1      A00146182PNM90WNNAZ5Q              [B006Y4QDVQ, B00823Y41S, B00BQJV1LG]
2      A00165422B2GAUE3EL6Z0                          [B008SBGKP2, B00BLW2PZY]
3      A00338282E99B8OR2JYTZ                          [B003F8BKGW, B00DVFNNQE]
4      A00354001GE099Q1FL0TU                          [B0058YTOP0, B00BTWAZ0I]
...                      ...                                               ...
39358          AZZMQ85DPFEG3  [B000S7O8AS, B009N0CSXU, B00CKGB85I, B00DUW4VAA]
39359          AZZNK89PXD006                          [B004L7J7IO, B008ZBPQJ6]
39360          AZZT1ERHBSNQ8              [B00856U6BE, B0089GNZNQ, B00CPK44DM]
39361          AZZTOUKVTUMVM                          [B000VL04LS, B0053XF2U2]
39362          AZZYW4YOE1B6E                          [B007UNSF3O, B008J4RESK]

[39363 rows x 2 columns]


## Instantiate Memory-based Embedding CF (Item-based)

In [13]:
mem_ecf = cf.EmbeddedItemBasedCF(d2v)

In [14]:
# fit learning algorithm to training data
mem_ecf.fit(train)

100%|██████████████████████████████████████████████████████| 39386/39386 [00:01<00:00, 32447.20it/s]


In [15]:
# generate n-number of candidates items (200)
candidate_items = mem_ecf.test()

100%|███████████████████████████████████████████████████████| 39386/39386 [00:28<00:00, 1398.69it/s]


## Loop through N = {10, 25, 30, 45}

In [16]:
# generate item popularity
item_popularity = generate_item_popularity(train)

In [17]:
n_recommendations = {}
for n in [10, 25, 30, 45]:
    # retrieve the top-n items based on similarities
    top_ns = get_top_n(candidate_items, mem_ecf.user_rating_history, n)
    # evaluate how well the recommended items predicted the future purchases
    n_recommended_items = evaluate_recommendations(top_ns, test_user_history, item_popularity, n)
    # saving the n-value and recommended items
    n_recommendations[n] = (top_ns, n_recommended_items)

The MEM-ECF has an average recall@10: 0.01172, average novelty@10: 0.97791
The MEM-ECF has an average recall@25: 0.02399, average novelty@25: 0.97883
The MEM-ECF has an average recall@30: 0.02740, average novelty@30: 0.97905
The MEM-ECF has an average recall@45: 0.03610, average novelty@45: 0.97953


# Evaluate N-Recommendations

In [18]:
def retrieve_recommendations(train: pd.DataFrame, top_ns: dict):
    """
    """
    # generating a random user
    random_user = np.random.choice(list(train['reviewerID'].unique()), 1)[0]
    print(f"For user: {random_user}:")
    print(f"Purchase History:\n{train[train['reviewerID'] == random_user][['asin', 'title']]}")

    # find the recommendations
    print(f"\nRecommending:\n")
    recommendations = (train[train['asin']
                             .isin(top_ns[random_user])][['asin', 'title']]
                       .drop_duplicates(subset='asin')
                       .set_index('asin'))
    print(f"{recommendations.loc[top_ns[random_user]].reset_index()}")

## N=10

In [19]:
top_ns_10 = n_recommendations[10][0]

In [20]:
retrieve_recommendations(train, top_ns_10)

For user: A3O9GI4D85UIVA:
Purchase History:
              asin                                              title
89231   B0049PMHTO                   Black Military Wool Glove Liners
106992  B0057F9JYG  Columbia Sportswear Women's Descender Trail Ru...
108767  B00594LZNI        Vivobarefoot Women's Neo Trail Running Shoe

Recommending:

         asin                                              title
0  B000HKPFS0                Black GI Polypropylene Glove Liners
1  B0000DYNCP               Smartwool Trekking Heavy, Crew beige
2  B002YN977I  Darn Tough Vermont Merino Wool Mountaineering ...
3  B0009MZVO4    Thorlo Women's Lite Running Mini Ankle-Cut Sock
4  B0000DYNCD            Smartwool Men's Hiking Medium Crew Sock
5  B001H0FE2G                  Isotoner Women's Solid Knit Glove
6  B0009MZXT2          Thorlo Men's Wool/Silk Lt Hiker Crew Sock
7  B0009MZW2A  Thorlo Women's Moderate Cushion Light  Hiking ...
8  B000W8UUW8                         Sorel Women's Glacier Boot
9  B00011V

## N=25

In [21]:
top_ns_25 = n_recommendations[25][0]

In [22]:
retrieve_recommendations(train, top_ns_25)

For user: A1BJ7X0KD3URX4:
Purchase History:
             asin                                              title
17292  B000EZTOYO                 Birkenstock Gizeh Birko-Flor Thong
32836  B000UK0Q6I  Cocoons Fitovers Polarized Sunglasses Mini Sli...
53824  B001SN8BLS  Skechers Women's Shape Ups - Sleek Fit Fitness...
93013  B004HHUNI6                    Clarks Women's May Poppy Loafer

Recommending:

          asin                                              title
0   B002UXRZOO          Clarks Women's Wave.Cruise Mary Jane Flat
1   B005DNM2RI            Clarks Women's Lexi Willow Slide Sandal
2   B0000BW6IN                   New Balance Women's W801 Sneaker
3   B002LZURAA                  Birkenstock Women's Mayari Sandal
4   B0044QPXTE             Birkenstock Women's Adria Thong Sandal
5   B0036OS0FE      Crocs Women's Wrapped Clog,Black/Black,4 M US
6   B008LYRVCW                  Clarks Women's Leisa Lolly Sandal
7   B006R2QJ14              Clarks Women's Clarks Lexi Cedar Mule
8

## N=30

In [23]:
top_ns_30 = n_recommendations[30][0]

In [24]:
retrieve_recommendations(train, top_ns_30)

For user: A2VNIZTV0BGUCX:
Purchase History:
              asin                                              title
26438   B000N63OB8  Hanes 5.2 oz Women's COMFORTSOFT Relax Fit V-N...
41060   B0015A6WKQ                    Levi's Juniors 535 Legging Jean
41963   B001718DSM                       Nike Womens Celso Girl Thong
42754   B0018N8YRE         Funtasma by Pleaser Women's Lust-2000 Boot
62102   B002JB6VEC                  Nike Women's Comfort Thong Sandal
141983  B0085VGQAO    Womens Elegant Sleek Patent Leather Skinny Belt
174552  B00EVPPG5Y     Vip Women's Long Sleeved Scoop Neck Midi Dress

Recommending:

          asin                                              title
0   B00C2QLEXC  Lucky Brand Women's Sweet N Straight Crop, 28,...
1   B005OKDO22          Levi's Women's 512 Bermuda, Social Blue,4
2   B007OQ8FRS  Lee Women's Misses Slender Secret Scarlett Bar...
3   B007Y8WR8O  Calvin Klein Jeans Women's Indigo Denim Straig...
4   B00EB10XEW  WallFlower Juniors Luscious Curvy 

## N=45

In [25]:
top_ns_45 = n_recommendations[45][0]

In [26]:
retrieve_recommendations(train, top_ns_45)

For user: A17WAIYTFIQYB3:
Purchase History:
              asin                                              title
51900   B001NXBVY2  FitFlop Women's Walkstar 3 Metallic Sandal San...
54976   B001UV3FYG            Dansko Women's Professional Tooled Clog
132886  B007HXMQXM  Allegra K Women Boat Neck Long Batwing Sleeve ...
158234  B009ZDEXQK         Skechers Women's Go Walk 2 Fashion Sneaker
162881  B00AVPHH4Q      Top Moda Women's COCO 1 Knee High Riding Boot
168863  B00CIBCJ62  Cotton Cantina Soft Chevron Sheer Infinity Sca...

Recommending:

          asin                                              title
0   B00DEQJLJ8        Clarks Women's Ashland India Mary Jane Flat
1   B004VUXAZC         Klogs USA Women's Tropical Platform Sandal
2   B00ANJJZAO                       crocs Women's Gianna LK Flat
3   B004WNFCUE             Soft Style Women's Easy To Pleats Pump
4   B0044QPXTE             Birkenstock Women's Adria Thong Sandal
5   B005OHS826           Hush Puppies Women's Vesper S