In [1]:
import numpy as np
import pandas as pd
import warnings

from gensim.models.doc2vec import Doc2Vec
from pandarallel import pandarallel
from tqdm import tqdm

from src.models import cf

pandarallel.initialize()
tqdm.pandas()
warnings.filterwarnings('ignore')



INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Load Data and Models

In [2]:
# global variables
DATA_PATH = "data/evaluation"
D2V_PATH = "models/d2v"
CATEGORY = "Pet_Supplies"


train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")
d2v = Doc2Vec.load(f"{D2V_PATH}/{CATEGORY}_50_d2v.model")

In [3]:
# checking train dataframe
train.head().append(train.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A14CK12J7C7JRK,3.0,I purchased the Trilogy with hoping my two cat...,2011-01-12,purchase trilogy hop cat age interested yr old...
1,2,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A2CR37UY3VR7BN,4.0,I bought the triliogy and have tested out all ...,2012-12-19,buy triliogy test dvd appear volume receive re...
2,3,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A2A4COGL9VW2HY,4.0,My female kitty could care less about these vi...,2011-05-12,female kitty care video care little male dig a...
3,4,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A2UBQA85NIGLHA,3.0,"If I had gotten just volume two, I would have ...",2012-03-05,volume star trilogy star read review know vol ...
4,5,B00005MF9U,LitterMaid LM900 Mega Self-Cleaning Litter Box,"['Pet Supplies', 'Cats', 'Litter &amp; Housebr...",A2BH04B9G9LOYA,1.0,"First off, it seems that someone is spamming t...",2006-12-31,spamming review glow reviewer review amazon ba...
68865,111581,B00K3YPOO0,Brightest Black Light Flashlight on Amazon- UV...,[],A11J1FHCK5U06J,4.0,Now I know exactly where the trouble spots are...,2014-05-23,know exactly trouble spot sniffing guess invis...
68866,111585,B00K3YPOO0,Brightest Black Light Flashlight on Amazon- UV...,[],A18JF0T0GOCORW,4.0,I use this light to help me find stains when I...,2014-05-24,use light help stain carpet clean pre treat ca...
68867,111595,B00K7EG97C,Nutro Crunchy Dog Treats with Real Mixed Berri...,"['Pet Supplies', 'Dogs', 'Treats', 'Cookies, B...",A3GRPCW9DG427Z,5.0,We are owned by the 3 pickiest pooches in the ...,2013-07-27,pickiest pooch world love fool reject doggie t...
68868,111598,B00K7EG97C,Nutro Crunchy Dog Treats with Real Mixed Berri...,"['Pet Supplies', 'Dogs', 'Treats', 'Cookies, B...",A2X6TLAX3JEO1A,5.0,My highly allergic white boxer loves these tre...,2014-05-09,highly allergic white boxer love treat meat co...
68869,111602,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A9PG9ODPPP31N,5.0,Works great on my medium sized dog. She has ve...,2014-07-09,work great medium size dog coarse hair work gr...


In [4]:
# testing d2v models
d2v.dv[0]

array([-0.31036553, -0.15850139,  0.02544439, -0.0646498 , -0.03718435,
        0.02328839,  0.03318371,  0.04919509, -0.18090156,  0.11933047,
        0.01682451,  0.1686323 ,  0.06326769,  0.09223235, -0.01427258,
       -0.06473075,  0.0102353 , -0.02016022, -0.12578371, -0.03764864,
        0.00058584, -0.01052261, -0.12321293, -0.08263619, -0.00429671,
        0.15283036,  0.01789919, -0.16985203, -0.1345357 , -0.17634651,
        0.08240214,  0.13576838,  0.04829112,  0.11729528,  0.05669812,
        0.1893843 , -0.1255381 , -0.21937071,  0.16547264,  0.08096074,
       -0.04227924,  0.092145  , -0.0230357 , -0.01015091,  0.0279709 ,
        0.13947988,  0.13985303, -0.08256152, -0.03683715, -0.01097574],
      dtype=float32)

# Generate User Embeddings

In [5]:
# get user rating history
train_user_rating_history = train.groupby(["reviewerID"])["asin"].progress_apply(list)
print(train_user_rating_history)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19058/19058 [00:00<00:00, 57523.37it/s]

reviewerID
A04173782GDZSQ91AJ7OD                 [B0002AT464, B0002AT464, B00078ZK2S]
A042274212BJJVOBS4Q85                 [B001AT9B8M, B001E8LD3K, B00II7195M]
A0436342QLT4257JODYJ     [B000255NCI, B000255NCI, B000255O90, B000255O9...
A04795073FIBKY8GSLZYI                             [B000O39TDC, B000O39TE6]
A06658082A27F4VB5UG8E                             [B0006MU8WC, B000FS4OYA]
                                               ...                        
AZYJE40XW6MFG                                     [B0002IEYIE, B0002IEYIE]
AZZ56WF4X19G2                                                 [B0010P0YSW]
AZZNK89PXD006            [B0002DHV16, B0006N9D4A, B0018CIPS8, B001Q9EGK...
AZZV9PDNMCOZW                         [B0006N9LN8, B004PU7SBU, B00DIIKLCI]
AZZYW4YOE1B6E            [B0002ARQV4, B0002H3R2E, B0002H3R2E, B000MLG4K...
Name: asin, Length: 19058, dtype: object





In [6]:
def generate_user_embeddings(user_rating_history: pd.DataFrame, d2v: Doc2Vec) -> dict:
    """
    
    Args:
        unique_users ([list]):
        d2v ([Doc2Vec]):
    """
    
    # generate unique users
    unique_users = user_rating_history.reset_index()["reviewerID"].tolist()
    
    user_embeddings = {}
    for user in tqdm(unique_users):
        user_embedding = np.zeros(50)
        for item in user_rating_history[user]:
            user_embedding += d2v.dv[item]
            
        # computing mean aggregation
        user_embedding /= len(user_rating_history[user])
        user_embeddings[user] = user_embedding
        
    return user_embeddings

In [7]:
train_user_embeddings = generate_user_embeddings(train_user_rating_history, d2v)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19058/19058 [00:00<00:00, 38658.15it/s]


# Utility Functions

In [8]:
def get_top_n(predictions: dict, user_rating_history: pd.DataFrame, n=10) -> dict:
    """Return the top-N recommendations for each user based on cosine similarity.
    
    Args:
        d2v ([Doc2Vec]): Doc2Vec of item representations based on reviews.
        user_rating_history ([dict]): "Future" purchases aggregated by users based on
            test set.
    
    Returns:
        ([dict]): A dictionary of top-N recommendations for each unique user, sorted by
            cosine similarties.
    """
    
    # retrieve a 200 items candidate list based on similarities
    top_ns = {}
    for user in predictions:
        rated_items = user_rating_history[user]
        candidate_items = [i[0] for i in predictions[user]]
        unrated_items = set(candidate_items) - set(rated_items)
        
        user_top_n = []
        idx = 0
        while len(user_top_n) < n:
            if candidate_items[idx] in unrated_items:
                user_top_n.append(candidate_items[idx])
                idx += 1
            else:
                idx += 1
        
        top_ns[user] = user_top_n
        
    return top_ns

def recall_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating recall@K - relevant / total relevant items
    recall_at_k = num_relevant / len(asins)
    
    return recall_at_k

def novelty_at_k(item_popularity, predicted_asins, k=10):
    """
    """
    # finding avg novelty
    popularity_sum = item_popularity.loc[predicted_asins].sum()
    novelty_at_k = ((k*1) - popularity_sum) / k
    
    return novelty_at_k

def generate_item_popularity(train: pd.DataFrame) -> pd.DataFrame:
    """
    """
    
    # create a mapping of item popularatity
    # based on sum(item's review / max reviews) / no items
    max_reviews = (train.groupby(['asin'])
                   .agg({'processedReviewText': 'count'})
                   .max()
                   .values[0])
    item_popularity = (train.groupby(['asin'])
                       .agg({'processedReviewText': 'count'})
                       .apply(lambda x: x/max_reviews))
    
    return item_popularity
    

def evaluate_recommendations(top_ns: dict, user_rating_history: pd.DataFrame, item_popularity: pd.DataFrame, k=10) -> pd.DataFrame:
    """
    
    Args:
        top_ns
        user_rating_history
    """
    
    test_recommendations = pd.DataFrame(top_ns.items(), columns=["reviewerID", "pred_asin"])
    
    # combined test history and recommendations
    test_merged = pd.merge(user_rating_history, test_recommendations, on="reviewerID", how="inner")
    
    # generating recall@k metrics
    test_merged["recall@k"] = test_merged.apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)
    test_merged["novelty@k"] = test_merged.apply(lambda x: novelty_at_k(item_popularity, x.pred_asin, k=k), axis=1)
    average_recall_at_k = test_merged["recall@k"].mean()
    average_novelty_at_k = test_merged["novelty@k"].mean()
    
    print(f"The MEM-ECF has an average recall@{k}: {average_recall_at_k:.5f}, average novelty@{k}: {average_novelty_at_k:.5f}")
    
    return test_merged

# Generate N-Recommendations = {10, 25, 30, 45}

## Load Test Data

In [9]:
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [10]:
test.head().append(test.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,1,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A39QHP5WLON5HV,5.0,There are usually one or more of my cats watch...,2013-09-14,usually cat watch tv stay trouble dvd play lik...
1,104,B00005MF9V,LitterMaid Universal Cat Privacy Tent (LMT100),"['Pet Supplies', 'Cats', 'Litter & Housebreaki...",A366V0GCEPH5CX,5.0,My cats love it and so do I. I no longer have ...,2013-02-02,cat love longer cat litter fly floor litter fl...
2,133,B00005MF9T,LitterMaid LM500 Automated Litter Box,"['Pet Supplies', 'Cats', 'Litter & Housebreaki...",ALWWS8QBYN80B,1.0,I have one female cat that weighs under 10 pou...,2004-11-17,female cat weigh pound year old use everclean ...
3,153,B00005MF9W,LitterMaid Waste Receptacles Automatic Litter ...,"['Pet Supplies', 'Cats', 'Litter & Housebreaki...",A3PVI3NE7OY1SP,5.0,I love these. They make the clean up so much e...,2013-09-26,love clean easy clean box manually use issue w...
4,154,B00005MF9W,LitterMaid Waste Receptacles Automatic Litter ...,"['Pet Supplies', 'Cats', 'Litter & Housebreaki...",A2H83XMHUVDLJY,4.0,"I love this litter box. I do not use the lids,...",2014-06-26,love litter box use lid use receptacle tear cr...
41564,111601,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],AV34KNYW82YSS,4.0,Pulled lots of hair out of my Labs coat. Didn'...,2014-07-18,pulled lot hair labs coat think prove wrong co...
41565,111603,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A1YMNTFLNDYQ1F,5.0,I have been trying to find a rubber bristle br...,2014-07-16,try rubber bristle brush persian year lose glo...
41566,111604,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A1FQ3HRVXA4A5B,5.0,Great product to use on your pets knowing this...,2014-07-11,great product use pet know gentle rubber damag...
41567,111605,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A3OP6CI0XCRQXO,5.0,I bought a second one because I have two cats ...,2014-07-22,buy second cat american short hair buy brush m...
41568,111606,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A11LC938XF35XN,5.0,Our dogs love getting brushed with this. It m...,2014-07-17,dog love brush massage remove heavy undercoat ...


In [11]:
# generating test history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [12]:
print(test_user_history)

                  reviewerID                                  asin
0      A04173782GDZSQ91AJ7OD              [B0090Z9AYS, B00CPDWT2M]
1      A042274212BJJVOBS4Q85              [B005AZ4M3Q, B00771WQIY]
2       A0436342QLT4257JODYJ  [B0018CDR68, B003SJTM8Q, B00474A3DY]
3      A04795073FIBKY8GSLZYI              [B001PKT30M, B005DGI2RY]
4      A06658082A27F4VB5UG8E              [B000TZ1TTM, B0019VUHH0]
...                      ...                                   ...
18993          AZYJE40XW6MFG              [B00HVAKJZS, B00IDZT294]
18994          AZZ56WF4X19G2                          [B004A7X218]
18995          AZZNK89PXD006  [B0002DHV16, B005BP8MQ8, B009RTX4SU]
18996          AZZV9PDNMCOZW              [B007EQL390, B00ISBWVT6]
18997          AZZYW4YOE1B6E  [B0002AQPA2, B0002AQPA2, B0002ARQV4]

[18998 rows x 2 columns]


## Instantiate Memory-based Embedding CF (Item-based)

In [13]:
mem_ecf = cf.EmbeddedItemBasedCF(d2v)

In [14]:
%%time
# fit learning algorithm to training data
mem_ecf.fit(train)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19058/19058 [00:00<00:00, 41982.69it/s]

CPU times: user 709 ms, sys: 18 ms, total: 727 ms
Wall time: 726 ms





In [15]:
%%time
# generate n-number of candidates items (200)
candidate_items = mem_ecf.test()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19058/19058 [00:07<00:00, 2593.10it/s]

CPU times: user 24.7 s, sys: 4.01 s, total: 28.7 s
Wall time: 7.35 s





## Loop through N = {10, 25, 30, 45}

In [16]:
# generate item popularity
item_popularity = generate_item_popularity(train)

In [17]:
n_recommendations = {}
for n in [10, 25, 30, 45]:
    # retrieve the top-n items based on similarities
    top_ns = get_top_n(candidate_items, mem_ecf.user_rating_history, n)
    # evaluate how well the recommended items predicted the future purchases
    n_recommended_items = evaluate_recommendations(top_ns, test_user_history, item_popularity, n)
    # saving the n-value and recommended items
    n_recommendations[n] = (top_ns, n_recommended_items)

The MEM-ECF has an average recall@10: 0.02538, average novelty@10: 0.95381
The MEM-ECF has an average recall@25: 0.04702, average novelty@25: 0.96009
The MEM-ECF has an average recall@30: 0.05238, average novelty@30: 0.96143
The MEM-ECF has an average recall@45: 0.06805, average novelty@45: 0.96418


# Evaluate N-Recommendations

In [18]:
def retrieve_recommendations(train: pd.DataFrame, top_ns: dict):
    """
    """
    # generating a random user
    random_user = np.random.choice(list(train['reviewerID'].unique()), 1)[0]
    print(f"For user: {random_user}:")
    print(f"Purchase History:\n{train[train['reviewerID'] == random_user][['asin', 'title']]}")

    # find the recommendations
    print(f"\nRecommending:\n")
    recommendations = (train[train['asin']
                             .isin(top_ns[random_user])][['asin', 'title']]
                       .drop_duplicates(subset='asin')
                       .set_index('asin'))
    print(f"{recommendations.loc[top_ns[random_user]].reset_index()}")

## N=10

In [19]:
top_ns_10 = n_recommendations[10][0]

In [20]:
retrieve_recommendations(train, top_ns_10)

For user: A6CZTBVG7PR8O:
Purchase History:
             asin                                      title
30205  B0009YD810        Peter's Woven Grass Mat for Rabbits
30505  B0009YJ3OG               Peters Woven Grass Play Ball
39094  B000MDZE2E  Peters Woven Grass Hide-A-Way Hut Rabbits

Recommending:

         asin                                              title
0  B0013UQQIQ                   Snoozer Luxury Cozy Cave Pet Bed
1  B0064ZX6RO           P.L.A.Y. Pet Lifestyle and You Crate Pad
2  B00027ZVKA      Superpet Bunny Flip N Toss Carrot (Pack of 3)
3  B000QOHVSO                         Hartz Bizzy Balls Cat Toys
4  B000MFOB2Q           Boss Pet - Prestige Large Dog Anchor Kit
5  B003954NDI  Jakks Pacific Pawdoodles Krinklers Dog Toy, Fo...
6  B000EFV9YM                          Marshall Ferret Octo-Play
7  B000HHJF70  JPI JAKKS Pacific CFA Active Plush House Mouse...
8  B001N2PU6I  Caitec Creative Foraging Systems Four Corners ...
9  B001O8L2UO                      Trixie 5-in-

## N=25

In [21]:
top_ns_25 = n_recommendations[25][0]

In [22]:
retrieve_recommendations(train, top_ns_25)

For user: A79EOKB6XXAGF:
Purchase History:
             asin                                   title
21887  B0002EOVZO             Planet Dog Orbee-Tuff Orbee
21909  B0002EOVZO             Planet Dog Orbee-Tuff Orbee
22792  B0002I0RNK       PetSafe Busy Buddy Waggle Dog Toy
22829  B0002I0RNK       PetSafe Busy Buddy Waggle Dog Toy
49887  B001JQLNB4  StarMark Bob-A-Lot Interactive Dog Toy

Recommending:

          asin                                              title
0   B004A7X218  West Paw Zogoflex Tux Interactive Treat Dispen...
1   B000A61GNO           PetSafe Busy Buddy Squirrel Dude Dog Toy
2   B000084ESL  KONG - Dental - Durable Rubber, Teeth and Gum ...
3   B0009YD8NS                         Treat Dispensing Chew Ball
4   B001PKTWDA  West Paw Zogoflex Hurley Durable Dog Bone Chew...
5   B000MCXXZU  JW Pet Company iSqueak Ball Rubber Dog Toy, Co...
6   B001E8SZE0  West Paw Design Zogoflex Dog Toy, Tux, Colors ...
7   B003YHB8DO  StarMark Everlasting Fun Ball, Dog Toy Medium/...

## N=30

In [23]:
top_ns_30 = n_recommendations[30][0]

In [24]:
retrieve_recommendations(train, top_ns_30)

For user: A2VUF4LUC6BN7T:
Purchase History:
             asin                                              title
59591  B004CRDIZQ  COA61120-KW Baskerville Ultra Dog Muzzle, Size...
63402  B005D7FAXW  W.C. Redmon Pet Scale, Tote Bag for Pet Scale ...

Recommending:

          asin                                              title
0   B0012NTI70          Puppia Soft Dog Harness Spring Pink Large
1   B003MU9NP8  Outward Hound Kyjen   Designer Pet Saver Life ...
2   B0002DJVAU                Fashion Pet Puddles Rain Dog Poncho
3   B0002DGVY4  Herm Sprenger Pet Supply Imports Chrome Plated...
4   B0012NVAOY          Puppia Soft Dog Harness Spring Pink Small
5   B000O5FYU2  Pet Gear AT3 Generation 2 All-Terrain Pet Stro...
6   B000AYB4VK  Planet Dog 5' Natural Hemp Leash with Fleece H...
7   B0015UGWYW  Zack &amp; Zoey Get Your Nose Pet Tee Shirt - ...
8   B00074VT5E  LupinePet 1-Inch Martingale Combo Collar for L...
9   B0056F53YM                Signature K9 Standard Leather Leash
10  B00

## N=45

In [25]:
top_ns_45 = n_recommendations[45][0]

In [26]:
retrieve_recommendations(train, top_ns_45)

For user: A37H473BZHX9P0:
Purchase History:
             asin                                 title
9512   B0002APRKQ                     Lee's Net Breeder
9518   B0002APRKQ                     Lee's Net Breeder
42789  B000YDQ90I  Aqueon Replacement Filter Cartridges

Recommending:

          asin                                              title
0   B0002602SC  Aquarium Filter Hydro-Sponge IV by Lustar, for...
1   B0002568HW  Fluval Aquastop For Fluval 104-404, 105-405 Se...
2   B007KKU8QE  PanaView 5-Gallon Fish Tank with LED Lighting ...
3   B0009YHSE8  MarineLand Cartridge Media Refill for Penguin ...
4   B006OMKMTM  Oceanic Systems, Inc. Evolve 4 - LED Light Aqu...
5   B0002602S2  Hydro-Sponge Filter I--HS910 - Aquarium Techno...
6   B005VFLAG2       Finnex External Refugium Breeder Hang-On Box
7   B0002AQGZQ                                Elite Sponge Filter
8   B0002DJLE6  Penn Plax Cascade 300 Submersible Aquarium Fil...
9   B0002AQKIO        Fluval Carbon, 100-gram Nylon Bags