In [1]:
import numpy as np
import pandas as pd
import warnings

from gensim.models.doc2vec import Doc2Vec
from pandarallel import pandarallel
from tqdm import tqdm

from src.models import cf

pandarallel.initialize()
tqdm.pandas()
warnings.filterwarnings('ignore')



INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Load Data and Models

In [2]:
# global variables
DATA_PATH = "data/evaluation"
D2V_PATH = "models/d2v"
CATEGORY = "Grocery_and_Gourmet_Food"


train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")
d2v = Doc2Vec.load(f"{D2V_PATH}/{CATEGORY}_50_d2v.model")

In [3]:
# checking train dataframe
train.head().append(train.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A23RYWDS884TUL,5.0,This curry paste makes a delicious curry. I j...,2013-05-28,curry paste delicious curry fry chicken vegeta...
1,1,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A945RBQWGZXCK,5.0,I've purchased different curries in the grocer...,2012-09-17,purchase different curry grocery store complet...
2,3,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3AMNY44OP8AOU,4.0,I started a new diet restricting all added sug...,2014-01-23,start new diet restrict added sugar brand suga...
3,4,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3IB4CQ2QEJLJ8,5.0,So many flavors. I can't begin to tell you how...,2014-04-27,flavor begin tell love mae ploy curry ask reci...
4,5,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",AQA5DF3RWKETQ,5.0,I've used this a lot recently in some of my ch...,2012-11-27,use lot recently chicken dish use lot like spi...
47769,77420,B00I33696K,Reese's Miniature Peanut Butter Cups .31oz - 1...,"['Grocery & Gourmet Food', 'Candy & Chocolate'...",A192LQZWDYPR4U,5.0,Another quality Reese Peanut Butter Cup produc...,2014-02-27,quality reese peanut butter cup product great ...
47770,77421,B00I33696K,Reese's Miniature Peanut Butter Cups .31oz - 1...,"['Grocery & Gourmet Food', 'Candy & Chocolate'...",A2QKXW3LDQ66P5,5.0,I purchased these for my husband who has every...,2013-02-20,purchase husband love reeses valentine day pre...
47771,77430,B00ID9VSOM,"Viva Labs Organic Coconut Sugar: Non-GMO, Low-...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A2P3TGJU301KXD,5.0,this stuff is INCREDIBILY yummy! SO much bette...,2014-07-15,stuff incredibily yummy good regular brown sug...
47772,77456,B00IRL93SY,Barrie House Kenya Estate - AA Single Cup Caps...,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",AEFE9VDHTQ199,5.0,"Very nice aroma, body and taste! Will buy this...",2014-05-24,nice aroma body taste buy coffee good coffee a...
47773,77508,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A2AEZQ3DGBBLPR,2.0,This is a no go for diabetics according to my ...,2014-06-26,diabetic accord wife doctor order intention us...


In [4]:
# testing d2v models
d2v.dv[0]

array([-1.68350220e-01,  1.28843918e-01,  1.53609708e-01,  1.02531262e-01,
       -1.78938702e-01,  1.40786603e-01, -1.02161698e-01,  2.32769642e-02,
       -2.05874681e-01,  1.07798077e-01, -6.60957396e-02, -3.01132966e-02,
       -4.72943723e-01,  8.28767791e-02,  2.58622289e-01, -1.09628424e-01,
        4.43287522e-01, -3.27611864e-01, -1.31865367e-01, -1.37127474e-01,
       -9.12572891e-02,  2.37981174e-02,  4.94413167e-01, -7.89995044e-02,
       -4.79375347e-02,  5.91341615e-01, -4.05233592e-01,  4.74170223e-03,
       -4.25971776e-01,  3.98977101e-02,  1.94822073e-01,  3.21712404e-01,
       -9.83737037e-02,  4.36548710e-01,  1.68986227e-02,  3.02458614e-01,
       -2.99157470e-01, -3.90456207e-02,  3.00976545e-01,  1.16236672e-01,
       -4.14809547e-02, -2.12868959e-01, -2.89017200e-01, -2.83463597e-01,
        2.26068288e-01, -1.09374970e-01, -7.62177492e-03, -1.16938069e-01,
       -1.03234852e-04,  8.88586491e-02], dtype=float32)

# Generate User Embeddings

In [5]:
# get user rating history
train_user_rating_history = train.groupby(["reviewerID"])["asin"].progress_apply(list)
print(train_user_rating_history)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13397/13397 [00:00<00:00, 62043.53it/s]

reviewerID
A00177463W0XWB16A9O05                             [B0029XDZIK, B0094ISOMA]
A022899328A0QROR32DCT                             [B001ACMCNU, B003TO9RSU]
A068255029AHTHDXZURNU                             [B000K8WVYA, B0094ISOMA]
A06944662TFWOKKV4GJKX                                         [B000CQBZPG]
A1004703RC79J9                                                [B001E50THY]
                                               ...                        
AZWRZZAMX90VT            [B0007R9L5Q, B000CQ01GU, B000E123IC, B000E46LZ...
AZXKAH2DE6C8A            [B000EML7DS, B000ODF2ME, B001650XUK, B0018QLG9...
AZXON596A1VXC                         [B00113SKZW, B00113ZTVK, B001L4JH5I]
AZYXC63SS008M                                                 [B0040WCQKQ]
AZZ5ASC403N74                                                 [B004U49QU2]
Name: asin, Length: 13397, dtype: object





In [6]:
def generate_user_embeddings(user_rating_history: pd.DataFrame, d2v: Doc2Vec) -> dict:
    """
    
    Args:
        unique_users ([list]):
        d2v ([Doc2Vec]):
    """
    
    # generate unique users
    unique_users = user_rating_history.reset_index()["reviewerID"].tolist()
    
    user_embeddings = {}
    for user in tqdm(unique_users):
        user_embedding = np.zeros(50)
        for item in user_rating_history[user]:
            user_embedding += d2v.dv[item]
            
        # computing mean aggregation
        user_embedding /= len(user_rating_history[user])
        user_embeddings[user] = user_embedding
        
    return user_embeddings

In [7]:
train_user_embeddings = generate_user_embeddings(train_user_rating_history, d2v)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13397/13397 [00:00<00:00, 40451.55it/s]


# Utility Functions

In [8]:
def get_top_n(predictions: dict, user_rating_history: pd.DataFrame, n=10) -> dict:
    """Return the top-N recommendations for each user based on cosine similarity.
    
    Args:
        d2v ([Doc2Vec]): Doc2Vec of item representations based on reviews.
        user_rating_history ([dict]): "Future" purchases aggregated by users based on
            test set.
    
    Returns:
        ([dict]): A dictionary of top-N recommendations for each unique user, sorted by
            cosine similarties.
    """
    
    # retrieve a 200 items candidate list based on similarities
    top_ns = {}
    for user in predictions:
        rated_items = user_rating_history[user]
        candidate_items = [i[0] for i in predictions[user]]
        unrated_items = set(candidate_items) - set(rated_items)
        
        user_top_n = []
        idx = 0
        while len(user_top_n) < n:
            if candidate_items[idx] in unrated_items:
                user_top_n.append(candidate_items[idx])
                idx += 1
            else:
                idx += 1
        
        top_ns[user] = user_top_n
        
    return top_ns

def recall_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating recall@K - relevant / total relevant items
    recall_at_k = num_relevant / len(asins)
    
    return recall_at_k

def novelty_at_k(item_popularity, predicted_asins, k=10):
    """
    """
    # finding avg novelty
    popularity_sum = item_popularity.loc[predicted_asins].sum()
    novelty_at_k = ((k*1) - popularity_sum) / k
    
    return novelty_at_k

def generate_item_popularity(train: pd.DataFrame) -> pd.DataFrame:
    """
    """
    
    # create a mapping of item popularatity
    # based on sum(item's review / max reviews) / no items
    max_reviews = (train.groupby(['asin'])
                   .agg({'processedReviewText': 'count'})
                   .max()
                   .values[0])
    item_popularity = (train.groupby(['asin'])
                       .agg({'processedReviewText': 'count'})
                       .apply(lambda x: x/max_reviews))
    
    return item_popularity
    

def evaluate_recommendations(top_ns: dict, user_rating_history: pd.DataFrame, item_popularity: pd.DataFrame, k=10) -> pd.DataFrame:
    """
    
    Args:
        top_ns
        user_rating_history
    """
    
    test_recommendations = pd.DataFrame(top_ns.items(), columns=["reviewerID", "pred_asin"])
    
    # combined test history and recommendations
    test_merged = pd.merge(user_rating_history, test_recommendations, on="reviewerID", how="inner")
    
    # generating recall@k metrics
    test_merged["recall@k"] = test_merged.apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)
    test_merged["novelty@k"] = test_merged.apply(lambda x: novelty_at_k(item_popularity, x.pred_asin, k=k), axis=1)
    average_recall_at_k = test_merged["recall@k"].mean()
    average_novelty_at_k = test_merged["novelty@k"].mean()
    
    print(f"The MEM-ECF has an average recall@{k}: {average_recall_at_k:.5f}, average novelty@{k}: {average_novelty_at_k:.5f}")
    
    return test_merged

# Generate N-Recommendations = {10, 25, 30, 45}

## Load Test Data

In [9]:
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [10]:
test.head().append(test.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,2,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A1TCSC0YWT82Q0,5.0,I love ethnic foods and to cook them. I recent...,2013-08-03,love ethnic food cook recently purchase produc...
1,8,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A1Z7Y2GMAP9SRY,5.0,I like to make my own curry but this is a tast...,2014-06-27,like curry tasty alternative use base kind dif...
2,23,B00004S1C5,"Ateco Food Coloring Kit, 6 colors","['Grocery & Gourmet Food', 'Cooking & Baking',...",A14YSMLYLJEMET,1.0,This product is no where near natural / organi...,2013-03-29,product near natural organic wish review purch...
3,31,B00005344V,Traditional Medicinals Organic Breathe Easy Se...,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",A2F488C4PLWGEI,5.0,If my wife drinks a cup of this tea when she f...,2014-03-23,wife drink cup tea feel attack come help avoid...
4,32,B00005344V,Traditional Medicinals Organic Breathe Easy Se...,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",AO1HXV7DWZZIR,5.0,I don't know about the medicinal aspects of th...,2014-02-06,know medicinal aspect tea flavor downright scr...
28001,77519,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A1WT3TVHANP7ZF,3.0,Hmmm. I really wanted to love this sweetener. ...,2014-07-22,hmmm want love sweetener half sugar half stevi...
28002,77520,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A3NEAETOSXDBOM,5.0,"I confess I have a sweet tooth, and love the t...",2014-06-30,confess sweet tooth love taste sugar recognize...
28003,77521,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",AD1ZOPB0BBEHB,4.0,"It has a little of the stevia aftertaste, but ...",2014-07-17,little stevia aftertaste fair compromise able ...
28004,77522,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A18ECVX2RJ7HUE,5.0,i love marinade for grilled flank steak or lon...,2014-05-30,love marinade grilled flank steak london broil...
28005,77523,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A2G04D4QZAXL15,3.0,I've been using Truvia (a form of stevia) on m...,2014-05-27,use truvia form stevia cereal greek yogurt yea...


In [11]:
# generating test history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [12]:
print(test_user_history)

                  reviewerID  \
0      A00177463W0XWB16A9O05   
1      A022899328A0QROR32DCT   
2      A068255029AHTHDXZURNU   
3      A06944662TFWOKKV4GJKX   
4             A1004703RC79J9   
...                      ...   
13274          AZWRZZAMX90VT   
13275          AZXKAH2DE6C8A   
13276          AZXON596A1VXC   
13277          AZYXC63SS008M   
13278          AZZ5ASC403N74   

                                                    asin  
0                               [B00474OR8G, B00BFM6OAW]  
1                                           [B00CMQDKES]  
2                                           [B001FA1K2G]  
3                                           [B000GFYRHG]  
4                                           [B003GTR8IO]  
...                                                  ...  
13274  [B0007R9L4M, B000CN7BMA, B001EQ5D1K, B002VT3GX...  
13275   [B000MAK41I, B004X8TJP2, B006H34CUS, B007W14RMM]  
13276                           [B001EO5S0I, B00271QQ7Q]  
13277                    

## Instantiate Memory-based Embedding CF (Item-based)

In [13]:
mem_ecf = cf.EmbeddedItemBasedCF(d2v)

In [14]:
%%time
# fit learning algorithm to training data
mem_ecf.fit(train)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13397/13397 [00:00<00:00, 41946.42it/s]

CPU times: user 489 ms, sys: 13.4 ms, total: 503 ms
Wall time: 500 ms





In [15]:
%%time
# generate n-number of candidates items (200)
candidate_items = mem_ecf.test()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13397/13397 [00:05<00:00, 2601.82it/s]

CPU times: user 16.9 s, sys: 2.92 s, total: 19.8 s
Wall time: 5.15 s





## Loop through N = {10, 25, 30, 45}

In [16]:
# generate item popularity
item_popularity = generate_item_popularity(train)

In [17]:
n_recommendations = {}
for n in [10, 25, 30, 45]:
    # retrieve the top-n items based on similarities
    top_ns = get_top_n(candidate_items, mem_ecf.user_rating_history, n)
    # evaluate how well the recommended items predicted the future purchases
    n_recommended_items = evaluate_recommendations(top_ns, test_user_history, item_popularity, n)
    # saving the n-value and recommended items
    n_recommendations[n] = (top_ns, n_recommended_items)

The MEM-ECF has an average recall@10: 0.02156, average novelty@10: 0.94243
The MEM-ECF has an average recall@25: 0.04002, average novelty@25: 0.95109
The MEM-ECF has an average recall@30: 0.04542, average novelty@30: 0.95319
The MEM-ECF has an average recall@45: 0.05798, average novelty@45: 0.95722


# Evaluate N-Recommendations

In [18]:
def retrieve_recommendations(train: pd.DataFrame, top_ns: dict):
    """
    """
    # generating a random user
    random_user = np.random.choice(list(train['reviewerID'].unique()), 1)[0]
    print(f"For user: {random_user}:")
    print(f"Purchase History:\n{train[train['reviewerID'] == random_user][['asin', 'title']]}")

    # find the recommendations
    print(f"\nRecommending:\n")
    recommendations = (train[train['asin']
                             .isin(top_ns[random_user])][['asin', 'title']]
                       .drop_duplicates(subset='asin')
                       .set_index('asin'))
    print(f"{recommendations.loc[top_ns[random_user]].reset_index()}")

## N=10

In [19]:
top_ns_10 = n_recommendations[10][0]

In [38]:
retrieve_recommendations(train, top_ns_10)

For user: A9I40WFF40R4:
Purchase History:
             asin                                              title
14256  B00119OLSQ  South Beach Living Snack Pack Delights Dark Ch...
35357  B004FELBH8  Newtons Fruit Thins Fig and Honey, 10.5-Ounce ...
35635  B004JGQ16I  Oreo Golden Oreo Fudge Creme, 11.3-Ounce (Pack...

Recommending:

         asin                                              title
0  B004FEJ968  Newtons Fruit Thins Cranberry Citrus Oats, 10....
1  B00027CE2S  Newman's Own Organics Newman O's, (Original) C...
2  B005KK3N36  Quaker Soft Baked Oatmeal Cookie, Chocolate Al...
3  B00119OL8G  South Beach Living Snack Pack Delights Dark Ch...
4  B000FA38ZY  Mallomars Pure Chocolate Cookies, 8-Ounce Boxe...
5  B00B18PAWI  Nestle Skinny Cow Divine Filled Chocolates, Ca...
6  B007JFMIWW  Quaker Stila Crispy Oat Cookie Bar - Blueberry...
7  B000EMQFY4  Nature Valley Chewy Granola Bar, Trail Mix, Fr...
8  B006BXV1H6  Kellogg's Krave Double Chocolate Cereal, 11-Ou...
9  B00DBSG2NC   

## N=25

In [21]:
top_ns_25 = n_recommendations[25][0]

In [55]:
retrieve_recommendations(train, top_ns_25)

For user: A1PMWT1I668I15:
Purchase History:
             asin                                              title
7101   B000FA38ZY  Mallomars Pure Chocolate Cookies, 8-Ounce Boxe...
23297  B001H3201Q  Werther's Original Caramel Milk Chocolate, 5.2...
29853  B002NHSQ6I  Newman's Own Organics Organic Premium Chocolat...
36393  B004K00DGC  Jamba Juice Energy Drink, Crisp Apple, 8.4-Oun...
38987  B004YGQPAK  Ambrosoli-HoneesEnergy Plus Vitamin Honey (Pac...
40883  B005IW4WEA  KIND Healthy Grains Clusters, Peanut Butter Wh...
41362  B005KK3N36  Quaker Soft Baked Oatmeal Cookie, Chocolate Al...
42708  B006BXUZVO  Kellogg's Raisin Bran Cereals, Cinnamon and Al...
42898  B006BXUY2Y  Kellogg's Special K Granola Bar, Dark Chocolat...

Recommending:

          asin                                              title
0   B000H26J7E  Lindt Excellence Bar, 70% Cocoa Smooth Dark Ch...
1   B0013TO51W  Wonka Nerds Giant Chewy Candies, 1.8-Ounce Bag...
2   B001E6KBYE  Special K Chocolatey Delight Cereal,

## N=30

In [23]:
top_ns_30 = n_recommendations[30][0]

In [24]:
retrieve_recommendations(train, top_ns_30)

For user: A2BBZGVLEGIAVN:
Purchase History:
             asin                                              title
11307  B000LQJT1O  NongShim Bowl Noodle, Hot and Spicy, 3.03-Ounc...
24970  B001NJJOCW     Chef Boyardee Microwavable 8 Bowl Variety Pack
26014  B001UFSOXE  Coffee Masters Around The World In Twelve Coff...

Recommending:

          asin                                              title
0   B001NZLUYG                                 Shanti's Spice Box
1   B000H241DS                        Sushi Chef Sushi Making Kit
2   B001GVIT6E  Jacobs Kronung Coffee, 17.6-Ounce Vacuum Packs...
3   B00826F95K  Igourmet Four Continents of Cheese on a Budget...
4   B000OUX2QA             Ras El Hanout 4.0 Oz By Zamouri Spices
5   B000EZZISK                  Tone's Onion Powder - 20oz shaker
6   B000BBY7ZC                              Nature's Best Bouquet
7   B00631TI2Q  Kirkland Signature Whole Black Peppercorns, 14...
8   B001VNEBSC   Frontier Cinnamon Powder, Korintje (a Grade) ...
9   

## N=45

In [25]:
top_ns_45 = n_recommendations[45][0]

In [26]:
retrieve_recommendations(train, top_ns_45)

For user: A2MJZT59XTMN6U:
Purchase History:
             asin                                              title
12868  B000V1RBRS  Bob's Red Mill Pancake &amp; Waffle Mix Hi Fib...
47070  B00BY6NN30  Yogi Tea, Cinnamon Vanilla Healthy Skin, 16 Co...

Recommending:

          asin                                              title
0   B005ER1JI6  Life Cereal, Maple and Brown Sugar, 13-Ounce (...
1   B001EQ5NLU  Spam with Real Hormel Bacon, 12 Ounce Can (Pac...
2   B00286M6TW  Hormel  Compleats Beef Steak &amp; Peppers wit...
3   B000EA2D9W  Billington's Natural Light Brown Muscovado Sug...
4   B001E5E29A  Stonewall Kitchen Farmhouse Pancake and Waffle...
5   B000MIFS3Y  Chef Boyardee Beef Ravioli, 14.25-Ounce Microw...
6   B000EITYUU  Fine Ground Celtic Sea Salt &ndash; (1) 16 Oun...
7   B002WTE0MQ                            Sunchang Gochujang 500g
8   B000E5GFQE       Bragg Organic Sprinkle Seasoning 1.50 Ounces
9   B000EF3E54  Crown Prince Natural Flat Fillets of Anchovies...
10  B00