In [1]:
from collections import defaultdict

import numpy as np
import pandas as pd
import warnings

from surprise import Dataset, Reader
from tqdm import tqdm

from src.models import cf

tqdm.pandas()
warnings.filterwarnings('ignore')



# Load Data

In [2]:
# global variables
DATA_PATH = "data/evaluation"
CATEGORY = "Grocery_and_Gourmet_Food"

# training parameters
N_EPOCHS = 10
LR_ALL = 0.005
BETA = 0.1

train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")

In [3]:
# checking train dataframe
train.head().append(train.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A23RYWDS884TUL,5.0,This curry paste makes a delicious curry. I j...,2013-05-28,curry paste delicious curry fry chicken vegeta...
1,1,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A945RBQWGZXCK,5.0,I've purchased different curries in the grocer...,2012-09-17,purchase different curry grocery store complet...
2,3,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3AMNY44OP8AOU,4.0,I started a new diet restricting all added sug...,2014-01-23,start new diet restrict added sugar brand suga...
3,4,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3IB4CQ2QEJLJ8,5.0,So many flavors. I can't begin to tell you how...,2014-04-27,flavor begin tell love mae ploy curry ask reci...
4,5,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",AQA5DF3RWKETQ,5.0,I've used this a lot recently in some of my ch...,2012-11-27,use lot recently chicken dish use lot like spi...
47769,77420,B00I33696K,Reese's Miniature Peanut Butter Cups .31oz - 1...,"['Grocery & Gourmet Food', 'Candy & Chocolate'...",A192LQZWDYPR4U,5.0,Another quality Reese Peanut Butter Cup produc...,2014-02-27,quality reese peanut butter cup product great ...
47770,77421,B00I33696K,Reese's Miniature Peanut Butter Cups .31oz - 1...,"['Grocery & Gourmet Food', 'Candy & Chocolate'...",A2QKXW3LDQ66P5,5.0,I purchased these for my husband who has every...,2013-02-20,purchase husband love reeses valentine day pre...
47771,77430,B00ID9VSOM,"Viva Labs Organic Coconut Sugar: Non-GMO, Low-...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A2P3TGJU301KXD,5.0,this stuff is INCREDIBILY yummy! SO much bette...,2014-07-15,stuff incredibily yummy good regular brown sug...
47772,77456,B00IRL93SY,Barrie House Kenya Estate - AA Single Cup Caps...,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",AEFE9VDHTQ199,5.0,"Very nice aroma, body and taste! Will buy this...",2014-05-24,nice aroma body taste buy coffee good coffee a...
47773,77508,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A2AEZQ3DGBBLPR,2.0,This is a no go for diabetics according to my ...,2014-06-26,diabetic accord wife doctor order intention us...


# Preparing Dataset for Surprise's Algorithm

In [4]:
# create reader 
reader = Reader(rating_scale=(1,5))

# Utility Functions

In [5]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in tqdm(predictions):
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in tqdm(top_n.items()):
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def recall_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating recall@K - relevant / total relevant items
    recall_at_k = num_relevant / len(asins)
    
    return recall_at_k

def novelty_at_k(item_popularity, predicted_asins, k=10):
    """
    """
    # finding avg novelty
    popularity_sum = item_popularity.loc[predicted_asins].sum()
    novelty_at_k = ((k*1) - popularity_sum) / k
    
    return novelty_at_k

def generate_item_popularity(train: pd.DataFrame) -> pd.DataFrame:
    """
    """
    
    # create a mapping of item popularatity
    # based on sum(item's review / max reviews) / no items
    max_reviews = (train.groupby(['asin'])
                   .agg({'processedReviewText': 'count'})
                   .max()
                   .values[0])
    item_popularity = (train.groupby(['asin'])
                       .agg({'processedReviewText': 'count'})
                       .apply(lambda x: x/max_reviews))
    
    return item_popularity
    

def evaluate_recommendations(top_ns: dict, user_rating_history: pd.DataFrame, item_popularity: pd.DataFrame, k=10) -> pd.DataFrame:
    """
    
    Args:
        top_ns
        user_rating_history
    """
    
    test_recommendations = pd.DataFrame(top_ns.items(), columns=["reviewerID", "pred_asin"])
    test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])
    
    # combined test history and recommendations
    test_merged = pd.merge(user_rating_history, test_recommendations, on="reviewerID", how="inner")
    
    # generating recall@k metrics
    test_merged["recall@k"] = test_merged.apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)
    test_merged["novelty@k"] = test_merged.apply(lambda x: novelty_at_k(item_popularity, x.pred_asin, k=k), axis=1)
    average_recall_at_k = test_merged["recall@k"].mean()
    average_novelty_at_k = test_merged["novelty@k"].mean()
    
    print(f"The MEM-ECF has an average recall@{k}: {average_recall_at_k:.5f}, average novelty@{k}: {average_novelty_at_k:.5f}")
    
    return test_merged

# Generate N-Recommendations = {10, 25, 30, 45}

## Load Test Data

In [6]:
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [7]:
test.head().append(test.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,2,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A1TCSC0YWT82Q0,5.0,I love ethnic foods and to cook them. I recent...,2013-08-03,love ethnic food cook recently purchase produc...
1,8,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A1Z7Y2GMAP9SRY,5.0,I like to make my own curry but this is a tast...,2014-06-27,like curry tasty alternative use base kind dif...
2,23,B00004S1C5,"Ateco Food Coloring Kit, 6 colors","['Grocery & Gourmet Food', 'Cooking & Baking',...",A14YSMLYLJEMET,1.0,This product is no where near natural / organi...,2013-03-29,product near natural organic wish review purch...
3,31,B00005344V,Traditional Medicinals Organic Breathe Easy Se...,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",A2F488C4PLWGEI,5.0,If my wife drinks a cup of this tea when she f...,2014-03-23,wife drink cup tea feel attack come help avoid...
4,32,B00005344V,Traditional Medicinals Organic Breathe Easy Se...,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",AO1HXV7DWZZIR,5.0,I don't know about the medicinal aspects of th...,2014-02-06,know medicinal aspect tea flavor downright scr...
28001,77519,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A1WT3TVHANP7ZF,3.0,Hmmm. I really wanted to love this sweetener. ...,2014-07-22,hmmm want love sweetener half sugar half stevi...
28002,77520,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A3NEAETOSXDBOM,5.0,"I confess I have a sweet tooth, and love the t...",2014-06-30,confess sweet tooth love taste sugar recognize...
28003,77521,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",AD1ZOPB0BBEHB,4.0,"It has a little of the stevia aftertaste, but ...",2014-07-17,little stevia aftertaste fair compromise able ...
28004,77522,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A18ECVX2RJ7HUE,5.0,i love marinade for grilled flank steak or lon...,2014-05-30,love marinade grilled flank steak london broil...
28005,77523,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A2G04D4QZAXL15,3.0,I've been using Truvia (a form of stevia) on m...,2014-05-27,use truvia form stevia cereal greek yogurt yea...


In [8]:
# generating test history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [9]:
print(test_user_history)

                  reviewerID  \
0      A00177463W0XWB16A9O05   
1      A022899328A0QROR32DCT   
2      A068255029AHTHDXZURNU   
3      A06944662TFWOKKV4GJKX   
4             A1004703RC79J9   
...                      ...   
13274          AZWRZZAMX90VT   
13275          AZXKAH2DE6C8A   
13276          AZXON596A1VXC   
13277          AZYXC63SS008M   
13278          AZZ5ASC403N74   

                                                    asin  
0                               [B00474OR8G, B00BFM6OAW]  
1                                           [B00CMQDKES]  
2                                           [B001FA1K2G]  
3                                           [B000GFYRHG]  
4                                           [B003GTR8IO]  
...                                                  ...  
13274  [B0007R9L4M, B000CN7BMA, B001EQ5D1K, B002VT3GX...  
13275   [B000MAK41I, B004X8TJP2, B006H34CUS, B007W14RMM]  
13276                           [B001EO5S0I, B00271QQ7Q]  
13277                    

# Instantiate FunkSVD (Matrix Factorization)

In [10]:
# instantiating funksvd 
funk_svd = cf.FunkMF(n_epochs=N_EPOCHS, lr_all=LR_ALL, reg_all=BETA)

In [11]:
# fitting to the training data
funk_svd.fit(train)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9


In [12]:
%%time
# creating testset for prediction
testset = funk_svd.trainset.build_anti_testset()

CPU times: user 32 s, sys: 1.43 s, total: 33.4 s
Wall time: 33.8 s


In [13]:
%%time
# generate candidate times
candidate_items = funk_svd.test(testset)

CPU times: user 6min 49s, sys: 1min 38s, total: 8min 28s
Wall time: 9min 1s


## Loop through N = {10, 25, 30, 45}

In [14]:
# generate item popularity
item_popularity = generate_item_popularity(train)

In [15]:
n_recommendations = {}
for n in [10, 25, 30, 45]:
    # retrieve the top-n items based on similarities
    top_ns = get_top_n(candidate_items, n)
    # evaluate how well the recommended items predicted the future purchases
    n_recommended_items = evaluate_recommendations(top_ns, test_user_history, item_popularity, n)
    # saving the n-value and recommended items
    n_recommendations[n] = (top_ns, n_recommended_items)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63307346/63307346 [00:41<00:00, 1530531.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13397/13397 [00:46<00:00, 285.31it/s]


The MEM-ECF has an average recall@10: 0.01318, average novelty@10: 0.87762


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63307346/63307346 [01:11<00:00, 887807.31it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13397/13397 [00:42<00:00, 315.00it/s]


The MEM-ECF has an average recall@25: 0.02329, average novelty@25: 0.89942


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63307346/63307346 [01:08<00:00, 917753.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13397/13397 [00:42<00:00, 314.50it/s]


The MEM-ECF has an average recall@30: 0.02630, average novelty@30: 0.90351


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63307346/63307346 [01:08<00:00, 924059.41it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13397/13397 [00:39<00:00, 339.89it/s]


The MEM-ECF has an average recall@45: 0.03396, average novelty@45: 0.91186


# Evaluate N-Recommendations

In [16]:
def retrieve_recommendations(train: pd.DataFrame, top_ns: dict):
    """
    """
    # generating a random user
    random_user = np.random.choice(list(train['reviewerID'].unique()), 1)[0]
    print(f"For user: {random_user}:")
    print(f"Purchase History:\n{train[train['reviewerID'] == random_user][['asin', 'title']]}")

    # find the recommendations
    print(f"\nRecommending:\n")
    recommendations = (train[train['asin']
                             .isin([i[0] for i in top_ns[random_user]])][['asin', 'title']]
                       .drop_duplicates(subset='asin')
                       .set_index('asin'))
    print(f"{recommendations.loc[[i[0] for i in top_ns[random_user]]].reset_index()}")

## N=10

In [17]:
top_ns_10 = n_recommendations[10][0]
retrieve_recommendations(train, top_ns_10)

For user: A28RSMADFCBJDT:
Purchase History:
             asin                                              title
25175  B001OCKIBY  Planters Big Nut Bars, Roasted Triple Nut, 5-C...
29074  B002D4DY8G  Gevalia Dark Chocolate Truffle Ground Coffee, ...
30677  B003120YHI  Sun Crystals Stevia and Sugar Cane Granular Sw...
34756  B004CYLW7A  Crunchy Nut Roasted Nut and Honey, 10.8-Ounce ...

Recommending:

         asin                                              title
0  B003OGKCDC  Nature's Way Organic Extra Virgin Coconut Oil-...
1  B00DS842HS  Viva Naturals Organic Extra Virgin Coconut Oil...
2  B0054TWQMM         Nutiva Organic White Chia Seeds 12oz2 pack
3  B000F4D5GC  Let's Do Organic Shredded, Unsweetened Coconut...
4  B0001M0Z6Q  Spicy World Peppercorn (Whole)-Black Tellicher...
5  B002AUF0Q2   Glutenfreeda Gluten Free Instant Oatmeal, Var...
6  B001O1Q0NA  The Spice Lab Pink Himalayan Salt - 1 Pound X-...
7  B00014JNI0  YS Organic Bee Farms CERTIFIED ORGANIC RAW HON...
8  B000G82

## N=25

In [18]:
top_ns_25 = n_recommendations[25][0]
retrieve_recommendations(train, top_ns_25)

For user: A3L83OV2D94LHJ:
Purchase History:
             asin                                              title
9430   B000HDK0DC  YumEarth Organic Lollipops, Assorted Flavors, ...
21685  B001EQ55PE   Eight O&#39;Clock Coffee, Original Whole Bean...
21955  B001EQ5IAG  NOW Foods Erythritol Pure Sweetener,16-Ounce (...
30519  B002ZOG2K6  Eight O'Clock Coffee, Colombian Peaks Whole Be...
39376  B00511MJ2K  Wholesome Sweeteners Zero Calorie Free Pouch, ...
45081  B007TGO1TY  NECTRESSE Natural No Calorie Sweetener, 140 Se...

Recommending:

          asin                                              title
0   B0001M0Z6Q  Spicy World Peppercorn (Whole)-Black Tellicher...
1   B00DS842HS  Viva Naturals Organic Extra Virgin Coconut Oil...
2   B000EDG3UE  Bob's Red Mill Organic Grain Quinoa, 26 Ounce ...
3   B000F4D5GC  Let's Do Organic Shredded, Unsweetened Coconut...
4   B003OGKCDC  Nature's Way Organic Extra Virgin Coconut Oil-...
5   B000EDDS6Q  Bob's Red Mill Old Country Style Muesli Cerea

## N=30

In [19]:
top_ns_30 = n_recommendations[30][0]
retrieve_recommendations(train, top_ns_30)

For user: AEMSGB00FL3YO:
Purchase History:
             asin                                              title
23063  B001GVIS4M  Yogourmet Freeze Dried Yogurt Starter, 1 ounce...
27480  B00282UD0K  SweetLeaf Sweet Drops Liquid Stevia Sweetener,...
34062  B00462Z2QA  Red Rain Energy Drink, Regular, 16-Ounce Cans ...
36055  B004JRMG98      Taco Bell Jalapeno Sauce, 8-Ounce (Pack of 6)

Recommending:

          asin                                              title
0   B000EDDS6Q  Bob's Red Mill Old Country Style Muesli Cereal...
1   B001O1Q0NA  The Spice Lab Pink Himalayan Salt - 1 Pound X-...
2   B00DS842HS  Viva Naturals Organic Extra Virgin Coconut Oil...
3   B000HDJXLW  Muir Glen Canned Tomatoes, Organic Diced Tomat...
4   B00014JNI0  YS Organic Bee Farms CERTIFIED ORGANIC RAW HON...
5   B00338DSQ4      Barilla Spaghetti Pasta, 32 Ounce (Pack of 6)
6   B0001M0Z6Q  Spicy World Peppercorn (Whole)-Black Tellicher...
7   B001EQ5JLE   PG Tips Black Tea, Pyramid Tea Bags, 240-Coun...
8 

## N=45

In [20]:
top_ns_45 = n_recommendations[45][0]
retrieve_recommendations(train, top_ns_45)

For user: A1JS6NH1BYZA12:
Purchase History:
             asin                                              title
6220   B000EVT08S      Haribo Gummi Candy, Techno Bears, 5-Pound Bag
21395  B001EQ54J6  DeLallo Grated Parmesan, 8-Ounce Units (Pack o...

Recommending:

          asin                                              title
0   B001O1Q0NA  The Spice Lab Pink Himalayan Salt - 1 Pound X-...
1   B0001M0Z6Q  Spicy World Peppercorn (Whole)-Black Tellicher...
2   B000G82L62  Lundberg Family Farms Wild Blend Rice, 16 Ounc...
3   B000EDDS6Q  Bob's Red Mill Old Country Style Muesli Cereal...
4   B000EDG3UE  Bob's Red Mill Organic Grain Quinoa, 26 Ounce ...
5   B00DS842HS  Viva Naturals Organic Extra Virgin Coconut Oil...
6   B003OGKCDC  Nature's Way Organic Extra Virgin Coconut Oil-...
7   B0001CXUHW                   Saf Instant Yeast, 1 Pound Pouch
8   B00014JNI0  YS Organic Bee Farms CERTIFIED ORGANIC RAW HON...
9   B001PEWJWC  Garbanzo Beans aka Chickpeas or Ceci Beans | N...
10  B00