In [1]:
from collections import defaultdict

import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD
from tqdm import tqdm

tqdm.pandas()

# Load Data

In [2]:
# global variable
CATEGORY = "Grocery_and_Gourmet_Food"
DATA_PATH = "data/evaluation"

In [3]:
train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")

In [4]:
# check first 5 rows
train.head()

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A23RYWDS884TUL,5.0,This curry paste makes a delicious curry. I j...,2013-05-28,curry paste delicious curry fry chicken vegeta...
1,1,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A945RBQWGZXCK,5.0,I've purchased different curries in the grocer...,2012-09-17,purchase different curry grocery store complet...
2,3,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3AMNY44OP8AOU,4.0,I started a new diet restricting all added sug...,2014-01-23,start new diet restrict added sugar brand suga...
3,4,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3IB4CQ2QEJLJ8,5.0,So many flavors. I can't begin to tell you how...,2014-04-27,flavor begin tell love mae ploy curry ask reci...
4,5,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",AQA5DF3RWKETQ,5.0,I've used this a lot recently in some of my ch...,2012-11-27,use lot recently chicken dish use lot like spi...


In [None]:
# create reader
reader = Reader(rating_scale=(1,5))

# generate data required for surprise
data = Dataset.load_from_df(train[['reviewerID', 'asin', 'overall']], reader)

In [None]:
# generating training set
trainset = data.build_full_trainset()

# Training Funk's SVD

In [None]:
algo = SVD(n_factors=50, n_epochs=5, verbose=True)

In [None]:
# fitting to the trainset
algo.fit(trainset)

# Generate Predictions

In [None]:
testset = trainset.build_anti_testset()

In [None]:
%%time
# predict ratings for all pairs (u, i) that are NOT in the training set
predictions = algo.test(testset, verbose=False)

In [5]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in tqdm(predictions):
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in tqdm(top_n.items()):
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
top_ns = get_top_n(predictions)

# Evaluate Top-N Recommendations

### Defining Evaluation Metrics

In [6]:
def precision_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating precision@K - relevant / total recommended
    precision_at_k = num_relevant / k
    
    return precision_at_k

def recall_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating recall@K - relevant / total relevant items
    recall_at_k = num_relevant / len(asins)
    
    return recall_at_k

In [7]:
# loading test dataset
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

# generating test rating history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [None]:
# generating a random user
random_user = np.random.choice(list(train['reviewerID'].unique()), 1)[0]
print(f"For user: {random_user}:")
print(f"Purchase History:\n{train[train['reviewerID'] == random_user][['asin', 'title']]}")

# find the recommendations
print(f"\nRecommending:\n")
print(f"{train[train['asin'].isin([i[0] for i in top_ns[random_user]])][['asin', 'title']].drop_duplicates(subset='asin')}")

### N=10

In [None]:
top_ns = get_top_n(predictions, n=10)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 10
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

### N=25

In [None]:
top_ns = get_top_n(predictions, n=25)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 25
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

### N=30

In [None]:
top_ns = get_top_n(predictions, n=30)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 30
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

### N=45

In [None]:
top_ns = get_top_n(predictions, n=45)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 45
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

In [None]:
# looking at how many get correct
test_merged[test_merged['recall@k'] > 0]

# Evaluate `FunkMF` Class

In [8]:
from src.models import cf



In [9]:
# instantiating funk's svd/mf
funk_mf = cf.FunkMF(n_epochs=20, lr_all=0.01)

In [10]:
# fitting to training data
funk_mf.fit(train)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


In [None]:
# check trainset and testset availability
funk_mf.testset[:5]

In [11]:
%%time
# generating predictions
predictions = funk_mf.predict()

CPU times: user 6min 18s, sys: 1min 26s, total: 7min 45s
Wall time: 8min 29s


### N=10

In [12]:
top_ns = get_top_n(predictions, n=10)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 10
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|███████████████████████████████████████████████| 63307346/63307346 [01:15<00:00, 840487.24it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [01:15<00:00, 177.09it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 46170.96it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 52741.33it/s]

The MEM-ECF has a average precision@10: 0.00069, average recall@10: 0.00309.





### N=25

In [13]:
top_ns = get_top_n(predictions, n=25)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 25
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|███████████████████████████████████████████████| 63307346/63307346 [04:43<00:00, 223209.37it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [00:57<00:00, 231.29it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 46216.13it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 44950.01it/s]


The MEM-ECF has a average precision@25: 0.00069, average recall@25: 0.00865.


### N=30

In [14]:
top_ns = get_top_n(predictions, n=30)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 30
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|███████████████████████████████████████████████| 63307346/63307346 [01:28<00:00, 718307.85it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [01:14<00:00, 178.75it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 42995.70it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 45999.36it/s]


The MEM-ECF has a average precision@30: 0.00069, average recall@30: 0.01029.


### N=45

In [15]:
top_ns = get_top_n(predictions, n=45)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 45
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|███████████████████████████████████████████████| 63307346/63307346 [01:41<00:00, 620753.80it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [01:22<00:00, 161.67it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 42421.66it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 45020.13it/s]

The MEM-ECF has a average precision@45: 0.00067, average recall@45: 0.01507.





In [16]:
# looking at how many get correct
test_merged[test_merged['recall@k'] > 0]

Unnamed: 0,reviewerID,asin,pred_asin,precision@k,recall@k
7,A100DXY4SLAMPM,"[B000FK63QA, B000KEJMRI]","[B0000DI085, B0000IJYK4, B00015HNMM, B0001CXUH...",0.022222,0.500000
37,A10AFVU66A79Y1,"[B000E1FZHS, B000JMAVYO, B001E6K6B2, B002NKPCZ...","[B002HG9R1I, B004LKVRKM, B000EITYUU, B000EDG3U...",0.022222,0.100000
44,A10BWUA2MGA9BK,[B000S8593W],"[B000216O16, B0002YGSJQ, B000EDDS6Q, B000EDK5L...",0.022222,1.000000
171,A11OQUV1ZI2MT2,"[B002DM62BY, B008YUL4KI, B00HKGB9ZW]","[B000KEPB9Q, B002HG9R1I, B000EMM976, B000F4DKA...",0.022222,0.333333
173,A11OTLEDSW8ZXD,"[B002RBRY0Y, B007K5KAJY, B00BNR7I18, B00C1CLQG...","[B0002YGSJQ, B000E5GFQE, B000EDG3UE, B000H11C6...",0.022222,0.111111
...,...,...,...,...,...
13213,AZ61VB6SPTQWJ,[B000JMAVYO],"[B0000DI085, B00014JNI0, B0001EJ4CU, B0001M0Z6...",0.022222,1.000000
13217,AZ6OA110XCE5F,"[B000EVE3YE, B001ELL86Y]","[B0000DI085, B00014JNI0, B0001M0Z6Q, B000216O1...",0.022222,0.500000
13219,AZ8C1QH6OQ7T5,[B003OGKCDC],"[B0000DI085, B0000IJYK4, B00015HNMM, B0001EJ4C...",0.022222,1.000000
13251,AZNS7TH82KH9K,[B00DS842HS],"[B0000DI085, B00014JNI0, B0001EJ4CU, B000G82L6...",0.022222,1.000000
