In [1]:
from collections import defaultdict

import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD
from tqdm import tqdm

tqdm.pandas()

# Load Data

In [2]:
# global variable
CATEGORY = "Grocery_and_Gourmet_Food"
DATA_PATH = "data/evaluation"

In [3]:
train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")

In [4]:
# check first 5 rows
train.head()

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A23RYWDS884TUL,5.0,This curry paste makes a delicious curry. I j...,2013-05-28,curry paste delicious curry fry chicken vegeta...
1,1,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A945RBQWGZXCK,5.0,I've purchased different curries in the grocer...,2012-09-17,purchase different curry grocery store complet...
2,3,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3AMNY44OP8AOU,4.0,I started a new diet restricting all added sug...,2014-01-23,start new diet restrict added sugar brand suga...
3,4,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3IB4CQ2QEJLJ8,5.0,So many flavors. I can't begin to tell you how...,2014-04-27,flavor begin tell love mae ploy curry ask reci...
4,5,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",AQA5DF3RWKETQ,5.0,I've used this a lot recently in some of my ch...,2012-11-27,use lot recently chicken dish use lot like spi...


In [5]:
# create reader
reader = Reader(rating_scale=(1,5))

# generate data required for surprise
data = Dataset.load_from_df(train[['reviewerID', 'asin', 'overall']], reader)

In [6]:
# generating training set
trainset = data.build_full_trainset()

# Training Funk's SVD

In [None]:
algo = SVD(n_factors=50, n_epochs=5, verbose=True)

In [None]:
# fitting to the trainset
algo.fit(trainset)

# Generate Predictions

In [None]:
testset = trainset.build_anti_testset()

In [None]:
%%time
# predict ratings for all pairs (u, i) that are NOT in the training set
predictions = algo.test(testset, verbose=False)

In [8]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in tqdm(predictions):
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in tqdm(top_n.items()):
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
top_ns = get_top_n(predictions)

# Evaluate Top-N Recommendations

### Defining Evaluation Metrics

In [7]:
def precision_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating precision@K - relevant / total recommended
    precision_at_k = num_relevant / k
    
    return precision_at_k

def recall_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating recall@K - relevant / total relevant items
    recall_at_k = num_relevant / len(asins)
    
    return recall_at_k

In [9]:
# loading test dataset
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

# generating test rating history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [None]:
# generating a random user
random_user = np.random.choice(list(train['reviewerID'].unique()), 1)[0]
print(f"For user: {random_user}:")
print(f"Purchase History:\n{train[train['reviewerID'] == random_user][['asin', 'title']]}")

# find the recommendations
print(f"\nRecommending:\n")
print(f"{train[train['asin'].isin([i[0] for i in top_ns[random_user]])][['asin', 'title']].drop_duplicates(subset='asin')}")

### N=10

In [None]:
top_ns = get_top_n(predictions, n=10)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 10
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

### N=25

In [None]:
top_ns = get_top_n(predictions, n=25)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 25
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

### N=30

In [None]:
top_ns = get_top_n(predictions, n=30)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 30
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

### N=45

In [None]:
top_ns = get_top_n(predictions, n=45)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 45
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

In [None]:
# looking at how many get correct
test_merged[test_merged['recall@k'] > 0]

# Evaluate `FunkMF` Class

In [10]:
from src.models import cf



In [16]:
# instantiating funk's svd/mf
funk_mf = cf.FunkMF(n_epochs=5)

In [17]:
# fitting to training data
funk_mf.fit(train)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


In [18]:
# check trainset and testset availability
funk_mf.testset[:5]

[('A23RYWDS884TUL', 'B00004S1C5', 4.244714698371499),
 ('A23RYWDS884TUL', 'B00005344V', 4.244714698371499),
 ('A23RYWDS884TUL', 'B0000CDEPD', 4.244714698371499),
 ('A23RYWDS884TUL', 'B0000CFPI2', 4.244714698371499),
 ('A23RYWDS884TUL', 'B0000CH39R', 4.244714698371499)]

In [19]:
%%time
# generating predictions
predictions = funk_mf.predict()

CPU times: user 6min 54s, sys: 2min 29s, total: 9min 23s
Wall time: 10min 44s


### N=10

In [20]:
top_ns = get_top_n(predictions, n=10)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 10
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|██████████████████████████████████████████████| 63307346/63307346 [00:53<00:00, 1186295.07it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [00:51<00:00, 259.87it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 44104.84it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 46562.74it/s]

The MEM-ECF has a average precision@10: 0.00279, average recall@10: 0.01434.





### N=25

In [21]:
top_ns = get_top_n(predictions, n=25)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 25
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|███████████████████████████████████████████████| 63307346/63307346 [01:56<00:00, 541456.69it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [00:57<00:00, 231.25it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 37074.18it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 46545.27it/s]


The MEM-ECF has a average precision@25: 0.00210, average recall@25: 0.02678.


### N=30

In [22]:
top_ns = get_top_n(predictions, n=30)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 30
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|███████████████████████████████████████████████| 63307346/63307346 [01:30<00:00, 701828.37it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [00:52<00:00, 252.91it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 38075.14it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 49282.01it/s]

The MEM-ECF has a average precision@30: 0.00202, average recall@30: 0.03096.





### N=45

In [23]:
top_ns = get_top_n(predictions, n=45)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 45
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|███████████████████████████████████████████████| 63307346/63307346 [01:32<00:00, 687870.58it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [00:46<00:00, 288.41it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 46487.42it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 48520.13it/s]

The MEM-ECF has a average precision@45: 0.00184, average recall@45: 0.04235.





In [24]:
# looking at how many get correct
test_merged[test_merged['recall@k'] > 0]

Unnamed: 0,reviewerID,asin,pred_asin,precision@k,recall@k
10,A100VQNP6I54HS,"[B001VNEICQ, B004EKHN4I, B0057POYGY]","[B00DS842HS, B00017028M, B0054TWQMM, B000BD0SD...",0.022222,0.333333
18,A1025ZA8TGG21H,"[B000ED9L6C, B002BTI9B0]","[B001PEWJWC, B00DS842HS, B000EDG3UE, B00014JNI...",0.022222,0.500000
21,A1047EDJ84IMAS,"[B00014JNI0, B00014JNI0, B004CWO9Y0, B004I5KO9...","[B00014JNI0, B00271OPVU, B000Z93FQC, B0001CXUH...",0.022222,0.166667
26,A105S56ODHGJEK,"[B0025UCHRC, B005V9YXTO, B007JFXXJY, B00934WBRO]","[B00014JNI0, B005ZBZLT4, B001EO5U3I, B0001M0Z6...",0.022222,0.250000
42,A10BD0288TGRVS,"[B00017LEXE, B00017LEXE, B001D3K2GA, B0029XLH4Y]","[B00014JNI0, B00DS842HS, B000Z93FQC, B0001M0Z6...",0.022222,0.250000
...,...,...,...,...,...
13238,AZGV51M0UUJ8B,[B00DS842HS],"[B00014JNI0, B000Z93FQC, B00DS842HS, B00271OPV...",0.022222,1.000000
13251,AZNS7TH82KH9K,[B00DS842HS],"[B0001M0Z6Q, B00014JNI0, B000Z93FQC, B000ED9L6...",0.022222,1.000000
13260,AZQGJ5CEAJGXB,"[B005A1LINC, B00DS842HS]","[B00014JNI0, B00DS842HS, B000G82L62, B00271OPV...",0.022222,0.500000
13270,AZVJHW8TARWV9,[B001PEWJWC],"[B0001M0Z6Q, B00014JNI0, B00DS842HS, B000EDG3U...",0.022222,1.000000
