In [1]:
# set the environment path to find Recommenders
import sys
sys.path.append("../../")

import logging
import time

import numpy as np
import pandas as pd
import papermill as pm

from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from reco_utils.recommender.sar import SAR

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]
Pandas version: 0.24.2


In [2]:
# top k items to recommend
TOP_K = 10

In [3]:
MOVIELENS_DATA_SIZE = '100k'

In [4]:
data = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE
)

# Convert the float precision to 32-bit in order to reduce memory consumption 
data['rating'] = data['rating'].astype(np.float32)

data.head()

4.93MB [00:07, 692kB/s]                                                                                                


Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [5]:
train, test = python_stratified_split(data, ratio=0.75, col_user='userID', col_item='itemID', seed=42)

In [7]:
print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=len(train),
    train_users=len(train['userID'].unique()),
    train_items=len(train['itemID'].unique()),
    test_total=len(test),
    test_users=len(test['userID'].unique()),
    test_items=len(test['itemID'].unique()),
))


Train:
Total Ratings: 74992
Unique Users: 943
Unique Items: 1649

Test:
Total Ratings: 25008
Unique Users: 943
Unique Items: 1444



In [8]:
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)-8s %(message)s')

model = SAR(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_timestamp="timestamp",
    similarity_type="jaccard", 
    time_decay_coefficient=30, 
    timedecay_formula=True
)

In [9]:
start_time = time.time()

model.fit(train)

train_time = time.time() - start_time
print("Took {} seconds for training.".format(train_time))

2019-08-20 16:06:00,879 INFO     Collecting user affinity matrix
2019-08-20 16:06:00,956 INFO     Calculating time-decayed affinities
2019-08-20 16:06:01,788 INFO     Creating index columns
2019-08-20 16:06:01,932 INFO     Building user affinity sparse matrix
2019-08-20 16:06:02,080 INFO     Calculating item co-occurrence
2019-08-20 16:06:02,695 INFO     Calculating item similarity
2019-08-20 16:06:02,713 INFO     Using jaccard based similarity
2019-08-20 16:06:04,112 INFO     Done training


Took 3.2928342819213867 seconds for training.


In [10]:
start_time = time.time()

top_k = model.recommend_k_items(test, remove_seen=True)

test_time = time.time() - start_time
print("Took {} seconds for prediction.".format(test_time))

2019-08-20 16:06:40,731 INFO     Calculating recommendation scores
2019-08-20 16:06:41,633 INFO     Removing seen items


Took 1.354874610900879 seconds for prediction.


In [11]:
display(top_k.head())

Unnamed: 0,userID,itemID,prediction
0,1,58,3.049881
1,1,7,3.053073
2,1,318,3.059262
3,1,210,3.095604
4,1,96,3.124997


In [12]:
eval_map = map_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', k=TOP_K)

In [13]:
eval_ndcg = ndcg_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', k=TOP_K)

In [14]:
eval_precision = precision_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', k=TOP_K)

In [15]:
eval_recall = recall_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', k=TOP_K)

In [16]:
print("Model:\t",
      "Top K:\t%d" % TOP_K,
      "MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Model:	
Top K:	10
MAP:	0.110591
NDCG:	0.382461
Precision@K:	0.330753
Recall@K:	0.176385


In [20]:
# Now let's look at the results for a specific user
user_id = 876
ground_truth = test[test['userID']==user_id].sort_values(by='rating', ascending=False)[:TOP_K]
prediction = model.recommend_k_items(pd.DataFrame(dict(userID=[user_id])), remove_seen=True) 
pd.merge(ground_truth, prediction, on=['userID', 'itemID'], how='left')

2019-08-20 16:14:03,799 INFO     Calculating recommendation scores
2019-08-20 16:14:04,516 INFO     Removing seen items


Unnamed: 0,userID,itemID,rating,timestamp,prediction
0,876,523,5.0,879428378,
1,876,529,4.0,879428451,
2,876,174,4.0,879428378,0.353567
3,876,276,4.0,879428354,
4,876,288,3.0,879428101,


In [21]:
# Let's look atthe results of user #3
user_id = 3
ground_truth = test[test['userID']==user_id].sort_values(by='rating', ascending=False)[:TOP_K]
prediction = model.recommend_k_items(pd.DataFrame(dict(userID=[user_id])), remove_seen=True) 
pd.merge(ground_truth, prediction, on=['userID', 'itemID'], how='left')

2019-08-20 16:14:23,160 INFO     Calculating recommendation scores
2019-08-20 16:14:23,877 INFO     Removing seen items


Unnamed: 0,userID,itemID,rating,timestamp,prediction
0,3,346,5.0,889237455,
1,3,327,4.0,889237455,4.726932
2,3,181,4.0,889237482,
3,3,303,3.0,889236983,
4,3,354,3.0,889237004,
5,3,271,3.0,889237224,4.796582
6,3,350,3.0,889237076,
7,3,351,3.0,889237315,
8,3,345,3.0,889237004,
9,3,268,3.0,889236961,4.475295


In [22]:
#Let's look at the results of user #17
user_id = 17
ground_truth = test[test['userID']==user_id].sort_values(by='rating', ascending=False)[:TOP_K]
prediction = model.recommend_k_items(pd.DataFrame(dict(userID=[user_id])), remove_seen=True) 
pd.merge(ground_truth, prediction, on=['userID', 'itemID'], how='left')


2019-08-20 16:16:15,232 INFO     Calculating recommendation scores
2019-08-20 16:16:16,273 INFO     Removing seen items


Unnamed: 0,userID,itemID,rating,timestamp,prediction
0,17,150,5.0,885272654,
1,17,100,4.0,885272520,1.753743
2,17,222,3.0,885272751,1.485181
3,17,111,3.0,885272674,1.459573
4,17,245,2.0,885166209,
5,17,125,1.0,885272538,1.453929
6,17,323,1.0,885166256,
