In [2]:
import pandas as pd
import datetime

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, plot, iplot

from surprise import Reader, Dataset, SVD
from surprise.model_selection.validation import cross_validate

from sklearn.model_selection import train_test_split as  train_test_split_sklearn

df = pd.read_csv("data/user-item-interaction.csv")

In [3]:
print(df.shape)
df.tail()

(2633521, 8)


Unnamed: 0,event_time,order_id,product_id,category_id,category_code,brand,price,user_id
2633516,2020-11-21 10:10:01 UTC,2388440981134693942,1515966223526602848,2.268105e+18,electronics.smartphone,oppo,138.87,1.515916e+18
2633517,2020-11-21 10:10:13 UTC,2388440981134693943,1515966223509089282,2.268105e+18,electronics.smartphone,apple,418.96,1.515916e+18
2633518,2020-11-21 10:10:30 UTC,2388440981134693944,1515966223509089917,2.268105e+18,appliances.personal.scales,vitek,12.48,1.515916e+18
2633519,2020-11-21 10:10:30 UTC,2388440981134693944,2273948184839454837,2.268105e+18,,moulinex,41.64,1.515916e+18
2633520,2020-11-21 10:10:30 UTC,2388440981134693944,1515966223509127566,2.268105e+18,appliances.kitchen.blender,redmond,53.22,1.515916e+18


In [4]:
df.drop_duplicates('order_id', inplace=True)

In [5]:
df.dropna(
    subset=['event_time', 'order_id', 'product_id', 'category_id', 'price', 'user_id'],
    inplace=True
)

#To run faster we will use data from last month
df = df[df['event_time'] >= "2020-10-21"]

# Transform product_id, user_id and category_id into sequential values to a better visualization
# to sequential values
df['product_id'] = df['product_id'].rank(method='dense', ascending=False).astype(int)
df['user_id'] = df['user_id'].rank(method='dense', ascending=False).astype(int)
df['category_id'] = df['category_id'].rank(method='dense', ascending=False).astype(int)



df.reset_index(drop=True ,inplace=True)
df

Unnamed: 0,event_time,order_id,product_id,category_id,category_code,brand,price,user_id
0,2020-10-21 00:44:29 UTC,2388440981134624539,3890,2,electronics.video.tv,xiaomi,347.20,4336
1,2020-10-21 01:01:21 UTC,2388440981134625049,8121,446,computers.notebook,apple,1620.30,3958
2,2020-10-21 01:13:07 UTC,2388440981134625050,551,317,electronics.smartphone,samsung,694.42,3776
3,2020-10-21 01:29:43 UTC,2388440981134625051,8539,317,electronics.smartphone,apple,1317.57,3786
4,2020-10-21 01:34:49 UTC,2388440981134625052,3109,429,,,749.98,12450
...,...,...,...,...,...,...,...,...
62961,2020-11-21 10:08:14 UTC,2388440981134693940,2176,444,,xiaomi,16.18,9556
62962,2020-11-21 10:08:54 UTC,2388440981134693941,3605,2,electronics.video.tv,samsung,1736.09,1828
62963,2020-11-21 10:10:01 UTC,2388440981134693942,3338,317,electronics.smartphone,oppo,138.87,3
62964,2020-11-21 10:10:13 UTC,2388440981134693943,8584,317,electronics.smartphone,apple,418.96,1


In [29]:
split_date = "2020-11-14" # 3 weeks to train, one to test

train = df[df['event_time'] < split_date]
test = df[df['event_time'] >= split_date]

In [33]:
surprise_df = train.groupby(["product_id", 'user_id'],as_index=False).size()
surprise_df.rename(columns={'size': 'rating'}, inplace=True)
surprise_df['rating'].describe()
surprise_df['rating'] = surprise_df['rating'].apply(lambda x: x if x < 3 else 4)



In [34]:
print(surprise_df.product_id.unique().shape)
print(surprise_df.user_id.unique().shape)

surprise_df

(7502,)
(10284,)


Unnamed: 0,product_id,user_id,rating
0,42,5274,1
1,42,6883,1
2,44,449,1
3,44,1015,1
4,44,1212,1
...,...,...,...
38237,8715,4364,4
38238,8715,5370,1
38239,8715,5504,1
38240,8715,7121,1


## rating distribution

In [38]:

data = surprise_df['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / surprise_df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} product-ratings'.format(train.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [39]:
from surprise import Dataset
from surprise import Reader

reader = Reader(rating_scale=(1, 4))
data = Dataset.load_from_df(surprise_df, reader)
data

<surprise.dataset.DatasetAutoFolds at 0x7f6f535eb970>

In [40]:
from surprise import SVD
from surprise.model_selection import cross_validate, train_test_split

trainset, testset = train_test_split(data, test_size=.25)

svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.4664  0.4647  0.4752  0.4687  0.0046  
MAE (testset)     0.2266  0.2263  0.2293  0.2274  0.0013  
Fit time          0.80    0.86    0.82    0.82    0.02    
Test time         0.04    0.04    0.04    0.04    0.00    


{'test_rmse': array([0.46636754, 0.46469502, 0.47515447]),
 'test_mae': array([0.22663949, 0.2262801 , 0.22930631]),
 'fit_time': (0.7983338832855225, 0.8567812442779541, 0.8157951831817627),
 'test_time': (0.04352712631225586, 0.04456067085266113, 0.043941497802734375)}

In [41]:
predictions = svd.test(testset)
test = pd.DataFrame(predictions)

test = test.rename(columns={'uid':'user_id', 'iid': 'product_id', 
                            'r_ui':'actual', 'est':'prediction'})

cf_model = test.pivot_table(index='user_id', 
                            columns='product_id', values='prediction').fillna(0)


cf_model

product_id,98,106,172,258,336,338,367,428,431,432,...,12954,12962,12965,12966,12969,12975,12976,12979,13003,13011
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8710,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
def get_users_predictions(user_id, n, model):
    recommended_items = pd.DataFrame(model.loc[user_id])
    recommended_items.columns = ["predicted_rating"]
    recommended_items = recommended_items.sort_values('predicted_rating', ascending=False)    
    recommended_items = recommended_items.head(n)
    return recommended_items.index.tolist()

def get_recs(model, k):
    recs = []
    for user in model.index:
        cf_predictions = get_users_predictions(user, k, model)
        recs.append(cf_predictions)
    return recs

In [102]:
# Top-10 recommendations for each user
k = 100
recs = get_recs(cf_model, k)
preds = pd.DataFrame(index=cf_model.index)
preds["recs"] = recs
preds

Unnamed: 0_level_0,recs
user_id,Unnamed: 1_level_1
44,"[6614, 1213, 98, 7981, 7983, 7985, 7991, 7992,..."
63,"[1243, 98, 8029, 7985, 7991, 7992, 8000, 8001,..."
64,"[6353, 98, 8019, 7983, 7985, 7991, 7992, 8000,..."
70,"[6932, 8019, 7983, 7985, 7991, 7992, 8000, 800..."
73,"[7865, 98, 8018, 7982, 7983, 7985, 7991, 7992,..."
...,...
8710,"[3447, 98, 8019, 7983, 7985, 7991, 7992, 8000,..."
8712,"[2032, 5049, 2557, 7983, 7985, 7991, 7992, 800..."
8713,"[1114, 2556, 6086, 3444, 8018, 7983, 7985, 799..."
8714,"[6902, 98, 7974, 7982, 7983, 7985, 7991, 7992,..."


In [98]:
agg = test.groupby("user_id")['product_id']

recs = [preds.loc[i].values.tolist()[0] for i in preds.index]
reals = [group.values.tolist() for name, group in agg]

In [99]:
def HitRate(recs, reals):
    hits = 0
    total = 0
    
    for rec, real in zip(recs, reals):
        for r in rec:
            if r in real:
                hits += 1
                break
        total += 1
    
    return hits/total

def AverageReciprocalHitRank(recs, reals):
    summation = 0
    total = 0
    
    for rec, real in zip(recs, reals):
        hitRank = 0
        rank = 0
        for r in rec:
            rank = rank + 1
            if r in real:
                hitRank = rank
                break
        
        if (hitRank > 0) :
            summation += 1.0 / hitRank

        total += 1
    
    return summation / total


In [100]:
HitRate(recs, reals)

0.46213895394223264

In [101]:
AverageReciprocalHitRank(recs, reals)

0.4442595665354558

In [96]:
print(recs)

[[[6614, 1213, 98, 7981, 7983, 7985, 7991, 7992, 8000, 8001, 8004, 8005, 8006, 8015, 8018, 8019, 8029, 8037, 8045, 8050, 8058, 8061, 8067, 8070, 8075, 8077, 8079, 7982, 7974, 8094, 7937, 7910, 7911, 7912, 7914, 7915, 7917, 7919, 7922, 7927, 7931, 7939, 7972, 7944, 7945, 7947, 7950, 7954, 7955, 7960, 7961, 7967, 7971, 8082, 8097, 7903, 8228, 8200, 8204, 8205, 8206, 8210, 8211, 8216, 8218, 8222, 8223, 8227, 8231, 8098, 8233, 8234, 8237, 8238, 8239, 8240, 8242, 8243, 8249, 8250, 8261, 8198, 8196, 8195, 8194, 8101, 8114, 8120, 8124, 8125, 8129, 8131, 8135, 8138, 8143, 8146, 8155, 8156, 8164, 8171]], [[1243, 98, 8029, 7985, 7991, 7992, 8000, 8001, 8004, 8005, 8006, 8015, 8018, 8019, 8037, 7982, 8045, 8050, 8058, 8061, 8067, 8070, 8075, 8077, 8079, 8082, 7983, 7981, 8097, 7937, 7905, 7910, 7911, 7912, 7914, 7915, 7917, 7919, 7922, 7927, 7931, 7939, 7974, 7944, 7945, 7947, 7950, 7954, 7955, 7960, 7961, 7967, 7971, 7972, 8094, 8098, 8263, 8198, 8204, 8205, 8206, 8210, 8211, 8216, 8218, 8222, 8

In [97]:
print(reals)

[[1213, 6614], [1243], [6353], [6932], [7865], [1215], [10955], [4936], [11054], [10965], [10419, 1434], [1248, 1252, 1220], [2019], [4679], [1219, 1220, 1210, 1242, 1221, 2501, 2089], [7643], [1244, 1252], [11367, 1216, 8416], [11798], [7832], [1217, 1200, 1752, 1253, 1229], [2759], [2960], [448, 1252, 1231, 1218, 1762], [9355], [11956, 1317, 11737], [1581], [4231], [2019], [1218, 8227], [2554, 9198], [6633, 8425], [11500], [8534], [1230], [5939], [1234], [2553], [6907, 1219, 1233, 9894], [4911], [1228], [1144, 2086, 1002], [5742, 2470, 452, 8396, 10771], [979, 4928, 3990, 1869, 1619], [1203, 2036, 10523, 4932, 1426, 2414, 11473, 12573, 1987, 10974, 1459, 1871, 3724, 1700], [1224], [2943], [6985], [2010], [1216], [11280], [3751, 2088, 448, 7686], [1222, 4162, 1254], [2306, 1808, 1776, 5808, 1855, 5512], [6868, 600], [452, 11796, 9964, 5602, 5932, 3320], [9669, 1074, 1253, 1968, 1844, 432, 2716, 1070, 7665, 12551, 2089, 1391, 11294, 1016], [2916, 10036, 5083, 1420, 8739, 11988, 2448, 1