In [1]:
# Import necessary libraries 

import surprise 
import pandas as pd 
import numpy as np 

from recommenders.datasets.sparse import AffinityMatrix
from recommenders.datasets.python_splitters import numpy_stratified_split, python_random_split
from recommenders.evaluation.python_evaluation import (
    precision_at_k,
    recall_at_k,
    map_at_k,
    get_top_k_items
)

from recommenders.utils.timer import Timer
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions


In [2]:
# Load Data
df = pd.read_csv("../../00_Data/online_retail_prep.csv", index_col=0)
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,purchased
0,536365,3249,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,4048,United Kingdom,1
1,536365,2649,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,1
2,536365,2855,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,4048,United Kingdom,1
3,536365,2803,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,1
4,536365,2802,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,1


In [3]:
#to use standard names across the analysis 
header = {
        "col_user": "CustomerID",
        "col_item": "StockCode",
        "col_rating": "purchased",
    }

#instantiate the sparse matrix generation  
am = AffinityMatrix(df = df, col_user='CustomerID', col_item='StockCode', col_rating='purchased')

#obtain the sparse matrix 
X, _, _ = am.gen_affinity_matrix()

# check that the generated matrix has the correct dimensions
assert (X.shape[0] == df.CustomerID.unique().shape[0]) & (
        X.shape[1] == df.StockCode.unique().shape[0]
    )

In [4]:
# Split the Data
Xtr, Xtst = numpy_stratified_split(X, ratio=0.8, seed=1)

In [5]:
# Turn all values to 5 that are above 0 
Xtr = np.where(Xtr > 0, 5, Xtr)
Xtst = np.where(Xtst > 0, 5, Xtst)

In [6]:
train, test = python_random_split(df, 0.8, seed=1)

In [7]:
train.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,purchased
130524,547497,1809,3 PIECE SPACEBOY COOKIE CUTTER SET,1,2011-03-23 12:15:00,2.1,1563,United Kingdom,1
159870,550389,1821,CIRCUS PARADE CHILDRENS EGG CUP,2,2011-04-18 11:53:00,1.25,3634,United Kingdom,1
493914,578255,2291,MINI LIGHTS WOODLAND MUSHROOMS,1,2011-11-23 12:58:00,3.75,1725,United Kingdom,1
23522,538205,2715,"STRING OF 8 BUTTERFLIES,PINK",2,2010-12-10 11:24:00,1.65,330,United Kingdom,1
270638,560579,2783,"AIRLINE LOUNGE,METAL SIGN",2,2011-07-19 15:13:00,2.1,2613,United Kingdom,1


In [8]:
reader = surprise.Reader(rating_scale=(0,1))
train_set = surprise.Dataset.load_from_df(train[['CustomerID', 'StockCode', 'purchased']], reader=reader).build_full_trainset()

In [18]:
# Model
try:
    del(model)
except:
    pass

# Create Object for model 
model = surprise.SVDpp(random_state=0, n_factors=200, n_epochs=30, verbose=True)


model.fit(train_set)
    

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7ff1108a29a0>

In [19]:
predictions = predict(model, test, usercol='CustomerID', itemcol='StockCode')
predictions.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,1731,1714,0.997164
1,4327,2252,1.0
2,2572,2844,0.990465
3,3381,2923,1.0
4,2697,1257,0.99914


In [20]:
predictions.shape

(80768, 3)

In [51]:
print(predictions.shape)
print(test.shape)

(80768, 3)
(80768, 9)


In [52]:
predictions.describe()

Unnamed: 0,CustomerID,StockCode,prediction
count,80768.0,80768.0,80768.0
mean,2173.234276,1573.69093,0.990685
std,1256.5928,842.314373,0.022736
min,1.0,0.0,0.654897
25%,1193.0,937.0,0.993069
50%,2066.0,1526.0,1.0
75%,3287.0,2106.0,1.0
max,4371.0,3683.0,1.0


In [33]:
all_predictions = compute_ranking_predictions(model, test, usercol='CustomerID', itemcol='StockCode', remove_seen=True)

In [34]:
all_predictions.shape

(12189849, 3)

In [35]:
all_predictions.head()

Unnamed: 0,CustomerID,StockCode,prediction
80768,1731,2844,1.0
80769,1731,2923,0.986689
80770,1731,2778,1.0
80771,1731,2282,1.0
80772,1731,1624,1.0


In [36]:
all_predictions.prediction.min()

0.3457865802979627

In [40]:
all_predictions['prediction'] = np.where((all_predictions['prediction']>0.5), 1,0)

In [41]:
all_predictions.head()

Unnamed: 0,CustomerID,StockCode,prediction
80768,1731,2844,1
80769,1731,2923,1
80770,1731,2778,1
80771,1731,2282,1
80772,1731,1624,1


In [44]:
all_predictions.prediction.value_counts()

1    12189767
0          82
Name: prediction, dtype: int64

In [45]:
eval_precision = precision_at_k(test, all_predictions, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('precision \t:', eval_precision)
eval_recall = recall_at_k(test, all_predictions,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('recall \t:', eval_recall)

precision 	: 0.0
recall 	: 0.0
