# Loading Libraries

In [63]:
import pandas as pd
import numpy as np

import surprise
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions
from recommenders.evaluation.python_evaluation import precision_at_k, recall_at_k


# Reading Data

In [64]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/online_retail_train_mixed.csv", index_col=0)
test = pd.read_csv("../../00_Data/online_retail_test_mixed.csv", index_col=0)

In [65]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (287957, 3)
Shape of Test: 	 (71990, 3)


In [66]:
# Check Head of Train
train.head()

Unnamed: 0,StockCode,CustomerID,purchased
296882,2016,1620,0
171197,1917,441,1
262509,673,124,0
219802,2369,145,1
239431,2855,4208,1


# Preparation of Train Dataset for Surprise Models


In [67]:
# Reader 
reader = surprise.Reader(rating_scale=(0,1))

# Build Train Set from Custom Dataset
train_set = surprise.Dataset.load_from_df(train[['CustomerID', 'StockCode', 'purchased']], reader=reader).build_full_trainset()

# Baseline Model

In [68]:
# base_model
try:
    del(base_model)
except:
    pass

# Create Object for base_model 
base_model = surprise.SVD(random_state=0, n_factors=4, n_epochs=20, verbose=True)

# Fit the base_model
base_model.fit(train_set)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe349178820>

# Prediction: Test Set Only

In [69]:
# Make Predictions
test_pred = predict(base_model, test, usercol='CustomerID', itemcol='StockCode')

# Show first five rows
test_pred.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,1895,2069,0.994923
1,1549,1369,0.762107
2,3801,391,1.0
3,3818,1048,1.0
4,428,2156,0.640251


In [70]:
# Check Statistics of Prediction
test_pred.prediction.describe()

count    71990.000000
mean         0.715270
std          0.237177
min          0.000000
25%          0.562692
50%          0.760069
75%          0.912426
max          1.000000
Name: prediction, dtype: float64

In [71]:
# Convert Predictions
test_pred['prediction'] = np.where((test_pred['prediction']<0.5), 0,1)

In [72]:
# Check Distribution
test_pred['prediction'].value_counts()

1    58355
0    13635
Name: prediction, dtype: int64

In [73]:
# Sort Index of both Datasets to use Accuracy Score 
test = test.sort_values(by=['CustomerID', 'StockCode'])
test_pred = test_pred.sort_values(by=['CustomerID', 'StockCode'])

# Reset indeces for both DataFrames
test = test.reset_index(drop=True)
test_pred = test_pred.reset_index(drop=True)

In [74]:
# First Reorder columns for test 
test = test[['CustomerID', 'StockCode', 'purchased']]
# Head of Test
test.head()

Unnamed: 0,CustomerID,StockCode,purchased
0,1,87,1
1,1,206,1
2,1,275,1
3,1,316,1
4,1,395,0


In [75]:
# Head of Test_pred
test_pred.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,1,87,1
1,1,206,0
2,1,275,0
3,1,316,1
4,1,395,1


In [76]:
# Import Accuracy 
from sklearn.metrics import accuracy_score

# Accuracy 
accuracy_score(test.purchased, test_pred.prediction)

0.7971246006389776

# Prediction: Top N 

In [78]:
# Predict all pairs that are not in the train set
predictions = compute_ranking_predictions(base_model, test, usercol='CustomerID', itemcol='StockCode')

In [79]:
# Check first five rows
predictions.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,1,87,0.614476
1,1,206,0.378941
2,1,275,0.494187
3,1,316,0.91813
4,1,395,0.924775


In [80]:
# Filter Test for purchased items only
test = test[test['purchased']==1]

In [81]:
# Now filter out top N 
# First write a function
def filter_top_n(predictions: pd.DataFrame, n: int) -> pd.DataFrame:
    
    # Group the dataframe by 'CustomerID', and for each group, sort by 'prediction' in descending order, then take the top N rows
    top_n_per_customer = predictions.groupby('CustomerID', group_keys=False).apply(lambda group: group.sort_values('prediction', ascending=False).head(n))
    
    return top_n_per_customer

# Filter Top 10 
top_10 = filter_top_n(predictions, 10)

# Filter Top 20 
top_20 = filter_top_n(predictions, 20)


In [82]:
# Evaluate Precision at 10 
eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('precision at 10 \t:', eval_precision_10)

# Evaluate Recall at 10 
eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('recall at 10 \t:', eval_recall_10)

precision at 10 	: 0.027178082191780827
recall at 10 	: 0.02707264585614802


In [83]:
# Evaluate Precision at 20 
eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('precision at 20 \t:', eval_precision_20)

# Evaluate Recall at 20 
eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('recall at 20 \t:', eval_recall_20)

precision at 20 	: 0.025726027397260275
recall at 20 	: 0.050062646006226466


# Hyper-Parameter Tuning

In [87]:
# Define Set of Hyperparameters: 
n_factors = [10, 100, 1000]
reg_all = [0.01, 0.02, 0.03]
lr_det = [0.001, 0.005, 0.01]

# Initiate Lists 
factor_nr = []
regulation_all = []
lr_all = []
recall_10 = []
recall_20 = []
precision_10 = []
precision_20 = []

for factors in n_factors:
    for regulation in reg_all:
        for learning in lr_det:
            try:
                del(model)
            except:
                pass 
            # Define Model 
            model = surprise.SVD(random_state=0, n_factors=factor, n_epochs=20, lr_all=learning, reg_all=regulation)
            # Fit model
            model.fit(train_set)
            
            # Predict All Pairs of Users & Items that are NOT in the Trainset 
            predictions = compute_ranking_predictions(model, test, usercol='CustomerID', itemcol='StockCode')
            
            # Filter Top 10 & 20 
            top_10 = filter_top_n(predictions, 10)
            top_20 = filter_top_n(predictions, 20)
            
            # Evaluate Recall at 10 
            eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                        col_rating="purchased", col_prediction="prediction", 
                                        relevancy_method="top_k", k=10)
            # Evaluate Recall at 20 
            eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=20)
            
            # Evaluate Precision at 10 
            eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=10)
            # Evaluate Precision at 20 
            eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=20) 
            
            print("recall at for", factors, "factors: \t", eval_recall_10)
            factor_nr.append(factors)
            regulation_all.append(regulation)
            lr_all.append(learning)
            recall_10.append(eval_recall_10)
            recall_20.append(eval_recall_20)
            precision_10.append(eval_precision_10)
            precision_20.append(eval_precision_20)

recall at for 10 factors: 	 0.026719390610760566
recall at for 10 factors: 	 0.02759160354307616
recall at for 10 factors: 	 0.026166537720266832
recall at for 10 factors: 	 0.0271050752427132
recall at for 10 factors: 	 0.02734271287738868
recall at for 10 factors: 	 0.026933028064299498
recall at for 10 factors: 	 0.02749930913426135
recall at for 10 factors: 	 0.027727257460327788
recall at for 10 factors: 	 0.026853473584787242
recall at for 100 factors: 	 0.026719390610760566
recall at for 100 factors: 	 0.02759160354307616
recall at for 100 factors: 	 0.026166537720266832
recall at for 100 factors: 	 0.0271050752427132
recall at for 100 factors: 	 0.02734271287738868
recall at for 100 factors: 	 0.026933028064299498
recall at for 100 factors: 	 0.02749930913426135
recall at for 100 factors: 	 0.027727257460327788
recall at for 100 factors: 	 0.026853473584787242
recall at for 1000 factors: 	 0.026719390610760566
recall at for 1000 factors: 	 0.02759160354307616
recall at for 1000

In [88]:
dict_param1 = {
    "factor": factor_nr,
    "regulation_all": regulation_all,
    "learning_rate_all": lr_all,
    "recall_10": recall_10,
    "recall_20": recall_20,
    "precision_10" : precision_10,
    "precision_20" : precision_20
}

df_param1 = pd.DataFrame(dict_param1)
df_param1.sort_values(by='recall_10', ascending=False)

Unnamed: 0,factor,regulation_all,learning_rate_all,recall_10,recall_20,precision_10,precision_20
25,1000,0.03,0.005,0.027727,0.049425,0.027479,0.025466
16,100,0.03,0.005,0.027727,0.049425,0.027479,0.025466
7,10,0.03,0.005,0.027727,0.049425,0.027479,0.025466
1,10,0.01,0.005,0.027592,0.048038,0.027425,0.025425
19,1000,0.01,0.005,0.027592,0.048038,0.027425,0.025425
10,100,0.01,0.005,0.027592,0.048038,0.027425,0.025425
15,100,0.03,0.001,0.027499,0.046765,0.028027,0.025932
6,10,0.03,0.001,0.027499,0.046765,0.028027,0.025932
24,1000,0.03,0.001,0.027499,0.046765,0.028027,0.025932
13,100,0.02,0.005,0.027343,0.048409,0.026658,0.024918


# Tuning 2 

In [89]:
# Define Set of Hyperparameters: 
n_factors = [5, 10, 15]
reg_all = [0.01, 0.02, 0.03]
lr_det = [0.001, 0.005, 0.01]

# Initiate Lists 
factor_nr = []
regulation_all = []
lr_all = []
recall_10 = []
recall_20 = []
precision_10 = []
precision_20 = []

for factors in n_factors:
    for regulation in reg_all:
        for learning in lr_det:
            try:
                del(model)
            except:
                pass 
            # Define Model 
            model = surprise.SVD(random_state=0, n_factors=factor, n_epochs=20, lr_all=learning, reg_all=regulation)
            # Fit model
            model.fit(train_set)
            
            # Predict All Pairs of Users & Items that are NOT in the Trainset 
            predictions = compute_ranking_predictions(model, test, usercol='CustomerID', itemcol='StockCode')
            
            # Filter Top 10 & 20 
            top_10 = filter_top_n(predictions, 10)
            top_20 = filter_top_n(predictions, 20)
            
            # Evaluate Recall at 10 
            eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                        col_rating="purchased", col_prediction="prediction", 
                                        relevancy_method="top_k", k=10)
            # Evaluate Recall at 20 
            eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=20)
            
            # Evaluate Precision at 10 
            eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=10)
            # Evaluate Precision at 20 
            eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=20) 
            
            print("recall at for", factors, "factors: \t", eval_recall_10)
            factor_nr.append(factors)
            regulation_all.append(regulation)
            lr_all.append(learning)
            recall_10.append(eval_recall_10)
            recall_20.append(eval_recall_20)
            precision_10.append(eval_precision_10)
            precision_20.append(eval_precision_20)

recall at for 5 factors: 	 0.026719390610760566
recall at for 5 factors: 	 0.02759160354307616
recall at for 5 factors: 	 0.026166537720266832
recall at for 5 factors: 	 0.0271050752427132
recall at for 5 factors: 	 0.02734271287738868
recall at for 5 factors: 	 0.026933028064299498
recall at for 5 factors: 	 0.02749930913426135
recall at for 5 factors: 	 0.027727257460327788
recall at for 5 factors: 	 0.026853473584787242
recall at for 10 factors: 	 0.026719390610760566
recall at for 10 factors: 	 0.02759160354307616
recall at for 10 factors: 	 0.026166537720266832
recall at for 10 factors: 	 0.0271050752427132
recall at for 10 factors: 	 0.02734271287738868
recall at for 10 factors: 	 0.026933028064299498
recall at for 10 factors: 	 0.02749930913426135
recall at for 10 factors: 	 0.027727257460327788
recall at for 10 factors: 	 0.026853473584787242
recall at for 15 factors: 	 0.026719390610760566
recall at for 15 factors: 	 0.02759160354307616
recall at for 15 factors: 	 0.0261665377

In [90]:
dict_param2 = {
    "factor": factor_nr,
    "regulation_all": regulation_all,
    "learning_rate_all": lr_all,
    "recall_10": recall_10,
    "recall_20": recall_20,
    "precision_10" : precision_10,
    "precision_20" : precision_20
}

df_param2 = pd.DataFrame(dict_param2)
df_param2.sort_values(by='recall_10', ascending=False)

Unnamed: 0,factor,regulation_all,learning_rate_all,recall_10,recall_20,precision_10,precision_20
25,15,0.03,0.005,0.027727,0.049425,0.027479,0.025466
16,10,0.03,0.005,0.027727,0.049425,0.027479,0.025466
7,5,0.03,0.005,0.027727,0.049425,0.027479,0.025466
1,5,0.01,0.005,0.027592,0.048038,0.027425,0.025425
19,15,0.01,0.005,0.027592,0.048038,0.027425,0.025425
10,10,0.01,0.005,0.027592,0.048038,0.027425,0.025425
15,10,0.03,0.001,0.027499,0.046765,0.028027,0.025932
6,5,0.03,0.001,0.027499,0.046765,0.028027,0.025932
24,15,0.03,0.001,0.027499,0.046765,0.028027,0.025932
13,10,0.02,0.005,0.027343,0.048409,0.026658,0.024918


# Tuning 3

In [91]:
# Define Set of Hyperparameters: 
n_factors = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
reg_all = [0.03]
lr_det = [0.005]

# Initiate Lists 
factor_nr = []
regulation_all = []
lr_all = []
recall_10 = []
recall_20 = []
precision_10 = []
precision_20 = []

for factors in n_factors:
    for regulation in reg_all:
        for learning in lr_det:
            try:
                del(model)
            except:
                pass 
            # Define Model 
            model = surprise.SVD(random_state=0, n_factors=factor, n_epochs=20, lr_all=learning, reg_all=regulation)
            # Fit model
            model.fit(train_set)
            
            # Predict All Pairs of Users & Items that are NOT in the Trainset 
            predictions = compute_ranking_predictions(model, test, usercol='CustomerID', itemcol='StockCode')
            
            # Filter Top 10 & 20 
            top_10 = filter_top_n(predictions, 10)
            top_20 = filter_top_n(predictions, 20)
            
            # Evaluate Recall at 10 
            eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                        col_rating="purchased", col_prediction="prediction", 
                                        relevancy_method="top_k", k=10)
            # Evaluate Recall at 20 
            eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=20)
            
            # Evaluate Precision at 10 
            eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=10)
            # Evaluate Precision at 20 
            eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=20) 
            
            print("recall at for", factors, "factors: \t", eval_recall_10)
            factor_nr.append(factors)
            regulation_all.append(regulation)
            lr_all.append(learning)
            recall_10.append(eval_recall_10)
            recall_20.append(eval_recall_20)
            precision_10.append(eval_precision_10)
            precision_20.append(eval_precision_20)

recall at for 2 factors: 	 0.027727257460327788
recall at for 3 factors: 	 0.027727257460327788
recall at for 4 factors: 	 0.027727257460327788
recall at for 5 factors: 	 0.027727257460327788
recall at for 6 factors: 	 0.027727257460327788
recall at for 7 factors: 	 0.027727257460327788
recall at for 8 factors: 	 0.027727257460327788
recall at for 9 factors: 	 0.027727257460327788
recall at for 10 factors: 	 0.027727257460327788
recall at for 11 factors: 	 0.027727257460327788
recall at for 12 factors: 	 0.027727257460327788
recall at for 13 factors: 	 0.027727257460327788
recall at for 14 factors: 	 0.027727257460327788
recall at for 15 factors: 	 0.027727257460327788
recall at for 16 factors: 	 0.027727257460327788
recall at for 17 factors: 	 0.027727257460327788
recall at for 18 factors: 	 0.027727257460327788
recall at for 19 factors: 	 0.027727257460327788
recall at for 20 factors: 	 0.027727257460327788


In [92]:
dict_param3 = {
    "factor": factor_nr,
    "regulation_all": regulation_all,
    "learning_rate_all": lr_all,
    "recall_10": recall_10,
    "recall_20": recall_20,
    "precision_10" : precision_10,
    "precision_20" : precision_20
}

df_param3 = pd.DataFrame(dict_param3)
df_param3.sort_values(by='recall_10', ascending=False)

Unnamed: 0,factor,regulation_all,learning_rate_all,recall_10,recall_20,precision_10,precision_20
0,2,0.03,0.005,0.027727,0.049425,0.027479,0.025466
10,12,0.03,0.005,0.027727,0.049425,0.027479,0.025466
17,19,0.03,0.005,0.027727,0.049425,0.027479,0.025466
16,18,0.03,0.005,0.027727,0.049425,0.027479,0.025466
15,17,0.03,0.005,0.027727,0.049425,0.027479,0.025466
14,16,0.03,0.005,0.027727,0.049425,0.027479,0.025466
13,15,0.03,0.005,0.027727,0.049425,0.027479,0.025466
12,14,0.03,0.005,0.027727,0.049425,0.027479,0.025466
11,13,0.03,0.005,0.027727,0.049425,0.027479,0.025466
9,11,0.03,0.005,0.027727,0.049425,0.027479,0.025466


# Best Model: Accuracy on Test

In [93]:
# Create Object for Model
best_model = surprise.SVD(random_state=0, n_factors=2, n_epochs=20, lr_all=0.005, reg_all=0.03)

# Fit the base_model
best_model.fit(train_set)

# Make Predictions
test_pred = predict(best_model, test, usercol='CustomerID', itemcol='StockCode')

# Convert Predictions
test_pred['prediction'] = np.where((test_pred['prediction']>0.5), 1,0)

# Sort Index of both Datasets to use Accuracy Score 
test = test.sort_values(by=['CustomerID', 'StockCode'])
test_pred = test_pred.sort_values(by=['CustomerID', 'StockCode'])

# Reset indeces for both DataFrames
test = test.reset_index(drop=True)
test_pred = test_pred.reset_index(drop=True)

# Accuracy 
accuracy_score(test.purchased, test_pred.prediction)

0.9226527858106806