# Loading Libraries

In [1]:
import pandas as pd
import numpy as np

import surprise
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions
from recommenders.evaluation.python_evaluation import precision_at_k, recall_at_k
 
from sklearn.metrics import accuracy_score



# Reading Data

In [2]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/rec_sys_binary_train_mixed.csv", index_col=0)
test = pd.read_csv("../../00_Data/rec_sys_binary_test_mixed.csv", index_col=0)

In [3]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (193076, 3)
Shape of Test: 	 (48269, 3)


In [6]:
# Check Head of Train
train.head()

Unnamed: 0,StockCode,CustomerID,purchased
147307,47518F,16701,1
26104,21498,12758,1
46697,22030,17864,1
111830,22960,13804,1
94927,22725,12560,1


In [7]:
train.purchased.value_counts()

1    138161
0     54915
Name: purchased, dtype: int64

In [5]:
test.purchased.value_counts()

1    34540
0    13729
Name: purchased, dtype: int64

In [8]:
print(train.purchased.value_counts()[0]/train.purchased.value_counts()[1])
print(test.purchased.value_counts()[0]/test.purchased.value_counts()[1])

0.3974710663646036
0.397481181239143


# Preparation of Train Dataset for Surprise Models


In [9]:
# Reader 
reader = surprise.Reader(rating_scale=(0,1))

# Build Train Set from Custom Dataset
train_set = surprise.Dataset.load_from_df(train[['CustomerID', 'StockCode', 'purchased']], reader=reader).build_full_trainset()

# Baseline Model

In [10]:
# base_model
try:
    del(base_model)
except:
    pass

# Create Object for base_model 
base_model = surprise.SVDpp(random_state=0, n_factors=4, n_epochs=20, verbose=True, lr_all=0.001, reg_all=0.01)

# Fit the base_model
base_model.fit(train_set)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fb1700791f0>

# Prediction: Test Set Only

In [11]:
# Make Predictions
test_pred = predict(base_model, test, usercol='CustomerID', itemcol='StockCode')

# Show first five rows
test_pred.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,17865,23072,0.627339
1,17960,22139,0.809282
2,15358,10133,0.878206
3,17787,22188,0.837835
4,17076,84843,0.421679


In [12]:
# Check Statistics of Prediction
test_pred.prediction.describe()

count    48269.000000
mean         0.738662
std          0.168462
min          0.226186
25%          0.616031
50%          0.749223
75%          0.871218
max          1.000000
Name: prediction, dtype: float64

In [13]:
# Convert Predictions
test_pred['prediction'] = np.where((test_pred['prediction']>0.5), 1,0)

In [14]:
# Check Distribution
test_pred['prediction'].value_counts()

1    43553
0     4716
Name: prediction, dtype: int64

In [15]:
# Accuracy on positive Test only
test_positive = test[test['purchased']==1]
test_positive_pred = test_positive.merge(test_pred, on=['CustomerID', 'StockCode'], how='left', indicator=True)
test_positive_pred.purchased.value_counts()
# Accuracy 
accuracy_score(test_positive_pred.purchased, test_positive_pred.prediction)


0.965083960625362

In [16]:
# Accuracy on entire Test
# Sort Index of both Datasets to use Accuracy Score 
test = test.sort_values(by=['CustomerID', 'StockCode'])
test_pred = test_pred.sort_values(by=['CustomerID', 'StockCode'])

# Reset indeces for both DataFrames
test = test.reset_index(drop=True)
test_pred = test_pred.reset_index(drop=True)

In [17]:
# First Reorder columns for test 
test = test[['CustomerID', 'StockCode', 'purchased']]
# Head of Test
test.head()

Unnamed: 0,CustomerID,StockCode,purchased
0,12347,21035,1
1,12347,21041,1
2,12347,21064,1
3,12347,21307,0
4,12347,21719,0


In [18]:
# Head of Test_pred
test_pred.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,12347,21035,1
1,12347,21041,1
2,12347,21064,1
3,12347,21307,1
4,12347,21719,1


In [19]:
# Accuracy 
accuracy_score(test.purchased, test_pred.prediction)

0.7633056413018707

# Prediction: Top N 

In [20]:
# Predict all pairs that are not in the train set
predictions = compute_ranking_predictions(base_model, train, usercol='CustomerID', itemcol='StockCode', remove_seen=True)

In [21]:
# Check first five rows
predictions.head()

Unnamed: 0,CustomerID,StockCode,prediction
193076,16701,22030,0.851516
193077,16701,22960,0.964769
193078,16701,21509,0.719401
193079,16701,22810,0.616594
193080,16701,22780,0.704339


In [22]:
# Filter Test for purchased items only
test = test[test['purchased']==1]

In [23]:
# Now filter out top N 
# First write a function
def filter_top_n(predictions: pd.DataFrame, n: int) -> pd.DataFrame:
    
    # Group the dataframe by 'CustomerID', and for each group, sort by 'prediction' in descending order, then take the top N rows
    top_n_per_customer = predictions.groupby('CustomerID', group_keys=False).apply(lambda group: group.sort_values('prediction', ascending=False).head(n))
    
    return top_n_per_customer

# Filter Top 10 
top_10 = filter_top_n(predictions, 10)

# Filter Top 20 
top_20 = filter_top_n(predictions, 20)


In [24]:
# Evaluate Precision at 10 
eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('precision at 10 \t:', eval_precision_10)

# Evaluate Recall at 10 
eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('recall at 10 \t:', eval_recall_10)

precision at 10 	: 0.03535726266353573
recall at 10 	: 0.03518155771306669


In [25]:
# Evaluate Precision at 20 
eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('precision at 20 \t:', eval_precision_20)

# Evaluate Recall at 20 
eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('recall at 20 \t:', eval_recall_20)

precision at 20 	: 0.03108017443810802
recall at 20 	: 0.06144071476616181


# Hyper-Parameter Tuning

## Tuning Round 1 

In [26]:
# Define Set of Hyperparameters: 
n_factors = [10, 100]
lr_rates = [0.001]
reg_terms = [0.01]

# Initiate List for Metrics
factors = []
recall_k = []
learning_rates = []
regulation_terms = []


# Start for loop
for factor in n_factors:
    for lr in lr_rates:
        for reg in reg_terms:
            # First Delete Model Everytime
            try:
                del(model)
            except:
                pass

            # Define Model with Parameters 
            model = surprise.SVDpp(random_state=0,
                                    n_factors=factor,
                                    n_epochs=5,
                                    lr_all=lr,
                                    reg_all=reg)

            # Fit model
            model.fit(train_set)

            # Predict all pairs that are not in the train set
            predictions = compute_ranking_predictions(model, test, usercol='CustomerID', itemcol='StockCode')

            # Filter Top 10 
            top_10 = filter_top_n(predictions, 10)

            # Evaluate Recall at 10 
            eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                                col_rating="purchased", col_prediction="prediction", 
                                                relevancy_method="top_k", k=10)

            print("recall at for", factor, "factors: \t", eval_recall_10)

            # Append Lists
            factors.append(factor)
            recall_k.append(eval_recall_10)
            learning_rates.append(lr)
            regulation_terms.append(reg) 

recall at for 10 factors: 	 0.03126162933357219
recall at for 100 factors: 	 0.014360187107250753


In [27]:
dict_param = {
    "factor": factors,
    "regulation_term": regulation_terms,
    "learning_rate": learning_rates,
    "recall_10": recall_k
}

In [28]:
df_param = pd.DataFrame(dict_param)
df_param

Unnamed: 0,factor,regulation_term,learning_rate,recall_10
0,10,0.01,0.001,0.031262
1,100,0.01,0.001,0.01436


In [98]:
# Use for furhter 
df_param.to_csv('svdpp_binary_hpt_f10to1000_lr0001to01_reg001to002.csv')

## Tuning Round 2
- Factors 10-30
- Learning Rate around 0.001
- Regulation Term around 0.01

In [29]:
# Define Set of Hyperparameters: 
n_factors = [10, 20, 30]
lr_rates = [0.001]
reg_terms = [0.01]

# Initiate List for Metrics
factors = []
recall_k = []
learning_rates = []
regulation_terms = []


# Start for loop
for factor in n_factors:
    for lr in lr_rates:
        for reg in reg_terms:
            # First Delete Model Everytime
            try:
                del(model)
            except:
                pass

            # Define Model with Parameters 
            model = surprise.SVDpp(random_state=0,
                                    n_factors=factor,
                                    n_epochs=5,
                                    lr_all=lr,
                                    reg_all=reg)

            # Fit model
            model.fit(train_set)

            # Predict all pairs that are not in the train set
            predictions = compute_ranking_predictions(model, test, usercol='CustomerID', itemcol='StockCode')

            # Filter Top 10 
            top_10 = filter_top_n(predictions, 10)

            # Evaluate Recall at 10 
            eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                                col_rating="purchased", col_prediction="prediction", 
                                                relevancy_method="top_k", k=10)

            print("recall at for", factor, "factors: \t", eval_recall_10)

            # Append Lists
            factors.append(factor)
            recall_k.append(eval_recall_10)
            learning_rates.append(lr)
            regulation_terms.append(reg) 

recall at for 10 factors: 	 0.03126162933357219
recall at for 20 factors: 	 0.02698097891984196
recall at for 30 factors: 	 0.024610013828802912


In [30]:
dict_param2 = {
    "factor": factors,
    "regulation_term": regulation_terms,
    "learning_rate": learning_rates,
    "recall_10": recall_k
}

df_param2 = pd.DataFrame(dict_param2)
df_param2

Unnamed: 0,factor,regulation_term,learning_rate,recall_10
0,10,0.01,0.001,0.031262
1,20,0.01,0.001,0.026981
2,30,0.01,0.001,0.02461


In [101]:
# Use for furhter 
df_param2.to_csv('svdpp_binary_hpt_f90to110_lr01to02_reg01.csv')

## Tuning Round 3
- Factors around 10 
- Learning Rate 0.001
- Regulation Term  0.01

In [31]:
# Define Set of Hyperparameters: 
n_factors = [5, 10, 15]
lr_rates = [0.001]
reg_terms = [0.01]

# Initiate List for Metrics
factors = []
recall_k = []
learning_rates = []
regulation_terms = []


# Start for loop
for factor in n_factors:
    for lr in lr_rates:
        for reg in reg_terms:
            # First Delete Model Everytime
            try:
                del(model)
            except:
                pass

            # Define Model with Parameters 
            model = surprise.SVDpp(random_state=0,
                                    n_factors=factor,
                                    n_epochs=5,
                                    lr_all=lr,
                                    reg_all=reg)

            # Fit model
            model.fit(train_set)

            # Predict all pairs that are not in the train set
            predictions = compute_ranking_predictions(model, test, usercol='CustomerID', itemcol='StockCode')

            # Filter Top 10 
            top_10 = filter_top_n(predictions, 10)

            # Evaluate Recall at 10 
            eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                                col_rating="purchased", col_prediction="prediction", 
                                                relevancy_method="top_k", k=10)

            print("recall at for", factor, "factors: \t", eval_recall_10)

            # Append Lists
            factors.append(factor)
            recall_k.append(eval_recall_10)
            learning_rates.append(lr)
            regulation_terms.append(reg) 

recall at for 5 factors: 	 0.032998515986824024
recall at for 10 factors: 	 0.03126162933357219
recall at for 15 factors: 	 0.02880378370919426


In [32]:
dict_param3 = {
    "factor": factors,
    "regulation_term": regulation_terms,
    "learning_rate": learning_rates,
    "recall_10": recall_k
}

df_param3 = pd.DataFrame(dict_param3)
df_param3

# Save for further analysis
#df_param3.to_csv('svdpp_binary_hpt_f110to130_lr01to02_reg01.csv')

Unnamed: 0,factor,regulation_term,learning_rate,recall_10
0,5,0.01,0.001,0.032999
1,10,0.01,0.001,0.031262
2,15,0.01,0.001,0.028804


## Final Tuning

Now the final tuning with more parameters. This tuning process will be used for model explanation

In [34]:
# Define Set of Hyperparameters: 
n_factors = [2, 3, 4, 5, 6]
lr_rates = [0.001, 0.002, 0.003]
reg_terms = [0.01, 0.02]

# Initiate List for Metrics
factors = []
learning_rates = []
regulation_terms = []
recall_10 = []
recall_20 = []
precision_10 = []
precision_20 = [] 


# Start for loop
for factor in n_factors:
    for lr in lr_rates:
        for reg in reg_terms:
            # First Delete Model Everytime
            try:
                del(model)
            except:
                pass

            # Define Model with Parameters 
            model = surprise.SVDpp(random_state=0,
                                    n_factors=factor,
                                    n_epochs=20,
                                    lr_all=lr,
                                    reg_all=reg)

            # Fit model
            model.fit(train_set)

            # Predict all pairs that are not in the train set
            predictions = compute_ranking_predictions(model, test, usercol='CustomerID', itemcol='StockCode')

            # Filter Top 10 & 20  
            top_10 = filter_top_n(predictions, 10)
            top_20 = filter_top_n(predictions, 20)

            # Evaluate Recall at 10 
            eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                                col_rating="purchased", col_prediction="prediction", 
                                                relevancy_method="top_k", k=10)
            
            # Evaluate Recall at 20 
            eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=20)
            
            # Evaluate Precision at 10 
            eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=10)
            # Evaluate Precision at 20 
            eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=20) 

            print("recall at for", factor, "factors: \t", eval_recall_10)

            # Append Lists
            factors.append(factor)
            recall_10.append(eval_recall_10)
            recall_20.append(eval_recall_20)
            precision_10.append(eval_precision_10)
            precision_20.append(eval_precision_20)
            learning_rates.append(lr)
            regulation_terms.append(reg) 

recall at for 2 factors: 	 0.03422518907927705
recall at for 2 factors: 	 0.034101718721857445
recall at for 2 factors: 	 0.03313489540860049
recall at for 2 factors: 	 0.033964694253029705
recall at for 2 factors: 	 0.03377514076948185
recall at for 2 factors: 	 0.03328879620304329
recall at for 3 factors: 	 0.03330974472326468
recall at for 3 factors: 	 0.033573715087182175
recall at for 3 factors: 	 0.03291273135061098
recall at for 3 factors: 	 0.033858932636463696
recall at for 3 factors: 	 0.033657173978577516
recall at for 3 factors: 	 0.03405741386006994
recall at for 4 factors: 	 0.03162871730822714
recall at for 4 factors: 	 0.03203059316113658
recall at for 4 factors: 	 0.031573645343891746
recall at for 4 factors: 	 0.032095579305846715
recall at for 4 factors: 	 0.032153790272016515
recall at for 4 factors: 	 0.03213547056916712
recall at for 5 factors: 	 0.03234939119085691
recall at for 5 factors: 	 0.03297958991485781
recall at for 5 factors: 	 0.03318708624100177
recal

In [36]:
dict_param5 = {
    "factor": factors,
    "regulation_term": regulation_terms,
    "learning_rate": learning_rates,
    "recall_10": recall_10,
    "recall_20" : recall_20,
    "precision_10" : precision_10,
    "precision_20" : precision_20
}

df_param5 = pd.DataFrame(dict_param5)
df_param5.sort_values(by='recall_10', ascending=False)

Unnamed: 0,factor,regulation_term,learning_rate,recall_10,recall_20,precision_10,precision_20
0,2,0.01,0.001,0.034225,0.056716,0.030963,0.027155
1,2,0.02,0.001,0.034102,0.057301,0.031365,0.027608
11,3,0.02,0.003,0.034057,0.057052,0.029353,0.026602
3,2,0.02,0.002,0.033965,0.057819,0.029285,0.027088
9,3,0.02,0.002,0.033859,0.055961,0.029923,0.026166
4,2,0.01,0.003,0.033775,0.056999,0.028145,0.025914
10,3,0.01,0.003,0.033657,0.055802,0.028682,0.025981
7,3,0.02,0.001,0.033574,0.056354,0.030761,0.027508
23,5,0.02,0.003,0.033424,0.058303,0.028615,0.026032
6,3,0.01,0.001,0.03331,0.056203,0.030359,0.027122


# Best Model: Accuracy on Test

In [38]:
# Create Object for Model
best_model = surprise.SVDpp(random_state=0, n_factors=2, n_epochs=20, lr_all=0.001, reg_all=0.01)

# Fit the base_model
best_model.fit(train_set)

# Make Predictions
test_pred = predict(best_model, test, usercol='CustomerID', itemcol='StockCode')

# Convert Predictions
test_pred['prediction'] = np.where((test_pred['prediction']>0.5), 1,0)

# Sort Index of both Datasets to use Accuracy Score 
test = test.sort_values(by=['CustomerID', 'StockCode'])
test_pred = test_pred.sort_values(by=['CustomerID', 'StockCode'])

# Reset indeces for both DataFrames
test = test.reset_index(drop=True)
test_pred = test_pred.reset_index(drop=True)

# Accuracy 
accuracy_score(test.purchased, test_pred.prediction)

0.9660104226983208