# Loading Libraries

In [1]:
import pandas as pd
import numpy as np

import surprise
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions
from recommenders.evaluation.python_evaluation import precision_at_k, recall_at_k


# Reading Data

In [3]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/online_retail_ratings_train.csv", index_col=0)
test = pd.read_csv("../../00_Data/online_retail_ratings_test.csv", index_col=0)

In [4]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (349978, 3)
Shape of Test: 	 (87558, 3)


In [5]:
# Check Head of Train
train.head()

Unnamed: 0,StockCode,CustomerID,purchased
167423,1923,1,2
95278,1292,1,2
176306,2012,1,2
294620,1336,1,-1
94566,1286,1,2


# Preparation of Train Dataset for Surprise Models


In [6]:
# Reader 
reader = surprise.Reader(rating_scale=(-2,2))

# Build Train Set from Custom Dataset
train_set = surprise.Dataset.load_from_df(train[['CustomerID', 'StockCode', 'purchased']], reader=reader).build_full_trainset()

# Baseline Model

In [7]:
# base_model
try:
    del(base_model)
except:
    pass

# Create Object for base_model 
base_model = surprise.SVDpp(random_state=0, n_factors=200, n_epochs=20, verbose=True)

# Fit the base_model
base_model.fit(train_set)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fe319103640>

# Prediction: Test Set Only

In [8]:
# Make Predictions
test_pred = predict(base_model, test, usercol='CustomerID', itemcol='StockCode')

# Show first five rows
test_pred.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,1,989,-0.540224
1,1,648,1.224336
2,1,2900,1.30367
3,1,320,1.194785
4,1,2318,1.381025


In [9]:
# Check Statistics of Prediction
test_pred.prediction.describe()

count    87558.000000
mean         0.432954
std          0.919772
min         -2.000000
25%         -0.604333
50%          0.848500
75%          1.153366
max          2.000000
Name: prediction, dtype: float64

In [10]:
# Convert Predictions
test_pred['prediction'] = np.where((test_pred['prediction']>0),1,0)

# Convert Test
test['purchased'] = np.where((test['purchased']>0),1,0)

In [11]:
# Check Distribution
test_pred['prediction'].value_counts()

1    57659
0    29899
Name: prediction, dtype: int64

In [12]:
# Sort Index of both Datasets to use Accuracy Score 
test = test.sort_values(by=['CustomerID', 'StockCode'])
test_pred = test_pred.sort_values(by=['CustomerID', 'StockCode'])

# Reset indeces for both DataFrames
test = test.reset_index(drop=True)
test_pred = test_pred.reset_index(drop=True)

In [13]:
# First Reorder columns for test 
test = test[['CustomerID', 'StockCode', 'purchased']]
# Head of Test
test.head()

Unnamed: 0,CustomerID,StockCode,purchased
0,1,87,1
1,1,316,1
2,1,320,1
3,1,648,1
4,1,989,0


In [14]:
# Head of Test_pred
test_pred.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,1,87,1
1,1,316,1
2,1,320,1
3,1,648,1
4,1,989,0


In [15]:
# Import Accuracy 
from sklearn.metrics import accuracy_score

# Accuracy 
accuracy_score(test.purchased, test_pred.prediction)

0.9232622947075082

# Prediction: Top N 

In [17]:
# Predict all pairs that are not in the train set
predictions = compute_ranking_predictions(base_model, train, usercol='CustomerID', itemcol='StockCode', remove_seen=True)

In [18]:
# Check first five rows
predictions.head()

Unnamed: 0,CustomerID,StockCode,prediction
349978,1,920,1.463282
349979,1,753,0.852112
349980,1,989,-0.540224
349981,1,2177,-0.631089
349982,1,1806,-0.69952


In [19]:
# Now filter out top N 
# First write a function
def filter_top_n(predictions: pd.DataFrame, n: int) -> pd.DataFrame:
    
    # Group the dataframe by 'CustomerID', and for each group, sort by 'prediction' in descending order, then take the top N rows
    top_n_per_customer = predictions.groupby('CustomerID', group_keys=False).apply(lambda group: group.sort_values('prediction', ascending=False).head(n))
    
    return top_n_per_customer

# Filter Top 10 
top_10 = filter_top_n(predictions, 10)

# Filter Top 20 
top_20 = filter_top_n(predictions, 20)


In [20]:
# Evaluate Precision at 10 
eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('precision at 10 \t:', eval_precision_10)

# Evaluate Recall at 10 
eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('recall at 10 \t:', eval_recall_10)

precision at 10 	: 0.020194647201946473
recall at 10 	: 0.0076947532106192065


In [21]:
# Evaluate Precision at 20 
eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('precision at 20 \t:', eval_precision_20)

# Evaluate Recall at 20 
eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('recall at 20 \t:', eval_recall_20)

precision at 20 	: 0.018085969180859692
recall at 20 	: 0.013873478688957884


# Hyper-Parameter Tuning

In [None]:
# Define Set of Hyperparameters: 
n_factors = [5, 10, 100, 200, 1000]

# Initiate List for Metrics
factors = []
recall_k = []

# Start for loop
for factor in n_factors:
    # First Delete Model Everytime
    try:
        del(model)
    except:
        pass

    # Define Model with Parameters 
    model = surprise.SVDpp(random_state=0, n_factors=factor, n_epochs=10)

    # Fit model
    model.fit(train_set)

    # Predict all pairs that are not in the train set
    predictions = compute_ranking_predictions(model, test, usercol='CustomerID', itemcol='StockCode')

    # Filter Top 10 
    top_10 = filter_top_n(predictions, 10)

    # Evaluate Recall at 10 
    eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                        col_rating="purchased", col_prediction="prediction", 
                                        relevancy_method="top_k", k=10)

    print("recall at for", factor, "factors: \t", eval_recall_10)

    # Append Lists
    factors.append(factor)
    recall_k.append(eval_recall_10)