# Loading Libraries

In [114]:
import pandas as pd
import numpy as np

import logging
import tensorflow as tf


#RBM 
from recommenders.models.rbm.rbm import RBM
from recommenders.utils.timer import Timer
from recommenders.utils.plot import line_graph

# Evaluation
from recommenders.evaluation.python_evaluation import (
    precision_at_k,
    recall_at_k,
)


# Reading Data

In [115]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/online_retail_train.csv", index_col=0)
test = pd.read_csv("../../00_Data/online_retail_test.csv", index_col=0)

In [116]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (207084, 3)
Shape of Test: 	 (51892, 3)


In [117]:
# Check Number of Unique Items and User in Train & Test 
print("Unique Users in Train:", train.CustomerID.nunique())
print("Unique Users in Test:", test.CustomerID.nunique())
print("Unique Items in Train:", train.StockCode.nunique())
print("Unique Items in Test:", test.StockCode.nunique())

Unique Users in Train: 3692
Unique Users in Test: 3692
Unique Items in Train: 2716
Unique Items in Test: 2716


# Preparation of Datasets for Model

In [118]:
# Import Affinity Matrix
from recommenders.datasets.sparse import AffinityMatrix


#to use standard names across the analysis 
header = {
        "col_user": "CustomerID",
        "col_item": "StockCode",
        "col_rating": "purchased",
    }

#instantiate the sparse matrix generation  
am_train = AffinityMatrix(df = train, col_user='CustomerID', col_item='StockCode', col_rating='purchased')
am_test = AffinityMatrix(df = test, col_user='CustomerID', col_item='StockCode', col_rating='purchased')

#obtain the sparse matrix 
Xtr, _, _ = am_train.gen_affinity_matrix()
Xtst, _, _ = am_test.gen_affinity_matrix()

In [119]:
# Sanity Check: Print Shape of Train & Test Matrix 
print(Xtr.shape)
print(Xtst.shape)

(3692, 2716)
(3692, 2716)


In [120]:
print(np.sum(Xtr))
print(np.sum(Xtst))

207084
51892


# Baseline Model

## Prediction: Test Only

### Training for Test Only

In [168]:
try:
    del(model)
except:
    pass

#First we initialize the model class
model = RBM(
    possible_ratings=[1., 5.],
    visible_units=Xtr.shape[1],
    hidden_units=600,
    training_epoch=30,
    minibatch_size=100,
    keep_prob=0.7,
    with_metrics=True,
    seed=42
)

In [169]:
#Model Fit
with Timer() as train_time:
    model.fit(Xtr)

print("Took {:.2f} seconds for training.".format(train_time.interval))

Took 40.06 seconds for training.


### Test Set Prediction

In [207]:
# Make Predictions for entire Test Matrix
pred_test = model.predict(Xtst) ## Always Use Xtst for prediction - better results 

In [208]:
# Map Back Pred Test
pred_test = am_test.map_back_sparse(pred_test, kind = 'prediction')

In [209]:
# Value counts in Pred Test: Entire Dataset
pred_test.prediction.value_counts()

5.0    9050628
1.0     976844
Name: prediction, dtype: int64

In [210]:
# Merge test_pred with test
test_prediction_am = test.merge(pred_test, on=['StockCode', 'CustomerID'], how='left')

# Change Purchased to 5 
test_prediction_am['purchased'] = 5

In [211]:
# Check first five rows 
test_prediction_am.head()

Unnamed: 0,StockCode,CustomerID,purchased,prediction
0,1249,1,5,5.0
1,396,1,5,5.0
2,2601,1,5,5.0
3,912,1,5,5.0
4,1032,1,5,5.0


In [212]:
# Check Distribution of Prediction Values in Test Set
test_prediction_am.prediction.value_counts()

5.0    51830
1.0       62
Name: prediction, dtype: int64

In [213]:
# Import Accuracy 
from sklearn.metrics import accuracy_score

# Accuracy 
accuracy_score(test_prediction_am.purchased, test_prediction_am.prediction)

0.9988052108224774

## Prediction: Top N

### Data Preparation for TOP N

In [177]:
# Sanity check: Xtr & Xtst before reloading & Transformation
print(np.unique(Xtr))
print(np.unique(Xtst))

[1 5]
[1 5]


In [178]:
#obtain the sparse matrix Again  
Xtr, _, _ = am_train.gen_affinity_matrix()
Xtst, _, _ = am_test.gen_affinity_matrix()

In [179]:
# Change Values above 0 to 5 for Top N training
Xtst = np.where(Xtst > 0, 5, Xtst)
Xtr = np.where(Xtr > 0, 5, Xtr)

In [180]:
# Sanity check: Xtr & Xtst After reloading & Transformation
print(np.unique(Xtr))
print(np.unique(Xtst))

[0 5]
[0 5]


### Training for TOP N

In [181]:
# Delete First Model 
try:
    del(model)
except:
    pass

# First we initialize the model class
model = RBM(
    possible_ratings=[1., 5.], # Always provide this range - way better results! 
    visible_units=Xtr.shape[1],
    hidden_units=600,
    training_epoch=30,
    minibatch_size=100,
    keep_prob=0.7,
    with_metrics=True,
    seed=42)

#Model Fit
with Timer() as train_time:
    model.fit(Xtr)

print("Took {:.2f} seconds for training.".format(train_time.interval))

Took 41.29 seconds for training.


In [220]:
# Model prediction on the test set Xtst.
with Timer() as prediction_time:
    top_k_20 =  model.recommend_k_items(Xtst, top_k=20, remove_seen=True) # Xtst best

print("Took {:.2f} seconds for prediction.".format(prediction_time.interval))

Took 1.80 seconds for prediction.


### Evaluation

In [221]:
# Map Back Xtst & Top_k 
df_top_k_20 = am_test.map_back_sparse(top_k_20, kind = 'prediction')
df_test = am_test.map_back_sparse(Xtst, kind = 'ratings')

In [222]:
eval_recall = recall_at_k(df_test, df_top_k_20, col_user="CustomerID", col_item="StockCode", 
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
eval_recall

0.41936458053028974