# Loading Libraries

In [2]:
import pandas as pd
import numpy as np

import logging
import tensorflow as tf


#RBM 
from recommenders.models.rbm.rbm import RBM
from recommenders.utils.timer import Timer
from recommenders.utils.plot import line_graph

# Evaluation
from recommenders.evaluation.python_evaluation import (
    precision_at_k,
    recall_at_k,
)


# Reading Data

In [3]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/online_retail_ratings_train.csv", index_col=0)
test = pd.read_csv("../../00_Data/online_retail_ratings_test.csv", index_col=0)

In [4]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (349978, 3)
Shape of Test: 	 (87558, 3)


In [5]:
# Check Number of Unique Items and User in Train & Test 
print("Unique Users in Train:", train.CustomerID.nunique())
print("Unique Users in Test:", test.CustomerID.nunique())
print("Unique Items in Train:", train.StockCode.nunique())
print("Unique Items in Test:", test.StockCode.nunique())

Unique Users in Train: 3699
Unique Users in Test: 3699
Unique Items in Train: 2730
Unique Items in Test: 2730


In [6]:
# Value Counts in Train
train.purchased.value_counts()

-1    152118
 1    144542
 2     47533
-2      5785
Name: purchased, dtype: int64

In [7]:
# Value Counts in Test
test.purchased.value_counts()

 1    47060
-1    25526
 2    13446
-2     1526
Name: purchased, dtype: int64

# Preparation of Datasets for Model

In [5]:
# Import Affinity Matrix
from recommenders.datasets.sparse import AffinityMatrix


#to use standard names across the analysis 
header = {
        "col_user": "CustomerID",
        "col_item": "StockCode",
        "col_rating": "purchased",
    }

#instantiate the sparse matrix generation  
am_train = AffinityMatrix(df = train, col_user='CustomerID', col_item='StockCode', col_rating='purchased')
am_test = AffinityMatrix(df = test, col_user='CustomerID', col_item='StockCode', col_rating='purchased')

#obtain the sparse matrix 
Xtr, _, _ = am_train.gen_affinity_matrix()
Xtst, _, _ = am_test.gen_affinity_matrix()

In [6]:
# Sanity Check: Print Shape of Train & Test Matrix 
print(Xtr.shape)
print(Xtst.shape)

(3699, 2730)
(3699, 2730)


# Baseline Model

## Prediction: Test Only

### Training for Test Only

In [18]:
try:
    del(model)
except:
    pass

#First we initialize the model class
model = RBM(
    possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
    visible_units=Xtr.shape[1],
    hidden_units=600,
    training_epoch=30,
    minibatch_size=100,
    keep_prob=0.7,
    with_metrics=True,
    seed=42
)

In [19]:
#Model Fit
with Timer() as train_time:
    model.fit(Xtr)

print("Took {:.2f} seconds for training.".format(train_time.interval))

Took 39.66 seconds for training.


### Test Set Prediction

In [20]:
# Make Predictions for entire Test Matrix
pred_test = model.predict(Xtst) ## Always Use Xtst for prediction - better results 

In [21]:
# Map Back Pred Test
pred_test = am_test.map_back_sparse(pred_test, kind = 'prediction')

In [22]:
# Value counts in Pred Test: Entire Dataset
pred_test.prediction.value_counts()

 2.0    5314296
-2.0    3840844
-1.0     943130
Name: prediction, dtype: int64

In [26]:
# Merge test_pred with test
test_prediction_am = test.merge(pred_test, on=['StockCode', 'CustomerID'], how='left')

# Change Purchased above 0 to 1 and below zero to 0 
test_prediction_am['purchased'] = test_prediction_am['purchased'].apply(lambda x: 0 if x < 0 else 1 )

# Change prediction above 0 to 1 and below zero to 0 
test_prediction_am['prediction'] = test_prediction_am['prediction'].apply(lambda x: 0 if x < 0 else 1 )


In [27]:
# Check first five rows 
test_prediction_am.head()

Unnamed: 0,StockCode,CustomerID,purchased,prediction
0,989,1,0,0
1,648,1,1,1
2,2900,1,1,1
3,320,1,1,1
4,2318,1,1,1


In [28]:
# Check Distribution of Prediction Values in Test Set
test_prediction_am.prediction.value_counts()

1    57876
0    29682
Name: prediction, dtype: int64

In [29]:
# Import Accuracy 
from sklearn.metrics import accuracy_score

# Accuracy 
accuracy_score(test_prediction_am.purchased, test_prediction_am.prediction)

0.8429840791246945

## Prediction: Top N

### Data Preparation for TOP N

In [30]:
# Sanity check: Xtr & Xtst before reloading & Transformation
print(np.unique(Xtr))
print(np.unique(Xtst))

[-2 -1  0  1  2]
[-2 -1  0  1  2]


### Training for TOP N

In [31]:
# Delete First Model 
try:
    del(model)
except:
    pass

# First we initialize the model class
model = RBM(
    possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])), # Always provide this range - way better results! 
    visible_units=Xtr.shape[1],
    hidden_units=600,
    training_epoch=30,
    minibatch_size=100,
    keep_prob=0.7,
    with_metrics=True,
    seed=42)

#Model Fit
with Timer() as train_time:
    model.fit(Xtr)

print("Took {:.2f} seconds for training.".format(train_time.interval))

Took 39.14 seconds for training.


In [32]:
# Model prediction on the test set Xtst.
with Timer() as prediction_time:
    top_k_20 =  model.recommend_k_items(Xtst, top_k=20, remove_seen=True) # Xtst best

print("Took {:.2f} seconds for prediction.".format(prediction_time.interval))

Took 1.41 seconds for prediction.


### Evaluation

In [33]:
# Map Back Xtst & Top_k 
df_top_k_20 = am_test.map_back_sparse(top_k_20, kind = 'prediction')
df_test = am_test.map_back_sparse(Xtst, kind = 'ratings')

In [34]:
eval_recall = recall_at_k(df_test, df_top_k_20, col_user="CustomerID", col_item="StockCode", 
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
eval_recall

0.05616079361183164

In [35]:
eval_manual = df_test.merge(df_top_k_20, on=['CustomerID', 'StockCode'], how='inner')
eval_manual

Unnamed: 0,CustomerID,StockCode,purchased,prediction
0,2,914,2,1.983786
1,5,1484,1,1.968137
2,7,2035,1,1.900548
3,10,1749,1,1.967080
4,10,2686,1,1.949279
...,...,...,...,...
5123,4370,1516,2,1.939546
5124,4370,1607,2,1.977694
5125,4370,166,2,1.928714
5126,4371,2678,1,1.936056


In [36]:
eval_manual.shape[0] /  df_test.shape[0]

0.05856689280248521