# Book Reading Propensity Model

In [1]:
from propensity_utils import *

## Set the Parameters

In [2]:
# Set the parameters
random_state = 42
algo_params = {'n_factors': 128, 
               'n_epochs': 100, 
               'lr_all': 0.005, 
               'reg_all': 0.1,
               'random_state':random_state}

## Train/Test Data Paths

In [3]:
train_val_path = './data/goodreads_2016_train_val.csv'
test_path = './data/goodreads_2016_test.csv'

## Propensity Class and Its Methods

### Instantiate a new model

In [4]:
propensity = Propensity(algo_class=SVD, algo_params=algo_params, train_data_path=train_val_path)

### Evaluate the model via cross-validation on the train set

In [5]:
%%time
propensity.cross_validate(cv=3)

Evaluating RMSE, MAE, FCP of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.7951  0.7956  0.7956  0.7955  0.0002  
MAE (testset)     0.6077  0.6083  0.6081  0.6080  0.0002  
FCP (testset)     0.6192  0.6173  0.6177  0.6181  0.0008  
Fit time          126.59  111.36  116.84  118.26  6.30    
Test time         28.71   16.24   11.82   18.92   7.15    
CPU times: user 6min 43s, sys: 50 s, total: 7min 33s
Wall time: 7min 57s


### Train the model, and evaluate it on the test set

In [6]:
%%time
propensity.train_and_test(testset_path=test_path, verbose=True)

Evaluating the model performance on the test set
RMSE: 0.8599
MAE:  0.6681
FCP:  0.5877
Accuracy (Test Set): 65.97%
CPU times: user 2min 15s, sys: 10.1 s, total: 2min 25s
Wall time: 2min 30s


### Estimate the confidence on the train set

In [7]:
%%time
propensity.estimate_confidence(n_cv_folds=3, verbose=True)

Estimating the Confidence on the Train Set...
Fold  1: C = 0.70444. Time elapsed:  3.83 minutes
Fold  2: C = 0.70346. Time elapsed:  4.30 minutes
Fold  3: C = 0.70477. Time elapsed:  4.52 minutes
************************************************************
Estimated Confidence (on a 3-fold CV): 0.70
************************************************************
CPU times: user 9min 16s, sys: 2min 13s, total: 11min 29s
Wall time: 12min 41s


0.7042216144751977

### Train the model

In [10]:
%%time
# Test: Set the global confidence manually, to save time
# propensity.confidence = 0.7042216144751977
propensity.train_model(verbose=True)

CPU times: user 3min 7s, sys: 22.2 s, total: 3min 29s
Wall time: 3min 43s


### Look at the fitted train set (with predicted propensity & estimated confidence)

__Note:__ The estimated confidence is a weighted average of two point estimates: 
 * Global confidence (probability of a correct binary prediction for the entire dataset)
 * User-based confidence (probability of a correct binary prediction for a given user)

In [11]:
propensity.train_data_fitted.sample(5)

Unnamed: 0,user_id,book_id,rating,rating_predicted,would_recommend,would_recommend_pred,prediction_is_correct,user_correct_pred_cnt,user_rating_cnt,user_correct_pred_fraction,confidence
1703630,65b9e320beda7353c3455b752bdf6d26,7507908,3,4.014392,0,1,0,7,10,0.7,0.702111
1732739,834b0a31496b41d11343d4bfb97abf70,7171637,5,4.566122,1,1,1,47,49,0.959184,0.91597
2040158,560913dd21187b9d928f54702ab788d4,13206900,5,4.694133,1,1,1,16,17,0.941176,0.853415
2067686,803d3c2fae0bf5ef52ea715ff22350d3,71811,4,3.4147,1,0,0,22,43,0.511628,0.547966
2620873,221ff13d393301162833ff0a277ac67c,13497,5,4.770233,1,1,1,22,24,0.916667,0.854183


### User confidence table

In [12]:
propensity.user_confidence.sample(5)

Unnamed: 0,user_id,confidence
3298004,389538565a500c066dcbc461c9fc6224,0.347778
3427244,4e463f146a3c6b8a8cdac937847546ee,0.826013
2426863,186b758d78d9326d7ec275d12bfdb2a3,0.640201
1764701,d4875f63f92eb4714501607dc9d57555,0.859153
980079,5c2448139a79df2698faecb1d0fa3663,0.841689


### Make a prediction

In [13]:
%%time
user_id, item_id = '8842281e1d1347389f2ab93d60773d4d', 76620
res = propensity.infer_propensity_for_pair(user_id, item_id, verbose=False)
res

CPU times: user 103 ms, sys: 46.2 ms, total: 149 ms
Wall time: 163 ms


(1, 0.7311105586138161)

### Batch prediction: entire test set

In [14]:
%%time
res_df = propensity.infer_propensity_from_df(test_path, verbose=False)

CPU times: user 5.07 s, sys: 134 ms, total: 5.21 s
Wall time: 5.32 s


In [15]:
res_df.sample(10)

Unnamed: 0,user_id,book_id,would_recommend_pred,confidence
159888,9f111d5d9d86f7460e86972db85ac52c,25241477,1,0.767189
11176,fca6a41f55842d0435d48cc8076a8fe6,91475,0,0.300704
197960,c680b6e0e998d0aa8386587a32d26a44,24866845,1,0.704222
173867,e9262f2dab75dc99e7d984afae17ea42,4769247,1,0.917546
299232,b79a23fed94235a8e4a98aff257004b5,17331518,1,0.704222
205945,14dcc06a5b8836bdce618891c6ae461a,6668868,0,0.563819
230952,ce93ba0b6e41e93b81b81968a0ccd650,30245,1,0.779341
53938,6d7b490c88b5cca494ad8f7d87a7f2c3,4978,0,0.637323
303726,346c318e6c257f50ba58d82a99f46bbd,13023,1,0.704222
96652,98af18e6e108b206195b4f9d5c4b0df5,18874336,1,0.902633
