# Book Reading Propensity Model

In [1]:
from propensity_utils import *

## Set the Parameters

In [2]:
# Set the parameters
random_state = 42
algo_params = {'n_factors': 128, 
               'n_epochs': 1, 
               'lr_all': 0.005, 
               'reg_all': 0.1,
               'random_state':random_state}

## Train/Test Data Paths

In [3]:
train_val_path = './data/goodreads_2016_train_val.csv'
test_path = './data/goodreads_2016_test.csv'

## Propensity Class and Its Methods

### Instantiate a new model

In [4]:
propensity = Propensity(algo_class=SVD, algo_params=algo_params, train_data_path=train_val_path)

### Evaluate the model via cross-validation on the train set

In [5]:
%%time
propensity.cross_validate(cv=3)

Evaluating RMSE, MAE, FCP of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8963  0.8955  0.8955  0.8958  0.0004  
MAE (testset)     0.7037  0.7032  0.7034  0.7035  0.0002  
FCP (testset)     0.5526  0.5509  0.5502  0.5512  0.0010  
Fit time          3.52    4.62    4.26    4.14    0.46    
Test time         26.60   9.81    12.40   16.27   7.38    
CPU times: user 1min 23s, sys: 26 s, total: 1min 49s
Wall time: 2min 3s


### Train the model, and evaluate it on the test set

In [5]:
%%time
propensity.train_and_test(testset_path=test_path, verbose=True)

Evaluating the model performance on the test set
RMSE: 0.9119
MAE:  0.7126
FCP:  0.5630
Accuracy (Test Set): 72.75%
CPU times: user 19.6 s, sys: 2.88 s, total: 22.5 s
Wall time: 23.9 s


### Estimate the confidence on the train set

In [6]:
%%time
propensity.estimate_confidence(n_cv_folds=3, verbose=True)

Estimating the Confidence on the Train Set...
Fold  1: C = 0.67718. Time elapsed:  0.53 minutes
Fold  2: C = 0.67845. Time elapsed:  0.59 minutes
Fold  3: C = 0.67835. Time elapsed:  0.64 minutes
************************************************************
Estimated Confidence (on a 3-fold CV): 0.68
************************************************************
CPU times: user 1min 19s, sys: 19 s, total: 1min 38s
Wall time: 1min 48s


0.6779931938697099

### Train the model

In [5]:
%%time
propensity.confidence = 0.68
propensity.train_model(verbose=True)

CPU times: user 1min 6s, sys: 2.82 s, total: 1min 9s
Wall time: 1min 10s


### Look at the fitted train set (with predicted propensity & estimated confidence)

__Note:__ The estimated confidence is a weighted average of two point estimates: 
 * Global confidence (probability of a correct binary prediction for the entire dataset)
 * User-based confidence (probability of a correct binary prediction for a given user)

In [6]:
propensity.train_data_fitted.sample(5)

Unnamed: 0,user_id,book_id,rating,rating_predicted,would_recommend,would_recommend_pred,prediction_is_correct,user_correct_pred_cnt,user_rating_cnt,user_correct_pred_fraction,confidence
2258068,44734772c453622a4eb917b0cf986030,1162543,5,4.128079,1,1,1,21,21,1.0,0.896774
256969,57d04050a88c2f5b17be2d51dbf53ca1,17838859,4,4.063139,1,1,1,16,18,0.888889,0.814286
740646,2319c9ed57a6048e54cbf37c1527ca40,149267,5,3.968632,1,0,0,14,40,0.35,0.416
2511372,3a1ab829c0c911352a8f14f9cdc0fe4c,81167,5,4.11421,1,1,1,52,60,0.866667,0.84
2042049,11787115f9614b248fe3322ba924e255,2366001,5,4.061298,1,1,1,8,9,0.888889,0.778947


### User confidence table

In [7]:
propensity.user_confidence.sample(5)

Unnamed: 0,user_id,confidence
687491,0bc9df0c42ce1adffb16900795241823,0.8
676217,ecebda8a5ee51e5dabd01db91e17eabd,0.709091
1440110,e1dbbbf245202490d0ea4458c97b2713,0.811765
3075417,4a0f0ce7893f03b53215f9a8443fbcd8,0.709091
2878286,366f09037c8f4298219cec896403ddb8,0.6


### Make a prediction

In [6]:
%%time
user_id, item_id = '8842281e1d1347389f2ab93d60773d4d', 76620
res = propensity.infer_propensity_for_pair(user_id, item_id, verbose=False)
res

CPU times: user 61.5 ms, sys: 8.78 ms, total: 70.3 ms
Wall time: 76.6 ms


(1, 0.7090909090909091)

### Batch prediction: entire test set

In [10]:
%%time
res_df = propensity.infer_propensity_from_df(test_path, verbose=False)

CPU times: user 5.26 s, sys: 291 ms, total: 5.55 s
Wall time: 5.82 s


In [11]:
res_df.sample(10)

Unnamed: 0,user_id,book_id,would_recommend_pred,confidence
59939,3eb791d5e2d4092fd869abb1e36f5526,16006945,1,0.821053
40555,fb87d7393306d9f86ee76a5a62cf76de,21942107,1,0.783333
246765,361c448eef97bbc1b96227cab3fdb5d0,769483,1,0.68
287492,e36b4b82588fa6385405c254cba92533,16065004,0,0.504918
269627,9a37476b41edfb5c07874d33b3447b45,18138189,1,0.859459
280107,4960c0409d32e4c9df2eabf6fe9eafb3,29213435,0,0.55625
194962,8c85faabaaba7de7a6b34c2c11f105cb,24464141,0,0.678947
44001,b9b56784330b1a4d8b72548ded30b31b,20821111,1,0.684615
34951,5179cfb050c2273623aef110fd0932a3,6582637,1,0.964571
68857,577a86412918bd00235ef4794187d0f2,2319370,0,0.386735
