In [1]:

# Reloading
%reload_ext autoreload
%autoreload 2

import pandas as pd
print('pandas version:{}'.format(pd.__version__))
import numpy as np
print('numpy version:{}'.format(np.__version__))
import math
from datetime import datetime
from dgpylib import dg_athena
from dgpylib import dg_s3
import os
import re
from matplotlib.ticker import MaxNLocator
from matplotlib import pyplot as plt
import seaborn as sns
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import sys
sys.path.insert(0, '../Modelling')
import helper_function2 as hf

import sys
sys.path.insert(0, '../EDA')
import data_integrity_fixer as dint
import basicstatsandplotter as bsp
import data_imputer as di
import data_integrity_fixer as dif
import parameters

##for modelling
import h2o
print('h2o version:{}'.format(h2o.__version__))
from h2o.estimators import H2OXGBoostEstimator
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators import H2OTargetEncoderEstimator
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.tree import H2OTree
from h2o.tree import H2ONode
from h2o.tree import H2OSplitNode
from h2o.tree import H2OLeafNode
from h2o.estimators.kmeans import H2OKMeansEstimator

#import xgboost as xgb
import sklearn
import category_encoders as ce
print('sklearn version:{}'.format(sklearn.__version__))
from sklearn import datasets
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score
from sklearn.preprocessing import PowerTransformer
import time
import parameters
from statsmodels.stats.outliers_influence import variance_inflation_factor

import math
from h2o.grid.grid_search import H2OGridSearch

%run 'functions.ipynb'

plt.rcParams['figure.figsize'] = [10, 5]
plt.rcParams['figure.dpi'] = 100

pandas version:1.4.1
numpy version:1.22.3
h2o version:3.30.0.4
sklearn version:1.0.2


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
conn_s3 = dg_s3.Connect('dgdatadump/DS&CA/Pricing/02 Projects/2022/Tel_Reg_2.0/Cancellation Model')

# read datasets
df_raw12 = conn_s3.read('TelReg_Canx_filt_12mo.csv')
df_raw9 = conn_s3.read('TelReg_Canx_filt_9mo.csv')
df_raw6 = conn_s3.read('TelReg_Canx_filt_6mo.csv')
df_raw3 = conn_s3.read('TelReg_Canx_filt_3mo.csv')

In [4]:
data_list = ['df_raw3', 'df_raw6', 'df_raw9', 'df_raw12']
data_set = [df_raw3, df_raw6, df_raw9, df_raw12]

In [5]:
# read feature importance performance pickle
feat_imp_performance_df = pd.read_pickle("feat_imp_performance_df.pkl")

# Modelling
<a id='section4'></a>

In [6]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 49 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.4
H2O_cluster_version_age:,"2 years, 3 months and 7 days !!!"
H2O_cluster_name:,H2O_from_python_ubuntu_vwxp7y
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,27.46 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


## Model: GBM
<a id='gbm'></a>

In [7]:
performance_df = pd.DataFrame(columns =['name', 'varimp', 'feats_stored_gbm', 'perf_metrics','classif_rep', 'xval_auc', 'xval_metrics'])

response = 'cancelflag'
for idx, dataset in enumerate(data_list):
    if (dataset == 'df_raw12'):
        df = df_raw12
    elif (dataset == 'df_raw9'): 
        df = df_raw9
    elif (dataset == 'df_raw6'): 
        df = df_raw6
    else:
        df = df_raw3

    # # convert columns to factors
    # for i in df.columns[0:]:
    #     df[i] = df[i].astype('category')
    
    # specify the number of feats to be used as predictor
    predictors1 = feat_imp_performance_df.loc[idx,'feats_stored_gbm']
    predictors1 = predictors1[:20] 

    #train test split
    x_train, x_test, y_train, y_test = train_test_split(df[predictors1], df[response], test_size = 0.2, random_state = 1)
    train = pd.concat([x_train, y_train], axis=1)
    test = pd.concat([x_test, y_test], axis=1)

    # preserve training & test data's index to be used as a key to merge prediction probability with main dataset 
    xtrain_index = x_train.index
    xtest_index = x_test.index

    # convert to h2o dataframe format
    hf_train = h2o.H2OFrame(train)
    hf_test = h2o.H2OFrame(test)

    # format target to fit the model
    hf_train['cancelflag']=hf_train['cancelflag'].asfactor()
    hf_test['cancelflag']=hf_test['cancelflag'].asfactor()

    # initialise the estimator 
    df_gbm = H2OGradientBoostingEstimator(keep_cross_validation_predictions=True, nfolds = 5, seed = 1)
    start = time.time()

    # train the model
    df_gbm.train(x = predictors1, y = response, training_frame = hf_train)
    # model= df_gbm
    end = time.time()
    print(end - start)
    
    # Retrieve the variable importance
    varimp = df_gbm.varimp(use_pandas=True).sort_values('scaled_importance', ascending=False)
    feats_stored_gbm = list(varimp['variable'])
    
    #perf metrics
    perf_metrics = model_perf_stats(df_gbm, hf_train, hf_test)
    
    y_true = y_test
    y_pred = df_gbm.predict(hf_test)
    y_pred = y_pred.as_data_frame()
    
    # classification report
    classif_rep = classification_report(y_true, y_pred['predict'], labels=[0,1])
    
    # AUC of cross-validated holdout predictions
    xval_auc =  f'AUC of cross-validated holdout predictions: {df_gbm.auc(xval=True)}'
    
    # cross-validation metrics summary
    xval_metrics = df_gbm.cross_validation_metrics_summary()
    
    
    performance_df.loc[idx,:] = [dataset, varimp, feats_stored_gbm, perf_metrics, classif_rep, xval_auc, xval_metrics]

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
32.50166440010071
gbm prediction progress: |████████████████████████████████████████████████| 100%


  arr_value = np.asarray(value)


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
27.189404487609863
gbm prediction progress: |████████████████████████████████████████████████| 100%


  arr_value = np.asarray(value)


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
19.566553831100464
gbm prediction progress: |████████████████████████████████████████████████| 100%


  arr_value = np.asarray(value)


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
14.002660751342773
gbm prediction progress: |████████████████████████████████████████████████| 100%


  arr_value = np.asarray(value)


In [8]:
# save performance metric to pickle
pd.to_pickle(performance_df, "performance_df.pkl")

In [9]:
for i in range(len(performance_df)):
    for j in ['name','feats_stored_gbm','perf_metrics','classif_rep', 'xval_auc']:

        print(performance_df.loc[i,j])

df_raw3
['ManufacturerBrandCode', 'ONS_%people_working_FT', 'ONS_avg_age', 'PlansAcceptedPast1YearCount', 'PlanLiveCount', 'ClientAccountDesc', 'PurchasePrice', 'AppAge', 'PlansActivePast5yrCount', 'ClientGroupDesc', 'ApplianceCode', 'ONS_n_families_per_population', 'Fee', 'price_diff', 'ONS_%people_working_49+', 'ONS_avg_distance_travelled_to_work(km)', 'ONS_avg_household_size', 'ONS_avg_dependent_children_per_family', 'ONS_bedrooms per rooms', 'GoodsColour']
['MCC: 0.15283 / 0.12671', 'F1: 0.26612 / 0.25665', 'AUC: 0.63350 / 0.61699', 'AUC PR: 0.23835 / 0.21159', 'Accuracy: 0.87872 / 0.87771', 'Logloss: 0.35945 / 0.36331']
              precision    recall  f1-score   support

           0       0.90      0.65      0.75     62513
           1       0.17      0.51      0.25      8899

    accuracy                           0.63     71412
   macro avg       0.54      0.58      0.50     71412
weighted avg       0.81      0.63      0.69     71412

AUC of cross-validated holdout predictio

In [10]:
# print cross-validated performance metrics for all cancellation cohorts
for i in range(len(performance_df)):
    print(performance_df.loc[i,'name'])
    print(performance_df.loc[i,'xval_metrics'])

df_raw3

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.5907699,0.02851111,0.62302303,0.572043,0.604119,0.60303456,0.5516297
1,auc,0.6126119,0.0011605019,0.61273324,0.6145789,0.6118037,0.6120954,0.61184824
2,aucpr,0.21355721,0.0008720812,0.21293922,0.2142159,0.21286897,0.21475933,0.21300264
3,err,0.40923014,0.02851111,0.376977,0.427957,0.395881,0.39696544,0.44837028
4,err_count,23377.4,1599.8572,21571.0,24477.0,22663.0,22631.0,25545.0
5,f0point5,0.19195,0.0030485012,0.19670467,0.19244376,0.18994235,0.19191717,0.18874204
6,f1,0.25462848,0.0024889435,0.25645444,0.25744015,0.25123072,0.25322554,0.25479156
7,f2,0.37840548,0.011040336,0.3683387,0.3887311,0.37091234,0.37209076,0.39195448
8,lift_top_group,4.2650156,0.09092015,4.141169,4.205629,4.3419414,4.3544526,4.281885
9,logloss,0.3630558,0.002807214,0.36435515,0.36542052,0.35846817,0.362341,0.3646942



See the whole table with table.as_data_frame()

df_raw6

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.5312438,0.043960076,0.49329925,0.48074967,0.5516512,0.5427883,0.5877307
1,auc,0.61795557,0.0023525665,0.6159587,0.61909276,0.6191348,0.61501944,0.6205721
2,aucpr,0.27751416,0.0025333418,0.2740267,0.2802953,0.27903676,0.27835554,0.2758565
3,err,0.46875617,0.043960076,0.50670075,0.51925033,0.44834882,0.4572117,0.41226932
4,err_count,20292.2,1923.5203,22005.0,22442.0,19496.0,19736.0,17782.0
5,f0point5,0.24586754,0.0043473216,0.24207942,0.24254002,0.24969588,0.24363074,0.2513916
6,f1,0.32048568,0.0027089904,0.32127324,0.32317993,0.32244387,0.31657317,0.31895825
7,f2,0.46093738,0.019634815,0.477474,0.4841508,0.45500913,0.45185843,0.4361945
8,lift_top_group,3.8915555,0.10934594,3.9272923,3.7943835,3.7683742,4.0333176,3.9344106
9,logloss,0.43786472,0.002807153,0.43917358,0.44084293,0.43951872,0.43542054,0.4343679



See the whole table with table.as_data_frame()

df_raw9

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.4766172,0.031485815,0.47014815,0.44699365,0.46611112,0.53032976,0.46950334
1,auc,0.6150042,0.0042644343,0.6142886,0.6093765,0.61662066,0.6210264,0.6137086
2,aucpr,0.31807268,0.007450073,0.32467943,0.30834475,0.32518283,0.3196294,0.31252697
3,err,0.5233828,0.031485815,0.52985185,0.55300635,0.5338889,0.46967027,0.53049666
4,err_count,14056.0,897.1062,14235.0,14872.0,14415.0,12520.0,14238.0
5,f0point5,0.27727798,0.007271135,0.27628756,0.26624018,0.2820554,0.28535047,0.27645633
6,f1,0.36113527,0.0060338145,0.36077058,0.3518696,0.3686769,0.36272013,0.36163917
7,f2,0.51816064,0.012624833,0.5196771,0.5186946,0.53208435,0.4976535,0.5226937
8,lift_top_group,3.6653688,0.16628969,3.6217902,3.7750835,3.8937736,3.541835,3.494362
9,logloss,0.48549926,0.0035665394,0.48620015,0.48064575,0.4904784,0.48416418,0.48600787



See the whole table with table.as_data_frame()

df_raw12

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.47549376,0.0147072375,0.45651162,0.47832206,0.46450004,0.49100837,0.48712668
1,auc,0.6301007,0.0070304903,0.6303236,0.61835456,0.63087267,0.6362818,0.6346711
2,aucpr,0.37104255,0.010234709,0.37088135,0.3563073,0.36806312,0.37590668,0.38405433
3,err,0.5245063,0.0147072375,0.5434884,0.5216779,0.53549993,0.5089916,0.51287335
4,err_count,5614.6,191.07275,5830.0,5559.0,5800.0,5406.0,5478.0
5,f0point5,0.31694004,0.0038626858,0.31353343,0.31457955,0.3157129,0.3175615,0.32331282
6,f1,0.4067711,0.0037034443,0.4058296,0.40245083,0.40767974,0.40541136,0.4124839
7,f2,0.5677706,0.007968775,0.57513434,0.55843925,0.5752493,0.5604549,0.56957525
8,lift_top_group,3.4532664,0.052340303,3.537352,3.445147,3.3924413,3.4437726,3.4476187
9,logloss,0.5197081,0.0027469478,0.51941633,0.52275413,0.51979196,0.515392,0.52118593



See the whole table with table.as_data_frame()



In [11]:
# save performance metric to pickle
pd.to_pickle(performance_df, "feat_imp_performance_df.pkl")