In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_regression, f_classif
import gc
gc.enable()
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization
import lightgbm as lgb



file = "dataset/transformed_dataset.csv"
reg_cols = ['atlas_pct_diabetes_adults13',
 'atlas_pct_wic15',
 'total_physician_office_net_paid_pmpm_cost_9to12m_b4',
 'atlas_pct_laccess_hisp15',
 'atlas_pct_fmrkt_frveg16',
 'credit_hh_nonmtgcredit_60dpd',
 'atlas_dirsales_farms12',
 'rx_nonmaint_pmpm_ct',
 'zip_cd',
 'atlas_pct_laccess_white15',
 'credit_hh_bankcard_severederog',
 'atlas_pct_fmrkt_credit16',
 'credit_bal_autofinance_new',
 'rej_days_since_last_clm',
 'rx_generic_pmpm_ct_0to3m_b4',
 'rwjf_social_associate_rate',
 'med_physician_office_ds_clm_6to9m_b4',
 'atlas_totalocchu',
 'atlas_veg_acrespth12',
 'atlas_pct_loclsale12',
 'atlas_pct_fmrkt_anmlprod16',
 'atlas_freshveg_farms12',
 'rwjf_resident_seg_black_inx',
 'atlas_pct_loclfarm12',
 'total_outpatient_mbr_resp_pmpm_cost_6to9m_b4',
 'atlas_berry_acrespth12',
 'rx_maint_pmpm_ct_9to12m_b4',
 'rx_tier_2_pmpm_ct',
 'atlas_agritrsm_rct12',
 'atlas_pct_laccess_snap15',
 'atlas_deep_pov_all',
 'ccsp_227_pct',
 'bh_outpatient_net_paid_pmpm_cost',
 'atlas_veg_farms12',
 'rx_hum_16_pmpm_ct',
 'cms_risk_adjustment_factor_a_amt',
 'atlas_recfac14',
 'total_physician_office_copay_pmpm_cost',
 'atlas_pc_fsrsales12',
 'atlas_pct_fmrkt_baked16',
 'atlas_net_international_migration_rate',
 'rx_maint_mbr_resp_pmpm_cost_6to9m_b4',
 'rx_generic_pmpm_cost_6to9m_b4',
 'rx_gpi2_49_pmpm_cost_0to3m_b4',
 'atlas_pct_sbp15',
 'atlas_pct_laccess_child15',
 'met_obe_diag_pct',
 'atlas_orchard_acrespth12',
 'atlas_pct_laccess_hhnv15',
 'cnt_cp_webstatement_pmpm_ct',
 'atlas_pct_laccess_lowi15',
 'rx_gpi2_02_pmpm_cost',
 'cms_partd_ra_factor_amt',
 'atlas_pct_free_lunch14',
 'rx_tier_2_pmpm_ct_3to6m_b4',
 'cons_chva',
 'atlas_pct_fmrkt_wiccash16',
 'rx_overall_net_paid_pmpm_cost_6to9m_b4',
 'total_med_allowed_pmpm_cost_9to12m_b4',
 'bh_physician_office_copay_pmpm_cost_6to9m_b4',
 'atlas_pct_snap16',
 'atlas_ghveg_sqftpth12',
 'atlas_pc_dirsales12',
 'atlas_pct_reduced_lunch14',
 'ccsp_236_pct',
 'atlas_deep_pov_children',
 'atlas_pct_sfsp15',
 'rwjf_air_pollute_density',
 'rx_generic_pmpm_cost',
 'cms_tot_partd_payment_amt',
 'cons_nwperadult',
 'rx_days_since_last_script',
 'atlas_pct_laccess_nhasian15',
 'rx_nonbh_mbr_resp_pmpm_cost_6to9m_b4',
 'rx_days_since_last_script_6to9m_b4',
 'atlas_pct_obese_adults13',
 'credit_bal_consumerfinance',
 'atlas_pct_fmrkt_wic16',
 'atlas_orchard_farms12',
 'atlas_berry_farms12',
 'atlas_pct_laccess_multir15',
 'rx_bh_mbr_resp_pmpm_cost_9to12m_b4',
 'atlas_pc_wic_redemp12',
 'rwjf_mv_deaths_rate',
 'atlas_povertyunder18pct',
 'rx_gpi2_72_pmpm_cost_6to9m_b4',
 'atlas_pct_fmrkt_snap16',
 'atlas_medhhinc',
 'rx_nonbh_net_paid_pmpm_cost',
 'credit_bal_bankcard_severederog',
 'bh_ip_snf_net_paid_pmpm_cost',
 'atlas_pc_snapben15',
 'rx_nonbh_pmpm_ct_0to3m_b4',
 'rx_overall_mbr_resp_pmpm_cost_0to3m_b4',
 'auth_3mth_post_acute_mean_los',
 'rx_branded_mbr_resp_pmpm_cost',
 'rx_tier_1_pmpm_ct_0to3m_b4',
 'bh_ncdm_pct',
 'atlas_naturalchangerate1016',
 'rx_mail_mbr_resp_pmpm_cost_0to3m_b4',
 'credit_bal_autobank',
 'rx_nonotc_dist_gpi6_pmpm_ct',
 'cons_cgqs',
 'rx_overall_gpi_pmpm_ct_0to3m_b4',
 'credit_hh_bankcardcredit_60dpd',
 'rx_gpi2_01_pmpm_cost_0to3m_b4',
 'cci_dia_m_pmpm_ct',
 'atlas_pct_nslp15',
 'mcc_end_pct',
 'atlas_pct_laccess_black15',
 'credit_bal_mtgcredit_new',
 'credit_hh_1stmtgcredit',
 'cons_chmi',
 'rwjf_income_inequ_ratio',
 'atlas_pct_laccess_pop15',
 'atlas_pc_ffrsales12',
 'atlas_hh65plusalonepct',
 'atlas_pct_fmrkt_sfmnp16',
 'auth_3mth_acute_mean_los',
 'rx_hum_28_pmpm_cost',
 'atlas_pct_laccess_nhna15',
 'atlas_povertyallagespct',
 'rx_nonbh_mbr_resp_pmpm_cost',
 'rx_nonmaint_mbr_resp_pmpm_cost_9to12m_b4',
 'atlas_pct_fmrkt_otherfood16',
 'lab_dist_loinc_pmpm_ct',
 'rx_generic_mbr_resp_pmpm_cost',
 'atlas_pct_laccess_seniors15',
 'atlas_pct_cacfp15',
 'total_outpatient_allowed_pmpm_cost_6to9m_b4',
 'rx_nonmaint_mbr_resp_pmpm_cost',
 'credit_bal_nonmtgcredit_60dpd',
 'atlas_ownhomepct',
 'rx_overall_mbr_resp_pmpm_cost',
 'atlas_redemp_snaps16',
 'atlas_netmigrationrate1016',
 'atlas_percapitainc',
 'phy_em_px_pct',
 'rx_generic_mbr_resp_pmpm_cost_0to3m_b4']

cat_cols = ['bh_ncdm_ind',
 'auth_3mth_post_acute_inf',
 'rx_maint_net_paid_pmpm_cost_t_9-6-3m_b4',
 'ccsp_065_pmpm_ct',
 'auth_3mth_acute_vco',
 'rx_gpi2_72_pmpm_ct_6to9m_b4',
 'auth_3mth_post_acute_men',
 'rej_total_physician_office_visit_ct_pmpm_0to3m_b4',
 'total_physician_office_net_paid_pmpm_cost_t_9-6-3m_b4',
 'bh_ip_snf_net_paid_pmpm_cost_0to3m_b4',
 'mcc_ano_pmpm_ct_t_9-6-3m_b4',
 'atlas_type_2015_update',
 'atlas_retirement_destination_2015_upda',
 'auth_3mth_post_acute_sns',
 'atlas_hiamenity',
 'cons_ltmedicr',
 'auth_3mth_acute_ccs_086',
 'total_physician_office_mbr_resp_pmpm_cost_t_9-6-3m_b4',
 'auth_3mth_acute_cir',
 'atlas_csa12',
 'total_med_net_paid_pmpm_cost_t_6-3-0m_b4',
 'cons_n2pwh',
 'auth_3mth_snf_post_hsp',
 'auth_3mth_post_acute_inj',
 'med_outpatient_mbr_resp_pmpm_cost_t_9-6-3m_b4',
 'rx_gpi2_56_dist_gpi6_pmpm_ct_3to6m_b4',
 'atlas_low_employment_2015_update',
 'auth_3mth_acute_inf',
 'lab_albumin_loinc_pmpm_ct',
 'rx_gpi2_17_pmpm_cost_t_12-9-6m_b4',
 'cons_rxadhs',
 'cons_mobplus',
 'atlas_foodinsec_child_03_11',
 'lang_spoken_cd',
 'bh_ip_snf_mbr_resp_pmpm_cost_9to12m_b4',
 'auth_3mth_post_acute_gus',
 'auth_3mth_acute_cad',
 'rx_maint_pmpm_ct_t_6-3-0m_b4',
 'auth_3mth_acute_ccs_044',
 'cons_hxmioc',
 'med_outpatient_visit_ct_pmpm_t_12-9-6m_b4',
 'med_physician_office_allowed_pmpm_cost_t_9-6-3m_b4',
 'auth_3mth_acute_res',
 'auth_3mth_acute_chf',
 'auth_3mth_acute_ccs_030',
 'auth_3mth_dc_hospice',
 'auth_3mth_acute_neo',
 'atlas_type_2015_recreation_no',
 'hum_region',
 'atlas_ghveg_farms12',
 'rx_maint_net_paid_pmpm_cost_t_12-9-6m_b4',
 'auth_3mth_acute_ccs_048',
 'rx_overall_gpi_pmpm_ct_t_6-3-0m_b4',
 'rx_overall_gpi_pmpm_ct_t_12-9-6m_b4',
 'rx_nonbh_pmpm_ct_t_9-6-3m_b4',
 'mcc_chf_pmpm_ct_t_9-6-3m_b4',
 'auth_3mth_post_acute_chf',
 'auth_3mth_psychic',
 'rx_nonotc_pmpm_cost_t_6-3-0m_b4',
 'auth_3mth_acute_end',
 'atlas_low_education_2015_update',
 'src_div_id',
 'auth_3mth_bh_acute',
 'auth_3mth_acute_ccs_067',
 'atlas_type_2015_mining_no',
 'cons_n2pmr',
 'rx_mail_net_paid_pmpm_cost_t_6-3-0m_b4',
 'rej_med_er_net_paid_pmpm_cost_t_9-6-3m_b4',
 'med_outpatient_deduct_pmpm_cost_t_9-6-3m_b4',
 'rej_med_ip_snf_coins_pmpm_cost_t_9-6-3m_b4',
 'rx_generic_dist_gpi6_pmpm_ct_t_9-6-3m_b4',
 'auth_3mth_dc_home',
 'auth_3mth_acute_bld',
 'auth_3mth_acute_ner',
 'oontwk_mbr_resp_pmpm_cost_t_6-3-0m_b4',
 'rx_gpi2_90_dist_gpi6_pmpm_ct_9to12m_b4',
 'atlas_foodhub16',
 'rx_maint_pmpm_cost_t_6-3-0m_b4',
 'auth_3mth_post_acute_ben',
 'est_age',
 'auth_3mth_post_acute_cer',
 'auth_3mth_acute_ccs_153',
 'auth_3mth_acute_dig',
 'total_ip_maternity_net_paid_pmpm_cost_t_12-9-6m_b4',
 'auth_3mth_post_acute_cad',
 'rx_bh_pmpm_ct_0to3m_b4',
 'rx_nonmail_dist_gpi6_pmpm_ct_t_9-6-3m_b4',
 'atlas_persistentchildpoverty_1980_2011',
 'atlas_slhouse12',
 'atlas_population_loss_2015_update',
 'auth_3mth_acute_ccs_094',
 'auth_3mth_post_acute_ner',
 'auth_3mth_acute_ccs_227',
 'rx_overall_dist_gpi6_pmpm_ct_t_6-3-0m_b4',
 'auth_3mth_acute_trm',
 'auth_3mth_post_acute',
 'auth_3mth_acute_dia',
 'auth_3mth_acute_ccs_043',
 'rx_overall_mbr_resp_pmpm_cost_t_6-3-0m_b4',
 'cms_orig_reas_entitle_cd',
 'auth_3mth_post_acute_end',
 'auth_3mth_acute_can',
 'auth_3mth_acute_ccs_172',
 'auth_3mth_dc_home_health',
 'atlas_hipov_1115',
 'rx_phar_cat_cvs_pmpm_ct_t_9-6-3m_b4',
 'rx_gpi2_62_pmpm_cost_t_9-6-3m_b4',
 'cons_n2phi',
 'auth_3mth_post_acute_hdz',
 'auth_3mth_bh_acute_mean_los',
 'auth_3mth_post_acute_dig',
 'auth_3mth_transplant',
 'rx_mail_mbr_resp_pmpm_cost_t_9-6-3m_b4',
 'auth_3mth_acute_sns',
 'auth_3mth_post_acute_vco',
 'auth_3mth_home',
 'rx_nonbh_net_paid_pmpm_cost_t_6-3-0m_b4',
 'auth_3mth_post_acute_ckd',
 'rx_gpi2_34_dist_gpi6_pmpm_ct',
 'rx_gpi2_33_pmpm_ct_0to3m_b4',
 'auth_3mth_dc_ltac',
 'cons_estinv30_rc',
 'rx_phar_cat_humana_pmpm_ct_t_9-6-3m_b4',
 'auth_3mth_acute_men',
 'auth_3mth_dc_snf',
 'cons_hhcomp',
 'bh_ip_snf_mbr_resp_pmpm_cost_6to9m_b4',
 'auth_3mth_acute_inj',
 'total_physician_office_visit_ct_pmpm_t_6-3-0m_b4',
 'mabh_seg',
 'auth_3mth_post_acute_res',
 'auth_3mth_bh_acute_men',
 'auth_3mth_acute_hdz',
 'hedis_dia_hba1c_ge9',
 'auth_3mth_post_acute_trm',
 'auth_3mth_hospice',
 'rx_gpi2_39_pmpm_cost_t_6-3-0m_b4',
 'atlas_vlfoodsec_13_15',
 'auth_3mth_dc_acute_rehab',
 'rx_generic_pmpm_cost_t_6-3-0m_b4',
 'auth_3mth_acute_ccs_154',
 'cons_rxmaint',
 'total_bh_copay_pmpm_cost_t_9-6-3m_b4',
 'rx_nonmaint_dist_gpi6_pmpm_ct_t_12-9-6m_b4',
 'rej_med_outpatient_visit_ct_pmpm_t_6-3-0m_b4',
 'cons_rxadhm',
 'auth_3mth_acute_mus',
 'rx_nonbh_pmpm_cost_t_9-6-3m_b4',
 'rx_days_since_last_script_0to3m_b4',
 'auth_3mth_post_acute_cir',
 'auth_3mth_post_acute_dia',
 'auth_3mth_post_er',
 'auth_3mth_dc_no_ref',
 'bh_ip_snf_mbr_resp_pmpm_cost_3to6m_b4',
 'auth_3mth_acute',
 'rx_branded_pmpm_ct_t_6-3-0m_b4',
 'atlas_farm_to_school13',
 'auth_3mth_acute_cer',
 'med_ambulance_coins_pmpm_cost_t_9-6-3m_b4',
 'auth_3mth_acute_gus',
 'rx_gpi4_6110_pmpm_ct',
 'cons_hxwearbl',
 'auth_3mth_ltac',
 'auth_3mth_acute_ckd',
 'bh_ip_snf_net_paid_pmpm_cost_6to9m_b4',
 'sex_cd',
 'days_since_last_clm_0to3m_b4',
 'atlas_perpov_1980_0711',
 'auth_3mth_post_acute_mus',
 'auth_3mth_non_er',
 'bh_ncal_ind',
 'auth_3mth_facility',
 'atlas_foodinsec_13_15',
 'auth_3mth_dc_left_ama',
 'race_cd',
 'bh_ip_snf_admit_days_pmpm_t_9-6-3m_b4',
 'auth_3mth_dc_other',
 'cons_stlnindx',
 'auth_3mth_acute_skn',
 'total_allowed_pmpm_cost_t_9-6-3m_b4',
 'auth_3mth_rehab',
 'bh_urgent_care_copay_pmpm_cost_t_12-9-6m_b4',
 'auth_3mth_dc_custodial',
 'auth_3mth_snf_direct',
 'auth_3mth_acute_ccs_042',
 'bh_ip_snf_net_paid_pmpm_cost_9to12m_b4',
 'bh_ip_snf_net_paid_pmpm_cost_3to6m_b4',
 'rx_maint_pmpm_cost_t_12-9-6m_b4',
 'auth_3mth_post_acute_rsk',
 'rev_cms_ansth_pmpm_ct',
 'cons_cwht']

target = "covid_vaccination"

student_id = 2000728661

id = "ID"
reg_scalar = MinMaxScaler()
reg_f, cat_f = fdf = SelectKBest(f_regression, k=50), SelectKBest(f_classif, k=70)
def scale_df(dataframe, train=True):
    
    if train: dataframe[reg_cols] = reg_scalar.fit_transform(dataframe[reg_cols])
    else: dataframe[reg_cols] = reg_scalar.transform(dataframe[reg_cols])
    return dataframe

def get_reduced_features(dataframe, train=True):
    rcat, rreg = [], []
    if train: 
        rcat = cat_f.fit(dataframe[cat_cols], dataframe[target]).get_support(indices=True)
        rreg = reg_f.fit(dataframe[reg_cols], dataframe[target]).get_support(indices=True)
    else:
        rcat = cat_f.get_support(indices=True)
        rreg = reg_f.get_support(indices=True)
    rcat = [i for idx, i in enumerate(dataframe[cat_cols].columns) if idx in rcat]
    rreg = [i for idx, i in enumerate(dataframe[reg_cols].columns) if idx in rreg]
    return rcat, rreg

df = pd.read_csv('dataset/transformed_dataset.csv')

# cat_cols, reg_cols = get_reduced_features(df)


df = df[cat_cols + reg_cols + [target]]
df = scale_df(df)
df.head()

Unnamed: 0,bh_ncdm_ind,auth_3mth_post_acute_inf,rx_maint_net_paid_pmpm_cost_t_9-6-3m_b4,ccsp_065_pmpm_ct,auth_3mth_acute_vco,rx_gpi2_72_pmpm_ct_6to9m_b4,auth_3mth_post_acute_men,rej_total_physician_office_visit_ct_pmpm_0to3m_b4,total_physician_office_net_paid_pmpm_cost_t_9-6-3m_b4,bh_ip_snf_net_paid_pmpm_cost_0to3m_b4,...,rx_nonmaint_mbr_resp_pmpm_cost,credit_bal_nonmtgcredit_60dpd,atlas_ownhomepct,rx_overall_mbr_resp_pmpm_cost,atlas_redemp_snaps16,atlas_netmigrationrate1016,atlas_percapitainc,phy_em_px_pct,rx_generic_mbr_resp_pmpm_cost_0to3m_b4,covid_vaccination
0,0,0,1,1,1,1,0,1,9,1,...,0.003288,0.159828,0.748533,0.008597,0.184427,0.434299,0.351813,0.0,0.018443,0
1,0,0,8,1,1,1,0,1,9,1,...,0.008956,0.197643,0.677332,0.016252,0.282557,0.436958,0.382757,0.0,0.059102,0
2,0,0,0,1,1,1,0,1,1,1,...,0.008518,0.305488,0.677281,0.014049,0.285884,0.36241,0.363665,0.130694,0.010014,0
3,0,0,0,1,1,2,0,1,9,1,...,0.0,0.269741,0.463229,0.003586,0.366044,0.303851,0.300465,0.0,0.012345,0
4,0,0,4,1,1,1,0,1,9,1,...,0.0,0.090292,0.456923,0.0,0.249197,0.384314,0.536348,0.5,0.0,0


In [2]:



X_T, y_T = df[reg_cols + cat_cols], df[target]
# X_T, X_t, y_T, y_t = train_test_split(X, y, test_size=.2, random_state=student_id, shuffle=True, stratify=y)


In [3]:
params = {
    'max_depth':[9, 10, 12],
    'learning_rate': [.1],
    'n_estimators': [200, 250]
}

# m = CatBoostClassifier(random_state = student_id, task_type="GPU", devices='0:1', 
#                            eval_metric='AUC', thread_count=1, 
#                            cat_features=cat_cols, metric_period=40,
#                            od_type='Iter', loss_function="Logloss", 
#                        depth=12, learning_rate=learning_rate[1],n_estimators=n_estimators[0])

In [4]:
# m.fit(X_T,y=y_T,eval_set=(X_t, y_t),verbose=True,plot=True, use_best_model=True)

In [5]:
def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=3, random_seed=6,n_estimators=10000, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, params={'verbose': -1}, free_raw_data=False)
    # parameters
    def lgb_eval(learning_rate,num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, min_data_in_leaf,min_sum_hessian_in_leaf,subsample):
        params = {'application':'binary', 'metric':'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)
        params['verbose']: -1
        
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, 
                           verbose_eval =200, metrics=['auc'], categorical_feature=cat_cols)
        return max(cv_result['auc-mean'])
     
    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.01, 1.0),
                                            'num_leaves': (24, 80),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (5, 30),
                                            'max_bin':(20,90),
                                            'min_data_in_leaf': (20, 80),
                                            'min_sum_hessian_in_leaf':(0,100),
                                           'subsample': (0.01, 1.0)
                                           }, 
                                 random_state=student_id)

    
    #n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
    #init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
    
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_auc=[]
    for model in range(len( lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])
    
    # return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'],lgbBO.res[pd.Series(model_auc).idxmax()]['params']

In [6]:
opt_params = bayes_parameter_opt_lgb(X_T, y_T, init_round=5, opt_round=10, n_folds=3, random_seed=student_id,n_estimators=10000)
opt_params[1]["num_leaves"] = int(round(opt_params[1]["num_leaves"]))
opt_params[1]['max_depth'] = int(round(opt_params[1]['max_depth']))
opt_params[1]['min_data_in_leaf'] = int(round(opt_params[1]['min_data_in_leaf']))
opt_params[1]['max_bin'] = int(round(opt_params[1]['max_bin']))
opt_params[1]['objective']='binary'
opt_params[1]['metric']='auc'
opt_params[1]['is_unbalance']=True
opt_params[1]['boost_from_average']=False
opt_params=opt_params[1]


|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 112968, number of negative: 536926
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3128
[LightGBM] [Info] Number of data points in the train set: 649894, number of used features: 286
[LightGBM] [Info] Number of positive: 112969, number of negative: 536926
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3128
[LightGBM] [Info] Number of data points in the train set: 649895, number of used features: 286
[LightGBM] [Info] Number of positive: 112969, number of negative: 536926
You can set

| [0m 4       [0m | [0m 0.6571  [0m | [0m 0.8209  [0m | [0m 0.8487  [0m | [0m 0.8517  [0m | [0m 71.76   [0m | [0m 27.36   [0m | [0m 57.34   [0m | [0m 46.06   [0m | [0m 53.96   [0m | [0m 0.2541  [0m |
[LightGBM] [Info] Number of positive: 112968, number of negative: 536926
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3110
[LightGBM] [Info] Number of data points in the train set: 649894, number of used features: 277
[LightGBM] [Info] Number of positive: 112969, number of negative: 536926
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3110
[LightGBM] [Info] Number of data points in the train set: 649895, number of used features: 277
[LightGBM] [Info] Number of positive: 112969, number of negative: 536926
You can set `force_row_wise=true` to remove the overhead.

| [0m 8       [0m | [0m 0.6616  [0m | [0m 0.8066  [0m | [0m 0.2714  [0m | [0m 0.7571  [0m | [0m 23.55   [0m | [0m 17.72   [0m | [0m 35.98   [0m | [0m 18.69   [0m | [0m 56.66   [0m | [0m 0.904   [0m |
[LightGBM] [Info] Number of positive: 112968, number of negative: 536926
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3295
[LightGBM] [Info] Number of data points in the train set: 649894, number of used features: 286
[LightGBM] [Info] Number of positive: 112969, number of negative: 536926
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3295
[LightGBM] [Info] Number of data points in the train set: 649895, number of used features: 286
[LightGBM] [Info] Number of positive: 112969, number of negative: 536926
You can set `force_col_wise=true` to remove the overhead.

| [0m 12      [0m | [0m 0.6649  [0m | [0m 0.8327  [0m | [0m 0.5664  [0m | [0m 0.5681  [0m | [0m 28.66   [0m | [0m 27.46   [0m | [0m 21.45   [0m | [0m 29.49   [0m | [0m 49.16   [0m | [0m 0.4746  [0m |
[LightGBM] [Info] Number of positive: 112968, number of negative: 536926
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3285
[LightGBM] [Info] Number of data points in the train set: 649894, number of used features: 281
[LightGBM] [Info] Number of positive: 112969, number of negative: 536926
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3285
[LightGBM] [Info] Number of data points in the train set: 649895, number of used features: 281
[LightGBM] [Info] Number of positive: 112969, number of negative: 536926
You can set `force_row_wise=true` to remove the overhead.

In [7]:
opt_params

{'bagging_fraction': 0.8855002052522746,
 'feature_fraction': 0.4435584169139978,
 'learning_rate': 0.11326924831768609,
 'max_bin': 23,
 'max_depth': 22,
 'min_data_in_leaf': 26,
 'min_sum_hessian_in_leaf': 15.18694006998691,
 'num_leaves': 49,
 'subsample': 0.8970360999639048,
 'objective': 'binary',
 'metric': 'auc',
 'is_unbalance': True,
 'boost_from_average': False}

In [8]:

# m = CatBoostClassifier(random_state = student_id, task_type="GPU", devices='0:1', 
#                            eval_metric='AUC', thread_count=1, 
#                            cat_features=cat_cols, metric_period=40,
#                            od_type='Iter', loss_function="Logloss",
#                        depth=9, learning_rate=.1,n_estimators=200)

# # best_params = m.grid_search(params, X,y=y, plot=True, stratified=True, cv=4, verbose=True)
# m.fit(X, y=y, plot=True, )

# m.save_model('models/catboost.cbm',
#            format="cbm",
#            export_parameters=None,
#            pool=None)

In [9]:
tdf = pd.read_csv('dataset/transformed_dataset_holdout.csv')
tdf = tdf[reg_cols + cat_cols + [id]]
tdf = scale_df(tdf, False)

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=31416)
oof = np.zeros(len(X_T))
predictions = np.zeros(len(tdf))
feature_importance_df = pd.DataFrame()
features = reg_cols + cat_cols

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_T.values, y_T.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(X_T.iloc[trn_idx][features], label=y_T.iloc[trn_idx])
    val_data = lgb.Dataset(X_T.iloc[val_idx][features], label=y_T.iloc[val_idx])

    num_round = 15000
    clf = lgb.train(opt_params, trn_data, num_round, valid_sets = [trn_data, val_data], 
                    verbose_eval=500, early_stopping_rounds = 250, categorical_feature=cat_cols)
    oof[val_idx] = clf.predict(X_T.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(tdf[features], num_iteration=clf.best_iteration) / folds.n_splits


Fold 0
[LightGBM] [Info] Number of positive: 152507, number of negative: 724850
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4851
[LightGBM] [Info] Number of data points in the train set: 877357, number of used features: 287
Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[92]	training's auc: 0.719469	valid_1's auc: 0.677969
Fold 1
[LightGBM] [Info] Number of positive: 152507, number of negative: 724850
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4837
[LightGBM] [Info] Number of data points in the train set: 877357, number of used features: 282
Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[109]	training's auc: 0.727599	valid_1's auc: 0.674185
Fold 2
[LightGBM] [Info] Number of 

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[115]	training's auc: 0.729188	valid_1's auc: 0.679894
Fold 8
[LightGBM] [Info] Number of positive: 152508, number of negative: 724850
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4844
[LightGBM] [Info] Number of data points in the train set: 877358, number of used features: 285
Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[81]	training's auc: 0.71461	valid_1's auc: 0.678083
Fold 9
[LightGBM] [Info] Number of positive: 152507, number of negative: 724851
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4851
[LightGBM] [Info] Number of data points in the train set: 877358, number of used features: 287
Training until validation scores don

In [10]:
fold_importance_df

Unnamed: 0,Feature,importance,fold
0,atlas_pct_diabetes_adults13,4,10
1,atlas_pct_wic15,4,10
2,total_physician_office_net_paid_pmpm_cost_9to1...,0,10
3,atlas_pct_laccess_hisp15,1,10
4,atlas_pct_fmrkt_frveg16,2,10
...,...,...,...
325,bh_ip_snf_net_paid_pmpm_cost_3to6m_b4,0,10
326,rx_maint_pmpm_cost_t_12-9-6m_b4,10,10
327,auth_3mth_post_acute_rsk,0,10
328,rev_cms_ansth_pmpm_ct,3,10


In [11]:
oof.shape, 

((974842,),)

In [12]:
print("CV score: {:<8.5f}".format(roc_auc_score(y_T, oof)))

CV score: 0.67755 


In [13]:
predictions, type(predictions)

(array([0.41475253, 0.19525717, 0.51993198, ..., 0.63572648, 0.54156594,
        0.7428567 ]),
 numpy.ndarray)

In [14]:



tdf[target] = predictions > .5
tdf['SCORE'] = 1. - predictions



In [15]:
tdf['RANK'] = tdf['SCORE'].rank(ascending=False, method='first').astype(np.int64)
tdf['SCORE'] = tdf['SCORE'].round(10)

In [16]:
y_T.value_counts()

0    805389
1    169453
Name: covid_vaccination, dtype: int64

In [17]:
tdf.to_csv('dataset/2021CaseCompetition_Ashutosh_Tiwari_20211006-2.csv', index=False, columns=[id, 'SCORE', 'RANK'])

In [18]:
tdf.shape

(525158, 334)

In [19]:
tdf[target].value_counts()

False    286984
True     238174
Name: covid_vaccination, dtype: int64

In [20]:
feature_importance_df

Unnamed: 0,Feature,importance,fold
0,atlas_pct_diabetes_adults13,1,1
1,atlas_pct_wic15,7,1
2,total_physician_office_net_paid_pmpm_cost_9to1...,0,1
3,atlas_pct_laccess_hisp15,0,1
4,atlas_pct_fmrkt_frveg16,2,1
...,...,...,...
325,bh_ip_snf_net_paid_pmpm_cost_3to6m_b4,0,10
326,rx_maint_pmpm_cost_t_12-9-6m_b4,10,10
327,auth_3mth_post_acute_rsk,0,10
328,rev_cms_ansth_pmpm_ct,3,10
