In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import cluster
from sklearn.decomposition import PCA
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 1000

In [2]:
original = pd.read_csv("data/2020_Competition_Holdout.csv")
health = original.copy()
original = original.set_index('person_id_syn')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
def mapStringVariables(data):
    """
    Reorganizes columns to alphabetical order, Maps all string variables into new groups.  Prepares string values
    to be turned into dummy variables. 
    
    One input:
    
    data: a pandas data frame
    
    Returns pandas data frame Variables remapped to prepare to create dummy variables 
    """
    data = data.reindex(sorted(data.columns), axis = 1)
    data['cons_cmys'] = data['cons_cmys'].map({'0':'Unknown',
                                                          '1': 'Less Than High School',
                                                          '2': 'Less Than High School', 
                                                          '3': 'High School Diploma',
                                                          '4': 'Some College', 
                                                          '5': 'Associate Degree',
                                                          '6': 'Bachelors Degree',
                                                          '7': 'Graduate Degree', 
                                                          '8': 'Professional School Degree',
                                                          '9': 'Doctorate Degree'})
    data['cons_cmys'].fillna('Unknown', inplace = True)
    data['cons_hhcomp'] = data['cons_hhcomp'].map({'A':'Min Two People, Children',
                                                          'C': 'Min Two People, Children',
                                                          'D': 'Min Two People, No Children', 
                                                          'E': 'Min Two People, Children',
                                                          'F': 'Min Two People, No Children', 
                                                          'G': 'Min Two People, Children',
                                                          'H': 'Min Two People, No Children',
                                                          'B': 'Min Two People, No Children', 
                                                          'I': 'One Person, Children',
                                                          'J': 'One Person, No Children',
                                                          'K': 'One Person, Children', 
                                                          'L': 'One Person, No Children',
                                                          'U': 'Unknown'})
    data['cons_hhcomp'].fillna('Unknown', inplace = True)
    data['cms_ra_factor_type_cd'] = data['cms_ra_factor_type_cd'].map({'CN': 'CN',
                                                                      'CP': 'CP',
                                                                      'E': 'E',
                                                                      'CF': 'CF',
                                                                      'D':'D',
                                                                      '1': 'Other',
                                                                      'C2' : 'Other',
                                                                      'I': 'Other',
                                                                      'SE': 'Other',
                                                                      '*': 'Other'})

    data['cms_ra_factor_type_cd'].fillna('Unknown', inplace = True)
    data['cons_homstat'] = data['cons_homstat'].map({'P': 'Homeowner',
                                                                 'R': 'Renter',
                                                                 'T': 'Renter',
                                                                 'U': 'Unknown',
                                                                 'Y': 'Homeowner'})
    data['cons_homstat'].fillna('Unknown', inplace = True)
    
    data['sex_cd'] = data['sex_cd'].map({'M': 0, 'F': 1})
    for i in range(203, 212):
        data.iloc[:,i] = data.iloc[:,i].map({'Y': 1, 'N': 0})
        
    data['lang_spoken_cd'] = data['lang_spoken_cd'].map({'E': 'ENG', 'ENG': 'ENG', 'SPA': 'SPA'})
    return data

In [4]:
def dropMajorityNAcolumns(data, threshold = 0.5):
    """
    Drops all columns from data that have a certain NA percentage of values above the threshold. 
    
    Two Inputs:
    
    data: a pandas dataframe
    threshold (default = 0.5): a non-negative value (from 0.0 to 1.0) that represents a threshold percentage for 
    which columns to drop
    
    Returns pandas DataFrame with dropped columns
    """
    
    for column in data.columns:
        if (data[column].isna().sum() / len(data[column])) >= threshold:
            data.drop(column, inplace = True, axis = 1)
            print("Dropped: ", column)
    return data 

In [5]:
def columnBinning(data):
    """
    Groups columns into specific bins, removes their original columns
    
    One Inputs:
    
    data: a pandas dataframe

    Returns pandas DataFrame with new columns, removes columns that are not of interest
    """
    
    ccsp_columns = ['ccsp_014_ind', 'ccsp_021_ind', 'ccsp_034_ind', 'ccsp_060_ind', 'ccsp_080_ind', 'ccsp_107_ind',
                   'ccsp_125_ind', 'ccsp_204_ind', 'ccsp_212_ind', 'ccsp_242_ind']
    
    data['cons_n2mob'] = round(data['cons_n2mob'], 2) / 100
    data['cons_n2pbl'] = round(data['cons_n2pbl'], 2) / 100
    data['cons_n2pmv'] = round(data['cons_n2pmv'], 2) / 100
    
    
    for ccsp_column in ccsp_columns: 
            data.loc[data[ccsp_column] == 1, 'ccsp_extra_group'] = 1
            data.drop(ccsp_column, inplace = True, axis = 1)
    data['ccsp_extra_group'].fillna(0, inplace = True)
            
    return data
    

In [6]:
def columnsToDrop(data): 
    """
    Drops all columns that are not going to be used in the model.  
    
    One Inputs:
    
    data: a pandas dataframe

    Returns pandas DataFrame with dropped columns
    """
    
    columns_drop = ['lab_bnp_abn_result_ind', 'lab_hba1_c_abn_result_ind', 'med_ip_ltach_admit_ct_pmpm',
                   'med_ip_ltach_admit_days_pmpm', 'med_ip_maternity_admit_ct_pmpm', 'med_ip_maternity_admit_days_pmpm',
                   'med_ip_mhsa_admit_ct_pmpm', 'med_ip_mhsa_admit_days_pmpm',
                    'src_platform_cd']
    
    columns_drop += ["submcc_ano_mus_pmpm_ct", "submcc_ano_othr_pmpm_ct","submcc_ben_lymp_pmpm_ct","submcc_ben_ner_pmpm_ct", 
                      "submcc_brn_acc_pmpm_ct","submcc_cad_fh/ho_pmpm_ct","submcc_cad_ptca_pmpm_ct", "submcc_can_brst_pmpm_ct",
                     "submcc_can_leuk_pmpm_ct","submcc_can_ner_pmpm_ct",
                     "submcc_gus_othr_pmpm_ct","submcc_hdz_arrh_pmpm_ct","submcc_hdz_it_i_pmpm_ct","submcc_hdz_surg_pmpm_ct",
                      "submcc_hdz_valv_pmpm_ct","submcc_hiv_kapo_pmpm_ct","submcc_hiv_pcp_pmpm_ct","submcc_inf_men_pmpm_ct",
                      "submcc_inf_myco_pmpm_ct","submcc_inj_comp_pmpm_ct","submcc_mus_othr_pmpm_ct","submcc_neo_fh/ho_pmpm_ct",
                    "submcc_pre_care_pmpm_ct", "submcc_pre_del_pmpm_ct","submcc_pre_l/d_pmpm_ct","submcc_pre_mul_pmpm_ct",
                    "submcc_pre_ect_pmpm_ct","submcc_pre_othr_pmpm_ct","submcc_rar_als_pmpm_ct",
                    "submcc_rar_cf_pmpm_ct","submcc_rar_drm_pmpm_ct","submcc_rar_mg_pmpm_ct","submcc_rar_othr_pmpm_ct",
                    "submcc_rar_pol_pmpm_ct","submcc_rar_sca_pmpm_ct","submcc_rar_scl_pmpm_ct","submcc_rsk_an_pmpm_ct",
                    "submcc_rsk_fh/h_pmpm_ct","submcc_rsk_othr_pmpm_ct","submcc_rsk_pcos_pmpm_ct",
                      "submcc_trm_fxu_pmpm_ct","submcc_trm_fxul_pmpm_ct","submcc_vco_end_pmpm_ct"]
    
    columns_drop += ["submcc_ano_mus_ind", "submcc_ano_othr_ind","submcc_ben_lymp_ind","submcc_ben_ner_ind", 
                      "submcc_brn_acc_ind","submcc_cad_fh/ho_ind","submcc_cad_ptca_ind", "submcc_can_brst_ind",
                     "submcc_can_leuk_ind","submcc_can_ner_ind",
                     "submcc_gus_othr_ind","submcc_hdz_arrh_ind","submcc_hdz_it_i_ind","submcc_hdz_surg_ind",
                      "submcc_hdz_valv_ind","submcc_hiv_kapo_ind","submcc_hiv_pcp_ind","submcc_inf_men_ind",
                      "submcc_inf_myco_ind","submcc_inj_comp_ind","submcc_mus_othr_ind","submcc_neo_fh/ho_ind",
                    "submcc_pre_care_ind", "submcc_pre_del_ind","submcc_pre_l/d_ind","submcc_pre_mul_ind",
                    "submcc_pre_ect_ind","submcc_pre_othr_ind","submcc_rar_als_ind",
                    "submcc_rar_cf_ind","submcc_rar_drm_ind","submcc_rar_mg_ind","submcc_rar_othr_ind",
                    "submcc_rar_pol_ind","submcc_rar_sca_ind","submcc_rar_scl_ind","submcc_rsk_an_ind",
                    "submcc_rsk_fh/h_ind","submcc_rsk_othr_ind","submcc_rsk_pcos_ind",
                      "submcc_trm_fxu_ind","submcc_trm_fxul_ind","submcc_vco_end_ind"]
    
    for columns in columns_drop:
        data.drop(columns, inplace = True, axis = 1)
        
    return data

In [7]:
def LabelEncoding(data): 
    '''
    Encodes Categorical data to levels to prepare for machine learning model 
    
    Takes one input:
    data: a pandas data frame
    
    Returns a pandas dataframe with new updated columns
    '''
    
    from sklearn.preprocessing import LabelEncoder

    lb_make = LabelEncoder()
    
    data['rucc_category'] = lb_make.fit_transform(data['rucc_category'])
    data['state_cd'] = lb_make.fit_transform(data['state_cd'])
    data['zip_cd'] = lb_make.fit_transform(data['zip_cd'])
    data['cnty_cd'] = lb_make.fit_transform(data['cnty_cd'])
    
    pdc = ['pdc_ast', 'pdc_cvd', 'pdc_dep', 'pdc_dia', 'pdc_hf', 'pdc_ht', 'pdc_lip', 'pdc_ost']
    for p in pdc: 
        data.loc[data[p] == 1.1, p] = 0
    
    return data

In [8]:
def betosEngineering(data):
    '''
    Creates new features for three different groups in Betos: common, uncommon, and critical.  Sums all the claims
    based on these three groups and returns the new columns. Afterwards, drops the old betos columns (pmpm_ct)
    
    Takes one input:
    data: a pandas data frame
    
    Returns a pandas dataframe with new updated columns
    '''
    common_visits = ['betos_m1b_pmpm_ct', 'betos_o1b_pmpm_ct', 'betos_o1e_pmpm_ct', 'betos_o1g_pmpm_ct',
                'betos_t1a_pmpm_ct', 'betos_t1b_pmpm_ct', 'betos_t1e_pmpm_ct', 'betos_t1h_pmpm_ct',
                'betos_t2a_pmpm_ct', 'betos_y2_pmpm_ct']
    uncommon = ['betos_d1c_pmpm_ct', 'betos_m5b_pmpm_ct', 'betos_m5c_pmpm_ct', 'betos_m5d_pmpm_ct']
    critical = ['betos_d1d_pmpm_ct', 'betos_m2c_pmpm_ct', 'betos_o1a_pmpm_ct']

    data['betos_common_visits_pmpm_ct'] = data[common_visits].sum(axis = 1)
    data['betos_uncommon_visits_pmpm_ct'] = data[uncommon].sum(axis = 1)
    data['betos_critical_visits_pmpm_ct'] = data[critical].sum(axis = 1)
    
    total = common_visits + uncommon + critical
    
    pca = PCA(n_components = 7)
    pc = pca.fit_transform(data[total])
    
    pca_data = pd.DataFrame(data = pc, columns = ['PCA1_betos', 'PCA2_betos', 'PCA3_betos', 
                                                  'PCA4_betos', 'PCA5_betos', 'PCA6_betos',
                                                  'PCA7_betos'])
    data = pd.concat([data, pca_data], axis = 1)
    
    labels = range(1,101)
    betos_list = []
    for single in total: 
        name = single + "_rank"
        data[name] = pd.qcut(data[single].rank(method = 'first'), q = 100, labels = labels)
        betos_list.append(name)
        data[name] = data[name].astype(int)
    
    data['Betos_Score'] = data[betos_list].sum(axis=1)
    data['Betos_AVG_Score'] = Standardize(data['Betos_Score'])
    data['Betos_Score'] = data[betos_list].mean(axis=1)
    data['Betos_AVG_Score'] = Standardize(data['Betos_Score'])
    data['est_age_std'] = Standardize(data['est_age'])

    model_betos = cluster.KMeans(n_clusters = 20)
    data['Betos_Score_clusters'] = model_betos.fit_predict(data[['Betos_Score', 'est_age_std']])

    for column in total:
        data.drop(column, inplace = True, axis = 1)
    return data

In [9]:
def medEngineering(data):
    '''
    Creates new feature based on total admitted days for non-BH related claims. 

    Takes one input:
    data: a pandas data frame
    
    Returns a pandas dataframe with new updated columns
    '''
    med_admit = ['med_ip_acute_admit_days_pmpm', 'med_ip_rehab_admit_days_pmpm', 'med_ip_snf_admit_days_pmpm']
    
    data['med_admit_days'] = data[med_admit].sum(axis = 1)
    for column in med_admit:
        data.drop(column, inplace = True, axis = 1)
    return data

In [10]:
def creditDataEngineering(data): 
    '''
    Creates new features based on credit data information

    Takes one input:
    data: a pandas data frame
    
    Returns a pandas dataframe with new updated columns
    '''
    from sklearn import cluster
    from sklearn.decomposition import PCA
    
    credit_bal_general = ['credit_bal_autobank', 'credit_bal_autofinance', 'credit_bal_consumerfinance']
    
    credit_bal_new = ['credit_bal_agencyfirstmtg_new', 'credit_bal_autobank_new', 'credit_bal_autofinance_new',
                     'credit_bal_consumerfinance_new','credit_bal_mtgcredit_new']
    
    credit_bal_dpd = ['credit_bal_1stmtg_30to59dpd', 'credit_bal_1stmtg_60to89dpd', 'credit_bal_1stmtgcredit_60dpd',
                     'credit_bal_agencyfirstmtg_60dpd', 'credit_bal_heloc_60dpd', 'credit_bal_mtg_90to119dpd',
                     'credit_bal_nonagn1stmorg_30to59dpd', 'credit_bal_nonagn1stmorg_60to89dpd', 
                     'credit_bal_nonagn1stmorg_90to119dp', 'credit_bal_nonagnfirstmtg_60dpd', 'credit_bal_nonmtgcredit_60dpd',
                     'credit_bal_studentloan_60dpd']
    
    credit_bal_overage = ['credit_bal_1stmtg_collections', 'credit_bal_1stmtg_severederog', 'credit_bal_agency1stmorg_collectio',
                         'credit_bal_bankcard_severederog', 'credit_bal_heloc_severederog', 'credit_bal_mtg_bankruptcy',
                         'credit_bal_mtg_severederog', 'credit_bal_nonagn1stmorg_bankruptc', 'credit_bal_nonagn1stmorg_collectio']

    credit_num_new = ['credit_num_1stmtgcredit_new', 'credit_num_agencyfirstmtg_new', 'credit_num_autobank_new',
                 'credit_num_autofinance_new', 'credit_num_consumerfinance_new', 'credit_num_mtgcredit_new']

    credit_num_general = ['credit_num_1stmtgcredit', 'credit_num_agencyfirstmtg', 'credit_num_autobank', 
                         'credit_num_autofinance', 'credit_num_consumerfinance', 'credit_num_studentloan']

    credit_num_dpd = ['credit_num_1stmtg_30to59dpd', 'credit_num_1stmtg_60to89dpd', 'credit_num_agencyfirstmtg_60dpd',
                      'credit_num_mtg_60to89dpd', 'credit_num_mtg_90to119dpd', 'credit_num_nonagn1stmorg_30to59dpd', 
                      'credit_num_nonagn1stmorg_60to89dpd', 'credit_num_nonagn1stmorg_90to119dp',
                      'credit_num_nonmtgcredit_60dpd', 'credit_num_studentloan_60dpd', 'credit_num_heloc_60dpd']

    credit_num_danger = ['credit_num_1stmtg_bankruptcy', 'credit_num_1stmtg_collections', 'credit_num_1stmtg_severederog',
                         'credit_num_agency1stmorg_collectio', 'credit_num_bankcard_severederog', 'credit_num_heloc_severederog',
                         'credit_num_mtg_collections', 'credit_num_mtg_severederog', 'credit_num_nonagn1stmorg_bankruptc',
                         'credit_num_nonagn1stmorg_collectio']
    
    credit_hh_new = ['credit_hh_1stmtgcredit_new', 'credit_hh_agencyfirstmtg_new', 'credit_hh_autobank_new',
                 'credit_hh_autofinance_new', 'credit_hh_consumerfinance_new', 'credit_hh_mtgcredit_new']

    credit_hh_dpd = ['credit_hh_bankcardcredit_60dpd', 'credit_hh_nonmtgcredit_60dpd',
                     'credit_hh_studentloan_60dpd']
    
    percent_balance = ['credit_prcnt_agencyfirstmtg', 'credit_prcnt_autobank',
                      'credit_prcnt_mtgcredit', 'credit_prcnt_nonagnfirstmtg']
    
    credit_hh_clusters = ['credit_hh_1stmtg_severederog', 'credit_hh_1stmtgcredit', 'credit_hh_1stmtgcredit_new',
                          'credit_hh_agencyfirstmtg', 'credit_hh_agencyfirstmtg_new', 'credit_hh_autobank',
                          'credit_hh_autobank_new', 'credit_hh_autofinance', 'credit_hh_autofinance_new',
                          'credit_hh_bankcard_severederog', 'credit_hh_bankcardcredit_60dpd',
                          'credit_hh_consumerfinance', 'credit_hh_consumerfinance_new', 'credit_hh_mtg_severederog',
                          'credit_hh_mtgcredit_new', 'credit_hh_nonagnfirstmtg', 'credit_hh_nonmtgcredit_60dpd',
                          'credit_hh_studentloan', 'credit_hh_studentloan_60dpd', 'credit_hh_totalallcredit_bankruptcy',
                          'credit_hh_totalallcredit_collections', 'credit_hh_totalallcredit_severederog']
    
    age_accounts = ['credit_minmob_1stmtgcredit', 'credit_minmob_agencyfirstmtg', 'credit_minmob_mtgcredit']
    
    credit_balance = ['credit_bal_totalallcredit_60dpd', 'credit_bal_totalallcredit_60to89dpd', 'credit_bal_totalallcredit_90to119dpd'
                      'credit_bal_totalallcredit_new']
    
    credit_bal_clusters = credit_bal_general + credit_bal_dpd + credit_bal_overage + credit_bal_new
    credit_num_clusters = credit_num_new + credit_num_general + credit_num_dpd + credit_num_danger
    
    data['average_minmob_percent_of_age'] = round(data[age_accounts].mean(axis = 1) / data['est_age'], 2)
    data.loc[data['average_minmob_percent_of_age'] > 1, 'average_minmob_percent_of_age'] = 1
    
    data['average_percent_balance_total'] = round(data[percent_balance].sum(axis = 1) / 4, 1)
    
    data['credit_num_new'] = data[credit_num_new].sum(axis = 1)
    data['credit_num_general'] = data[credit_num_general].sum(axis = 1)
    data['credit_num_dpd'] = data[credit_num_dpd].sum(axis = 1)
    data['credit_num_overage'] = data[credit_num_danger].sum(axis = 1)
    
    data['credit_hh_new_average'] = data[credit_hh_new].mean(axis = 1)
    data['credit_hh_dpd_average'] = data[credit_hh_dpd].mean(axis = 1)
    
    data['credit_weighted_pct_dpd_overdue'] = 0.3 * data['credit_hh_totalallcredit_bankruptcy'] +\
                                              0.25 * data['credit_hh_totalallcredit_collections'] +\
                                              0.25 * data['credit_hh_totalallcredit_severederog'] +\
                                              0.2 * (data[credit_hh_dpd].sum(axis = 1))
    
    credit_all = credit_bal_general + credit_bal_dpd + credit_bal_overage + credit_bal_new
    credit_bal_num = ['credit_num_new', 'credit_num_general', 'credit_num_dpd', 'credit_num_overage']
    credit_total = data[credit_all].sum(axis = 1)
    credit_num_total = data[credit_bal_num].sum(axis = 1)
    
    data['credit_weighted_pct_dpd_overdue'] = data['credit_weighted_pct_dpd_overdue'] / 100
    
    data['credit_financial_index'] = 0.075 * data[credit_bal_general].sum(axis = 1) / credit_total +\
                                       0.075 * data['credit_num_general'] / credit_num_total +\
                                       0.25 * data[credit_bal_dpd].sum(axis = 1) / credit_total +\
                                       0.05 * data['credit_num_dpd'] / credit_num_total +\
                                       0.30 * data[credit_bal_overage].sum(axis = 1) / credit_total +\
                                       0.10 * data['credit_num_overage'] / credit_num_total +\
                                       0.075 * data[credit_bal_new].sum(axis = 1) / credit_total +\
                                       0.075 * data['credit_num_new'] / credit_num_total     
    data['credit_financial_index'] = (data['credit_financial_index'] - data['credit_financial_index'].min()) /\
                                      (data['credit_financial_index'].max() - data['credit_financial_index'].min())
            
    data['credit_total_new'] = data[credit_bal_new].sum(axis = 1)
    data['credit_dpd_exposure'] = data['credit_num_dpd'] * data[credit_bal_dpd].sum(axis = 1)
    data['credit_overage_exposure'] = data['credit_num_overage'] * data[credit_bal_overage].sum(axis = 1)
    
    data['default_risk'] = 0.2 * data['credit_dpd_exposure'] / np.log(credit_total + 1) +\
                            0.75 * data['credit_overage_exposure'] / np.log(credit_total + 1) +\
                            0.05 * data['credit_total_new'] / np.log(credit_total + 1) 
    data.loc[data['default_risk'] > 4, 'default_risk'] = 4
    data['default_risk'] = (data['default_risk'] - data['default_risk'].min()) / (data['default_risk'].max() - data['default_risk'].min())
    data['default_risk'].fillna(0, inplace = True)
    
    #Credit Balance Clustering Engineering
    labels = range(1,101)
    balance = []
    for columns in credit_bal_clusters: 
        name = columns + "_rank"
        data[name] = pd.qcut(data[columns].rank(method = 'first'), q = 100, labels = labels)
        data[name] = data[name].astype(int)
        balance.append(name)
    pca = PCA(n_components = 12)
    pc = pca.fit_transform(data[credit_bal_clusters])

    pca_data = pd.DataFrame(data = pc, columns = ['PCA1_creditbal', 'PCA2_creditbal', 'PCA3_creditbal', 
                                                  'PCA4_creditbal', 'PCA5_creditbal', 'PCA6_creditbal',
                                                  'PCA7_creditbal', 'PCA8_creditbal', 'PCA9_creditbal',
                                                  'PCA10_creditbal', 'PCA11_creditbal', 'PCA12_creditbal'])
    data = pd.concat([data, pca_data], axis = 1)

    data['Credit_Bal_Score'] = data[balance].sum(axis=1)
    data['Credit_Bal_AVG_Score'] = data[balance].mean(axis=1)
    data['Credit_Bal_Score'] = Standardize(data['Credit_Bal_Score'])
    data['Credit_Bal_AVG_Score'] = Standardize(data['Credit_Bal_AVG_Score'])
    data['default_risk_std'] = Standardize(data['default_risk'])
    model_bal = cluster.KMeans(n_clusters = 30)
    data['Credit_Bal_Score_clusters'] = model_bal.fit_predict(data[['Credit_Bal_Score', 'default_risk_std']])
    
    #Credit Number Clustering Engineering 
    number = []
    for columns in credit_num_clusters: 
        name = columns + "_rank"
        data[name] = pd.qcut(data[columns].rank(method = 'first'), q = 100, labels = labels)
        data[name] = data[name].astype(int)
        number.append(name)
        
    pca = PCA(n_components = 5)
    pc = pca.fit_transform(data[credit_bal_clusters])

    pca_data = pd.DataFrame(data = pc, columns = ['PCA1_creditnum', 'PCA2_creditnum', 'PCA3_creditnum', 
                                                  'PCA4_creditnum', 'PCA5_creditnum'])
    
    data['Credit_Num_Score'] = data[number].sum(axis=1)
    data['Credit_Num_AVG_Score'] = data[number].mean(axis=1)
    data['Credit_Num_Score'] = Standardize(data['Credit_Num_Score'])
    data['Credit_Num_AVG_Score'] = Standardize(data['Credit_Num_AVG_Score'])
    model_num = cluster.KMeans(n_clusters = 25)
    data['Credit_Num_Score_clusters'] = model_num.fit_predict(data[['Credit_Num_Score', 'default_risk_std']])
    
    #Credit HH % Clustering Engineering
    hh_percent = []
    for columns in credit_hh_clusters: 
        name = columns + "_rank"
        data[name] = pd.qcut(data[columns].rank(method = 'first'), q = 100, labels = labels)
        data[name] = data[name].astype(int)
        hh_percent.append(name)
    pca = PCA(n_components = 7)
    pc = pca.fit_transform(data[credit_bal_clusters])

    pca_data = pd.DataFrame(data = pc, columns = ['PCA1_credithh', 'PCA2_credithh', 'PCA3_credithh', 
                                                  'PCA4_credithh', 'PCA5_credithh', 'PCA6_credithh',
                                                  'PCA7_credithh'])
    data = pd.concat([data, pca_data], axis = 1)
    data['Credit_HH_Score'] = data[hh_percent].sum(axis=1)
    data['Credit_HH_AVG_Score'] = data[hh_percent].mean(axis=1)
    data['Credit_HH_Score'] = Standardize(data['Credit_HH_Score'])
    data['Credit_HH_AVG_Score'] = Standardize(data['Credit_HH_AVG_Score'])
    model_hh = cluster.KMeans(n_clusters = 25)
    data['Credit_HH_Score_clusters'] = model_hh.fit_predict(data[['Credit_HH_Score', 'default_risk_std']])

    credit = age_accounts + percent_balance + credit_bal_clusters +\
    credit_num_clusters + ['default_risk_std'] + credit_hh_clusters
    
    for column in credit:
        data.drop(column, inplace = True, axis = 1)
        
    return data

In [11]:
def FillNaN(data):
    '''
    Fills all NAN values based on certain criteria 
    
    Takes one input:
    data: a pandas dataframe
    
    Returns dataframe with all NAN values filled (which aren't tied to another function)
    '''
    data.fillna(0, inplace = True)
    
    return data

In [12]:
def CMSEngineering(data): 
    '''
    Creates new features based on CMS Risk Adjusted Information information

    Takes one input:
    data: a pandas data frame
    
    Returns a pandas dataframe with new updated columns
    '''

    data['cms_risk_ma_nbr_rx_combined'] = data['cms_ma_risk_score_nbr'] * data['cms_rx_risk_score_nbr'] / 2 * (data['cms_ma_risk_score_nbr'] + data['cms_rx_risk_score_nbr'])
    data['reverse_raf'] = data['cms_risk_adjustment_factor_a_amt'] / (1 + data['hcc_weighted_sum'])

    return data

In [13]:
def healthFactorsEngineering(data): 
    '''
    Creates new features based on some of the health scoring metrics

    Takes one input:
    data: a pandas data frame
    
    Returns a pandas dataframe with new updated columns
    '''
    cms_numeric_data = ['cms_partd_ra_factor_amt',
                        'cms_risk_adj_payment_rate_a_amt',
                        'cms_risk_adj_payment_rate_b_amt',
                        'cms_risk_adjustment_factor_a_amt',
                        'cms_tot_ma_payment_amt']
    
    data['life_remain'] = (67.98999 + data['sex_cd'] * 5 + (data['est_age'] * -0.67718)) * ((-6.51961 * data['cci_score'] + 104.04545) / 100)
    
    data['cci_score'] = (data['cci_score'] - data['cci_score'].min()) / (data['cci_score'].max() - data['cci_score'].min())
    data['dcsi_score'] = (data['dcsi_score'] - data['dcsi_score'].min()) / (data['dcsi_score'].max() - data['dcsi_score'].min())
    data['fci_score'] = (data['fci_score'] - data['fci_score'].min()) / (data['fci_score'].max() - data['fci_score'].min())
    data['hcc_weighted_sum'] = (data['hcc_weighted_sum'] - data['hcc_weighted_sum'].min()) / (data['hcc_weighted_sum'].max() - data['hcc_weighted_sum'].min())

    data['weighted_three_scores'] = data['fci_score'] * .40 + data['cci_score'] * .40  + data['dcsi_score'] * .20
    data['weighted_full_health']= data['fci_score'] * .25 + data['cci_score'] * .25  + data['dcsi_score'] * .25 + data['hcc_weighted_sum'] * 0.25
    
    pca = PCA(n_components = 2)
    pc = pca.fit_transform(data[cms_numeric_data])
    
    pca_data = pd.DataFrame(data = pc, columns = ['PCA1_cms', 'PCA2_cms'])
    data = pd.concat([data, pca_data], axis = 1)
    
    labels = range(1,101)
    cms = []
    for columns in cms_numeric_data: 
        name = columns + "_rank"
        data[name] = pd.qcut(data[columns].rank(method = 'first'), q = 100, labels = labels)
        cms.append(name)
        data[name] = data[name].astype(int)
    
    data['CMS_Score'] = data[cms].sum(axis=1)
    data['CMS_AVG_Score'] = data[cms].mean(axis=1)
    data['CMS_Score'] = Standardize(data['CMS_Score'])
    data['CMS_AVG_Score'] = Standardize(data['CMS_AVG_Score'])
    data['weighted_full_health_std'] = Standardize(data['weighted_full_health'])

    model_cms = cluster.KMeans(n_clusters = 20)
    data['CMS_Score_clusters'] = model_cms.fit_predict(data[['CMS_Score', 'weighted_full_health_std']])
    
    cms_numeric_data += ['weighted_full_health_std']
    
    for cms_numeric in cms_numeric_data:
        data.drop(cms_numeric, inplace = True, axis = 1)
    
    return data 

In [14]:
def DummyVariables(data, skip): 
    '''
    Creates dummy variable columns for all categorical data
    
    Takes two argument:
    
    data: A pandas dataframe (with categorical variables)
    drop: boolean value whether dropfirst is true
    
    Returns a pandas data frame with 
    '''
    if skip:
        data = data.set_index('person_id_syn')
    data = pd.get_dummies(data)
    
    return data

In [15]:
def MobilityandStressIndex(data):
    '''
    Creates two new weighted index scores from the financial, health, and demogaphic data provided.
    (Must be run after dummy variables are created)
    
    Takes one input: 
    data: a pandas data frame 
    
    Returns a pandas dataframe with two new index scores
    '''
    
    dummy_columns_to_drop  = ['cms_ra_factor_type_cd_Unknown','cons_cmys_Unknown',
                             'cons_hhcomp_Unknown', 'cons_homstat_Unknown']
    
    
    data['MobilityIndex'] = 0.2021 * data['cms_disabled_ind'] + 0.1295 * data['est_age'] / data['est_age'].max() +\
                        0.1062 * data['life_remain'] / data['life_remain'].max() + 0.0586 * data['credit_financial_index'] +\
                        0.0725 * data['cons_n65p_y'] + 0.0572 * data['cms_low_income_ind'] + 0.034 * data['cons_n2pmv'] +\
                        0.151 * data['weighted_full_health'] + data['hlth_pgm_slvrsnkr_par_status'] * -0.03 +\
                        0.1207 * (9 - data['rucc_category']) + 0.1364 * data['cons_hhcomp_Min Two People, Children'] +\
                        0.00521 * data['cons_hhcomp_Min Two People, No Children'] + 0.0682 * data['cons_hhcomp_One Person, Children']+\
                        0.02728 * data['cons_hhcomp_One Person, No Children'] + 0.0492 * data['cons_hhcomp_Unknown']
    data['MobilityIndex'] = (data['MobilityIndex'] - data['MobilityIndex'].min()) /\
                            (data['MobilityIndex'].max() - data['MobilityIndex'].min())
    
    
    data['StressIndex'] = 0.027 * data['cons_veteran_y'] + 0.1517 * data['credit_financial_index'] +\
                      0.0385 * data['cons_n65p_y'] + 0.1326 * data['cms_low_income_ind'] +\
                      0.2243 * data['default_risk'] + 0.0293 * data['smoker_current_ind'] +\
                      0.0783 * data['submcc_men_alco_ind'] + 0.0921 * data['submcc_men_depr_ind'] +\
                      0.0894 * data['submcc_men_abus_ind'] + 0.0868 * data['cons_hhcomp_Min Two People, Children'] +\
                      0.05872 * data['cons_hhcomp_Min Two People, No Children'] + 0.1468 * data['cons_hhcomp_One Person, Children']+\
                      0.02936 * data['cons_hhcomp_One Person, No Children'] + 0.08808 * data['cons_hhcomp_Unknown']
    
    
    data['StressIndex'] = (data['StressIndex'] - data['StressIndex'].min()) /\
                            (data['StressIndex'].max() - data['StressIndex'].min())
    data['est_age'] = np.log(data['est_age'])
    data.fillna(0, inplace = True)
    
    for dummy in dummy_columns_to_drop:
        data.drop(dummy, inplace = True, axis = 1)
    return data 

In [16]:
def hospitalEngineering(data):
    '''
    Drops all the columns related to hospital visits that have 0 values
    Add two columns: 
    
    1. total_emer_visits
    2. total_admit_days
    
    Drops all the admit days after summing them into the total_admit_days column
    
    One input: DataFrame
    '''
    data.drop(columns = ['total_ip_ltach_admit_ct_pmpm','total_ip_ltach_admit_days_pmpm','total_ip_maternity_admit_ct_pmpm'
                         ,'total_ip_maternity_admit_days_pmpm'],inplace = True)

    data['total_emer_visits'] = data['total_ambulance_visit_ct_pmpm'] + data['total_er_visit_ct_pmpm']

    data['total_admit_days'] = data['total_ip_acute_admit_days_pmpm'] + data['total_ip_mhsa_admit_days_pmpm'] +\
                               data['total_ip_rehab_admit_days_pmpm'] + data['total_ip_snf_admit_days_pmpm']
    
    data.drop(['total_ip_acute_admit_days_pmpm','total_ip_mhsa_admit_days_pmpm','total_ip_rehab_admit_days_pmpm',
              'total_ip_snf_admit_days_pmpm'], axis = 1, inplace=True)
    return data

In [17]:
def RxEngineering(data):
    '''
    Creates two new weighted index scores from the financial, health, and demogaphic data provided.
    (Must be run after dummy variables are created)
    
    Takes one input: 
    data: a pandas data frame 
    
    Returns a pandas dataframe with two new index scores
    '''


    
    rx_columns = ((data.filter(like="rx", axis=1).columns)&(data.filter(like="pmpm_ct", axis=1).columns)).tolist()

    pca = PCA(n_components = 5)
    pc = pca.fit_transform(data[rx_columns])

    pca_data = pd.DataFrame(data = pc, columns = ['PCA1_rx', 'PCA2_rx', 'PCA3_rx', 
                                                  'PCA4_rx', 'PCA5_rx'])
    data = pd.concat([data, pca_data], axis = 1)
    
    labels = range(1,101)
    rx_list = []
    for columns in rx_columns : 
        name = columns + "_rank"
        data[name] = pd.qcut(data[columns].rank(method = 'first'), q = 100, labels = labels)
        data[name] = data[name].astype(int)
        rx_list.append(name)
    
    data['RX_Score'] = data[rx_list].sum(axis=1)
    data['RX_AVG_Score'] = data[rx_list].mean(axis=1)
    data['RX_Score'] = Standardize(data['RX_Score'])
    data['RX_AVG_Score'] = Standardize(data['RX_AVG_Score'])
    model_rx = cluster.KMeans(n_clusters = 25)
    data['RX_Score_clusters'] = model_rx.fit_predict(data[['RX_Score', 'est_age_std']])
    
    for rx in rx_columns:
        data.drop(rx, inplace = True, axis = 1)
    return data

In [18]:
# finalize into a function 
def EngineerSUBMCC_PMPMcolumns(data):
    """
    Groups the SUBMCC PMPM columns
    
    One Inputs:
    
    data: a pandas dataframe
    
    Returns pandas DataFrame with updated columns to replace for all submcc_pmpm_ct 
    """

    # drop _ano, _inf, _pre, _hiv, _trm; keep 23 mcc_pmpm columns
    mcc_list = [ "_ben", "_bld", "_brn", "_cad", "_can", "_cer", "_cir", "_dia", 
                "_dig", "_end", "_gus", "_hdz", "_inj","_men", "_mus", "_ner","_rar", "_res", "_rsk", "_skn", "_sns", "_sor", "_vco"]

    labels = range(1,101)
    submcc_list = []
    for mcc in mcc_list:
        count = 'count' + mcc 
        data[count] = data[(data.filter(like=mcc, axis=1).columns) & (data.filter(like="pmpm_ct", axis=1).columns)].sum(axis=1)
        name = count + "_rank"
        data[name] = pd.qcut(data[count].rank(method = 'first'), q = 100, labels = labels)
        submcc_list.append(name)
        data[name] = data[name].astype(int)
        

    columns = ((data.filter(like="submcc", axis=1).columns)&(data.filter(like="pmpm_ct", axis=1).columns)).tolist()
    
    pca = PCA(n_components = 30)
    pc = pca.fit_transform(data[columns])
    
    pca_data = pd.DataFrame(data = pc, columns = ['PCA1_submcc', 'PCA2_submcc', 'PCA3_submcc', 
                                                  'PCA4_submcc', 'PCA5_submcc', 'PCA6_submcc',
                                                  'PCA7_submcc', 'PCA8_submcc', 'PCA9_submcc',
                                                  'PCA10_submcc', 'PCA11_submcc', 'PCA12_submcc', 
                                                  'PCA13_submcc', 'PCA14_submcc', 'PCA15_submcc', 
                                                  'PCA16_submcc', 'PCA17_submcc', 'PCA18_submcc',
                                                  'PCA19_submcc', 'PCA20_submcc', 'PCA21_submcc',
                                                  'PCA22_submcc', 'PCA23_submcc', 'PCA24_submcc', 
                                                  'PCA25_submcc', 'PCA26_submcc', 'PCA27_submcc', 
                                                  'PCA28_submcc', 'PCA29_submcc', 'PCA30_submcc'])
    data = pd.concat([data, pca_data], axis = 1)
    
    data['SUBMCC_Score'] = data[submcc_list].sum(axis=1)
    data['SUBMCC_AVG_Score'] = data[submcc_list].mean(axis=1)
    data['SUBMCC_Score'] = Standardize(data['SUBMCC_Score'])
    data['SUBMCC_AVG_Score'] = Standardize(data['SUBMCC_AVG_Score'])
    data['est_age_std'] = Standardize(data['est_age'])
    model_submcc = cluster.KMeans(n_clusters = 30)
    data['SUBMCC_Score_clusters'] = model_submcc.fit_predict(data[['SUBMCC_Score', 'est_age_std']])
    
    for column in columns: 
        data.drop(column, inplace = True, axis = 1)
    
    return data

In [19]:
def NormalizeMaxMin(data, columnname):
    
    data[columnname] = (data[columnname] - data[columnname].min()) /\
    (data[columnname].max() - data[columnname].min())
    
    return data[columnname]

In [20]:
def Standardize(data):
    data = (data - data.mean()) / data.std()
    return data

In [21]:
def Clustering(data):
    data['MobilityIndex_std'] = Standardize(data['MobilityIndex'])
    data['StressIndex_std'] = Standardize(data['StressIndex'])
    data['default_risk_std'] = Standardize(data['default_risk'])
    data['est_age_std'] = Standardize(data['est_age'])
    data['credit_financial_index_std'] = Standardize(data['credit_financial_index'])
    data['life_remain_std'] = Standardize(data['life_remain'])
    data['reverse_raf_std'] = Standardize(data['reverse_raf'])
    data['weighted_full_health_std'] = Standardize(data['weighted_full_health'])
    
    model20 = cluster.KMeans(n_clusters = 4)
    model15 = cluster.KMeans(n_clusters = 3)
    model10 = cluster.KMeans(n_clusters = 3)
    data['Credit_All_clusters'] = model20.fit_predict(data[['Credit_Bal_Score', 'Credit_Num_Score', 
                                                              'Credit_HH_Score']])
    data['Credit_All_AVG_clusters'] = model20.fit_predict(data[['Credit_Bal_AVG_Score', 'Credit_Num_AVG_Score', 
                                                              'Credit_HH_AVG_Score']])
    data['Credit_Stress_clusters'] = model15.fit_predict(data[['Credit_Bal_Score', 'Credit_Num_Score', 'Credit_HH_Score', 
                                                             'est_age_std', 'StressIndex_std', 'life_remain_std', 'reverse_raf_std']])
    data['Credit_RAFRisk_clusters'] = model15.fit_predict(data[['Credit_Bal_AVG_Score', 'Credit_Num_AVG_Score', 'Credit_HH_AVG_Score', 
                                                              'life_remain_std', 'reverse_raf_std']])
    data['Credit_Composite_clusters'] = model15.fit_predict(data[['Credit_Bal_Score', 'Credit_Num_Score', 'Credit_HH_Score', 
                                                                'default_risk_std', 'credit_financial_index_std',
                                                                'StressIndex_std']])
    data['Mobile_Health_clusters'] = model15.fit_predict(data[['SUBMCC_Score', 'RX_Score', 'MobilityIndex_std', 
                                                             'weighted_full_health_std', 'CMS_Score']])
    data['Health_Risk_clusters'] = model20.fit_predict(data[['weighted_full_health_std', 'life_remain_std', 'MobilityIndex_std', 
                                                          'est_age_std', 'reverse_raf_std']])
    data['Credit_Health_Stress_clusters'] = model10.fit_predict(data[['Betos_Score','Credit_Bal_AVG_Score', 'Credit_Num_AVG_Score', 'Credit_HH_AVG_Score', 
                                                                    'StressIndex_std']])
    data['General_Mobility_clusters'] = model10.fit_predict(data[['Betos_Score','est_age_std', 'CMS_Score', 
                                                                'MobilityIndex_std']])
    data['Health_Credit_Assesment_clusters'] = model10.fit_predict(data[['Betos_Score','Credit_Bal_AVG_Score', 'Credit_Num_AVG_Score', 
                                                                       'est_age_std', 'weighted_full_health_std', 'Credit_HH_AVG_Score',
                                                                        'MobilityIndex_std']])
    data['General_Health_clusters'] = model15.fit_predict(data[['life_remain_std', 'est_age_std', 'weighted_full_health_std']])
    data['General_HealthMobile_clusters'] = model15.fit_predict(data[['life_remain_std', 'MobilityIndex_std',
                                                                    'est_age_std', 'weighted_full_health_std']])
    data['Health_Scores_clusters'] = model15.fit_predict(data[['SUBMCC_Score', 'Betos_Score',
                                                             'CMS_Score', 'RX_Score', 'MobilityIndex_std']])
    data['Full_Credit_clusters'] = model15.fit_predict(data[['Credit_Bal_Score', 'Credit_Num_Score', 'Credit_HH_Score', 
                                                           'credit_financial_index_std', 'default_risk_std', 'est_age_std']])
    data['Full_AVG_Credit_clusters'] = model15.fit_predict(data[['Credit_Bal_AVG_Score', 'Credit_Num_AVG_Score', 'Credit_HH_AVG_Score', 
                                'credit_financial_index_std', 'default_risk_std', 'est_age_std']])
    
    to_drop = ['MobilityIndex_std', 'StressIndex_std', 'default_risk_std', 'est_age_std', 
               'credit_financial_index_std', 'life_remain_std', 'reverse_raf_std', 
               'weighted_full_health_std']
    
    for droped in to_drop: 
        data.drop(droped, inplace = True, axis = 1)
    
    return data

In [22]:
health = mapStringVariables(health)
health = dropMajorityNAcolumns(health)
health = FillNaN(health)
health = columnBinning(health)
health = columnsToDrop(health)
health = LabelEncoding(health)
health = betosEngineering(health)
health = medEngineering(health)
health = creditDataEngineering(health)
health = CMSEngineering(health)
health = healthFactorsEngineering(health)
health = hospitalEngineering(health)
health = EngineerSUBMCC_PMPMcolumns(health)
health = RxEngineering(health)
health = DummyVariables(health, True)
health = MobilityandStressIndex(health)
health = Clustering(health)

Dropped:  hedis_ami
Dropped:  hedis_cmc_ldc_c_control
Dropped:  hedis_cmc_ldc_c_screen


In [23]:
from sklearn.ensemble import IsolationForest
#health = health.drop('transportation_issues', axis = 1)
model = IsolationForest(n_jobs = -1, n_estimators = 1000, max_features = 765)
model.fit(health)
health['anomaly_decision'] = model.predict(health)
health['anomaly_score'] = model.decision_function(health.drop('anomaly_decision', axis = 1))

In [24]:
health['betos_m5c_pmpm_ct + betos_o1g_pmpm_ct'] = original['betos_m5c_pmpm_ct'] + original['betos_o1g_pmpm_ct']
health['betos_o1a_pmpm_ct * cons_n2pmv'] = original['betos_o1a_pmpm_ct'] * original['cons_n2pmv']
health['total_ambulance_visit_ct_pmpm + total_er_visit_ct_pmpm'] = original['total_ambulance_visit_ct_pmpm'] + original['total_er_visit_ct_pmpm']
health['cms_partd_ra_factor_amt * cms_tot_partd_payment_amt'] = original['cms_partd_ra_factor_amt'] * original['cms_tot_partd_payment_amt']
health['cms_ma_risk_score_nbr + cms_tot_partd_payment_amt'] = original['cms_ma_risk_score_nbr'] + original['cms_tot_partd_payment_amt']
health['credit_hh_autofinance * credit_num_totalallcredit_collections'] = original['credit_hh_autofinance'] * original['credit_num_totalallcredit_collections']
health['credit_hh_bankcard_severederog + credit_num_autobank'] = original['credit_hh_bankcard_severederog'] + original['credit_num_autobank']
health['cms_tot_ma_payment_amt * cons_n2pmv'] = original['cms_tot_ma_payment_amt'] * original['cons_n2pmv']
health['credit_bal_consumerfinance_new * credit_hh_autobank_new'] = original['credit_bal_consumerfinance_new'] * original['credit_hh_autobank_new']
health['fci_score + weighted_full_health'] = health['fci_score'] + health['weighted_full_health']

In [25]:
health['rx_overall_pmpm_ct'] = original['rx_overall_pmpm_ct']
health['rx_mail_pmpm_ct'] =original['rx_mail_pmpm_ct']
health['submcc_men_depr_pmpm_ct']=original['submcc_men_depr_pmpm_ct']
health['rx_gpi2_17_pmpm_ct']=original['rx_gpi2_17_pmpm_ct']
health['submcc_ben_othr_pmpm_ct']=original['submcc_ben_othr_pmpm_ct']
health['rx_gpi2_39_pmpm_ct']=original['rx_gpi2_39_pmpm_ct']
health['submcc_sor_eye_pmpm_ct']=original['submcc_sor_eye_pmpm_ct']
health['rx_branded_pmpm_ct']=original['rx_branded_pmpm_ct']
health['submcc_men_abus_pmpm_ct']=original['submcc_men_abus_pmpm_ct']
health['rx_gpi2_43_pmpm_ct']=original['rx_gpi2_43_pmpm_ct']
health['rx_gpi2_49_pmpm_ct']=original['rx_gpi2_49_pmpm_ct']
health['submcc_vco_vac_pmpm_ct']=original['submcc_vco_vac_pmpm_ct']
health['rx_gpi2_75_pmpm_ct']=original['rx_gpi2_75_pmpm_ct']
health['submcc_rsk_smok_pmpm_ct']=original['submcc_rsk_smok_pmpm_ct']

In [26]:
health_features = ['Credit_Stress_clusters','cms_risk_ma_nbr_rx_combined','med_ambulance_visit_ct_pmpm',
'est_age','cms_low_income_ind','cons_n2pmv','count_vco','cms_disabled_ind','submcc_ner_othr_ind',
'credit_hh_dpd_average', 'cms_risk_adjustment_factor_a_amt_rank','betos_o1a_ind','StressIndex',
'credit_num_totalallcredit_collections','submcc_ben_othr_ind','betos_m5d_pmpm_ct_rank','cons_n2mob',
'PCA18_submcc','betos_y2_ind','betos_m5b_pmpm_ct_rank', 'reverse_raf', 'hedis_dia_hba1c_test', 'count_bld_rank',
'cons_homstat_Renter', 'anomaly_score', 'mabh_seg_H2', 'betos_m5c_pmpm_ct + betos_o1g_pmpm_ct', 
'total_physician_office_visit_ct_pmpm', 'rx_gpi2_90_pmpm_ct_rank', 'PCA4_creditbal', 'count_mus',
'rx_gpi2_17_pmpm_ct_rank', 'betos_o1a_pmpm_ct * cons_n2pmv', 'total_ambulance_visit_ct_pmpm + total_er_visit_ct_pmpm',
'cms_partd_ra_factor_amt * cms_tot_partd_payment_amt', 'total_ambulance_visit_ct_pmpm', 'weighted_full_health',
'med_physician_office_visit_ct_pmpm', 'cons_retail_buyer', 'cons_hhcomp_Min Two People, No Children',
'count_ner_rank', 'total_emer_visits', 'credit_hh_nonmtgcredit_60dpd_rank', 'count_can', 'PCA3_rx', 
'cons_hhcomp_One Person, Children', 'med_outpatient_visit_ct_pmpm', 'cms_ma_risk_score_nbr + cms_tot_partd_payment_amt',
'fci_score', 'cci_score', 'hcc_weighted_sum','rx_overall_pmpm_ct','rx_mail_pmpm_ct','submcc_men_depr_pmpm_ct',
'rx_gpi2_17_pmpm_ct','submcc_ben_othr_pmpm_ct','rx_gpi2_39_pmpm_ct','submcc_sor_eye_pmpm_ct',
'rx_branded_pmpm_ct','submcc_men_abus_pmpm_ct','rx_gpi2_43_pmpm_ct','rx_gpi2_49_pmpm_ct',
'submcc_vco_vac_pmpm_ct','rx_gpi2_75_pmpm_ct','submcc_rsk_smok_pmpm_ct', 'credit_hh_totalallcredit_severederog_rank',
'ccsp_239_ind', 'anomaly_decision','credit_hh_autofinance * credit_num_totalallcredit_collections',
'credit_hh_bankcard_severederog + credit_num_autobank','cms_tot_ma_payment_amt * cons_n2pmv',
'credit_bal_consumerfinance_new * credit_hh_autobank_new', 'fci_score + weighted_full_health']

health = health[health_features]

In [27]:
health.fillna(0, inplace = True, axis = 1)
#health['transportation_issues'] = original['transportation_issues']
health.to_csv('data/HOLDOUT_FINAL.csv')