In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams.update({'font.size': 16})

# Data Preprocessing

In [26]:
OBJ_COLS = ['IAQ_WG','IAGEGRP2','I_RACETH','NUMBABES','NRWGT','TRMADJMW','FMOTHWGT','FACTOR','FWGT','GAGEFLAG','RACETFLG','PAYFLAG','VSTSFLAG','EDUCFLAG','EMPLFLAG','EDUC2','COKES','COKEQ','COKEN','AMPHS','AMPHQ','AMPHN','METHQ','METHN','THCS','THCQ','THCN','OPS','CODQ','CODN','MORPHQ','MORPHN','THCTEST','COKETEST','AMPHTEST','OPTEST','ANYTEST','THCRT','COKERT','TADJ1','TADJ2','TADJ3','TADJ4','TADJ5','TADJ6','TADJ7','TADJ8','TADJ9','TADJ10','TADJ11','TADJ12','TADJ13','TADJ14','TADJ15','TADJ16','TADJ17','TADJ18','INCFLAG','PSU_VAR']
COLS = ['CASEID','AGECALC','IAQ_17FT','IAQ_17IN','IAQ_15','RACEUP','IAQ_35','IAQ_37','IAQ_41','MARITAL1','EMPL1','IAQ_42','INCOME','IAQ_43B','IAQ_43C','PAYUP','IAQ_48','IAQ_50A','IAQ_50B','IAQ_50C','IAQ_50D','IAQ_50E','IAQ_51A','IAQ_51B','IAQ_51C','IAQ_51D','IAQ_51E','IAQ_18A','IAQ_18B','IAQ_18C','IAQ_18D','IAQ_23','IAQ_24A','IAQ_24B','IAQ_24C','IAQ_24D','IAQ_24E','IAQ_25','IAQ_26','IAQ_CNTR','IQ27LB01','IQ27OZ01','IQ2801','IQ2901','IQ27LB02','IQ27OZ02','IQ2802','IQ2902','IQ27LB03','IQ27OZ03','IQ2803','IQ2903','IQ27LB04','IQ27OZ04','IQ2804','IQ2904','IQ27LB04','IQ27OZ04','IQ2805','IQ2905','IAQ_16','IAQ_5','IAQ_WG','IAQ_1','NEWLBS','NEWOZS','NEWGRAMS','IAQ_19A','IAQ_19B','IAQ_19C','IAQ_20A','IAQ_21A','IAQ_20B','IAQ_21B','IAQ_20C','IAQ_21C','IAQ_20D','IAQ_21D','IAQ_20E','IAQ_21E','IAQ_20F','IAQ_21F','IAQ_22','IAQ_8','IAQ_9','IAQ_10','IAQ_11','METRO','IAQ_12B','IAQ_12B','IAQ_12C','IAQ_12D','IAQ_12E','IAQ_12F','IAQ_12G','IAQ_13A','IAQ_13B','IAQ_13C','IAQ_14','ALC3TUP','CIG3TUP','COKA3TUP','CRK3TUP','DONE3TUP','HER3TUP','THC3TUP','METH3TUP','ANY3TUP','ALCPREG','CIGPREG','COKAPREG','CRKPREG','DONEPREG','HERPREG','THCPREG','METHPREG','ALLPREG','ALC3M','CIG3M','COKA3M','CRK3M','DONE3M','HER3M','THC3M','METH3M','ANY3M','MULPREG2']
BOOL_COLS = ['IAQ_50C', 'IAQ_51C', 'IAQ_43B','IAQ_43C','IAQ_18A','IAQ_18B','IAQ_18C','IAQ_18D','IQ29_LAST','IAQ_1CALC','IAQ_19A','IAQ_19B','IAQ_19C','IAQ_STDCALC','IAQ_STDAB','METRO','IAQ_12G','IAQ_13A','IAQ_13B','IAQ_13C','IAQ_14','ALC3TUP','CIG3TUP','COKA3TUP','CRK3TUP','DONE3TUP','HER3TUP','THC3TUP','METH3TUP','ANY3TUP','ALCPREG','CIGPREG','COKAPREG','CRKPREG','DONEPREG','HERPREG','THCPREG','METHPREG','ALLPREG','ALC3M','CIG3M','COKA3M','CRK3M','DONE3M','HER3M','THC3M','METH3M','ANY3M', 'PAYUP']
F_COLS = ['CASEID','AGECALC','IAQ_17CM','IAQ_15KG','RACEUP','IAQ_35','IAQ_41','MARITAL1','EMPL1','IAQ_42','INCOME','IAQ_43B','IAQ_43C','PAYUP','IAQ_18A','IAQ_18B','IAQ_18C','IAQ_18D','IAQ_23','IAQ_24B','IAQ_24C','IAQ_24D','IAQ_24E','IAQ_25','IAQ_26','IAQ_CNTR','IQ27KG_LAST','IQ28_LAST','IQ29_LAST','IAQ_16KG','IAQ_WG','IAQ_1','IAQ_1CALC','NEWKG','IAQ_19A','IAQ_19B','IAQ_19C','IAQ_STDCALC','IAQ_STDAB','IAQ_10','IAQ_11','METRO','IAQ_12G','IAQ_13A','IAQ_13B','IAQ_13C','IAQ_14','ALC3TUP','CIG3TUP','COKA3TUP','CRK3TUP','DONE3TUP','HER3TUP','THC3TUP','METH3TUP','ANY3TUP','ALCPREG','CIGPREG','COKAPREG','CRKPREG','DONEPREG','HERPREG','THCPREG','METHPREG','ALLPREG','ALC3M','CIG3M','COKA3M','CRK3M','DONE3M','HER3M','THC3M','METH3M','ANY3M','MULPREG2']
F1_COLS = ['CASEID','AGECALC','IAQ_17CM','IAQ_15KG','RACEUP','IAQ_35','IAQ_41','MARITAL1','EMPL1','IAQ_42','INCOME','IAQ_51C','IAQ_18A','IAQ_18B','IAQ_18C','IAQ_18D','IAQ_23','IAQ_25','IAQ_26','IQ27KG_LAST','IQ29_LAST','IAQ_1CALC','IAQ_19A','IAQ_19B','IAQ_19C','IAQ_STDCALC','IAQ_STDAB','IAQ_10','IAQ_11','METRO','IAQ_12G','ALLPREG','ANY3M']
F1_TO_STRING = ['RACEUP','MARITAL1','EMPL1','INCOME','IAQ_51C','IAQ_18A','IAQ_18B','IAQ_18C','IAQ_18D','IAQ_25','IQ29_LAST','IAQ_1CALC','IAQ_19A','IAQ_19B','IAQ_19C','IAQ_STDCALC','IAQ_STDAB','IAQ_11']

In [None]:
def convert_obj_cols(data, obj_cols, to):
    data[obj_cols] = data[obj_cols].astype(to)
    return data

In [55]:
def remove_not_interviewd(data):
    return data[data['IAQ_1'] != 96]

In [56]:
def remove_not_questioned(data):
    return data[data['RACEUP'] != 6]

In [57]:
def add_last_birth_data(data):
    
    # Add the columns with N/A values
    
    '''
    IQ27LB_LAST	Baby weight (lbs), last child
    IQ27OZ_LAST	Baby weight (oz), last child
    IQ28_LAST	Mother's age at last baby born
    IQ29_LAST	Was last baby delivered more than 2 weeks early
    '''
        
    data['IQ27LB_LAST'] = 97
    data['IQ27OZ_LAST'] = 97
    data['IQ28_LAST'] = 97
    data['IQ29_LAST'] = 7
    
    # Assign the previousy-last-born baby data to new defined columns
    invalid = [96, 97, 98, 99]
    for i, row in data.iterrows():
        for k in reversed(range(1,11)):
            if (data.at[i, 'IQ28{:02d}'.format(k)] in invalid) or (data.at[i, 'IQ27LB{:02d}'.format(k)] in invalid):
                continue
            data.at[i, 'IQ27LB_LAST'] = data.at[i, 'IQ27LB{:02d}'.format(k)]
            data.at[i, 'IQ27OZ_LAST'] = data.at[i, 'IQ27OZ{:02d}'.format(k)]
            data.at[i, 'IQ28_LAST'] = data.at[i, 'IQ28{:02d}'.format(k)]
            data.at[i, 'IQ29_LAST'] = data.at[i, 'IQ29{:02d}'.format(k)]
                
    return data

In [58]:
def add_early_birth_flag(data):
    data['IAQ_1CALC'] = np.where(data['IAQ_1'] <= -14, 1, 2)
    data['IAQ_1CALC'] = np.where(data['IAQ_1'] == 96, 96, data['IAQ_1CALC'])
    data['IAQ_1CALC'] = np.where(data['IAQ_1'] == 97, 97, data['IAQ_1CALC'])
    data['IAQ_1CALC'] = np.where(data['IAQ_1'] == 98, 98, data['IAQ_1CALC'])
    data['IAQ_1CALC'] = np.where(data['IAQ_1'] == 99, 99, data['IAQ_1CALC'])
    return data

In [59]:
def add_std_flag(data):
    data['IAQ_STDCALC'] = np.where((data['IAQ_20A'] == 1)\
                                   | (data['IAQ_20B'] == 1)\
                                   | (data['IAQ_20C'] == 1)\
                                   | (data['IAQ_20D'] == 1)\
                                   | (data['IAQ_20E'] == 1)\
                                   | (data['IAQ_20F'] == 1), 1, 2)
    data['IAQ_STDAB'] = np.where((data['IAQ_21A'] == 1)\
                                   | (data['IAQ_21B'] == 1)\
                                   | (data['IAQ_21C'] == 1)\
                                   | (data['IAQ_21D'] == 1)\
                                   | (data['IAQ_21E'] == 1)\
                                   | (data['IAQ_21F'] == 1)\
                                   | (data['IAQ_22'] == 1), 1, 2)
    return data

In [60]:
def convert_metric(x, y, metric):
    '''
    input: height (ft, in) or weight (lbs, oz)
    returns height in cm or weight in kg
    '''

    if metric=='height' or metric=='h':
        x_m = x*30.48
        y_m = y*2.54

    elif metric=='weight' or metric=='w':
        x_m = float(x*453.592)/1000
        y_m = float(y*28.3495)/1000
        
    return x_m + y_m

In [61]:
def convert_metrics(data):
    
    # Add the columns with N/A values
    
    '''
    IAQ_15KG	Baby weight (lbs), last child
    IQ27OZ_LAST	Baby weight (oz), last child
    IQ28_LAST	Mother's age at last baby born
    IQ29_LAST	Was last baby delivered more than 2 weeks early
    '''
        
    data['IAQ_15KG'] = 997.0
    data['IAQ_16KG'] = 997.0
    data['IAQ_17CM'] = 997.0
    data['IQ27KG_LAST'] = 97.0
    data['NEWKG'] = 97.0
    
    for i, row in data.iterrows():
        if data.at[i, 'IAQ_15'] == 996:
            data.at[i, 'IAQ_15KG'] = 996
        if data.at[i, 'IAQ_15'] < 996:
            data.at[i, 'IAQ_15KG'] = convert_metric(data.at[i, 'IAQ_15'], 0, 'w')
        if data.at[i, 'IAQ_16'] == 996:
            data.at[i, 'IAQ_16KG'] = 996
        if data.at[i, 'IAQ_16'] < 996:
            data.at[i, 'IAQ_16KG'] = convert_metric(data.at[i, 'IAQ_16'], 0, 'w')
        if (data.at[i, 'IAQ_17FT'] == 96) or (data.at[i, 'IAQ_17IN'] == 96):
            data.at[i, 'IAQ_17CM'] = 996
        if (data.at[i, 'IAQ_17FT'] < 96) and (data.at[i, 'IAQ_17IN'] < 96):
            data.at[i, 'IAQ_17CM'] = convert_metric(data.at[i, 'IAQ_17FT'], data.at[i, 'IAQ_17IN'], 'h')
        if (data.at[i, 'IQ27LB_LAST'] == 96) or (data.at[i, 'IQ27OZ_LAST'] == 96):
            data.at[i, 'IQ27KG_LAST'] = 96
        if (data.at[i, 'IQ27LB_LAST'] < 96) and (data.at[i, 'IQ27OZ_LAST'] < 96):
            data.at[i, 'IQ27KG_LAST'] = convert_metric(data.at[i, 'IQ27LB_LAST'], data.at[i, 'IQ27OZ_LAST'], 'w')
        if data.at[i, 'NEWGRAMS'] < 9996:
            data.at[i, 'NEWKG'] = float(data.at[i, 'NEWGRAMS'])/1000
        elif (data.at[i, 'NEWLBS'] == 96) or (data.at[i, 'NEWOZS'] == 96):
            data.at[i, 'NEWKG'] = 96
        elif (data.at[i, 'NEWLBS'] < 96) and (data.at[i, 'NEWOZS'] < 96):
            data.at[i, 'NEWKG'] = convert_metric(data.at[i, 'NEWLBS'], data.at[i, 'NEWOZS'], 'w')
            
    return data

In [62]:
def impute_missing_values(data, col, thresh, direc, how):
    
    if direc == '>=' or direc == '=>':
        missing = (data[col] >= thresh)
    elif direc == '<':
        missing = (data[col] < thresh)
    elif direc == '==':
        missing = (data[col] == thresh)
    elif direc == '<=' or direc == '=<':
        missing = (data[col] <= thresh)

    if how=='mean':
        val = np.mean(data[~missing][col])
    if how=='median':
        val = np.median(data[~missing][col])
    if how=='class' or how=='zero':
        val = 0
    
    data.loc[missing, col] = val
    return data

In [64]:
def shift_bool(data, bool_cols):
    # "2==No" -> "0==No"
    data[bool_cols] = data[bool_cols].replace(2, 0)
    # "Missing val > 2" -> "Missing val == -1"
    data[bool_cols] = data[bool_cols].apply(lambda x: [y if y <= 1 else -1 for y in x])
    return data

In [6]:
def add_dummies(data, dummy_cols):
    res_data = data.copy()
    for col in dummy_cols:
        if len(data[col].unique()) > 2:
            res_data = res_data.drop(col, axis=1)
            dummies = pd.get_dummies(data[col], prefix=col, drop_first=True)
            res_data = res_data.merge(dummies, left_index=True, right_index=True)
    return res_data

In [65]:
def preprocess(data):
    
    data = data.replace(' ', 9999)
    data = convert_obj_cols(data, OBJ_COLS, 'float64')
    data = add_last_birth_data(data)
    data = add_early_birth_flag(data)
    data = convert_metrics(data)
    data = remove_not_interviewd(data)
    data = remove_not_questioned(data)
    data = add_std_flag(data)
    data = shift_bool(data, BOOL_COLS)
    
    # # # Imputation
    
    data = impute_missing_values(data, 'IAQ_17CM', 996, '>=', 'mean')
    data = impute_missing_values(data, 'IAQ_15KG', 996, '>=', 'mean')
    data = impute_missing_values(data, 'IAQ_23', 96, '>=', 'mean')
    data = impute_missing_values(data, 'IAQ_16KG', 996, '>=', 'mean')
    data = impute_missing_values(data, 'IAQ_WG', 99, '>=', 'mean')
    data = impute_missing_values(data, 'IAQ_1', 96, '==', 'mean')
    data = impute_missing_values(data, 'NEWKG', 96, '>=', 'mean')
    data = impute_missing_values(data, 'IAQ_10', 96, '>=', 'mean')
    
    data = impute_missing_values(data, 'IAQ_24A', 97, '>=', 'zero')
    data = impute_missing_values(data, 'IAQ_24B', 97, '>=', 'zero')
    data = impute_missing_values(data, 'IAQ_24C', 97, '>=', 'zero')
    data = impute_missing_values(data, 'IAQ_24D', 97, '>=', 'zero')
    data = impute_missing_values(data, 'IAQ_24E', 97, '>=', 'zero')
    data = impute_missing_values(data, 'IAQ_26', 97, '>=', 'mean')
    data = impute_missing_values(data, 'IAQ_CNTR', 97, '>=', 'zero')
    data = impute_missing_values(data, 'IQ27LB_LAST', 97, '>=', 'mean')
    data = impute_missing_values(data, 'IQ27OZ_LAST', 97, '>=', 'mean')
    data = impute_missing_values(data, 'IQ27KG_LAST', 97, '>=', 'mean')
    data = impute_missing_values(data, 'IQ28_LAST', 97, '>=', 'mean')
    
    # Median
    data = impute_missing_values(data, 'IAQ_35', 96, '>=', 'median')
    data = impute_missing_values(data, 'IAQ_41', 96, '>=', 'median')
    data = impute_missing_values(data, 'IAQ_42', 96, '>=', 'median')
    data = impute_missing_values(data, 'INCOME', 96, '>=', 'median')
    
    # Zero Class
    data = impute_missing_values(data, 'RACEUP', 6, '>=', 'class')
    data = impute_missing_values(data, 'MARITAL1', 6, '>=', 'class')
    data = impute_missing_values(data, 'EMPL1', 6, '>=', 'class')
    data = impute_missing_values(data, 'INCOME', 96, '>=', 'class')
    data = impute_missing_values(data, 'PAYUP', 6, '>=', 'class')
    data = impute_missing_values(data, 'IAQ_25', 6, '>=', 'class')
    data = impute_missing_values(data, 'IAQ_22', 6, '>=', 'class')
    data = impute_missing_values(data, 'IAQ_11', 6, '>=', 'class')
    data = impute_missing_values(data, 'MULPREG2', 6, '>=', 'class')
    
    data = data[F1_COLS]
#     data = add_dummies(data, DUMMIES)
#     data[F1_TO_STRING] = data[F1_TO_STRING].astype(str)
    
    return data.reset_index().drop(['index'], axis=1)

# Plotting

In [None]:
def plot_propensity(data, bins):
    
    T_0 = data[data['T'] == 0]['propensity']
    T_1 = data[data['T'] == 1]['propensity']
    T_0_bins, indices = np.histogram(T_0, bins=np.linspace(0, 1, num=bins+1), density=True)
    T_1_bins, indices = np.histogram(T_1, bins=np.linspace(0, 1, num=bins+1), density=True)
    df = pd.DataFrame(data={'propensity_bin': indices[:-1], 'untreated': T_0_bins/T_0_bins.shape[0], 'treated': T_1_bins/T_1_bins.shape[0]})

    fig, ax = plt.subplots(figsize=(15, 8))

    alpha = 0.6
    bar_width = 0.05

    rects1 = plt.bar(df['propensity_bin'], df['untreated'], bar_width,
    alpha=alpha,
    color='orange',
    label='Untreated',
    align='edge')

    rects1 = plt.bar(df['propensity_bin'], df['treated'], bar_width,
    alpha=alpha,
    color='blue',
    label='Treated',
    align='edge')

    plt.xlabel('Propensity Bin')
    plt.ylabel('Density')
    plt.title('Propensity Score')
    plt.minorticks_on()
    plt.legend()
    # plt.set_xticks()

    # plt.xticks(df['propensity_bin'], rotation=12)
    # plt.xticks(np.arange(min(df['propensity_bin']), max(df['propensity_bin'])+1, 0.1))

    plt.tight_layout()
    plt.show()

In [None]:
def roc_plot(model, X, Y, title='Model ROC'):
    # calculate the fpr and tpr for all thresholds of the classification
    probs = model.predict_proba(X)
    preds = probs[:,1]
    fpr, tpr, threshold = metrics.roc_curve(Y, preds)
    roc_auc = metrics.roc_auc_score(Y, model.predict(X))

    # method I: plt
    import matplotlib.pyplot as plt
    plt.title(title)
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.3f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

# Data Analysis

In [15]:
def odds_ratio_ci(arr, method='simple'):
    
    if method == 'simple':
        '''
        CI According to Robins, Breslow, Greenland (1986)
        '''
        
        a, b, c, d = arr[0], arr[1], arr[2], arr[3]
        
        xi = (a*d)/(b*c)

        L = np.log(xi)

        se_L = np.sqrt(1/a + 1/b + 1/c + 1/d)

        lb = np.exp(L-1.96*se_L)
        ub = np.exp(L+1.96*se_L)

        return xi, (lb, ub), (a,b,c,d)
    
    if method == 'mh':
        
        '''
        Mantel Haenszel
        '''
            
        sum_a = 0.0
        sum_ad_n = 0.0
        sum_bc_n = 0.0
        sum_E_a = 0.0
        sum_V_a = 0.0
        
        for a_k,b_k,c_k,d_k in arr:
            
            n1_k = a_k+c_k
            n0_k = b_k+d_k
            m1_k = a_k+b_k
            m0_k = c_k+d_k
            
            n_k = n1_k+n0_k
            
            sum_a += a_k
            sum_ad_n += (a_k*d_k)/n_k
            sum_bc_n += (b_k*c_k)/n_k
            sum_E_a += (n1_k*m1_k)/n_k
            sum_V_a += (n1_k*n0_k*m1_k*m0_k)/(n_k*n_k*(n_k-1))
        
        xi = sum_ad_n/sum_bc_n
        
        chi_sq = np.power((np.abs(sum_a-sum_E_a)-0.5), 2)/sum_V_a
        
        lb = np.exp((1-(1.96/np.sqrt(chi_sq)))*np.log(xi))
        ub = np.exp((1+(1.96/np.sqrt(chi_sq)))*np.log(xi))
        
        return xi, (lb, ub), arr

In [22]:
def index_by_strata(data, k):
    prop = data['propensity']
    bins = np.linspace(0.0, 1.0, k+1)
    quants = [0.0] + list(np.quantile(prop, bins[1:-1])) + [1.0]
    indices = []
    for i, q in enumerate(quants[:-1]):
        indices.append((data['propensity'] >= quants[i]) & (data['propensity'] < quants[i+1]))
    return indices, quants

In [16]:
def calc_odds_ratio(data, t_col, y_col, method, k=5):
    
    odds = ci = raw = None

    if method == 'crude':
        
        a = float(data[(data[y_col] == 1) & (data[t_col] == 1)].shape[0])
        b = float(data[(data[y_col] == 0) & (data[t_col] == 1)].shape[0])
        d = float(data[(data[y_col] == 1) & (data[t_col] == 0)].shape[0])
        c = float(data[(data[y_col] == 0) & (data[t_col] == 0)].shape[0])

        odds, ci, raw = odds_ratio_ci((a,b,c,d))
    
    if method == 'ipw':
        
        '''
        Stratification and Weighting Via the Propensity Score in Estimation of
        Causal Treatment Effects: A Comparative Study
        by Lunceford and Davidian 
        '''
        
        a = (((data[y_col])*(data[t_col]))/(data['propensity'])).sum()
        b = (((data[y_col])*(1-data[t_col]))/(1-data['propensity'])).sum()
        c = (((1-data[y_col])*(data[t_col]))/(data['propensity'])).sum()
        d = (((1-data[y_col])*(1-data[t_col]))/(1-data['propensity'])).sum()

        odds, ci, raw = odds_ratio_ci((a,b,c,d))

    if method == 'stratified':
        
        '''
        Mantel Haenszel
        '''
        
        indices, quants = index_by_strata(data, k)
        print(quants)
        abcd_arr = []
        
        for index in indices:
            
            strata = data[index]
            
            a_k = float(strata[(strata[y_col] == 1) & (strata[t_col] == 1)].shape[0])
            b_k = float(strata[(strata[y_col] == 1) & (strata[t_col] == 0)].shape[0])
            c_k = float(strata[(strata[y_col] == 0) & (strata[t_col] == 1)].shape[0])
            d_k = float(strata[(strata[y_col] == 0) & (strata[t_col] == 0)].shape[0])
            
            abcd_arr.append((a_k, b_k, c_k, d_k))
                   
        odds, ci, raw = odds_ratio_ci(abcd_arr, method='mh')
        
    return odds, ci, raw

In [None]:
def model_results(model, X, Y, cv=False, folds=10):

    print('Model Results:')
        
    if not cv:
        Y_pred = model.predict(X)

        print('Accuracy:', metrics.accuracy_score(Y, Y_pred))
        print('F1 value:', metrics.f1_score(Y, Y_pred))
        print('ROC AUC:', metrics.roc_auc_score(Y, Y_pred))
    
    else:
        score = cross_val_score(model, X, Y, cv=folds, scoring='accuracy')
        print('Avg. Accuracy: %0.3f , std: +/- %0.3f' % (score.mean(), score.std()))
        score = cross_val_score(model, X, Y, cv=folds, scoring='f1')
        print('Avg. F1: %0.3f , std: +/- %0.3f' % (score.mean(), score.std()))
        score = cross_val_score(model, X, Y, cv=folds, scoring='roc_auc')
        print('Avg. ROC AUC: %0.3f , std: +/- %0.3f' % (score.mean(), score.std()))

In [None]:
def binary_oversample(data, y_col):
    
    count_0, count_1 = data[y_col].value_counts()
    
    data_1 = data[data[y_col] == 1]
    data_0 = data[data[y_col] == 0]
        
    data_1_os = data_1.sample(count_0, replace=True)

    return pd.concat([data_0, data_1_os], axis=0)