In [None]:
import numpy as np
import pandas as pd
import sklearn as sk
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('data/train.csv', index_col = 'Id')
test = pd.read_csv('data/test.csv', index_col = 'Id')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
train_id = train.index
test_id = test.index

data = pd.concat([train, test])

### Feature engineering:
- create dummy vars for Product_Info_2 (keep everything else as numeric) Starting from version 1.5, XGBoost has experimental support for categorical data available for public testing.
- calculate sum of all Medical_Keyword columns
- for each binary keyword-value pair, calculate the mean of the target variable, then for each observation take the mean and the minimum of the keyword-value-mean targets

In [None]:
#create dummy vars for Product_Info_2 (keep everything else as numeric)
data = pd.get_dummies(data, prefix = 'Product_Info_2', columns = ['Product_Info_2'])

In [None]:
#calculate sum of all Medical_Keyword columns
Medical_Keyword_names = []
for i in range(1, 49):
    Medical_Keyword_names.append('Medical_Keyword_' + str(i))
    
data['Medical_Keyword_sum'] = data[Medical_Keyword_names].sum(axis = 1)

In [None]:
# for each binary keyword-value pair, calculate the mean of the target variable,
#then for each observation take the mean and the minimum of the keyword-value-mean targets

for value in range(1,8):
    data['target_'+str(value)] = (data['Response'] > value).astype(int)
    temp = pd.DataFrame()
    for col in Medical_Keyword_names:
        replace_dict = dict(data.groupby(by = col)['target_'+str(value)].mean())
        temp[col+'target_'+str(value)] = data[col].replace(replace_dict)
    data['keyword_target_mean_'+str(value)] = temp.sum(axis = 1)
    data['keyword_target_min_'+str(value)] = temp.min(axis = 1)

In [None]:
data.head(5)

Unnamed: 0_level_0,Product_Info_1,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,BMI,Employment_Info_1,Employment_Info_2,Employment_Info_3,Employment_Info_4,Employment_Info_5,Employment_Info_6,InsuredInfo_1,InsuredInfo_2,InsuredInfo_3,InsuredInfo_4,InsuredInfo_5,InsuredInfo_6,InsuredInfo_7,Insurance_History_1,Insurance_History_2,Insurance_History_3,Insurance_History_4,Insurance_History_5,Insurance_History_7,Insurance_History_8,Insurance_History_9,Family_Hist_1,Family_Hist_2,Family_Hist_3,Family_Hist_4,Family_Hist_5,Medical_History_1,Medical_History_2,Medical_History_3,Medical_History_4,Medical_History_5,Medical_History_6,Medical_History_7,Medical_History_8,Medical_History_9,Medical_History_10,Medical_History_11,Medical_History_12,Medical_History_13,Medical_History_14,Medical_History_15,Medical_History_16,Medical_History_17,Medical_History_18,Medical_History_19,Medical_History_20,Medical_History_21,Medical_History_22,Medical_History_23,Medical_History_24,Medical_History_25,Medical_History_26,Medical_History_27,Medical_History_28,Medical_History_29,Medical_History_30,Medical_History_31,Medical_History_32,Medical_History_33,Medical_History_34,Medical_History_35,Medical_History_36,Medical_History_37,Medical_History_38,Medical_History_39,Medical_History_40,Medical_History_41,Medical_Keyword_1,Medical_Keyword_2,Medical_Keyword_3,Medical_Keyword_4,Medical_Keyword_5,Medical_Keyword_6,Medical_Keyword_7,Medical_Keyword_8,Medical_Keyword_9,Medical_Keyword_10,Medical_Keyword_11,Medical_Keyword_12,Medical_Keyword_13,Medical_Keyword_14,Medical_Keyword_15,Medical_Keyword_16,Medical_Keyword_17,Medical_Keyword_18,Medical_Keyword_19,Medical_Keyword_20,Medical_Keyword_21,Medical_Keyword_22,Medical_Keyword_23,Medical_Keyword_24,Medical_Keyword_25,Medical_Keyword_26,Medical_Keyword_27,Medical_Keyword_28,Medical_Keyword_29,Medical_Keyword_30,Medical_Keyword_31,Medical_Keyword_32,Medical_Keyword_33,Medical_Keyword_34,Medical_Keyword_35,Medical_Keyword_36,Medical_Keyword_37,Medical_Keyword_38,Medical_Keyword_39,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response,Product_Info_2_A1,Product_Info_2_A2,Product_Info_2_A3,Product_Info_2_A4,Product_Info_2_A5,Product_Info_2_A6,Product_Info_2_A7,Product_Info_2_A8,Product_Info_2_B1,Product_Info_2_B2,Product_Info_2_C1,Product_Info_2_C2,Product_Info_2_C3,Product_Info_2_C4,Product_Info_2_D1,Product_Info_2_D2,Product_Info_2_D3,Product_Info_2_D4,Product_Info_2_E1,Medical_Keyword_sum,target_1,keyword_target_mean_1,keyword_target_min_1,target_2,keyword_target_mean_2,keyword_target_min_2,target_3,keyword_target_mean_3,keyword_target_min_3,target_4,keyword_target_mean_4,keyword_target_min_4,target_5,keyword_target_mean_5,keyword_target_min_5,target_6,keyword_target_mean_6,keyword_target_min_6,target_7,keyword_target_mean_7,keyword_target_min_7
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1
2,1,10,0.076923,2,1,1,0.641791,0.581818,0.148536,0.323008,0.028,12,1,0.0,3,,1,2,6,3,1,2,1,1,1,3,1,0.000667,1,1,2,2,,0.598039,,0.526786,4.0,112,2,1,1,3,2,2,1,,3,2,3,3,240.0,3,3,1,1,2,1,2,3,,1,3,3,1,3,2,3,,1,3,1,2,2,1,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,32.329096,0.671625,1,28.415945,0.588901,1,27.802288,0.576012,1,26.929399,0.557951,1,23.646403,0.489238,1,16.847428,0.347117,1,11.983588,0.245507
5,1,26,0.076923,2,3,1,0.059701,0.6,0.131799,0.272288,0.0,1,3,0.0,2,0.0018,1,2,6,3,1,2,1,2,1,3,1,0.000133,1,3,2,2,0.188406,,0.084507,,5.0,412,2,1,1,3,2,2,1,,3,2,3,3,0.0,1,3,1,1,2,1,2,3,,1,3,3,1,3,2,3,,3,1,1,2,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,32.329096,0.671625,1,28.415945,0.588901,1,27.802288,0.576012,0,26.929399,0.557951,0,23.646403,0.489238,0,16.847428,0.347117,0,11.983588,0.245507
6,1,26,0.076923,2,3,1,0.029851,0.745455,0.288703,0.42878,0.03,9,1,0.0,2,0.03,1,2,8,3,1,1,1,2,1,1,3,,3,2,3,3,0.304348,,0.225352,,10.0,3,2,2,1,3,2,2,2,,3,2,3,3,,1,3,1,1,2,1,2,3,,2,2,3,1,3,2,3,,3,3,1,3,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,32.329096,0.671625,1,28.415945,0.588901,1,27.802288,0.576012,1,26.929399,0.557951,1,23.646403,0.489238,1,16.847428,0.347117,1,11.983588,0.245507
7,1,10,0.487179,2,3,1,0.164179,0.672727,0.205021,0.352438,0.042,9,1,0.0,3,0.2,2,2,8,3,1,2,1,2,1,1,3,,3,2,3,3,0.42029,,0.352113,,0.0,350,2,2,1,3,2,2,2,,3,2,3,3,,1,3,1,1,2,2,2,3,,1,3,3,1,3,2,3,,3,3,1,2,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,32.33126,0.671625,1,28.41879,0.588901,1,27.798326,0.572384,1,26.920258,0.54927,1,23.646475,0.489238,1,16.842735,0.343066,1,11.976867,0.239659
8,1,26,0.230769,2,3,1,0.41791,0.654545,0.23431,0.424046,0.027,9,1,0.0,2,0.05,1,2,6,3,1,2,1,2,1,1,3,,3,2,3,2,0.463768,,0.408451,,,162,2,2,1,3,2,2,2,,3,2,3,3,,1,3,1,1,2,1,2,3,,2,2,3,1,3,2,3,,3,3,1,3,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,32.329096,0.671625,1,28.415945,0.588901,1,27.802288,0.576012,1,26.929399,0.557951,1,23.646403,0.489238,1,16.847428,0.347117,1,11.983588,0.245507


### Null Value

In [None]:
#checking percentage of missing values in a column
missing_val_count_by_column = data.isnull().sum()/len(data)
column_w_missing_val = list(data.columns[missing_val_count_by_column > 0])
column_w_missing_val.remove('Response')
print(missing_val_count_by_column[column_w_missing_val].sort_values(ascending=False))

Medical_History_10     0.990423
Medical_History_32     0.981578
Medical_History_24     0.937066
Medical_History_15     0.751270
Family_Hist_5          0.700414
Family_Hist_3          0.572423
Family_Hist_2          0.486898
Insurance_History_5    0.423281
Family_Hist_4          0.326751
Employment_Info_6      0.184987
Medical_History_1      0.149862
Employment_Info_4      0.112653
Employment_Info_1      0.000278
dtype: float64


In [None]:
data[column_w_missing_val].describe()

Unnamed: 0,Employment_Info_1,Employment_Info_4,Employment_Info_6,Insurance_History_5,Family_Hist_2,Family_Hist_3,Family_Hist_4,Family_Hist_5,Medical_History_1,Medical_History_10,Medical_History_15,Medical_History_24,Medical_History_32
count,79124.0,70230.0,64505.0,45645.0,40610.0,33841.0,53285.0,23711.0,67285.0,758.0,19686.0,4981.0,1458.0
mean,0.07793,0.00634,0.363228,0.00171,0.474554,0.498315,0.445338,0.486604,7.928572,143.426121,124.240221,50.449709,11.718107
std,0.082746,0.033368,0.350589,0.006465,0.154993,0.140164,0.163434,0.12918,12.912942,106.499108,98.66838,77.92163,38.257761
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.035,0.0,0.06,0.0004,0.362319,0.411765,0.323944,0.410714,2.0,9.25,18.0,1.0,0.0
50%,0.06,0.0,0.25,0.000933,0.463768,0.519608,0.43662,0.508929,4.0,225.0,117.0,8.0,0.0
75%,0.1,0.0,0.58,0.002,0.57971,0.607843,0.56338,0.580357,9.0,240.0,240.0,63.0,2.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,240.0,240.0,240.0,240.0,240.0


In [None]:
data.fillna(value =-1, inplace = True )

In [None]:
data.info(verbose=True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79146 entries, 2 to 79129
Data columns (total 167 columns):
 #    Column                 Non-Null Count  Dtype  
---   ------                 --------------  -----  
 0    Product_Info_1         79146 non-null  int64  
 1    Product_Info_3         79146 non-null  int64  
 2    Product_Info_4         79146 non-null  float64
 3    Product_Info_5         79146 non-null  int64  
 4    Product_Info_6         79146 non-null  int64  
 5    Product_Info_7         79146 non-null  int64  
 6    Ins_Age                79146 non-null  float64
 7    Ht                     79146 non-null  float64
 8    Wt                     79146 non-null  float64
 9    BMI                    79146 non-null  float64
 10   Employment_Info_1      79146 non-null  float64
 11   Employment_Info_2      79146 non-null  int64  
 12   Employment_Info_3      79146 non-null  int64  
 13   Employment_Info_4      79146 non-null  float64
 14   Employment_Info_5      79146 non-nul

  """Entry point for launching an IPython kernel.


### Modeling:



In [None]:
train, test = data.loc[train_id], data.loc[test_id]

target_col = ['target_' + str(i) for i in range(1,8)]


X_train = train.drop(labels = ['Response'] + target_col , axis= 1)
#X_val = val.drop(labels = ['Response', 'target'], axis= 1)
X_test = test.drop(labels = ['Response'] + target_col , axis= 1)
y_train = train[target_col ]
# y_val = val['target']

In [None]:
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import GradientBoostingClassifier

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

In [None]:
base_gb = GradientBoostingClassifier(
    n_estimators = 200,
    max_depth = 3, 
    max_features='auto',
    subsample=0.8,
    n_iter_no_change=6,
    validation_fraction=0.2,
    verbose = 1
)

In [None]:
chain = ClassifierChain(
    base_gb, 
    order = None, # If None, the order will be determined by the order of columns in the label matrix Y
    cv = None, # None, to use true labels when fitting
)

chain.fit(X_train, y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.6570           0.0142            1.08m
         2           0.6441           0.0111            1.06m
         3           0.6375           0.0082            1.06m
         4           0.6297           0.0062            1.06m
         5           0.6236           0.0049            1.05m
         6           0.6152           0.0050            1.04m
         7           0.6167           0.0032            1.04m
         8           0.6079           0.0039            1.03m
         9           0.6055           0.0025            1.02m
        10           0.6021           0.0025            1.02m
        20           0.5816           0.0012           57.72s
        30           0.5665           0.0005           54.50s
        40           0.5550           0.0003           51.25s
        50           0.5542           0.0004           48.02s
        60           0.5474           0.0001           44.87s
       

        80           0.3524           0.0002           34.27s
        90           0.3512          -0.0000           32.12s
       100           0.3438           0.0001           30.06s


ClassifierChain(base_estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                          criterion='friedman_mse',
                                                          init=None,
                                                          learning_rate=0.1,
                                                          loss='deviance',
                                                          max_depth=3,
                                                          max_features='auto',
                                                          max_leaf_nodes=None,
                                                          min_impurity_decrease=0.0,
                                                          min_impurity_split=None,
                                                          min_samples_leaf=1,
                                                          min_samples_split=2,
                                                          min_weight

###  Search for cutoffs
- the aim is to find the boundaries that maximize the kappa score
- boundaries are initialized according to the original Response distribution of the training dataset

In [None]:
from sklearn.metrics import cohen_kappa_score
from scipy.optimize import fmin_powell

In [None]:
#search for optimal cutoffs
train_preds = chain.predict_proba(X_train).sum(axis = 1) + 1

def digit(cutoff):
    res = []
    for y in list(train_preds):
        if y < cutoff[0]:
            res.append(1)
        elif y < cutoff[1]:
            res.append(2)
        elif y < cutoff[2]:
            res.append(3)
        elif y < cutoff[3]:
            res.append(4)
        elif y < cutoff[4]:
            res.append(5)
        elif y < cutoff[5]:
            res.append(6)
        elif y < cutoff[6]:
            res.append(7)
        else: res.append(8)
    return res  
    
def train_offset(cutoff):
    res = digit(cutoff)    
    return -cohen_kappa_score(train['Response'], res, weights = 'quadratic') 

cutoff = [1.5,2.9,3.1,4.5,5.5,6.1,7.1]  
offsets = fmin_powell(train_offset, x0 = cutoff, disp = False)

### Final Result

In [None]:
prob_test = chain.predict_proba(X_test).sum(axis = 1) + 1

res = []
for y in list(prob_test):
    if y < offsets[0]:
        res.append(1)
    elif y < offsets[1]:
        res.append(2)
    elif y < offsets[2]:
        res.append(3)
    elif y < offsets[3]:
        res.append(4)
    elif y < offsets[4]:
        res.append(5)
    elif y < offsets[5]:
        res.append(6)
    elif y < offsets[6]:
        res.append(7)
    else: res.append(8)


sample_submission['Response'] = res

In [None]:
sample_submission.to_csv('output/sklearn_v3.csv', index=False)