In [1]:
import numpy as np
import pandas as pd

In [2]:
X_train = pd.read_csv('input/train.csv')
(nb_sample,nb_feature) = X_train.shape
print(X_train.shape)
X_train.head(5)

(58881, 129)


Unnamed: 0,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,...,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response
0,1,A8,26,0.230769,2,3,1,0.492537,0.563636,0.225941,...,0,0,0,0,0,0,0,0,0,7
1,1,D2,26,1.0,3,3,1,0.671642,0.781818,0.32636,...,0,0,0,0,0,0,0,0,0,6
2,1,D2,26,0.230769,2,3,1,0.626866,0.745455,0.393305,...,0,0,0,0,0,0,0,0,0,1
3,1,D4,26,0.230769,2,3,1,0.268657,0.690909,0.246862,...,0,0,0,0,0,0,0,0,0,4
4,1,E1,10,0.230769,2,3,1,0.641791,0.581818,0.466527,...,0,0,1,0,0,0,1,0,0,2


### Plan:
##### a. According to EDA, we decide to preprocess our raw training and test data as following steps:
    1. Delete samples with 14 and 15 missing featrues;
    2. Delete directly the features that have more than 70% missing values;
    3. Encode object-type data;
    4. Fill missing values by mean for other features;
    5. See variance again and delete top5 features with the lowest variance;
    6. See correlation again to delete top5 features with the lowest correlation to the target;
##### b.  Set up a XGBoost model to see the performance and then select important features in using feature importance fonction.
    1. We deceide to use 3-fold crossing validation to reduce uncertainty and aviod overfitting;
    2. Select different number of improtant features to test model performance;
##### c.  Chosse the best features and do preprocessing (only step 1 and 4)
##### d. Predict results in using 3 models trained from 3-fold CV and then do a voting

### Data Preprocessing:

##### delete samples with 14 and 15 missing featrues

In [3]:
nb_delete_1415 = 0
for i in range(nb_sample):
    if i < (nb_sample-nb_delete_1415):
        nb_missing = X_train.iloc[i,:].isnull().sum()
        if nb_missing >= 13:
            X_train.drop(index=[i],inplace=True)
            nb_delete_1415+=1
    else:
        break
print('totally delete: ',nb_delete_1415, 'samples')
X_train.reset_index(drop = True,inplace = True)
X_train.head(5)

totally delete:  29 samples


Unnamed: 0,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,...,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response
0,1,A8,26,0.230769,2,3,1,0.492537,0.563636,0.225941,...,0,0,0,0,0,0,0,0,0,7
1,1,D2,26,1.0,3,3,1,0.671642,0.781818,0.32636,...,0,0,0,0,0,0,0,0,0,6
2,1,D2,26,0.230769,2,3,1,0.626866,0.745455,0.393305,...,0,0,0,0,0,0,0,0,0,1
3,1,D4,26,0.230769,2,3,1,0.268657,0.690909,0.246862,...,0,0,0,0,0,0,0,0,0,4
4,1,E1,10,0.230769,2,3,1,0.641791,0.581818,0.466527,...,0,0,1,0,0,0,1,0,0,2


##### delete directly the features that have more than 70% missing values

In [4]:
# According to EDA, we will delete the features: 
#Medical_History_10,Medical_History_32,Medical_History_24,InsuredInfo_8,Medical_History_15,InsuredInfo_9,Family_Hist_5 
import gc
del X_train['Medical_History_10'],X_train['Medical_History_32'],X_train['Medical_History_24'],X_train['InsuredInfo_8'],X_train['Medical_History_15'],X_train['InsuredInfo_9'],X_train['Family_Hist_5']
gc.collect()

20

##### encode object-type data

In [5]:
print(X_train[X_train.columns[(X_train.dtypes == 'object')==True]].columns)
print(X_train['Product_Info_2'].unique())
print(X_train['InsuredInfo_7'].unique())

Index(['Product_Info_2', 'InsuredInfo_7'], dtype='object')
['A8' 'D2' 'D4' 'E1' 'A3' 'A6' 'B2' 'D1' 'D3' 'A2' 'A1' 'C4' 'B1' 'A5'
 'C3' 'C1' 'A7' 'A4' 'C2']
['Female' 'Male']


In [6]:
# As we can see above, we decide to:
# onehot encoding: InsuredInfo_7; label encoding: Product_Info_2
X_train = X_train.join(pd.get_dummies(X_train['InsuredInfo_7']))
del X_train['InsuredInfo_7']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder().fit(X_train['Product_Info_2'])
X_train['Product_Info_2'] = le.transform(X_train['Product_Info_2'])
X_train.head(5)

Unnamed: 0,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,...,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response,Female,Male
0,1,7,26,0.230769,2,3,1,0.492537,0.563636,0.225941,...,0,0,0,0,0,0,0,7,1,0
1,1,15,26,1.0,3,3,1,0.671642,0.781818,0.32636,...,0,0,0,0,0,0,0,6,0,1
2,1,15,26,0.230769,2,3,1,0.626866,0.745455,0.393305,...,0,0,0,0,0,0,0,1,0,1
3,1,17,26,0.230769,2,3,1,0.268657,0.690909,0.246862,...,0,0,0,0,0,0,0,4,1,0
4,1,18,10,0.230769,2,3,1,0.641791,0.581818,0.466527,...,1,0,0,0,1,0,0,2,1,0


##### Fill missing values by mean for other features;

In [7]:
missing_features = pd.DataFrame(X_train.dtypes,columns=['dtypes'])
missing_features = missing_features.reset_index()
missing_features['Name'] = missing_features['index']
del missing_features['index']
missing_features.reset_index(drop=True,inplace=True)
missing_features = missing_features[X_train.isnull().sum().values!=0]
missing_features

Unnamed: 0,dtypes,Name
11,float64,Employment_Info_1
14,float64,Employment_Info_4
16,float64,Employment_Info_6
27,float64,Insurance_History_5
32,float64,Family_Hist_2
33,float64,Family_Hist_3
34,float64,Family_Hist_4
35,float64,Medical_History_1


In [8]:
# as they are all float data, we can then fill them by mean directly
X_train.fillna(X_train.mean()[missing_features.Name],inplace=True)
print('now we have ',X_train.isnull().sum().values.sum(),' missing value')

now we have  0  missing value


##### See variance again and delete top5 features with the lowest variance

In [9]:
tabel_var = X_train[X_train.columns].var().sort_values(ascending=True)
tabel_var = tabel_var.head(5)
print(tabel_var)
tabel_var.reset_index()
for column in tabel_var.index.tolist():
    del X_train[column]
gc.collect()

Insurance_History_5    0.000031
Employment_Info_4      0.000960
Medical_History_35     0.004108
Medical_History_38     0.004819
Ht                     0.005513
dtype: float64


60

##### See correlation again to delete top5 features with the lowest correlation to the targe

In [10]:
X_temp = X_train
# X_temp['Response'] = y_train
X_temp = X_temp.corr()
tabel_corr = abs(X_temp[['Response']]).sort_values(by=['Response'],ascending=True)
tabel_corr = tabel_corr.head(5)
print(tabel_corr)
tabel_corr.reset_index()
for column in tabel_corr.index.tolist():
    del X_train[column]
del X_temp
gc.collect()

                     Response
Insurance_History_4  0.000594
Medical_History_25   0.000598
Medical_History_36   0.000748
Medical_History_26   0.000777
Insurance_History_1  0.000885


20

In [11]:
print(X_train.shape)
y_train = X_train['Response']
del X_train['Response']
gc.collect()

(58852, 113)


20

##### Same operations on  test set

In [12]:
X_test = pd.read_csv('input/predict.csv')
del X_test['Medical_History_10'],X_test['Medical_History_32'],X_test['Medical_History_24'],X_test['InsuredInfo_8'],X_test['Medical_History_15'],X_test['InsuredInfo_9'],X_test['Family_Hist_5']
X_test = X_test.join(pd.get_dummies(X_test['InsuredInfo_7']))
del X_test['InsuredInfo_7']
le = LabelEncoder().fit(X_test['Product_Info_2'])
X_test['Product_Info_2'] = le.transform(X_test['Product_Info_2'])
missing_features = pd.DataFrame(X_test.dtypes,columns=['dtypes'])
missing_features = missing_features.reset_index()
missing_features['Name'] = missing_features['index']
del missing_features['index']
missing_features.reset_index(drop=True,inplace=True)
missing_features = missing_features[X_test.isnull().sum().values!=0]
X_test.fillna(X_test.mean()[missing_features.Name],inplace=True)

for column in tabel_var.index.tolist():
    del X_test[column]
for column in tabel_corr.index.tolist():
    del X_test[column]
print('test set shape:',X_test.shape)
gc.collect()

test set shape: (500, 112)


0

### Data Modeling:¶

#### Plan:
    a. Seperate training set into training set and validation set (used in step b,c);
    b. Use some simple models like SVC, KNN to see the baseline;
    c. Use some avanced models like random forest;
    d. If the result isn't good enough, we will try to use xgboost and cross validation to make a voting.

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, 
                                                                test_size=0.15, stratify=y_train,random_state=27)
print(X_train.shape)
print(X_validation.shape)

(50024, 112)
(8828, 112)


##### SVC

In [None]:
# wait for five minutes still no result, maybe it's memory issue
from sklearn.svm import SVC
clf = SVC(C=0.5)
clf.fit(X_train,y_train)
print('validation set presicion:',clf.score(X_validation,y_validation))
print('training set precision:',clf.score(X_train,y_train))

##### KNN

In [14]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=7)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_validation)
print('validation set presicion:',accuracy_score(y_validation, y_pred))
y_pred = clf.predict(X_train)
print('training set precision:',accuracy_score(y_train, y_pred))

validation set presicion: 0.32396918894426824
training set precision: 0.48116903886134654


##### Random forest

In [25]:
# here we can also use grid search to find the best hyperparameters to tune the model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=10,n_estimators=2000)
clf.fit(X_train,y_train)
print('validation set presicion:',clf.score(X_validation,y_validation))
print('training set precision:',clf.score(X_train,y_train))

validation set presicion: 0.5111010421386497
training set precision: 0.5674476251399329


##### As we can see, the score is not good enough (validation score is low with a little bit overfitting), so we will next test xgboost + cross validation then select important features and run it again 

##### Xgboost (will be finished later)

In [17]:
# from sklearn.model_selection import KFold
# from sklearn.metrics import roc_auc_score
# from tqdm import tqdm
# from xgboost import XGBClassifier
# import xgboost as xgb

# folds = KFold(n_splits=3, shuffle=True, random_state=123)
# oof_preds = np.zeros(X_train.shape[0])
# # sub_preds = np.zeros(test.shape[0])
# features = X_train.columns.to_list()
# for n_fold, (trn_idx, val_idx) in tqdm(enumerate(folds.split(X_train))):
#     trn_x, trn_y = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
#     val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]


#     xgb_params ={'objective':"multi:softmax",
#         'n_estimators':900,
#         'learning_rate':0.01,
#         'num_leaves':30,
#         'colsample_bytree':.8,
#         'subsample':.9,
#         'max_depth':15,
#         'reg_alpha':.1,
#         'reg_lambda':.1,
#         'min_split_gain':.01,
#         'min_child_weight':1,
#         'eval_metric':"mlogloss"}
#     d_train = xgb.DMatrix(trn_x, trn_y)
#     d_valid = xgb.DMatrix(val_x, val_y)
# #     d_test = xgb.DMatrix(test)
    
#     watchlist = [(d_train, 'train'), (d_valid, 'valid')]

#     model = xgb.train(xgb_params, d_train,
#                       watchlist, maximize=True, 
#                       verbose_eval=250, early_stopping_rounds=150)
    
#     print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
#     del clf, trn_x, trn_y, val_x, val_y
#     gc.collect()
    
# print('Full AUC score %.6f' % roc_auc_score(y_train, oof_preds))  