## Health Insurance Cross Sell Prediction 🏠 🏥
Predict Health Insurance Owners' who will be interested in Vehicle Insurance

In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('data//train.csv')
test_df = pd.read_csv('data//test.csv')

train_df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


### Data Exploration

In [2]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


In [3]:
train_df.describe()

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0
mean,190555.0,38.822584,0.997869,26.388807,0.45821,30564.389581,112.034295,154.347397,0.122563
std,110016.836208,15.511611,0.04611,13.229888,0.498251,17213.155057,54.203995,83.671304,0.327936
min,1.0,20.0,0.0,0.0,0.0,2630.0,1.0,10.0,0.0
25%,95278.0,25.0,1.0,15.0,0.0,24405.0,29.0,82.0,0.0
50%,190555.0,36.0,1.0,28.0,0.0,31669.0,133.0,154.0,0.0
75%,285832.0,49.0,1.0,35.0,1.0,39400.0,152.0,227.0,0.0
max,381109.0,85.0,1.0,52.0,1.0,540165.0,163.0,299.0,1.0


In [27]:
train_df['Response'].value_counts()

0    334399
1     46710
Name: Response, dtype: int64

We could see the dataset is highly unbalanced - about 90% 0s. We need to fix this

### Data Engineering

Resampled skewed data

In [254]:
from random import sample
from sklearn.utils import shuffle

def resample(df):
    indices_0 = df.index[df['Response'] == 0].tolist()
    indices_1 = df.index[df['Response'] == 1].tolist()
    indices_0 =  sample(indices_0, int(len(indices_1)*0.9))
    #indices_1 =  sample(indices_1, int(len(indices_1)*0.5))
    df_0 = df.iloc[ indices_0]
    df_1 = df.iloc[ indices_1]
    df_resampled = pd.concat([df_0, df_1], ignore_index=True)
    return shuffle(df_resampled, random_state=0)

train_df_1 = resample(train_df)

In [255]:
age = train_df_1['Age'].values
age_range = np.ones(len(age))
age_range[(age>30)&(age<=50)] = 2
age_range[(age>50)&(age<=70)] = 3
age_range[age>70] = 4

train_df_1['Age_range'] = age_range

In [256]:
train_df_1['Response'].value_counts()

1    46710
0    42039
Name: Response, dtype: int64

In [257]:
train_df_2 = pd.get_dummies(train_df_1, columns=['Gender','Vehicle_Age','Vehicle_Damage','Age_range'])
test_df_1 = pd.get_dummies(test_df, columns=['Gender','Vehicle_Age','Vehicle_Damage'])
train_df_2.head()

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Gender_Female,Gender_Male,Vehicle_Age_1-2 Year,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_No,Vehicle_Damage_Yes,Age_range_1.0,Age_range_2.0,Age_range_3.0,Age_range_4.0
57999,130483,33,1,30.0,0,28243.0,124.0,290,1,0,1,0,1,0,0,1,0,1,0,0
41498,61060,45,1,17.0,0,33611.0,7.0,41,0,0,1,1,0,0,0,1,0,1,0,0
82988,333172,49,1,35.0,0,26938.0,26.0,194,1,1,0,1,0,0,0,1,0,1,0,0
86417,361389,51,1,28.0,0,34104.0,122.0,246,1,0,1,1,0,0,0,1,0,0,1,0
20658,344891,20,1,8.0,0,35019.0,160.0,226,0,0,1,0,1,0,0,1,1,0,0,0


In [258]:
from sklearn.model_selection import train_test_split

X = train_df_2.drop(['id','Age','Response'],axis=1)
y = train_df_2['Response'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=10)
X_test = test_df_1.drop(['id','Age'],axis=1)

### Modeling

In [276]:
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, roc_auc_score

scaler = MinMaxScaler()
xgc = XGBClassifier(n_estimators=200, learning_rate=0.08, 
                    reg_lambda = 1, reg_alpha = 1, 
                    gamma=0.3, max_depth=6, eval_metric = 'auc')
pipeline = Pipeline(steps=[('Scale',scaler),('Classifier',xgc)])
model = pipeline.fit(X_train,y_train)

In [271]:
model.score(X_train,y_train)

0.8180691395862442

In [272]:
model.score(X_val,y_val)

0.8065802816901408

In [262]:
def metric(y_true,y_pred):
    
    indices_tp = np.where(y_true==1)
    predict_1 = y_pred[indices_tp]
    tp = sum(predict_1==1)
    fn = sum(predict_1==0)
    
    indices_tn = np.where(y_true==0)
    predict_0 = y_pred[indices_tn]
    tn = sum(predict_0==0)
    fp = sum(predict_0==1)
    
    print('Summary:')
    print('-------')
    print('%3.2f percent of Yes predicted correctly'%(tp/len(predict_1)*100))
    print('%3.2f percent of Yes misclassified as No'%(fn/len(predict_1)*100))
    print('%3.2f percent of No predicted correctly'%(tn/len(predict_0)*100))
    print('%3.2f percent of No misclassified as Yes'%(fp/len(predict_0)*100))
    print('Acc. -- %3.2f percent'%((tp+tn)/(tp+tn+fn+fp)*100))
    print('F1 score -- %2.2f'%(2*tp/(2*tp+fn+fp)))

In [273]:
y_pred = model.predict(X_val)
metric(y_val,y_pred)

Summary:
-------
93.72 percent of Yes predicted correctly
6.28 percent of Yes misclassified as No
66.00 percent of No predicted correctly
34.00 percent of No misclassified as Yes
Acc. -- 80.66 percent
F1 score -- 0.84


In [274]:
roc_auc_score(y_val,y_pred)

0.7985697313724809

### DNN Model

In [180]:
X_in = scaler.fit_transform(X_train)
X_try = scaler.fit_transform(X_val)

In [181]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop

model = Sequential()
model.add(Dense(200, input_shape=(X_train.shape[1],), activation='relu')) 
model.add(Dense(100, activation = 'relu'))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=0.001), metrics=['accuracy'])
model.fit(X_in,y_train, batch_size=100, 
          validation_data= (X_try, y_val),
          epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x24ee85bed30>