In [1]:
import numpy as np
import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
from importlib import reload
reload(plt)

from scipy.stats import norm

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
train.head()

Unnamed: 0,Id,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,...,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response
0,2,1,D3,10,0.076923,2,1,1,0.641791,0.581818,...,0,0,0,0,0,0,0,0,0,8
1,5,1,A1,26,0.076923,2,3,1,0.059701,0.6,...,0,0,0,0,0,0,0,0,0,4
2,6,1,E1,26,0.076923,2,3,1,0.029851,0.745455,...,0,0,0,0,0,0,0,0,0,8
3,7,1,D4,10,0.487179,2,3,1,0.164179,0.672727,...,0,0,0,0,0,0,0,0,0,8
4,8,1,D2,26,0.230769,2,3,1,0.41791,0.654545,...,0,0,0,0,0,0,0,0,0,8


In [3]:
train.shape, test.shape

((59381, 128), (19765, 127))

In [4]:
train.columns

Index(['Id', 'Product_Info_1', 'Product_Info_2', 'Product_Info_3',
       'Product_Info_4', 'Product_Info_5', 'Product_Info_6', 'Product_Info_7',
       'Ins_Age', 'Ht',
       ...
       'Medical_Keyword_40', 'Medical_Keyword_41', 'Medical_Keyword_42',
       'Medical_Keyword_43', 'Medical_Keyword_44', 'Medical_Keyword_45',
       'Medical_Keyword_46', 'Medical_Keyword_47', 'Medical_Keyword_48',
       'Response'],
      dtype='object', length=128)

In [5]:
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

Unnamed: 0,Total,Percent
Medical_History_10,58824,0.990620
Medical_History_32,58274,0.981358
Medical_History_24,55580,0.935990
Medical_History_15,44596,0.751015
Family_Hist_5,41811,0.704114
Family_Hist_3,34241,0.576632
Family_Hist_2,28656,0.482579
Insurance_History_5,25396,0.427679
Family_Hist_4,19184,0.323066
Employment_Info_6,10854,0.182786


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB


In [7]:
train = pd.get_dummies(train, prefix='Product_Info_2', columns=['Product_Info_2'])

In [8]:
train['Medical_Keywords'] = train.loc[:, 'Medical_Keyword_1':'Medical_Keyword_48'].sum(axis = 1)

In [9]:
train.shape

(59381, 147)

In [10]:
# train.drop(train.loc[:, 'Medical_Keyword_1':'Medical_Keyword_48'], inplace=True, axis=1)

In [11]:
train.shape

(59381, 147)

In [12]:
# train.drop(train.loc[:, 'Medical_History_1':'Medical_History_41'], inplace=True, axis=1)

In [13]:
train['BMI_Age'] = train['BMI'] * train['Ins_Age']
train['Ins_Age_sq'] = train['Ins_Age'] * train['Ins_Age']
train['Ht_sq'] = train['Ht'] * train['Ht']
train['Wt_sq'] = train['Wt'] * train['Wt']
train['Ins_Age_cu'] = train['Ins_Age'] * train['Ins_Age'] * train['Ins_Age']
train['Ht_cu'] = train['Ht'] * train['Ht'] * train['Ht']
train['Wt_cu'] = train['Wt'] * train['Wt'] * train['Wt']

In [14]:
train.shape

(59381, 154)

In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 154 entries, Id to Wt_cu
dtypes: float64(25), int64(110), uint8(19)
memory usage: 62.2 MB


In [16]:
for c in train.columns:
  if train[c].dtype == 'uint8':
    try:
        train[c] = train[c].astype('int64')
    except:
        print('Column',' ',c,' cannot be converted to int.')

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 154 entries, Id to Wt_cu
dtypes: float64(25), int64(129)
memory usage: 69.8 MB


In [18]:
train.fillna(-1, inplace=True)

In [19]:
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.sum().max()

0.0

In [20]:
x_train = train.loc[:, train.columns != 'Response']
x_train = x_train.values
x_train.shape

(59381, 153)

In [21]:
y_train = train['Response'].values

In [22]:
total = test.isnull().sum().sort_values(ascending=False)
percent = (test.isnull().sum()/test.isnull().count()).sort_values(ascending=False)
missing_data_1 = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data_1

Unnamed: 0,Total,Percent
Medical_History_10,19564,0.989831
Medical_History_32,19414,0.982241
Medical_History_24,18585,0.940299
Medical_History_15,14864,0.752036
Family_Hist_5,13624,0.689299
Family_Hist_3,11064,0.559777
Family_Hist_2,9880,0.499874
Insurance_History_5,8105,0.410068
Family_Hist_4,6677,0.337819
Employment_Info_6,3787,0.191601


In [23]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19765 entries, 0 to 19764
Columns: 127 entries, Id to Medical_Keyword_48
dtypes: float64(18), int64(108), object(1)
memory usage: 19.2+ MB


In [24]:
test = pd.get_dummies(test, prefix='Product_Info_2', columns=['Product_Info_2'])

In [25]:
test['Medical_Keywords'] = test.loc[:, 'Medical_Keyword_1':'Medical_Keyword_48'].sum(axis = 1)

In [26]:
test.shape

(19765, 146)

In [27]:
# test.drop(test.loc[:, 'Medical_Keyword_1':'Medical_Keyword_48'], inplace=True, axis=1)

In [28]:
test.shape

(19765, 146)

In [29]:
# test.drop(test.loc[:, 'Medical_History_1':'Medical_History_41'], inplace=True, axis=1)

In [30]:
test['BMI_Age'] = test['BMI'] * test['Ins_Age']
test['Ins_Age_sq'] = test['Ins_Age'] * test['Ins_Age']
test['Ht_sq'] = test['Ht'] * test['Ht']
test['Wt_sq'] = test['Wt'] * test['Wt']
test['Ins_Age_cu'] = test['Ins_Age'] * test['Ins_Age'] * test['Ins_Age']
test['Ht_cu'] = test['Ht'] * test['Ht'] * test['Ht']
test['Wt_cu'] = test['Wt'] * test['Wt'] * test['Wt']

In [31]:
test.shape

(19765, 153)

In [32]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19765 entries, 0 to 19764
Columns: 153 entries, Id to Wt_cu
dtypes: float64(25), int64(109), uint8(19)
memory usage: 20.6 MB


In [33]:
for c in test.columns:
  if test[c].dtype == 'uint8':
    try:
        test[c] = test[c].astype('int64')
    except:
        print('Column',' ',c,' cannot be converted to int.')

In [34]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19765 entries, 0 to 19764
Columns: 153 entries, Id to Wt_cu
dtypes: float64(25), int64(128)
memory usage: 23.1 MB


In [35]:
test.fillna(-1, inplace=True)

In [36]:
total = test.isnull().sum().sort_values(ascending=False)
percent = (test.isnull().sum()/test.isnull().count()).sort_values(ascending=False)
missing_data_1 = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data_1.sum().max()

0.0

In [37]:
x_test = test.values

In [38]:
x_test.shape

(19765, 153)

In [None]:
import xgboost 

xgb = xgboost.XGBClassifier(n_estimators=400, learning_rate=0.06, gamma=0, subsample=1,
                           colsample_bytree=1, max_depth=7, booster = 'gbtree', random_state = 0, objective = 'reg:linear', gpu_id = 0)
xgb.fit(x_train,y_train)
predictions = xgb.predict(x_test)

In [None]:
output = pd.DataFrame({ 'Id' : test['Id'], 'Response': predictions })
output.to_csv('xgboost-prudential-kaggle.csv', index = False)