In [4]:
# Importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor, XGBClassifier

pd.set_option('display.max_rows', 500)

In [14]:
# Importing data
df = pd.read_csv('InsNova_train.csv')
df = df.sample(frac=1.0)
df.loc[:, 'pure_premium'] = df['claim_cost'] / df['exposure']
df.loc[:, 'severity'] = df['claim_cost'] / np.fmax(df['claim_count'], 1)
df.loc[:, 'frequency'] = df['claim_count'] / df['exposure']
    
# Feature engineering
df['large_veh'] = np.where(df['veh_body'].isin(['MIBUS', 'MCARA', 'BUS']), 1, 0)
df['expensive_area'] = np.where(df['area'].isin(['E','F']), 1, 0)
df['expensive_age_risk'] = np.where(df['dr_age'].isin([1, 2]) & (df['veh_value'] > 5.0), 1, 0)
df['expensive_veh'] = np.where(df['veh_value'] > 6.0, 1, 0)
df['severe_veh'] = np.where(df['veh_body'].isin(['HDTOP', 'TRUCK', 'UTE']), 1, 0)
df['young_dr'] = np.where(df['dr_age'] == 1, 1, 0)
df['old_dr'] = np.where(df['dr_age'] > 4.0, 1, 0)
df['new_veh'] = np.where(df['veh_age'] < 2.0, 1, 0)
df['old_veh'] = np.where(df['veh_age'] == 4.0, 1, 0)
df['frequent_area'] = np.where(df['area'].isin(['B','F']), 1, 0)
df['young_dr_old_car'] = np.where((df['dr_age'] == 1) & (df['veh_age'] > 1.0), 1, 0)
df['young_m_old_car'] = np.where((df['dr_age'] == 1) & (df['veh_age'] > 1.0) & (df['gender'] == 'M'), 1, 0)
df['young_f_old_car'] = np.where((df['dr_age'] == 1) & (df['veh_age'] > 1.0) & (df['gender'] == 'F'), 1, 0)
df['frequent_body'] = np.where(df['veh_body'].isin(['BUS', 'COUPE', 'HDTOP', 'MCARA', 'PANVN', 'STNWG']), 1, 0)
df['infrequent_body'] = np.where(df['veh_body'].isin(['MIBUS', 'UTE']), 1, 0)
df['young_m'] = np.where((df['gender'] == 'M') & (df['dr_age'] < 3.0), 1, 0)

cat_cols = ['veh_body',
            'area',
            'gender',
            'large_veh',
            'expensive_area',
            'expensive_age_risk',
            'expensive_veh',
            'severe_veh',
            'young_dr',
            'old_dr',
            'new_veh',
            'old_veh',
            'frequent_area',
            'young_dr_old_car',
            'young_m_old_car',
            'young_f_old_car',
            'frequent_body',
            'infrequent_body',
            'young_m']

# Creating Categorical dataset for LightGBM and CatBoost
for i in cat_cols:
    df[i] = df[i].astype('category').cat.codes

# Splitting into pred/response
response_cols = ['exposure',
                 'claim_ind',
                 'claim_count',
                 'claim_cost',
                 'pure_premium',
                 'severity',
                 'frequency']
X, y = df.drop(response_cols, axis=1), df[response_cols]
X = X.drop('id', axis=1)
X['exposure'] = y['exposure'].copy()
pred_cols = ['veh_value', 'veh_age', 'dr_age'] + cat_cols
X_cont = X[pred_cols]
X_ind = X[pred_cols + ['exposure']]
sev_mask = y['claim_cost'] > 0.0

In [25]:
# Claim Ind
ind_model = XGBClassifier(n_estimators=500,
                          learning_rate=0.02,
                          subsample=0.67,
                          tree_method='hist',
                          scale_pos_weight= (y.shape[0] - y['claim_ind'].sum()) / y['claim_ind'].sum(),
                          max_depth=5,
                          colsample_bytree=0.5,
                          n_jobs=-1)
ind_model.fit(X_ind, y['claim_ind'])
ind_importances = pd.DataFrame({'feature': X_ind.columns, 'gain': ind_model.feature_importances_})
print(ind_importances.sort_values('gain', ascending=False))

               feature      gain
22            exposure  0.113471
0            veh_value  0.062772
16    young_dr_old_car  0.053334
12              old_dr  0.048422
2               dr_age  0.046759
3             veh_body  0.044051
13             new_veh  0.043088
4                 area  0.042747
18     young_f_old_car  0.042241
11            young_dr  0.041623
5               gender  0.039904
19       frequent_body  0.039682
15       frequent_area  0.039280
20     infrequent_body  0.039272
1              veh_age  0.037775
21             young_m  0.037176
10          severe_veh  0.036275
7       expensive_area  0.035854
17     young_m_old_car  0.035687
14             old_veh  0.035265
6            large_veh  0.031842
8   expensive_age_risk  0.027903
9        expensive_veh  0.025577


In [24]:
# Claim Freq
freq_model = XGBRegressor(n_estimators=500,
                          learning_rate=0.02,
                          subsample=0.67,
                          tree_method='hist',
                          objective='count:poisson',
                          max_depth=5,
                          colsample_bytree=0.5,
                          n_jobs=-1)
freq_model.fit(X_cont, y['frequency'], sample_weight=y['exposure'])
freq_importances = pd.DataFrame({'feature': X_cont.columns, 'gain': freq_model.feature_importances_})
print(freq_importances.sort_values('gain', ascending=False))

               feature      gain
0            veh_value  0.069510
12              old_dr  0.066400
16    young_dr_old_car  0.064667
2               dr_age  0.056436
17     young_m_old_car  0.054727
3             veh_body  0.049447
15       frequent_area  0.048848
4                 area  0.047951
11            young_dr  0.044758
5               gender  0.044632
20     infrequent_body  0.044199
18     young_f_old_car  0.043102
19       frequent_body  0.042414
21             young_m  0.042208
1              veh_age  0.041805
13             new_veh  0.040452
10          severe_veh  0.037998
7       expensive_area  0.036930
14             old_veh  0.034344
6            large_veh  0.032656
8   expensive_age_risk  0.029170
9        expensive_veh  0.027349


In [23]:
# Claim Freq (only claims)
freqc_model = XGBRegressor(n_estimators=500,
                          learning_rate=0.02,
                          subsample=0.67,
                          tree_method='hist',
                          objective='count:poisson',
                          max_depth=5,
                          colsample_bytree=0.5,
                          n_jobs=-1)
freqc_model.fit(X_cont.loc[sev_mask, :],
                y.loc[sev_mask, 'frequency'],
                sample_weight=y.loc[sev_mask, 'exposure'])
freqc_importances = pd.DataFrame({'feature': X_cont.columns, 'gain': freqc_model.feature_importances_})
print(freqc_importances.sort_values('gain', ascending=False))

               feature      gain
0            veh_value  0.076680
15       frequent_area  0.063253
18     young_f_old_car  0.054806
2               dr_age  0.053545
3             veh_body  0.052653
4                 area  0.051478
19       frequent_body  0.050993
12              old_dr  0.050972
13             new_veh  0.049098
1              veh_age  0.047706
5               gender  0.044428
14             old_veh  0.044391
21             young_m  0.043188
16    young_dr_old_car  0.042273
10          severe_veh  0.041430
17     young_m_old_car  0.040097
11            young_dr  0.039788
20     infrequent_body  0.037744
7       expensive_area  0.033252
6            large_veh  0.030214
8   expensive_age_risk  0.028228
9        expensive_veh  0.023785


In [22]:
# Claim Severity
sev_model = XGBRegressor(n_estimators=500,
                          learning_rate=0.02,
                          subsample=0.67,
                          tree_method='hist',
                          objective='reg:gamma',
                          max_depth=5,
                          colsample_bytree=0.5,
                          n_jobs=-1)
sev_model.fit(X_cont.loc[sev_mask, :],
              y.loc[sev_mask, 'severity'],
              sample_weight=y.loc[sev_mask, 'claim_count'])
sev_importances = pd.DataFrame({'feature': X_cont.columns, 'gain': sev_model.feature_importances_})
print(sev_importances.sort_values('gain', ascending=False))

               feature      gain
21             young_m  0.072461
8   expensive_age_risk  0.069857
0            veh_value  0.061316
7       expensive_area  0.053120
10          severe_veh  0.052756
1              veh_age  0.051970
4                 area  0.051621
13             new_veh  0.049976
5               gender  0.049283
2               dr_age  0.049118
16    young_dr_old_car  0.047370
19       frequent_body  0.045752
3             veh_body  0.044794
20     infrequent_body  0.038880
12              old_dr  0.038733
17     young_m_old_car  0.037252
14             old_veh  0.036132
18     young_f_old_car  0.035549
15       frequent_area  0.032938
11            young_dr  0.032182
6            large_veh  0.032047
9        expensive_veh  0.016893


In [26]:
# Pure Premium
pp_model = XGBRegressor(n_estimators=500,
                        learning_rate=0.02,
                        subsample=0.67,
                        tree_method='hist',
                        objective='reg:tweedie',
                        tweedie_variance_power=1.7,
                        max_depth=5,
                        colsample_bytree=0.5,
                        n_jobs=-1)
pp_model.fit(X_cont.loc[sev_mask, :],
             y.loc[sev_mask, 'pure_premium'],
             sample_weight=y.loc[sev_mask, 'exposure'])
pp_importances = pd.DataFrame({'feature': X_cont.columns, 'gain': pp_model.feature_importances_})
print(pp_importances.sort_values('gain', ascending=False))

               feature      gain
8   expensive_age_risk  0.075986
0            veh_value  0.065397
21             young_m  0.059138
10          severe_veh  0.056661
17     young_m_old_car  0.054997
4                 area  0.051665
2               dr_age  0.048935
14             old_veh  0.046895
1              veh_age  0.046270
7       expensive_area  0.046056
5               gender  0.046043
3             veh_body  0.045582
13             new_veh  0.043817
15       frequent_area  0.043449
12              old_dr  0.042309
20     infrequent_body  0.041034
19       frequent_body  0.040145
16    young_dr_old_car  0.038818
18     young_f_old_car  0.037785
11            young_dr  0.030242
6            large_veh  0.023579
9        expensive_veh  0.015199
