In [3]:
# Importing packages
from sklearn.experimental import enable_hist_gradient_boosting
from scipy.stats import uniform
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_predict, cross_val_score, GroupKFold, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel, RFECV, chi2
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.metrics import make_scorer, mean_poisson_deviance, mean_gamma_deviance, classification_report, brier_score_loss
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler, PolynomialFeatures, OrdinalEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, RidgeCV, RidgeClassifierCV, PoissonRegressor, GammaRegressor, TweedieRegressor, LogisticRegression, LogisticRegressionCV
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, StackingRegressor, HistGradientBoostingRegressor, RandomForestClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, LinearSVC, SVR
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

pd.set_option('display.max_rows', 500)

In [4]:
# Importing data
df = pd.read_csv('InsNova_train.csv')
df = df.sample(frac=1.0)
df.loc[:, 'pure_premium'] = df['claim_cost'] / df['exposure']
df.loc[:, 'severity'] = df['claim_cost'] / np.fmax(df['claim_count'], 1)
df.loc[:, 'frequency'] = df['claim_count'] / df['exposure']

# Getting CV inds
n_folds = 20
cv = StratifiedKFold(n_folds, shuffle=True, random_state=123)
df.loc[:, 'fold'] = 0
for fold, (_, test_inds) in enumerate(cv.split(df, df['claim_ind'])):
    df.loc[test_inds, 'fold'] = fold
    
# Feature engineering
df['large_veh'] = np.where(df['veh_body'].isin(['MIBUS', 'MCARA', 'BUS']), 1, 0)
df['expensive_area'] = np.where(df['area'].isin(['E','F']), 1, 0)
df['expensive_age_risk'] = np.where(df['dr_age'].isin([1, 2]) & (df['veh_value'] > 5.0), 1, 0)
df['expensive_veh'] = np.where(df['veh_value'] > 6.0, 1, 0)
df['severe_veh'] = np.where(df['veh_body'].isin(['HDTOP', 'TRUCK', 'UTE']), 1, 0)
df['young_dr'] = np.where(df['dr_age'] == 1, 1, 0)
df['old_dr'] = np.where(df['dr_age'] > 4.0, 1, 0)
df['new_veh'] = np.where(df['veh_age'] < 2.0, 1, 0)
df['old_veh'] = np.where(df['veh_age'] == 4.0, 1, 0)
df['frequent_area'] = np.where(df['area'].isin(['B','F']), 1, 0)
df['young_dr_old_car'] = np.where((df['dr_age'] == 1) & (df['veh_age'] > 1.0), 1, 0)
df['young_m_old_car'] = np.where((df['dr_age'] == 1) & (df['veh_age'] > 1.0) & (df['gender'] == 'M'), 1, 0)
df['young_f_old_car'] = np.where((df['dr_age'] == 1) & (df['veh_age'] > 1.0) & (df['gender'] == 'F'), 1, 0)
df['frequent_body'] = np.where(df['veh_body'].isin(['BUS', 'COUPE', 'HDTOP', 'MCARA', 'PANVN', 'STNWG']), 1, 0)
df['infrequent_body'] = np.where(df['veh_body'].isin(['MIBUS', 'UTE']), 1, 0)
df['young_m'] = np.where((df['gender'] == 'M') & (df['dr_age'] < 3.0), 1, 0)

cat_cols = ['veh_body',
            'area',
            'gender',
            'large_veh',
            'expensive_area',
            'expensive_age_risk',
            'expensive_veh',
            'severe_veh',
            'young_dr',
            'old_dr',
            'new_veh',
            'old_veh',
            'frequent_area',
            'young_dr_old_car',
            'young_m_old_car',
            'young_f_old_car',
            'frequent_body',
            'infrequent_body',
            'young_m']

rating_vars = ['young_dr_old_car',
               'old_dr',
               'young_dr',
               'young_m_old_car',
               'infrequent_body',
               'frequent_body',
               'young_f_old_car',
               'new_veh', 
               'frequent_area',
               'young_m']

# Creating Categorical dataset for LightGBM and CatBoost
df['lm_gender'] = np.where(df['gender'] == 'M', 1, 0)
for i in cat_cols:
    df[i] = df[i].astype('category')
df['dr_age'] = df['dr_age'].astype(np.float64)
df['veh_age'] = df['veh_age'].astype(np.float64) 

# Splitting into pred/response
response_cols = ['fold',
                 'exposure',
                 'claim_ind',
                 'claim_count',
                 'claim_cost',
                 'pure_premium',
                 'severity',
                 'frequency']
X, y = df.drop(response_cols, axis=1), df[response_cols]
X = X.drop('id', axis=1)
X['exposure'] = y['exposure'].copy()
lin_cols = ['veh_value', 'veh_body', 'veh_age', 'lm_gender', 'area', 'dr_age']
boost_cols = ['veh_value', 'veh_body', 'veh_age', 'gender', 'area', 'dr_age']
lin_sev_cols = lin_cols + ['exposure']
boost_sev_cols = boost_cols + ['exposure']

# Importing test set
df_test = pd.read_csv('InsNova_test.csv')
df_test['lm_gender'] = np.where(df_test['gender'] == 'M', 1, 0)
df_test['large_veh'] = np.where(df_test['veh_body'].isin(['MIBUS', 'MCARA', 'BUS']), 1, 0)
df_test['expensive_area'] = np.where(df_test['area'].isin(['E','F']), 1, 0)
df_test['expensive_age_risk'] = np.where(df_test['dr_age'].isin([1, 2]) & (df_test['veh_value'] > 5.0), 1, 0)
df_test['expensive_veh'] = np.where(df_test['veh_value'] > 6.0, 1, 0)
df_test['severe_veh'] = np.where(df_test['veh_body'].isin(['HDTOP', 'TRUCK', 'UTE']), 1, 0)
df_test['young_dr'] = np.where(df_test['dr_age'] == 1, 1, 0)
df_test['old_dr'] = np.where(df_test['dr_age'] > 4.0, 1, 0)
df_test['new_veh'] = np.where(df_test['veh_age'] < 2.0, 1, 0)
df_test['old_veh'] = np.where(df_test['veh_age'] == 4.0, 1, 0)
df_test['frequent_area'] = np.where(df_test['area'].isin(['B','F']), 1, 0)
df_test['young_dr_old_car'] = np.where((df_test['dr_age'] == 1) & (df_test['veh_age'] > 1.0), 1, 0)
df_test['young_m_old_car'] = np.where((df_test['dr_age'] == 1) & (df_test['veh_age'] > 1.0) & (df_test['gender'] == 'M'), 1, 0)
df_test['young_f_old_car'] = np.where((df_test['dr_age'] == 1) & (df_test['veh_age'] > 1.0) & (df_test['gender'] == 'F'), 1, 0)
df_test['frequent_body'] = np.where(df_test['veh_body'].isin(['BUS', 'COUPE', 'HDTOP', 'MCARA', 'PANVN', 'STNWG']), 1, 0)
df_test['infrequent_body'] = np.where(df_test['veh_body'].isin(['MIBUS', 'UTE']), 1, 0)
df_test['young_m'] = np.where((df_test['gender'] == 'M') & (df_test['dr_age'] < 3.0), 1, 0)

for i in cat_cols:
    df_test[i] = df_test[i].astype('category')
    
df_test['dr_age'] = df_test['dr_age'].astype(np.float64)
df_test['veh_age'] = df_test['veh_age'].astype(np.float64) 

In [5]:
# Defining our gini function
def gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(1/n_samples, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred / G_true

In [6]:
# Defining column transformers for later steps          
get_cats = make_column_selector(dtype_include=pd.CategoricalDtype)
get_notcats = make_column_selector(dtype_exclude=pd.CategoricalDtype)
get_notfloats = make_column_selector(dtype_exclude=np.float64)
get_floats = make_column_selector(dtype_include=np.float64)
get_ints = make_column_selector(dtype_include=[np.int32, np.int64])
one_hot = lambda: ColumnTransformer([('one_hot', OneHotEncoder(drop='first', sparse=False), get_cats)], remainder='passthrough')

# Initializing cross validated preds
cv_freq_preds = {}
test_freq_preds = {}
cv_sev_preds = {}
test_sev_preds = {}

In [10]:
# LGBM Classifier
ind_lgbm = LGBMClassifier(n_estimators=750,
                              learning_rate=0.1,
                              num_leaves=31,
                              subsample=0.8,
                              subsample_freq=1,
                              scale_pos_weight=(y.shape[0] - y['claim_ind'].sum()) / y['claim_ind'].sum(),
                              n_jobs=-1)

ginis = []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X.loc[y['fold'] != i, :], X.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    ind_lgbm.fit(X_train[boost_sev_cols], y_train['claim_ind'])
    ginis.append(gini(y_test['claim_cost'], ind_lgbm.predict_proba(X_test[boost_sev_cols])[:, 1]))
print(ginis)
print(np.mean(ginis))

[0.190149963934325, -0.013077813242219507, 0.11724726635263655, -0.03613068840671824, 0.07796341189978917, 0.15483696518393744, 0.14191269695658587, -0.1169246443748487, 0.04851382484001185, 0.14148603232032858, 0.14076331839563852, 0.018533033523736413, -0.28679037859982986, -0.1930338204648662, 0.1153294150218813, 0.20055704805396818, 0.09827494790944377, -0.0014097935874787413, -0.03574337349051484, -0.0243584799022371]
0.03690494661617848


In [14]:
df_test.shape

(22629, 25)