In [84]:
# Importing packages
from sklearn.experimental import enable_hist_gradient_boosting
from scipy.stats import uniform
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_predict, cross_val_score, GroupKFold, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.metrics import make_scorer, mean_poisson_deviance, mean_gamma_deviance
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler, PolynomialFeatures, OrdinalEncoder
from sklearn.linear_model import LinearRegression, RidgeCV, PoissonRegressor, GammaRegressor, TweedieRegressor, LogisticRegressionCV
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, LinearSVC, SVR
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

pd.set_option('display.max_rows', 500)

In [4]:
# Importing data
df = pd.read_csv('InsNova_train.csv')
df = df.sample(frac=1.0)
df.loc[:, 'pure_premium'] = df['claim_cost'] / df['exposure']
df.loc[:, 'severity'] = df['claim_cost'] / np.fmax(df['claim_count'], 1)
df.loc[:, 'frequency'] = df['claim_count'] / df['exposure']

# Getting CV inds
n_folds = 10
cv = StratifiedKFold(n_folds, shuffle=True, random_state=123)
df.loc[:, 'fold'] = 0
for fold, (_, test_inds) in enumerate(cv.split(df, df['claim_ind'])):
    df.loc[test_inds, 'fold'] = fold
    
# Feature engineering
df['large_veh'] = np.where(df['veh_body'].isin(['MIBUS', 'MCARA', 'BUS']), 1, 0)
df['expensive_area'] = np.where(df['area'].isin(['E','F']), 1, 0)
df['expensive_age_risk'] = np.where(df['dr_age'].isin([1, 2]) & (df['veh_value'] > 5.0), 1, 0)
df['expensive_veh'] = np.where(df['veh_value'] > 6.0, 1, 0)
df['severe_veh'] = np.where(df['veh_body'].isin(['HDTOP', 'TRUCK', 'UTE']), 1, 0)
df['young'] = np.where(df['dr_age'] == 1, 1, 0)

# Creating Categorical dataset for LightGBM and CatBoost
df['lm_gender'] = np.where(df['gender'] == 'M', 1, 0)
for i in ['veh_body', 'area', 'gender', 'large_veh', 'expensive_area', 'expensive_age_risk', 'expensive_veh', 'severe_veh', 'young']:
    df[i] = df[i].astype('category')
df['dr_age'] = df['dr_age'].astype(np.float64)
df['veh_age'] = df['veh_age'].astype(np.float64) 

# Splitting into pred/response
response_cols = ['fold',
                 'exposure',
                 'claim_ind',
                 'claim_count',
                 'claim_cost',
                 'pure_premium',
                 'severity',
                 'frequency']
X, y = df.drop(response_cols, axis=1), df[response_cols]
X = X.drop('id', axis=1)
X['exposure'] = y['exposure'].copy()
lin_cols = ['veh_value', 'veh_body', 'veh_age', 'lm_gender', 'area', 'dr_age']
boost_cols = ['veh_value', 'veh_body', 'veh_age', 'gender', 'area', 'dr_age']
lin_sev_cols = lin_cols + ['exposure']
boost_sev_cols = boost_cols + ['exposure']

# Importing test set
df_test = pd.read_csv('InsNova_test.csv')
df_test['lm_gender'] = np.where(df_test['gender'] == 'M', 1, 0)
df_test['large_veh'] = np.where(df_test['veh_body'].isin(['MIBUS', 'MCARA', 'BUS']), 1, 0)
df_test['expensive_area'] = np.where(df_test['area'].isin(['E','F']), 1, 0)
df_test['expensive_age_risk'] = np.where(df_test['dr_age'].isin([1, 2]) & (df_test['veh_value'] > 5.0), 1, 0)
df_test['expensive_veh'] = np.where(df_test['veh_value'] > 6.0, 1, 0)
df_test['severe_veh'] = np.where(df_test['veh_body'].isin(['HDTOP', 'TRUCK', 'UTE']), 1, 0)
df_test['young'] = np.where(df_test['dr_age'] == 1, 1, 0)

for i in ['veh_body', 'area', 'gender', 'large_veh', 'expensive_area', 'expensive_age_risk', 'expensive_veh', 'severe_veh', 'young']:
    df_test[i] = df_test[i].astype('category')
    
df_test['dr_age'] = df_test['dr_age'].astype(np.float64)
df_test['veh_age'] = df_test['veh_age'].astype(np.float64) 

In [25]:
# Defining our gini function
def gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(1/n_samples, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred / G_true

In [26]:
# Defining column transformers for later steps          
get_cats = make_column_selector(dtype_include=pd.CategoricalDtype)
get_notcats = make_column_selector(dtype_exclude=pd.CategoricalDtype)
get_notfloats = make_column_selector(dtype_exclude=np.float64)
get_floats = make_column_selector(dtype_include=np.float64)
get_ints = make_column_selector(dtype_include=[np.int32, np.int64])
one_hot = lambda: ColumnTransformer([('one_hot', OneHotEncoder(drop='first', sparse=False), get_cats)], remainder='passthrough')

# Initializing cross validated preds
cv_freq_preds = {}
test_freq_preds = {}
cv_sev_preds = {}
test_sev_preds = {}

In [27]:
# Defining a target encoder
class TargetEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, use_median=True, nonzero=True, min_samples=10):
        self.nonzero = nonzero
        self.use_median = use_median
        self.min_samples = min_samples
        self.categories_ = None
        self.medians_ = None
        self.all_mu = None
        
    def fit(self, X, y=None):
        _X, _y = np.array(X), np.array(y)
        self.categories_ = {i: np.unique(_X[:, i]) for i in range(_X.shape[1])}
        self.mu_ = {k:{i:[] for i in v} for k, v in self.categories_.items()}
        mu_func = np.median if self.use_median else np.mean
        self.all_mu = mu_func(_y[_y > 0.0]) if self.nonzero else mu_func(_y)
        for k, v in self.categories_.items():
            for i in v:
                if self.nonzero:
                    _x = _y[(_y > 0.0) & (_X[:, k] == i)]
                    if _x.shape[0] == 0:
                        self.mu_[k][i] = 0.0
                    elif _x.shape[0] < self.min_samples:
                        self.mu_[k][i] = self.all_mu
                    else:
                        self.mu_[k][i] = mu_func(_x)
                else:
                    _x = _y[_X[:, k] == i]
                    if _x.shape[0] < self.min_samples:
                        self.mu_[k][i] = self.all_mu
                    else:
                        self.mu_[k][i] = mu_func(_x)
        return self
    
    def transform(self, X, y=None):
        _X = np.array(X)
        for i in range(_X.shape[1]):
            _X[:, i] = [self.mu_[i][j] if j in self.mu_.keys() else self.all_mu for j in _X[:, i]]
        return np.array(_X)

# Frequency

In [156]:
# Freq sev predictions
freq_lm = make_pipeline(ColumnTransformer([('target_enc', TargetEncoder(True, True, 20), get_cats),
                                          ('box-cox', PowerTransformer(standardize=False), get_notcats)], remainder='passthrough'),
                             PolynomialFeatures(degree=2, interaction_only=True, include_bias=False),
                             StandardScaler(),
                             PoissonRegressor(alpha=1.0, max_iter=1000))

# Getting predictions
ginis, poissons, test_preds, train_preds = [], [], [], []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X.loc[y['fold'] != i, :], X.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    freq_lm.fit(X_train[lin_cols], y_train['frequency'], poissonregressor__sample_weight=y_train['exposure'])
    train_preds.append(pd.Series(freq_lm.predict(X_test[lin_cols]), index=X_test.index))
    test_preds.append(freq_lm.predict(df_test[lin_cols]))
    poissons.append(mean_poisson_deviance(y_test['frequency'], train_preds[-1]))
    ginis.append(gini(y_test['claim_count'], train_preds[-1] * y_test['exposure']))
print(np.mean(ginis))
print(np.mean(poissons))
test_freq_preds['freq_lm'] = np.mean(test_preds, axis=0)
cv_freq_preds['freq_lm'] = pd.concat(train_preds).loc[y.index]

0.32581236185788
1.3437953419221085


In [178]:
# LGBM freq
csts = {'veh_value': -1, 'dr_age': -1, 'veh_age': 1}
freq_lgbm = LGBMRegressor(n_estimators=250,
                          learning_rate=0.01,
                          subsample=0.8,
                          subsample_freq=1,
                          objective='poisson',
                          monotone_constraints=[csts[i] if i in csts.keys() else 0 for i in boost_cols],
                          monotone_constraints_method='advanced',
                          num_leaves=16,
                          n_jobs=-1)

# Getting predictions
ginis, poissons, test_preds, train_preds = [], [], [], []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X.loc[y['fold'] != i, :], X.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    freq_lgbm.fit(X_train[boost_cols], y_train['frequency'], sample_weight=y_train['exposure'])
    train_preds.append(pd.Series(freq_lgbm.predict(X_test[boost_cols]), index=X_test.index))
    test_preds.append(freq_lgbm.predict(df_test[boost_cols]))
    poissons.append(mean_poisson_deviance(y_test['frequency'], train_preds[-1]))
    ginis.append(gini(y_test['claim_count'], train_preds[-1] * y_test['exposure']))
print(np.mean(ginis))
print(np.mean(poissons))
test_freq_preds['freq_lgbm'] = np.mean(test_preds, axis=0)
cv_freq_preds['freq_lgbm'] = pd.concat(train_preds).loc[y.index]

0.32734163673833805
1.3447274983494935


In [179]:
# Catboost freq
csts = {'veh_value': -1, 'dr_age': -1, 'veh_age': 1}
freq_cat = CatBoostRegressor(n_estimators=250,
                             learning_rate=0.05,
                             objective='Poisson',
                             max_depth=3,
                             cat_features=[i for i in boost_cols if i not in ['veh_value', 'dr_age', 'veh_age']],
                             monotone_constraints=[csts[i] if i in csts.keys() else 0 for i in boost_cols],
                             verbose=0,
                             thread_count=-1)

# Getting predictions
ginis, poissons, test_preds, train_preds = [], [], [], []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X.loc[y['fold'] != i, :], X.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    freq_cat.fit(X_train[boost_cols], y_train['frequency'], sample_weight=y_train['exposure'])
    train_preds.append(pd.Series(freq_cat.predict(X_test[boost_cols]), index=X_test.index))
    test_preds.append(freq_cat.predict(df_test[boost_cols]))
    poissons.append(mean_poisson_deviance(y_test['frequency'], train_preds[-1]))
    ginis.append(gini(y_test['claim_count'], train_preds[-1] * y_test['exposure']))
print(np.mean(ginis))
print(np.mean(poissons))
test_freq_preds['freq_cat'] = np.mean(test_preds, axis=0)
cv_freq_preds['freq_cat'] = pd.concat(train_preds).loc[y.index]

0.3266319491597976
1.337405816344505


In [181]:
# Freq sev predictions
csts = {'veh_value': -1, 'dr_age': -1, 'veh_age': 1}
freq_hgb = make_pipeline(ColumnTransformer([('target_enc', OrdinalEncoder(), get_cats)], remainder='passthrough'),
                        HistGradientBoostingRegressor(loss='poisson',
                                                      max_iter=150,
                                                      learning_rate=0.01,
                                                      max_leaf_nodes=16,
                                                      l2_regularization=10.0,
                                                      monotonic_cst=[csts[i] if i in csts.keys() else 0 for i in lin_cols]
                                                     ))

# Getting predictions
ginis, poissons, test_preds, train_preds = [], [], [], []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X.loc[y['fold'] != i, :], X.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    freq_hgb.fit(X_train[lin_cols], y_train['frequency'], histgradientboostingregressor__sample_weight=y_train['exposure'])
    train_preds.append(pd.Series(freq_hgb.predict(X_test[lin_cols]), index=X_test.index))
    test_preds.append(freq_hgb.predict(df_test[lin_cols]))
    poissons.append(mean_poisson_deviance(y_test['frequency'], train_preds[-1]))
    ginis.append(gini(y_test['claim_count'], train_preds[-1] * y_test['exposure']))
print(np.mean(ginis))
print(np.mean(poissons))
test_freq_preds['freq_hgb'] = np.mean(test_preds, axis=0)
cv_freq_preds['freq_hgb'] = pd.concat(train_preds).loc[y.index]

0.3270711767754437
1.3437223555033369


In [264]:
# Stacking frequency
"""
csts = {'veh_value': 1, 'dr_age': -1}#, **{i: 1 for i in list(cv_freq_preds.keys())}}
X_cv = pd.concat([X[boost_cols], pd.DataFrame(cv_freq_preds)], axis=1)
freq_stack = CatBoostRegressor(n_estimators=250,
                              objective='Poisson',
                              max_depth=3,
                              cat_features=[i for i in X_cv.columns if i in ['veh_body', 'gender', 'area']],
                              monotone_constraints=[csts[i] if i in csts.keys() else 0 for i in X_cv.columns],
                              verbose=0,
                              thread_count=-1)
"""
X_cv = pd.concat([X[lin_cols], pd.DataFrame(cv_freq_preds)], axis=1)
freq_stack = make_pipeline(ColumnTransformer([('target_enc', TargetEncoder(True, True, 20), get_cats),
                                          ('box-cox', PowerTransformer(standardize=False), get_notcats)], remainder='passthrough'),
                             PolynomialFeatures(degree=2, interaction_only=True, include_bias=False),
                             StandardScaler(),
                             PoissonRegressor(alpha=0.0, max_iter=1000))
ginis, poissons, test_preds, train_preds = [], [], [], []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X_cv.loc[y['fold'] != i, :], X_cv.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    freq_stack.fit(X_train, y_train['frequency'], poissonregressor__sample_weight=y_train['exposure'])
    train_preds.append(pd.Series(freq_stack.predict(X_test), index=X_test.index))
    poissons.append(mean_poisson_deviance(y_test['frequency'], train_preds[-1]))
    ginis.append(gini(y_test['claim_count'], train_preds[-1] * y_test['exposure']))
freq_stack_preds = pd.concat(train_preds).loc[y.index]
print(np.mean(ginis))
print(np.mean(poissons))
freq_stack.fit(X_cv, y['frequency'], poissonregressor__sample_weight=y['exposure'])

0.3316748343767709
1.3315047301414107


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('target_enc',
                                                  TargetEncoder(min_samples=20),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000190B1DF1FD0>),
                                                 ('box-cox',
                                                  PowerTransformer(standardize=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000190B1DF1820>)])),
                ('polynomialfeatures',
                 PolynomialFeatures(include_bias=False, interaction_only=True)),
                ('standardscaler', StandardScaler()),
                ('poissonregressor',
                 PoissonRegressor(alpha=0.0, max_iter=1000))])

# Severity

In [183]:
# Freq sev predictions
sev_lm = make_pipeline(ColumnTransformer([('target_enc', TargetEncoder(True, True, 20), get_cats),
                                       ('box-cox', PowerTransformer(standardize=False), get_notcats)], remainder='passthrough'),
                           PolynomialFeatures(degree=2, interaction_only=True, include_bias=False),
                           StandardScaler(),
                           GammaRegressor(alpha=5.0, max_iter=1000))

# Getting predictions
sevs, test_preds, train_preds = [], [], []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X.loc[y['fold'] != i, :], X.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    sev_mask = y_train['claim_cost'] > 0.0
    sev_lm.fit(X_train.loc[sev_mask, lin_sev_cols], y_train.loc[sev_mask, 'severity'], gammaregressor__sample_weight=y_train.loc[sev_mask, 'claim_count'])
    preds = sev_lm.predict(X_test[lin_sev_cols])
    train_preds.append(pd.Series(preds, index=X_test.index))
    test_preds.append(sev_lm.predict(df_test[lin_sev_cols]))
    sevs.append(np.sqrt(np.mean((y_test.loc[y_test['claim_cost'] > 0.0, 'severity'] - preds[y_test['claim_cost'] > 0.0]) ** 2)))
print(sevs)
print(np.mean(sevs))
test_sev_preds['sev_lm'] = np.mean(test_preds, axis=0)
cv_sev_preds['sev_lm'] = pd.concat(train_preds).loc[y.index]

[3529.0204771797626, 2815.024979999179, 4081.8189032353675, 5172.526737799155, 3272.3520169106987, 4631.933348214872, 2657.1611718854406, 3354.4975732842254, 3898.7425292050725, 2725.341713578022]
3613.84194512918


In [232]:
# Freq sev predictions
csts = {'veh_value': 1, 'dr_age': -1}
sev_lgbm = LGBMRegressor(n_estimators=250,
                         learning_rate=0.001,
                         subsample=0.67,
                         subsample_freq=1,
                         colsample_bytree=0.7,
                         objective='gamma',
                         monotone_constraints=[csts[i] if i in csts.keys() else 0 for i in boost_sev_cols],
                         monotone_constraints_method='advanced',
                         num_leaves=8,
                         n_jobs=-1)

# Getting predictions
sevs, test_preds, train_preds = [], [], []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X.loc[y['fold'] != i, :], X.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    sev_mask = y_train['claim_cost'] > 0.0
    sev_lgbm.fit(X_train.loc[sev_mask, boost_sev_cols], y_train.loc[sev_mask, 'severity'], sample_weight=y_train.loc[sev_mask, 'claim_count'])
    preds = sev_lgbm.predict(X_test[boost_sev_cols])
    train_preds.append(pd.Series(preds, index=X_test.index))
    test_preds.append(sev_lgbm.predict(df_test[boost_sev_cols]))
    sevs.append(np.sqrt(np.mean((y_test.loc[y_test['claim_cost'] > 0.0, 'severity'] - preds[y_test['claim_cost'] > 0.0]) ** 2)))
print(sevs)
print(np.mean(sevs))
test_sev_preds['sev_lgbm'] = np.mean(test_preds, axis=0)
cv_sev_preds['sev_lgbm'] = pd.concat(train_preds).loc[y.index]

[3540.615215490788, 2849.6489045031053, 4113.466521714538, 5209.456981145867, 3298.49516049426, 4663.789875459549, 2639.691821437647, 3420.88360107951, 3929.000196972192, 2728.6923337466865]
3639.3740612044144


In [233]:
# Freq sev predictions
csts = {'veh_value': 1, 'dr_age': -1}
sev_cat = CatBoostRegressor(n_estimators=250,
                            subsample=0.67,
                            max_depth=1,
                            cat_features=[i for i in boost_sev_cols if i not in ['veh_value', 'dr_age', 'veh_age', 'exposure']],
                            monotone_constraints=[csts[i] if i in csts.keys() else 0 for i in boost_sev_cols],
                            verbose=0,
                            thread_count=-1)

# Getting predictions
sevs, test_preds, train_preds = [], [], []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X.loc[y['fold'] != i, :], X.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    sev_mask = y_train['claim_cost'] > 0.0
    sev_cat.fit(X_train.loc[sev_mask, boost_sev_cols], y_train.loc[sev_mask, 'severity'], sample_weight=y_train.loc[sev_mask, 'claim_count'])
    preds = sev_cat.predict(X_test[boost_sev_cols])
    train_preds.append(pd.Series(preds, index=X_test.index))
    test_preds.append(sev_cat.predict(df_test[boost_sev_cols]))
    sevs.append(np.sqrt(np.mean((y_test.loc[y_test['claim_cost'] > 0.0, 'severity'] - preds[y_test['claim_cost'] > 0.0]) ** 2)))
print(sevs)
print(np.mean(sevs))
test_sev_preds['sev_cat'] = np.mean(test_preds, axis=0)
cv_sev_preds['sev_cat'] = pd.concat(train_preds).loc[y.index]

[3497.7289459117055, 2856.3648559883663, 4043.1903636399293, 5115.251104929736, 3354.740513348025, 4616.050566931161, 2768.2199511293747, 3312.530069127422, 3909.892992766295, 2797.398845694033]
3627.1368209466045


In [186]:
# KNN
knn_sev = make_pipeline(ColumnTransformer([('target_enc', TargetEncoder(True, True, 20), get_cats)
                                      ], remainder='passthrough'),
                    StandardScaler(),
                    BaggingRegressor(KNeighborsRegressor(n_neighbors=25, n_jobs=-1),
                                     n_estimators=25,
                                     max_features=5,
                                     max_samples=0.8))
# Scoring model
sevs, test_preds, train_preds = [], [], []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X.loc[y['fold'] != i, :], X.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    sev_mask = y_train['claim_cost'] > 0.0
    knn_sev.fit(X_train.loc[sev_mask, lin_sev_cols], y_train.loc[sev_mask, 'severity'])
    preds = knn_sev.predict(X_test[lin_sev_cols])
    train_preds.append(pd.Series(preds, index=X_test.index))
    test_preds.append(knn_sev.predict(df_test[lin_sev_cols]))
    sevs.append(np.sqrt(np.mean((y_test.loc[y_test['claim_cost'] > 0.0, 'severity'] - preds[y_test['claim_cost'] > 0.0]) ** 2)))
print(sevs)
print(np.mean(sevs))
test_sev_preds['knn_sev'] = np.mean(test_preds, axis=0)
cv_sev_preds['knn_sev'] = pd.concat(train_preds).loc[y.index]

[3529.452078683232, 2847.838486977578, 4072.6913499285947, 5153.532316631541, 3249.1653213926515, 4646.163938449369, 2698.039733432999, 3311.8160483775937, 3887.1469339187784, 2751.8140731014305]
3614.766028089377


In [188]:
# MLP
mlp_sev = make_pipeline(ColumnTransformer([('target_enc', TargetEncoder(True, True, 20), get_cats)
                                           ], remainder='passthrough'),
                    StandardScaler(),
                    TransformedTargetRegressor(MLPRegressor(64, max_iter=15), transformer=StandardScaler()))
# Scoring model
sevs, test_preds, train_preds = [], [], []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X.loc[y['fold'] != i, :], X.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    sev_mask = y_train['claim_cost'] > 0.0
    mlp_sev.fit(X_train.loc[sev_mask, lin_sev_cols], y_train.loc[sev_mask, 'severity'])
    preds = mlp_sev.predict(X_test[lin_sev_cols])
    train_preds.append(pd.Series(preds, index=X_test.index))
    test_preds.append(mlp_sev.predict(df_test[lin_sev_cols]))
    sevs.append(np.sqrt(np.mean((y_test.loc[y_test['claim_cost'] > 0.0, 'severity'] - preds[y_test['claim_cost'] > 0.0]) ** 2)))
print(sevs)
print(np.mean(sevs))
test_sev_preds['mlp_sev'] = np.mean(test_preds, axis=0)
cv_sev_preds['mlp_sev'] = pd.concat(train_preds).loc[y.index]



[3586.617710066896, 2894.0394224510237, 4071.463089854397, 5049.901637906729, 3301.8593826943365, 4568.192992506632, 2819.4527305897864, 3305.9229654739447, 3888.9738726335413, 2828.2990422175703]
3631.4722846394857




In [189]:
# Linear SVR
lsvr_sev = make_pipeline(ColumnTransformer([('target_enc', TargetEncoder(True, True, 20), get_cats)
                                      ], remainder='passthrough'),
                    StandardScaler(),
                    TransformedTargetRegressor(LinearSVR(loss='squared_epsilon_insensitive', C=1.0, max_iter=3000), transformer=StandardScaler()))
# Scoring model
sevs, test_preds, train_preds = [], [], []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X.loc[y['fold'] != i, :], X.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    sev_mask = y_train['claim_cost'] > 0.0
    lsvr_sev.fit(X_train.loc[sev_mask, lin_sev_cols], y_train.loc[sev_mask, 'severity'], transformedtargetregressor__sample_weight=y_train.loc[sev_mask, 'claim_count'])
    preds = lsvr_sev.predict(X_test[lin_sev_cols])
    train_preds.append(pd.Series(preds, index=X_test.index))
    test_preds.append(lsvr_sev.predict(df_test[lin_sev_cols]))
    sevs.append(np.sqrt(np.mean((y_test.loc[y_test['claim_cost'] > 0.0, 'severity'] - preds[y_test['claim_cost'] > 0.0]) ** 2)))
print(sevs)
print(np.mean(sevs))
test_sev_preds['lsvr_sev'] = np.mean(test_preds, axis=0)
cv_sev_preds['lsvr_sev'] = pd.concat(train_preds).loc[y.index]

[3565.312306651559, 2835.7191415571842, 4012.126617772698, 5100.035685000249, 3257.9810275375844, 4602.567479627455, 2773.649647787202, 3282.855983943877, 3883.4892718258798, 2772.556702425881]
3608.629386412957


In [244]:
# Stacking all severities
csts = {'veh_value': 1, 'dr_age': -1}#, **{i: 1 for i in list(cv_sev_preds.keys())}}
X_cv = pd.concat([X[lin_sev_cols], pd.DataFrame(cv_sev_preds)], axis=1)
"""
sev_stack = LGBMRegressor(n_estimators=50,
                         learning_rate=0.01,
                         #subsample=0.67,
                         #subsample_freq=1,
                         #colsample_bytree=0.7,
                         #objective='gamma',
                         #monotone_constraints=[csts[i] if i in csts.keys() else 0 for i in X_cv.columns],
                         #monotone_constraints_method='advanced',
                         num_leaves=6,
                         n_jobs=-1)
                         """
sev_stack = make_pipeline(ColumnTransformer([('target_enc', TargetEncoder(True, True, 20), get_cats)], remainder='passthrough'),
                    StandardScaler(),
                    TransformedTargetRegressor(LinearSVR(loss='squared_epsilon_insensitive', C=3.0, max_iter=10000), transformer=StandardScaler()))

# Scoring model
sevs, test_preds, train_preds = [], [], []
for i in range(n_folds):
    X_train, X_test, y_train, y_test = X_cv.loc[y['fold'] != i, :], X_cv.loc[y['fold'] == i, :], y.loc[y['fold'] != i, :], y.loc[y['fold'] == i, :]
    sev_mask = y_train['claim_cost'] > 0.0
    sev_stack.fit(X_train.loc[sev_mask, :], y_train.loc[sev_mask, 'severity'], transformedtargetregressor__sample_weight=y_train.loc[sev_mask, 'claim_count'])
    preds = sev_stack.predict(X_test)
    train_preds.append(pd.Series(preds, index=X_test.index))
    sevs.append(np.sqrt(np.mean((y_test.loc[y_test['claim_cost'] > 0.0, 'severity'] - preds[y_test['claim_cost'] > 0.0]) ** 2)))
sev_stack_preds = pd.concat(train_preds).loc[y.index]
print(sevs)
print(np.mean(sevs))
sev_mask = y['claim_cost'] > 0.0
sev_stack.fit(X_cv.loc[sev_mask, :], y.loc[sev_mask, 'severity'], transformedtargetregressor__sample_weight=y.loc[sev_mask, 'claim_count'])

[3445.6631591031373, 2800.196533799697, 3972.789483369281, 4837.240751045394, 3238.3553134302038, 4544.9229512757165, 2576.4942228369905, 3217.2341980084434, 3869.4024244406605, 2735.197072048798]
3523.7496109358317


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('target_enc',
                                                  TargetEncoder(min_samples=20),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000190B1DF1FD0>)])),
                ('standardscaler', StandardScaler()),
                ('transformedtargetregressor',
                 TransformedTargetRegressor(regressor=LinearSVR(C=3.0,
                                                                loss='squared_epsilon_insensitive',
                                                                max_iter=10000),
                                            transformer=StandardScaler()))])

# Submission

In [266]:
gini(y['claim_cost'], sev_stack_preds * freq_stack_preds * y['exposure'])

0.25098010417921907

In [270]:
sev_preds

array([1367.68410516, 2078.82416217, 1157.41232085, ..., 3327.27750518,
       3844.92316855, 1544.40888525])

In [268]:
# Creating our submission
X_cv_freq = pd.concat([df_test[lin_cols], pd.DataFrame(test_freq_preds)], axis=1)
X_cv_sev = pd.concat([df_test[lin_sev_cols], pd.DataFrame(test_sev_preds)], axis=1)
freq_preds = freq_stack.predict(X_cv_freq)
sev_preds = sev_stack.predict(X_cv_sev)
df_test['claim_cost'] = df_test['exposure'] * freq_preds * sev_preds
df_test['id'] = np.arange(df_test.shape[0])
df_test['id'] = df_test['id'].astype(int)
df_test['id'] += 1
df_test[['id', 'claim_cost']].to_csv('stacked_freq_sev_predictions_3.csv', index=False)

# More Testing

In [18]:
# Testing just using logistic regression with exposure as a feature
from catboost import CatBoostClassifier
ind_cols = ['veh_value', 'veh_body', 'veh_age', 'gender', 'area', 'dr_age', 'exposure']
ind_cat = CatBoostClassifier(n_estimators=500,
                             max_depth=4,
                             subsample=0.67,
                             cat_features=['veh_body', 'gender', 'area'],
                             #monotone_constraints=[csts[i] if i in csts.keys() else 0 for i in boost_cols],
                             auto_class_weights='Balanced',
                             verbose=0,
                             thread_count=-1)

# Getting predictions
probs = cross_val_predict(ind_cat,
                          X[ind_cols],
                          y['claim_ind'],
                          groups=y['fold'],
                          cv=GroupKFold(n_folds),
                          method='predict_proba')

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y['claim_ind'], probs[:,1] > 0.5))

              precision    recall  f1-score   support

           0       0.95      0.63      0.76     21076
           1       0.10      0.58      0.17      1534

    accuracy                           0.63     22610
   macro avg       0.53      0.61      0.47     22610
weighted avg       0.90      0.63      0.72     22610



In [20]:
ind_cat.fit(X[ind_cols], y['claim_ind'])

<catboost.core.CatBoostClassifier at 0x2ace9f56b48>

In [22]:
print(pd.DataFrame({'feature': ind_cols, 'gain': ind_cat.feature_importances_}).sort_values('gain', ascending=False))

     feature       gain
6   exposure  40.110706
0  veh_value  25.975083
4       area  10.205096
1   veh_body   9.757386
5     dr_age   9.138234
2    veh_age   3.022741
3     gender   1.790754


In [38]:
# Quick severity model
lsvr_sev = make_pipeline(ColumnTransformer([('target_enc', TargetEncoder(True, True, 20), get_cats)
                                      ], remainder='passthrough'),
                    StandardScaler(),
                    TransformedTargetRegressor(LinearSVR(loss='squared_epsilon_insensitive', C=1.0, max_iter=3000), transformer=StandardScaler()))
sev_mask = y['claim_cost'] > 0.0
lsvr_sev.fit(X.loc[sev_mask, lin_sev_cols], y.loc[sev_mask, 'severity'], transformedtargetregressor__sample_weight=y.loc[sev_mask, 'claim_count'])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('target_enc',
                                                  TargetEncoder(min_samples=20),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002ACE9F75FC8>)])),
                ('standardscaler', StandardScaler()),
                ('transformedtargetregressor',
                 TransformedTargetRegressor(regressor=LinearSVR(loss='squared_epsilon_insensitive',
                                                                max_iter=3000),
                                            transformer=StandardScaler()))])

In [61]:
# Freq sev predictions
csts = {'veh_value': 1, 'dr_age': -1}
sev_cat = CatBoostRegressor(n_estimators=250,
                            subsample=0.67,
                            max_depth=1,
                            cat_features=[i for i in boost_sev_cols if i not in ['veh_value', 'dr_age', 'veh_age', 'exposure']],
                            monotone_constraints=[csts[i] if i in csts.keys() else 0 for i in boost_sev_cols],
                            verbose=0,
                            thread_count=-1)

sev_mask = y['claim_cost'] > 0.0
sev_cat.fit(X.loc[sev_mask, boost_sev_cols], y.loc[sev_mask, 'severity'], sample_weight=y.loc[sev_mask, 'claim_count'])

<catboost.core.CatBoostRegressor at 0x2acea6db108>

In [81]:
# Adding exponent to boost inequality
preds = (ind_cat.predict_proba(df_test[ind_cols])[:,1] ** 5) * lsvr_sev.predict(df_test[lin_sev_cols])

In [82]:
preds

array([  8.79745768, 114.74007332, 105.43851636, ..., 145.23840301,
        17.13076749,   0.15929425])

In [83]:
# Let's test our submission by ID, something doesn't seem right there
# Creating our submission
submission = df_test[['id']].copy()
submission['id'] = np.arange(submission.shape[0])
submission['id'] = submission['id'].astype(int)
submission['id'] += 1
submission['claim_cost'] = preds
submission[['id', 'claim_cost']].to_csv('yet_another_submission_3.csv', index=False)