In [1]:
# Importing our packages
from time import time
import numpy as np
import pandas as pd
from sklearn.model_selection import RepeatedKFold, cross_val_predict, cross_val_score, RandomizedSearchCV, cross_validate, KFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.linear_model import TweedieRegressor, RidgeClassifierCV
from sklearn.ensemble import StackingRegressor
from sklearn.calibration import CalibratedClassifierCV
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, LGBMRanker, LGBMClassifier
from catboost import CatBoostRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import tensorflow as tf

In [4]:
# Importing all our data
df = pd.read_csv('InsNova_train.csv')
df['pure_premium'] = df['claim_cost'] / df['exposure']
df['pure_premium_rank'] = df['pure_premium'].rank(method='min').astype(np.int64)
df['avg_cost'] = df['claim_cost'] / np.fmax(df['claim_count'], 1)
df['frequency'] = df['claim_count'] / df['exposure']
response_cols = ['exposure', 'claim_ind', 'claim_count', 'claim_cost', 'pure_premium','pure_premium_rank', 'avg_cost', 'frequency']
X, y = df.drop(response_cols, axis=1), df[response_cols]
X = X.drop('id', axis=1)

# Adding a condensed veh_body column
other_bodies = ['TRUCK', 'COUPE', 'MIBUS', 'PANVN', 'BUS', 'RDSTR', 'MCARA', 'CONVT']
X['veh_body2'] = np.where(X['veh_body'].isin(other_bodies), 'OTHER', X['veh_body'])

# Creating Categorical dataset for LightGBM and CatBoost
for i in ['veh_body', 'veh_body2', 'gender', 'area']:
    X[i] = X[i].astype('category')
    
# Defining column transformers for later steps          
get_cats = make_column_selector(dtype_include=pd.CategoricalDtype)
get_floats = make_column_selector(dtype_include=np.float64)
one_hot_scale = ColumnTransformer([('one_hot', OneHotEncoder(drop='first', sparse=False), get_cats),
                                    ('scaler', StandardScaler(), get_floats)],
                                  remainder='passthrough')

In [35]:
def gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.fmax(1.0, np.sum(true_order))
    L_pred = np.cumsum(pred_order) / np.fmax(1.0, np.sum(pred_order))
    L_ones = np.linspace(1/n_samples, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred/G_true

In [4]:
# Defining our cross validation function
def eval_model(model, n_iters=5, cv=10, fit_params=None):
    start_time = time()
    if fit_params is None:
        if hasattr(model, 'named_steps'):
            fit_params={list(model.named_steps.keys())[-1] + '__' + 'sample_weight': y['exposure']}
        else:
            fit_params={'sample_weight': y['exposure']}
        if isinstance(model, CatBoostRegressor):
            fit_params['cat_features'] = ['veh_body', 'area', 'gender']
        if isinstance(model, LGBMRanker):
            fit_params['group'] = [X.shape[0]]
    ginis = cross_val_score(model,
                            X,
                            y['pure_premium'],
                            cv=RepeatedKFold(n_splits=5, n_repeats=5),
                            scoring=make_scorer(gini),
                            fit_params=fit_params,
                            n_jobs=1)
    return np.array(ginis), time() - start_time

def print_eval(result, model_name):
    msg = '{} MRCV Gini: {:.4f} | 90% [{:.4f}, {:.4f}] | {:.1f} secs'
    means = np.random.choice(result[0],
                             (result[0].shape[0], 1000),
                             True)
    means = np.mean(means, axis=0)
    msg = msg.format(model_name,
                     np.mean(means),
                     np.quantile(means, 0.05),
                     np.quantile(means, 0.95),
                     result[1])
    print(msg)

In [61]:
# Dummy Regressor baseline
dummy = DummyRegressor()
dummy_cv = eval_model(dummy)
print_eval(dummy_cv, 'Baseline Model')

Baseline Model MRCV Gini: -0.1555 | 90% [-0.1908, -0.1204] | 0.1 secs


In [186]:
# Baseline Tweedie model
for i in np.linspace(1.01, 1.99, 15):
    tweedie = make_pipeline(one_hot_scale,
                            TweedieRegressor(power=i, alpha=0.001, max_iter=5000, warm_start=True))
    tweedie_cv = eval_model(tweedie)
    print_eval(tweedie_cv, 'Tweedie GLM {:.2f}'.format(i))

Tweedie GLM 1.01 MRCV Gini: 0.1649 | 90% [0.1032, 0.2255] | 69.9 secs
Tweedie GLM 1.08 MRCV Gini: 0.1908 | 90% [0.1142, 0.2613] | 71.3 secs
Tweedie GLM 1.15 MRCV Gini: 0.1947 | 90% [0.1409, 0.2445] | 75.2 secs
Tweedie GLM 1.22 MRCV Gini: 0.1808 | 90% [0.1135, 0.2429] | 63.9 secs
Tweedie GLM 1.29 MRCV Gini: 0.1530 | 90% [0.0888, 0.2076] | 60.6 secs
Tweedie GLM 1.36 MRCV Gini: 0.1868 | 90% [0.1237, 0.2502] | 50.9 secs
Tweedie GLM 1.43 MRCV Gini: 0.1472 | 90% [0.0911, 0.2042] | 41.9 secs


KeyboardInterrupt: 

In [171]:
# Starting with Shallow Extra Trees
et = LGBMRegressor(n_estimators=1000,
                   max_depth=1,
                   learning_rate=1,
                   subsample=0.9,
                   subsample_freq=1,
                   boosting_type='rf',
                   extra_trees=True,
                   objective='tweedie',
                   tweedie_variance_power=1.15,
                   boost_from_average=False,
                   n_jobs=-1)
et_cv = eval_model(et)
print_eval(et_cv, 'Shallow Extra Trees')

Shallow Extra Trees MRCV Gini: 0.2486 | 90% [0.1791, 0.3143] | 11.4 secs


In [166]:
# Starting with Shallow Random Forest
rf = LGBMRegressor(n_estimators=1000,
                   max_depth=1,
                   learning_rate=1,
                   subsample=0.67,
                   subsample_freq=1,
                   boosting_type='rf',
                   objective='tweedie',
                   tweedie_variance_power=1.15,
                   boost_from_average=False,
                   n_jobs=-1)
rf_cv = eval_model(rf)
print_eval(rf_cv, 'Shallow Random Forest')

Shallow Random Forest MRCV Gini: 0.2445 | 90% [0.1796, 0.3154] | 11.0 secs


In [165]:
# Trying LightGBM with Tweedie Objective
lgbm = LGBMRegressor(n_estimators=400,
                     learning_rate=0.001,
                     max_depth=1,
                     extra_trees=True,
                     objective='tweedie',
                     tweedie_variance_power=1.15,
                     boost_from_average=False,
                     n_jobs=-1)
lgbm_cv = eval_model(lgbm)
print_eval(lgbm_cv, 'Shallow LGBM')

Shallow LGBM MRCV Gini: 0.2371 | 90% [0.1828, 0.2940] | 5.3 secs


In [10]:
# Trying LightGBM with Tweedie Objective
lgbm = LGBMRegressor(n_estimators=500,
                     learning_rate=0.001,
                     num_leaves=6,
                     min_child_samples=1,
                     min_child_weight=0.0,
                     subsample=0.67, 
                     subsample_freq=1,
                     objective='tweedie',
                     tweedie_variance_power=1.15,
                     boost_from_average=False,
                     n_jobs=-1)
lgbm_cv = eval_model(lgbm)
print_eval(lgbm_cv, 'Shallow LGBM')

KeyboardInterrupt: 

In [93]:
# Let's look at feature importances
lgbm = LGBMRegressor(n_estimators=500,
                     #learning_rate=0.001,
                     num_leaves=11,
                     #min_child_samples=100,
                     #min_child_weight=0.0,
                     #max_bin=32,
                     #colsample_bytree=3.0 / 7.0,
                     subsample=0.67, 
                     subsample_freq=1,
                     #boosting_type='rf',
                     objective='tweedie',
                     mc=[1,0,1,0,0,-1, 0],
                     mc_method='advanced',
                     #path_smooth=100.0,
                     #cat_smooth=100.0,
                     #min_data_per_group=25,
                     tweedie_variance_power=1.8,
                     boost_from_average=False,
                     n_jobs=-1)
lgbm.fit(X, y['pure_premium'], sample_weight=y['exposure'])
feats = lgbm.feature_importances_ / lgbm.feature_importances_.sum()
print(pd.DataFrame({'Variable': X.columns, 'Feat. Imp.': feats}).sort_values('Feat. Imp.', ascending=False))
lgbm_cv = eval_model(lgbm)
print_eval(lgbm_cv, 'Shallow LGBM')

    Variable  Feat. Imp.
1   veh_body      0.3118
4       area      0.2892
3     gender      0.1638
0  veh_value      0.1156
2    veh_age      0.0688
5     dr_age      0.0412
6  veh_body2      0.0096
Shallow LGBM MRCV Gini: 0.1393 | 90% [0.1030, 0.1758] | 16.7 secs


In [94]:
print(lgbm.predict(X).min(), lgbm.predict(X).max())

8.954764772038274e-06 16023.653424521231


In [123]:
preds = []
for _ in range(15):
    preds.append(cross_val_predict(lgbm,
                                   X,
                                   y['pure_premium'],
                                   fit_params={'sample_weight':y['exposure']},
                                   cv=KFold(5, shuffle=True)))

In [124]:
gini(y['claim_cost'], y['exposure'] * np.vstack(preds).mean(axis=0))

0.1848789435901359

In [68]:
# What if we mask the bottom 50 % of observations?
def gini_mask(y_true, y_pred):
    y_pred = np.where(y_pred < np.quantile(y_pred, 0.5), 0.0, y_pred)

    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.fmax(1.0, np.sum(true_order))
    L_pred = np.cumsum(pred_order) / np.fmax(1.0, np.sum(pred_order))
    L_ones = np.linspace(1/n_samples, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred/G_true

# Defining our cross validation function
def eval_model_mask(model, n_iters=5, cv=10, fit_params=None):
    start_time = time()
    if fit_params is None:
        if hasattr(model, 'named_steps'):
            fit_params={list(model.named_steps.keys())[-1] + '__' + 'sample_weight': y['exposure']}
        else:
            fit_params={'sample_weight': y['exposure']}
        if isinstance(model, CatBoostRegressor):
            fit_params['cat_features'] = ['veh_body', 'area', 'gender']
        if isinstance(model, LGBMRanker):
            fit_params['group'] = [X.shape[0]]
    ginis = cross_val_score(model,
                            X,
                            y['pure_premium'],
                            cv=RepeatedKFold(n_splits=5, n_repeats=5),
                            scoring=make_scorer(gini_mask),
                            fit_params=fit_params,
                            n_jobs=1)
    return np.array(ginis), time() - start_time

def print_eval_mask(result, model_name):
    msg = '{} MRCV Gini: {:.4f} | 90% [{:.4f}, {:.4f}] | {:.1f} secs'
    means = np.random.choice(result[0],
                             (result[0].shape[0], 1000),
                             True)
    means = np.mean(means, axis=0)
    msg = msg.format(model_name,
                     np.mean(means),
                     np.quantile(means, 0.05),
                     np.quantile(means, 0.95),
                     result[1])
    print(msg)

In [72]:
# Starting with Shallow Extra Trees
lgbm = LGBMRegressor(n_estimators=500,
                     learning_rate=1.0,
                     num_leaves=2,
                     #min_child_samples=100,
                     #min_child_weight=0.0,
                     #max_bin=32,
                     #colsample_bytree=3.0 / 7.0,
                     subsample=0.67, 
                     subsample_freq=1,
                     boosting_type='rf',
                     objective='tweedie',
                     mc=[1,0,1,0,0,-1, 0],
                     mc_method='advanced',
                     #path_smooth=100.0,
                     #cat_smooth=100.0,
                     #min_data_per_group=25,
                     tweedie_variance_power=1.8,
                     boost_from_average=False,
                     n_jobs=-1)
lgbm_cv = eval_model_mask(lgbm)
print_eval_mask(lgbm_cv, 'Shallow Extra Trees')
lgbm_cv = eval_model(lgbm)
print_eval(lgbm_cv, 'Shallow Extra Trees')

Shallow Extra Trees MRCV Gini: 0.2027 | 90% [0.1313, 0.2740] | 8.3 secs
Shallow Extra Trees MRCV Gini: 0.2302 | 90% [0.1844, 0.2805] | 7.3 secs


In [33]:
X_cat = pd.get_dummies(X.copy(), columns=['veh_body', 'gender', 'area'])
mon_dict = {'veh_value': 1, 'dr_age': -1}
mon_constraints = [mon_dict[i] if i in list(mon_dict.keys()) else 0 for i in X_cat.columns]
lgbm = LGBMRegressor(n_estimators=500,
                     learning_rate=0.001,
                     num_leaves=8,
                     min_child_samples=100,
                     min_child_weight=0.0,
                     subsample=0.67, 
                     subsample_freq=1,
                     colsample_bytree=0.25,
                     boosting_type='rf',
                     objective='tweedie',
                     mc=mon_constraints,
                     mc_method='advanced',
                     min_data_per_group=25,
                     tweedie_variance_power=1.15,
                     boost_from_average=False,
                     n_jobs=-1)
lgbm.fit(X_cat, y['pure_premium'], sample_weight=y['exposure'])
feats = lgbm.feature_importances_ / lgbm.feature_importances_.sum()
print(pd.DataFrame({'Variable': X_cat.columns, 'Feat. Imp.': feats}).sort_values('Feat. Imp.', ascending=False))
lgbm_cv = eval_model(lgbm)
print_eval(lgbm_cv, 'Shallow LGBM')

          Variable  Feat. Imp.
0        veh_value    0.136470
2           dr_age    0.106078
1          veh_age    0.099942
12  veh_body_SEDAN    0.069842
16        gender_F    0.054939
13  veh_body_STNWG    0.052309
18          area_A    0.052016
6   veh_body_HBACK    0.043542
7   veh_body_HDTOP    0.040035
17        gender_M    0.040035
20          area_C    0.038282
23          area_F    0.036821
15    veh_body_UTE    0.035944
21          area_D    0.035359
19          area_B    0.035359
22          area_E    0.034775
14  veh_body_TRUCK    0.031268
5   veh_body_COUPE    0.030099
9   veh_body_MIBUS    0.018118
10  veh_body_PANVN    0.008767
3     veh_body_BUS    0.000000
11  veh_body_RDSTR    0.000000
8   veh_body_MCARA    0.000000
4   veh_body_CONVT    0.000000


Traceback (most recent call last):
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\sklearn.py", line 751, in fit
    super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\sklearn.py", line 595, in fit
    self._Booster = train(params, train_set,
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\engine.py", line 231, in train
    booster = Booster(params=params, train_set=train_set)
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\basic.py", line 1988, in __init__
    _safe_call(_LIB.LGBM_BoosterCreate(
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\basic.py", line 55, in _safe_call
    raise LightGBMError(decode_st

Traceback (most recent call last):
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\sklearn.py", line 751, in fit
    super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\sklearn.py", line 595, in fit
    self._Booster = train(params, train_set,
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\engine.py", line 231, in train
    booster = Booster(params=params, train_set=train_set)
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\basic.py", line 1988, in __init__
    _safe_call(_LIB.LGBM_BoosterCreate(
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\basic.py", line 55, in _safe_call
    raise LightGBMError(decode_st

Traceback (most recent call last):
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\sklearn.py", line 751, in fit
    super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\sklearn.py", line 595, in fit
    self._Booster = train(params, train_set,
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\engine.py", line 231, in train
    booster = Booster(params=params, train_set=train_set)
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\basic.py", line 1988, in __init__
    _safe_call(_LIB.LGBM_BoosterCreate(
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\basic.py", line 55, in _safe_call
    raise LightGBMError(decode_st

Shallow LGBM MRCV Gini: nan | 90% [nan, nan] | 1.6 secs


Traceback (most recent call last):
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\sklearn.py", line 751, in fit
    super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\sklearn.py", line 595, in fit
    self._Booster = train(params, train_set,
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\engine.py", line 231, in train
    booster = Booster(params=params, train_set=train_set)
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\basic.py", line 1988, in __init__
    _safe_call(_LIB.LGBM_BoosterCreate(
  File "C:\Users\gursk\anaconda3\envs\travelers\lib\site-packages\lightgbm\basic.py", line 55, in _safe_call
    raise LightGBMError(decode_st

In [180]:
# CatBoost
cb = CatBoostRegressor(n_estimators=500,
                       learning_rate=0.0001,
                       max_depth=3,
                       subsample=0.8,
                       objective='Tweedie:variance_power=1.15',
                       boost_from_average=False,
                       verbose=0,
                       thread_count=-1)
cb_cv = eval_model(cb)
print_eval(cb_cv, 'CatBoost')

CatBoost MRCV Gini: 0.1971 | 90% [0.1421, 0.2523] | 78.8 secs


In [181]:
# Prepping a submission
et = LGBMRegressor(n_estimators=1000,
                   max_depth=1,
                   learning_rate=1,
                   subsample=0.9,
                   subsample_freq=1,
                   boosting_type='rf',
                   extra_trees=True,
                   objective='tweedie',
                   tweedie_variance_power=1.15,
                   boost_from_average=False,
                   n_jobs=-1)
et.fit(X, y['pure_premium'], sample_weight=y['exposure'])

LGBMRegressor(boost_from_average=False, boosting_type='rf', extra_trees=True,
              learning_rate=1, max_depth=1, n_estimators=1000,
              objective='tweedie', subsample=0.9, subsample_freq=1,
              tweedie_variance_power=1.15)

In [185]:
# Getting our predicstions
df_test = pd.read_csv('InsNova_test.csv')
X_test = df_test.drop(['exposure', 'id'], axis=1)

# Creating Categorical dataset for LightGBM and CatBoost
for i in ['veh_body', 'gender', 'area']:
    X_test[i] = X_test[i].astype('category')

df_test['claim_cost'] = df_test['exposure'] * et.predict(X_test)
df_test['id'] = np.arange(df_test.shape[0])
df_test['id'] = df_test['id'].astype(int)
df_test['id'] += 1
df_test[['id', 'claim_cost']].to_csv('shallow_et_predictions.csv', index=False)

In [107]:
def get_cats(x):
    return [i for i in np.arange(x.shape[1]) if not isinstance(x[0, i], float) and not isinstance(x[0, i], int)]

def get_floats(x):
    return [i for i in np.arange(x.shape[1]) if isinstance(x[0, i], float)]

one_hot_scale = ColumnTransformer([('one_hot', OneHotEncoder(drop='first', sparse=False), get_cats),
                                   ('scaler', StandardScaler(), get_floats)],
                                    remainder='passthrough')

class ScaledTweedieRegressor(BaseEstimator, RegressorMixin):
    
    def __init__(self, **kwargs):
        self.model = make_pipeline(one_hot_scale, TweedieRegressor(**kwargs))
        
    def fit(self, X, y, sample_weight):
        self.model.fit(X, y, tweedieregressor__sample_weight=sample_weight)
        return self
    
    def predict(self, X):
        return self.model.predict(X)
    
    def get_params(self, deep=True):
        return self.model.named_steps['tweedieregressor'].get_params(deep)
    
    def set_params(self, **kwargs):
        self.model.named_steps['tweedieregressor'].set_params(**kwargs)
        return self

In [108]:
class TweedieLGBMRegressor(BaseEstimator, RegressorMixin):
    
    def __init__(self, **kwargs):
        self.model = LGBMRegressor(**kwargs)
        
    def fit(self, X, y, sample_weight):
        _X = pd.DataFrame(X)
        for i in [1, 3, 4]:
            _X.iloc[:, i] = _X.iloc[:, i].astype('category')
        _X.iloc[:, 0] = _X.iloc[:, 0].astype(np.float64)
        _X.iloc[:, 2] = _X.iloc[:, 2].astype(np.int64)
        _X.iloc[:, 5] = _X.iloc[:, 5].astype(np.int64)
        if _X.shape[1] > 6:
            _X.iloc[:, 6] = _X.iloc[:, 6].astype(np.float64)
            _X.iloc[:, 7] = _X.iloc[:, 7].astype(np.float64)
            _X.iloc[:, 8] = _X.iloc[:, 8].astype(np.float64)
        self.model.fit(_X, y, sample_weight=sample_weight)
        return self
    
    def predict(self, X):
        _X = pd.DataFrame(X)
        for i in [1, 3, 4]:
            _X.iloc[:, i] = _X.iloc[:, i].astype('category')
        _X.iloc[:, 0] = _X.iloc[:, 0].astype(np.float64)
        _X.iloc[:, 2] = _X.iloc[:, 2].astype(np.int64)
        _X.iloc[:, 5] = _X.iloc[:, 5].astype(np.int64)
        if _X.shape[1] > 6:
            _X.iloc[:, 6] = _X.iloc[:, 6].astype(np.float64)
            _X.iloc[:, 7] = _X.iloc[:, 7].astype(np.float64)
            _X.iloc[:, 8] = _X.iloc[:, 8].astype(np.float64)
        return self.model.predict(_X)
    
    def get_params(self, deep=True):
        return self.model.get_params(deep)
    
    def set_params(self, **kwargs):
        self.model.set_params(**kwargs)
        return self

In [112]:
# Creating a stacking regressor
base_estimators = [ScaledTweedieRegressor(power=1.15, alpha=0.1, max_iter=1000),
                   TweedieLGBMRegressor(n_estimators=500,
                                          learning_rate=0.001,
                                          max_depth=1,
                                          extra_trees=True,
                                          objective='tweedie',
                                          min_child_samples=2,
                                          subsample=0.67,
                                          subsample_freq=1,
                                          tweedie_variance_power=1.15,
                                          boost_from_average=False,
                                          n_jobs=-1),
                    CatBoostRegressor(n_estimators=300,
                                      learning_rate=0.0001,
                                      min_child_samples=1,
                                      max_depth=1,
                                      subsample=0.8,
                                      objective='Tweedie:variance_power=1.15',
                                      boost_from_average=False,
                                      verbose=0,
                                      cat_features=[1, 3, 4],
                                      thread_count=-1)]
meta_estimator = TweedieLGBMRegressor(n_estimators=1000,
                                       max_depth=1,
                                       learning_rate=1,
                                       min_child_samples=2,
                                       subsample=0.9,
                                       subsample_freq=1,
                                       boosting_type='rf',
                                       objective='tweedie',
                                       tweedie_variance_power=1.15,
                                       boost_from_average=False,
                                       n_jobs=-1)
model = StackingCVRegressor(base_estimators,
                            meta_estimator,
                            cv=15,
                            use_features_in_secondary=True,
                            n_jobs=1)
model.fit(X, y['pure_premium'], sample_weight=y['exposure'])

StackingCVRegressor(cv=15,
                    meta_regressor=TweedieLGBMRegressor(boost_from_average=False,
                                                        boosting_type='rf',
                                                        class_weight=None,
                                                        colsample_bytree=1.0,
                                                        importance_type='split',
                                                        learning_rate=1,
                                                        max_depth=1,
                                                        min_child_samples=2,
                                                        min_child_weight=0.001,
                                                        min_split_gain=0.0,
                                                        n_estimators=1000,
                                                        n_jobs=-1,
                                                        num_leav

In [113]:
# Getting our predicstions
df_test = pd.read_csv('InsNova_test.csv')
X_test = df_test.drop(['exposure', 'id'], axis=1)

# Creating Categorical dataset for LightGBM and CatBoost
for i in ['veh_body', 'gender', 'area']:
    X_test[i] = X_test[i].astype('category')

df_test['claim_cost'] = df_test['exposure'] * model.predict(X_test)
df_test['id'] = np.arange(df_test.shape[0])
df_test['id'] = df_test['id'].astype(int)
df_test['id'] += 1
df_test[['id', 'claim_cost']].to_csv('stacked_predictions2.csv', index=False)

In [8]:
# Transforming data
X = one_hot_scale.fit_transform(X)

In [9]:
# Defining our Tweedie loss
@tf.function
def tweedie_loss(y_true, y_pred):
    p = 1.8
    loss = 2.0 * (tf.pow(y_true, 2-p) / ((1-p) * (2-p)) - y_true * tf.pow(y_pred, 1-p) / (1-p) + tf.pow(y_pred, 2-p) / (2-p))
    return tf.reduce_mean(loss, axis=-1)

In [12]:
from sklearn.metrics import mean_tweedie_deviance
y_preds = np.array([0.1, 0.1, 1.0, 10.0])
y_true = np.array([0.0, 100.0, 20.0, 0.0])
print(mean_tweedie_deviance(y_true, y_preds, power=1.8))
print(tweedie_loss(y_true, y_preds))
# Looks like our implementation is correct

402.92645177075065
tf.Tensor(402.92645177075065, shape=(), dtype=float64)


In [101]:
# Trying simple GLM
def create_model():
    model = tf.keras.Sequential([
        tf.keras.Input((21,)),
        tf.keras.layers.Dense(1,
                              kernel_initializer='zeros',
                              bias_initializer=tf.keras.initializers.constant(np.log(y['pure_premium'].mean())),
                              kernel_regularizer=tf.keras.regularizers.L2(1.0),
                              activation=tf.keras.activations.exponential)
    ])
    model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.001), loss=tweedie_loss)
    return model
tf_model = tf.keras.wrappers.scikit_learn.KerasRegressor(create_model, epochs=20, batch_size=4096, verbose=0)
tf_model_cv = eval_model(tf_model)
print_eval(tf_model_cv, 'Tensorflow')





Tensorflow MRCV Gini: 0.2448 | 90% [0.1893, 0.3008] | 12.2 secs


In [102]:
tweedie_model = TweedieRegressor(power=1.8, alpha=1.0, link='log', max_iter=10000)
#tweedie_model_cv = eval_model(tweedie_model)
#print_eval(tweedie_model_cv, 'Tweedie')

In [103]:
tf_model = tf.keras.wrappers.scikit_learn.KerasRegressor(create_model, epochs=20, batch_size=2048, verbose=0).fit(X, y['pure_premium'])
tweedie_model = TweedieRegressor(power=1.8, alpha=0.0, link='log', max_iter=10000).fit(X, y['pure_premium'])

In [104]:
print(tweedie_model.coef_, tweedie_model.intercept_)
tf_model.model.get_weights()

[-3.17526994e+01  1.25038164e+00  1.17368538e+00  1.05126807e+00
 -4.13737090e-01  2.28885233e-01 -1.63109558e-01  5.93158004e-01
  1.57360318e+00  7.62899124e-01  5.34459086e-01  5.39815702e-01
  2.60458093e-01  1.29114805e-01  2.03070660e-02  1.08488873e+00
 -1.64894121e-01  9.17025052e-01 -2.18726717e-02  1.89307916e-01
 -3.64956350e-01] 5.483571571191364


[array([[-1.6895263e-03],
        [ 7.5927797e-05],
        [-3.4942046e-02],
        [ 8.7109575e-04],
        [-1.9080314e-03],
        [-8.7680127e-03],
        [-1.0869695e-02],
        [-2.1996172e-04],
        [ 2.0410961e-01],
        [-4.8673566e-02],
        [-6.7389361e-03],
        [-2.0518079e-02],
        [-1.2585604e-02],
        [-1.6887544e-02],
        [-6.1889980e-02],
        [ 2.0125842e-01],
        [-4.0973466e-02],
        [ 4.4123136e-02],
        [-8.3352149e-02],
        [ 1.8733530e-01],
        [-2.7435362e-01]], dtype=float32),
 array([6.55971], dtype=float32)]

In [50]:
# Masking bench mark
dummy_mask_cv = cross_validate(DummyClassifier(strategy='prior'),
                               X_transform,
                               y['claim_ind'],
                               cv=RepeatedKFold(n_splits=5, n_repeats=5),
                               verbose=1,
                               scoring=['accuracy',
                                        'balanced_accuracy',
                                        'neg_brier_score',
                                        'average_precision',
                                        'precision',
                                        'recall',
                                        'f1',
                                        'roc_auc']
                             )
for k, v in dummy_mask_cv.items():
    if k in ['fit_time', 'score_time']:
        continue
    print('{}: {:.3f}'.format(k, v.mean()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_star

test_accuracy: 0.932
test_balanced_accuracy: 0.500
test_neg_brier_score: -0.063
test_average_precision: 0.068
test_precision: 0.000
test_recall: 0.000
test_f1: 0.000
test_roc_auc: 0.500


In [60]:
# Testing masking with LogisticRegression
X_transform = one_hot_scale.fit_transform(X)
logit_mask = LogisticRegressionCV(Cs=[0.0001, 0.001, 0.01, 0.1, 1.0],
                                  cv=10,
                                  scoring='f1',
                                  max_iter=1000,
                                  n_jobs=-1)
logit_mask_cv = cross_validate(logit_mask,
                               X_transform,
                               y['claim_ind'],
                               fit_params={'sample_weight': y['exposure']},
                               cv=RepeatedKFold(n_splits=5, n_repeats=5),
                               verbose=1,
                               scoring=['accuracy',
                                        'balanced_accuracy',
                                        'neg_brier_score',
                                        'average_precision',
                                        'precision',
                                        'recall',
                                        'f1',
                                        'roc_auc']
                             )
for k, v in logit_mask_cv.items():
    if k in ['fit_time', 'score_time']:
        continue
    print('{}: {:.3f}'.format(k, v.mean()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_star

test_accuracy: 0.932
test_balanced_accuracy: 0.500
test_neg_brier_score: -0.064
test_average_precision: 0.078
test_precision: 0.000
test_recall: 0.000
test_f1: 0.000
test_roc_auc: 0.539


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   29.9s finished


In [63]:
preds = cross_val_predict(logit_mask, X_transform, y['claim_ind'], fit_params={'sample_weight': y['exposure']}, cv=10)

In [65]:
preds.sum()

0

In [70]:
# Adding masking to our predictions if we don't think there will be a claim
lgbm_mask = LGBMClassifier(n_estimators=1000,
                           learning_rate=1.0,
                           num_leaves=2,
                           boosting_type='rf',
                           subsample=0.5,
                           subsample_freq=1,
                           extra_trees=True,
                           boost_from_average=False,
                           #is_unbalance=True,
                           n_jobs=-1)
lgbm_mask_cv = cross_validate(lgbm_mask,
                               X,
                               y['claim_ind'],
                               fit_params={'sample_weight': y['exposure']},
                               cv=RepeatedKFold(n_splits=5, n_repeats=5),
                               scoring=['accuracy',
                                        'balanced_accuracy',
                                        'neg_brier_score',
                                        'average_precision',
                                        'precision',
                                        'recall',
                                        'f1',
                                        'roc_auc']
                             )
for k, v in lgbm_mask_cv.items():
    if k in ['fit_time', 'score_time']:
        continue
    print('{}: {:.3f}'.format(k, v.mean()))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

test_accuracy: 0.932
test_balanced_accuracy: 0.500
test_neg_brier_score: -0.072
test_average_precision: 0.080
test_precision: 0.000
test_recall: 0.000
test_f1: 0.000
test_roc_auc: 0.540


  _warn_prf(average, modifier, msg_start, len(result))


In [85]:
# Adding masking to our predictions if we don't think there will be a claim
# We want to set high confidence negatives to 0
lgbm_mask = LGBMClassifier(n_estimators=100,
                           learning_rate=1.0,
                           num_leaves=31,
                           subsample=0.67,
                           subsample_freq=1,
                           max_bin=64,
                           min_child_samples=1,
                           n_jobs=-1)
lgbm_mask_cv = cross_validate(lgbm_mask,
                               X,
                               y['claim_ind'],
                               fit_params={'sample_weight': y['exposure']},
                               cv=RepeatedKFold(n_splits=5, n_repeats=5),
                               scoring=['accuracy',
                                        'balanced_accuracy',
                                        'neg_brier_score',
                                        'average_precision',
                                        'precision',
                                        'recall',
                                        'f1',
                                        'roc_auc']
                             )
for k, v in lgbm_mask_cv.items():
    if k in ['fit_time', 'score_time']:
        continue
    print('{}: {:.3f}'.format(k, v.mean()))

test_accuracy: 0.862
test_balanced_accuracy: 0.501
test_neg_brier_score: -0.136
test_average_precision: 0.070
test_precision: 0.072
test_recall: 0.083
test_f1: 0.073
test_roc_auc: 0.509


In [86]:
preds = cross_val_predict(lgbm_mask, X, y['claim_ind'], fit_params={'sample_weight': y['exposure']}, cv=10)
print(y['claim_ind'].mean())
print(preds.mean())

0.06784608580274215
0.083812472357364


In [25]:
# What params tend to work for F1?
params = {'n_estimators': Integer(100, 1000),
          'learning_rate': Real(1e-5, 1.0, 'uniform'),
          'subsample': Real(0.5, 1.0, 'uniform'),
          'num_leaves': Integer(2, 11)}
lgbm_mask = LGBMClassifier(class_weight='balanced', n_jobs=-1)
bayes_mask = BayesSearchCV(lgbm_mask,
                           params,
                           n_iter=32,
                           verbose=1,
                           fit_params={'sample_weight': y['exposure']},
                           cv=10,
                           scoring='f1')
bayes_mask.fit(X, y['claim_ind'])

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.0s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.5s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.2s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.7s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.5s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.6s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.1s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.2s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.2s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.4s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    9.5s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.3s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.1s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.2s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.5s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.3s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    9.9s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.1s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.1s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.7s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.1s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.4s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.2s finished


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.9s finished


BayesSearchCV(cv=10, estimator=LGBMClassifier(class_weight='balanced'),
              fit_params={'sample_weight': 0        0.241898
1        0.856523
2        0.417517
3        0.626975
4        0.089770
           ...   
22605    0.909445
22606    0.999321
22607    0.783724
22608    0.841333
22609    0.808868
Name: exposure, Length: 22610, dtype: float64},
              n_iter=32, scoring='f1',
              search_spaces={'learning_rate': Real(low=1e-05, high=1.0, prior='uniform', transform='identity'),
                             'n_estimators': Integer(low=100, high=1000, prior='uniform', transform='identity'),
                             'num_leaves': Integer(low=2, high=11, prior='uniform', transform='identity'),
                             'subsample': Real(low=0.5, high=1.0, prior='uniform', transform='identity')},
              verbose=1)

In [27]:
print(bayes_mask.best_params_)

OrderedDict([('learning_rate', 1.0), ('n_estimators', 100), ('num_leaves', 2), ('subsample', 0.5)])


In [None]:
class MaskedStackingRegressor(BaseEstimator, RegressorMixin):
    
    def __init__(self, stacker, masker):
        self.stacker = stacker
        self.masker = masker
        
    def fit(self, X, y, sample_weights):
        self.stacker.fit()

# Legacy Code

In [None]:
# Hyperparam tuning models
params = {'tweedie_variance_power': np.linspace(1.01, 1.99, 10)}
model = GridSearchCV(LGBMRegressor(boosting='rf',
                                   n_estimators=250,
                                   subsample=0.67,
                                   max_depth=1,
                                   subsample_freq=1,
                                   objective='tweedie',
                                   n_jobs=-1),
                        params,
                        scoring=make_scorer(gini),
                        cv=RepeatedKFold(n_splits=5, n_repeats=3))
model_cv = cross_val_score(model,
                           X,
                           y['pure_premium'],
                           scoring=make_scorer(gini),
                           cv=RepeatedKFold(n_splits=5, n_repeats=5),
                           verbose=1,
                           n_jobs=1,
                           fit_params={'sample_weight': y['exposure']})
print(np.array(model_cv).round(4))
print(np.mean(model_cv))

In [None]:
# Prepping model submission
params = {'tweedie_variance_power': np.linspace(1.01, 1.99, 10)}
model = GridSearchCV(LGBMRegressor(boosting='rf',
                                   n_estimators=2000,
                                   subsample=0.67,
                                   max_depth=1,
                                   subsample_freq=1,
                                   objective='tweedie',
                                   n_jobs=-1),
                        params,
                        scoring=make_scorer(gini),
                        cv=RepeatedKFold(n_splits=5, n_repeats=3))
model.fit(X, y['pure_premium'], sample_weight=y['exposure'])

In [None]:
# Creating predictions
# Getting our predicstions
df_test = pd.read_csv('InsNova_test.csv')
#df_test = pd.get_dummies(df_test, columns=['veh_body', 'veh_age', 'gender', 'area', 'dr_age'])
X_test = df_test.drop(['exposure', 'id'], axis=1)
for i in ['veh_body', 'gender', 'area']:
    X_test[i] = X_test[i].astype('category')
X_test['veh_value_2'] = X_test['veh_value'] ** 2
df_test['claim_cost'] = df_test['exposure'] * model.best_estimator_.predict(X_test)
df_test['id'] = np.arange(df_test.shape[0])
df_test['id'] = df_test['id'].astype(int)
df_test['id'] += 1
df_test[['id', 'claim_cost']].to_csv('shallow_rf_predictions.csv', index=False)
# Creating Categorical dataset for LightGBM and CatBoost
for i in ['veh_body', 'gender', 'area']:
    X[i] = X[i].astype('category')