In [65]:
# Importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, StratifiedKFold, RepeatedKFold, cross_val_score
from sklearn.metrics import make_scorer, auc, mean_squared_error as mse
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression, PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [2]:
# Importing data, preprocessing, and splitting
df = pd.read_csv('InsNova_train.csv').drop('id', axis=1)
df = pd.get_dummies(df, columns=['veh_body', 'veh_age', 'gender', 'area', 'dr_age'])
df['pure_premium'] = df['claim_cost'] / df['exposure']
df['avg_cost'] = df['claim_cost'] / np.fmax(df['claim_count'], 1)
df['frequency'] = df['claim_count'] / df['exposure']
response_cols = ['exposure', 'claim_ind', 'claim_count', 'claim_cost', 'pure_premium', 'avg_cost', 'frequency']
X, y = df.drop(response_cols, axis=1), df[response_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y['claim_count'])
claim_mask = (y_train['claim_cost'] > 0.0).values
print('Train Counts:', y_train['claim_count'].value_counts(), 'Test Counts:', y_test['claim_count'].value_counts(), sep='\n')

Train Counts:
0    16861
1     1155
2       70
3        2
Name: claim_count, dtype: int64
Test Counts:
0    4215
1     288
2      18
3       1
Name: claim_count, dtype: int64


In [3]:
def gini(y_true, y_pred):
    # check and get number of samples
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(1/n_samples, 1, n_samples)

    # normalize to true Gini coefficient
    return np.sum(L_ones - L_pred)

In [4]:
def normalized_gini(y_true, y_pred):
    return gini(y_true, y_pred) / gini(y_true, y_true)

In [5]:
# Creating our eval function
model_scores = {}
def eval_model(model):
    train_preds = model.predict(X_train) * y_train['exposure']
    test_preds = model.predict(X_test) * y_test['exposure']
    return {'train': gini(y_train['claim_cost'], train_preds),
            'test': gini(y_test['claim_cost'], test_preds),
            'norm_train': normalized_gini(y_train['claim_cost'], train_preds),
            'norm_test': normalized_gini(y_test['claim_cost'], test_preds)}

In [6]:
# Training a benchmark dummy model
dummy = DummyRegressor().fit(X_train, y_train['pure_premium'], sample_weight=y_train['exposure'])
model_scores['dummy'] = eval_model(dummy)

In [7]:
# Testing simple linear models
linreg = LinearRegression().fit(X_train, y_train['pure_premium'], sample_weight=y_train['exposure'])
model_scores['linreg'] = eval_model(linreg)

ridge = BayesSearchCV(make_pipeline(StandardScaler(),
                                    Ridge()),
                      {'ridge__alpha': Real(0.001, 10.0, 'uniform')},
                      fit_params={'ridge__sample_weight': y_train['exposure']},
                      n_iter=16,
                      cv=5,
                      scoring=make_scorer(normalized_gini),
                      n_jobs=-1).fit(X_train, y_train['pure_premium']).best_estimator_
model_scores['ridge'] = eval_model(ridge)

tweedie = BayesSearchCV(make_pipeline(StandardScaler(),
                                      TweedieRegressor()),
                        {'tweedieregressor__alpha': Real(0.1, 5.0, 'uniform'),
                         'tweedieregressor__power': Real(1.001, 1.99, 'uniform')},
                        fit_params={'tweedieregressor__sample_weight': y_train['exposure']},
                        n_iter=32,
                        scoring=make_scorer(normalized_gini),
                        cv=5,
                        n_jobs=-1).fit(X_train, y_train['pure_premium']).best_estimator_
model_scores['tweedie'] = eval_model(tweedie)

In [8]:
# Testing tree-based methods
dt = DecisionTreeRegressor().fit(X_train, y_train['pure_premium'], sample_weight=y_train['exposure'])
model_scores['decision_tree'] = eval_model(dt)

In [9]:
et = ExtraTreesRegressor(n_estimators=100, n_jobs=-1).fit(X_train, y_train['pure_premium'], sample_weight=y_train['exposure'])
model_scores['extra_trees'] = eval_model(et)

In [10]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1).fit(X_train, y_train['pure_premium'], sample_weight=y_train['exposure'])
model_scores['random_forest'] = eval_model(rf)

In [11]:
ad = AdaBoostRegressor(n_estimators=100).fit(X_train, y_train['pure_premium'], sample_weight=y_train['exposure'])
model_scores['adaboost'] = eval_model(ad)

In [12]:
gbr = GradientBoostingRegressor(n_estimators=100).fit(X_train, y_train['pure_premium'], sample_weight=y_train['exposure'])
model_scores['gbm'] = eval_model(gbr)

In [13]:
hgbr = HistGradientBoostingRegressor(max_iter=100).fit(X_train, y_train['pure_premium'], sample_weight=y_train['exposure'])
model_scores['hist_gbm'] = eval_model(hgbr)

In [14]:
xgb = XGBRegressor(n_estimators=100, n_jobs=-1).fit(X_train, y_train['pure_premium'], sample_weight=y_train['exposure'])
model_scores['xgboost'] = eval_model(xgb)

In [15]:
lgbm = LGBMRegressor(n_estimators=100, n_jobs=-1).fit(X_train, y_train['pure_premium'], sample_weight=y_train['exposure'])
model_scores['lgbm'] = eval_model(lgbm)

In [16]:
xgbt = XGBRegressor(n_estimators=100, objective='reg:tweedie', n_jobs=-1).fit(X_train, y_train['pure_premium'], sample_weight=y_train['exposure'])
model_scores['tweedie-xgboost'] = eval_model(xgbt)

In [17]:
lgbmt = LGBMRegressor(n_estimators=100, objective='tweedie', n_jobs=-1).fit(X_train, y_train['pure_premium'], sample_weight=y_train['exposure'])
model_scores['tweedie-lgbm'] = eval_model(lgbmt)

In [51]:
# Trying a tuned XGBRegressor
params = {'n_estimators': Integer(50, 750),
          'max_depth': Integer(1, 6),
          'learning_rate': Real(1e-7, 1.0, 'uniform'),
          'gamma': Real(0.0, 20.0, 'uniform'),
          'tweedie_variance_power': Real(1.01, 1.99, 'uniform'),
          'subsample': Real(0.5, 1.0, 'uniform'),
          'colsample_bytree': Real(0.5, 1.0, 'uniform'),
          'reg_lambda': Real(0.0, 10.0, 'uniform')}

xgb_opt = BayesSearchCV(
    XGBRegressor(tree_method='hist', objective='reg:tweedie', n_jobs=-1),
    params,
    fit_params={'sample_weight': y_train['exposure']},
    scoring=make_scorer(normalized_gini),
    n_iter=32,
    cv=5,
    n_jobs=1,
    verbose=1
)
xgb_opt.fit(X_train, y_train['pure_premium'])
print(xgb_opt.best_params_)
model_scores['xgb-tuned'] = eval_model(xgb_opt.best_estimator_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   11.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   15.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.3s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.7s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   15.7s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.3s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.8s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.2s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.5s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.6s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.2s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   15.6s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.2s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.2s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.7s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.3s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.9s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.3s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.8s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.3s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.2s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.8s finished


OrderedDict([('colsample_bytree', 0.8980933014856265), ('gamma', 3.2516185888233133), ('learning_rate', 0.5696664381732708), ('max_depth', 5), ('n_estimators', 503), ('reg_lambda', 3.0339205452104507), ('subsample', 0.9505192100105055), ('tweedie_variance_power', 1.8777680756638517)])


In [55]:
# Trying a tuned LGBMRegressor
params = {'n_estimators': Integer(50, 750),
          'num_leaves': Integer(10, 62),
          'learning_rate': Real(1e-7, 1.0, 'uniform'),
          'min_split_gain': Real(0.0, 20.0, 'uniform'),
          'tweedie_variance_power': Real(1.01, 1.99, 'uniform'),
          'subsample': Real(0.5, 1.0, 'uniform'),
          'subsample_freq': Integer(0, 5),
          'colsample_bytree': Real(0.5, 1.0, 'uniform'),
          'reg_lambda': Real(0.0, 10.0, 'uniform')}

lgbm_opt = BayesSearchCV(
    LGBMRegressor(objective='tweedie', n_jobs=-1),
    params,
    fit_params={'sample_weight': y_train['exposure']},
    #scoring=make_scorer(normalized_gini),
    n_iter=32,
    cv=RepeatedKFold(5, 5),
    n_jobs=1
)
lgbm_opt.fit(X_train, y_train['pure_premium'])
print(lgbm_opt.best_params_)
model_scores['lgbm-tuned'] = eval_model(lgbm_opt.best_estimator_)



OrderedDict([('colsample_bytree', 0.8442454527010317), ('learning_rate', 0.8982509084659792), ('min_split_gain', 14.042854802280587), ('n_estimators', 466), ('num_leaves', 41), ('reg_lambda', 10.0), ('subsample', 0.890180570668657), ('subsample_freq', 1), ('tweedie_variance_power', 1.1669117671912612)])


In [19]:
# Now let's try adding our ensembles of frequency-severity models
def eval_freq_sev_model(freq_model, sev_model, ind_model=None):
    train_cost_pred = freq_model.predict(X_train) * sev_model.predict(X_train) * y_train['exposure']
    test_cost_pred = freq_model.predict(X_test) * sev_model.predict(X_test) * y_test['exposure']
    if ind_model is not None:
        train_cost_pred *= ind_model.predict(X_train)
        test_cost_pred *= ind_model.predict(X_test)
    return {'train': gini(y_train['claim_cost'], train_cost_pred),
            'test': gini(y_test['claim_cost'], test_cost_pred),
            'norm_train': normalized_gini(y_train['claim_cost'], train_cost_pred),
            'norm_test': normalized_gini(y_test['claim_cost'], test_cost_pred)}

In [29]:
# First with GLMs
binom = BayesSearchCV(make_pipeline(StandardScaler(),
                                    LogisticRegression(penalty='elasticnet', solver='saga', max_iter=500, n_jobs=1)),
                      {'logisticregression__C': Real(0.0, 5.0, 'uniform'),
                       'logisticregression__l1_ratio': Real(0.0, 1.0, 'uniform'),
                       'logisticregression__class_weight': Categorical([None, 'balanced'])},
                      n_iter=16,
                      fit_params={'logisticregression__sample_weight': y_train['exposure']},
                      cv=5,
                      n_jobs=5).fit(X_train, y_train['claim_ind'])
print(binom.best_params_)

OrderedDict([('logisticregression__C', 2.4932123487012574), ('logisticregression__class_weight', None), ('logisticregression__l1_ratio', 1.0)])




In [30]:
poisson = BayesSearchCV(make_pipeline(StandardScaler(),
                                      PoissonRegressor()),
                        {'poissonregressor__alpha': Real(1e-10, 1.0, 'uniform')},
                        n_iter=16,
                        fit_params={'poissonregressor__sample_weight': y_train['exposure']},
                        cv=5,
                        n_jobs=5).fit(X_train, y_train['frequency'])
print(poisson.best_params_)

OrderedDict([('poissonregressor__alpha', 0.04213590920992788)])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [31]:
poisson2 = BayesSearchCV(make_pipeline(StandardScaler(),
                                      PoissonRegressor()),
                        {'poissonregressor__alpha': Real(1e-10, 50.0, 'uniform')},
                        n_iter=16,
                        fit_params={'poissonregressor__sample_weight': y_train['exposure'][claim_mask]},
                        cv=5,
                        n_jobs=5).fit(X_train[claim_mask], y_train['frequency'][claim_mask])
print(poisson2.best_params_)



OrderedDict([('poissonregressor__alpha', 0.9719655830725147)])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [32]:
gamma = BayesSearchCV(make_pipeline(StandardScaler(),
                                    GammaRegressor()),
                      {'gammaregressor__alpha': Real(1e-9, 10.0, 'uniform')},
                      n_iter=16,
                      fit_params={'gammaregressor__sample_weight': y_train['exposure'][claim_mask]},
                      cv=5,
                      n_jobs=5).fit(X_train[claim_mask], y_train['avg_cost'][claim_mask])
print(gamma.best_params_)



OrderedDict([('gammaregressor__alpha', 0.0002999434191263241)])


In [33]:
# Evaluating our freq-sev models
binom = binom.best_estimator_
poisson = poisson.best_estimator_
poisson2 = poisson2.best_estimator_
gamma = gamma.best_estimator_
model_scores['freq-sev-glm'] = eval_freq_sev_model(poisson, gamma)
model_scores['ind-freq-sev-glm'] = eval_freq_sev_model(poisson2, gamma, binom)

In [38]:
# Let's replace our GLMs with LGBM models
params = {'n_estimators': Integer(50, 1000),
          'num_leaves': Integer(10, 62),
          'learning_rate': Real(1e-7, 1.0, 'uniform'),
          'min_split_gain': Real(0.0, 20.0, 'uniform'),
          'subsample': Real(0.5, 1.0, 'uniform'),
          'colsample_bytree': Real(0.5, 1.0, 'uniform'),
          'reg_lambda': Real(0.0, 20.0, 'uniform')}

#lgbm_ind = BayesSearchCV(LGBMClassifier(subsample_freq=1, n_jobs=1),
#                         params,
#                         fit_params={'sample_weight': y_train['exposure']},
#                         n_iter=32,
#                         cv=5,
#                         n_jobs=-1).fit(X_train, y_train['claim_ind'])
lgbm_count = BayesSearchCV(LGBMRegressor(objective='poisson', subsample_freq=1, n_jobs=1),
                         params,
                         fit_params={'sample_weight': y_train['exposure']},
                         n_iter=32,
                         cv=5,
                         n_jobs=-1).fit(X_train, y_train['frequency'])
#lgbm_count2 = BayesSearchCV(LGBMRegressor(objective='poisson', subsample_freq=1, n_jobs=1),
#                         params,
#                         fit_params={'sample_weight': y_train['exposure'][claim_mask]},
#                         n_iter=32,
#                         cv=5,
#                         n_jobs=-1).fit(X_train[claim_mask], y_train['frequency'][claim_mask])
lgbm_sev = BayesSearchCV(LGBMRegressor(objective='gamma', subsample_freq=1, n_jobs=1),
                         params,
                         fit_params={'sample_weight': y_train['exposure'][claim_mask]},
                         n_iter=32,
                         cv=5,
                         n_jobs=-1).fit(X_train[claim_mask], y_train['avg_cost'][claim_mask])

#print(lgbm_count.best_params_, lgbm_count2.best_params_, lgbm_sev.best_params_, lgbm_ind.best_params_, sep='\n')
#lgbm_ind = lgbm_ind.best_estimator_
lgbm_count = lgbm_count.best_estimator_
#lgbm_count2 = lgbm_count2.best_estimator_
lgbm_sev = lgbm_sev.best_estimator_
model_scores['freq-sev-lgbm'] = eval_freq_sev_model(lgbm_count, lgbm_sev)
#model_scores['ind-freq-sev-lgbm'] = eval_freq_sev_model(lgbm_count2, lgbm_sev, lgbm_ind)

In [56]:
# Coercing scores to a dataframe
df_scores = pd.DataFrame({'model': list(model_scores.keys()),
                          'train_gini': [v['train'] for k, v in model_scores.items()],
                          'test_gini': [v['test'] for k, v in model_scores.items()],
                          'train_gini_norm': [v['norm_train'] for k, v in model_scores.items()],
                          'test_gini_norm': [v['norm_test'] for k, v in model_scores.items()]
                         })
df_scores = df_scores.sort_values('test_gini_norm', ascending=False)
print(df_scores[['model', 'train_gini_norm', 'test_gini_norm']])

               model  train_gini_norm  test_gini_norm
3            tweedie         0.235890        0.228799
14        lgbm-tuned         0.904487        0.224042
2              ridge         0.235182        0.221190
1             linreg         0.235204        0.221186
13      tweedie-lgbm         0.641960        0.200642
7           adaboost         0.336690        0.199356
12   tweedie-xgboost         0.299730        0.196280
0              dummy         0.151139        0.181505
10           xgboost         0.327906        0.181505
18         xgb-tuned         0.929870        0.181505
17     freq-sev-lgbm         0.763178        0.100128
9           hist_gbm         0.216058        0.081064
6      random_forest         0.973409        0.027838
5        extra_trees         0.994418       -0.011462
4      decision_tree         0.994418       -0.011746
8                gbm        -0.071910       -0.013380
11              lgbm         0.190758       -0.025777
15      freq-sev-glm        

In [88]:
%matplotlib qt
fig, ax = plt.subplots(figsize=(13, 9))
ind = np.arange(df_scores.shape[0])
width = 0.35
ax.bar(ind, df_scores['train_gini_norm'], width, bottom=0, label='Train MSE')
ax.bar(ind + width, df_scores['test_gini_norm'], width, bottom=0, label='Test MSE')

ax.set_title('Train/Test Normalized Gini by Model')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(df_scores['model'], rotation=90.0)
ax.legend()
ax.autoscale_view()
plt.show()

In [67]:
# We should do CV rather than holding out to test model performance
# Let's just use some hand-tuned lgbm and tweedie models

In [75]:
tweedie_glm = make_pipeline(StandardScaler(), TweedieRegressor(alpha=0.1, power=1.9))
tweedie_cv = cross_val_score(tweedie_glm,
                             X,
                             y['claim_cost'],
                             scoring=make_scorer(normalized_gini),
                             cv=RepeatedKFold(10, 5),
                             n_jobs=-1,
                             fit_params={'tweedieregressor__sample_weight': y['exposure']})
print(np.mean(tweedie_cv))

0.10804349444306859


In [87]:
lgbm_base = LGBMRegressor(n_estimators=100,
                          num_leaves=31,
                          objective='tweedie',
                          tweedie_variance_power=1.5,
                         )
lgbm_cv = cross_val_score(lgbm_base,
                          X,
                          y['claim_cost'],
                          scoring=make_scorer(normalized_gini),
                          cv=RepeatedKFold(10, 5),
                          n_jobs=-1,
                          fit_params={'sample_weight': y['exposure']})
print(np.mean(lgbm_cv))

-0.0034017891363120967
