In [1]:
# Importing our packages
import numpy as np
import pandas as pd
from sklearn.model_selection import RepeatedKFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import TweedieRegressor

In [2]:
# Importing data, preprocessing, and splitting
df = pd.read_csv('InsNova_train.csv')
df = pd.get_dummies(df, columns=['veh_body', 'veh_age', 'gender', 'area', 'dr_age'])
df['pure_premium'] = df['claim_cost'] / df['exposure']
df['avg_cost'] = df['claim_cost'] / np.fmax(df['claim_count'], 1)
df['frequency'] = df['claim_count'] / df['exposure']
response_cols = ['exposure', 'claim_ind', 'claim_count', 'claim_cost', 'pure_premium', 'avg_cost', 'frequency']
X, y = df.drop(response_cols, axis=1), df[response_cols]
X = X.drop('id', axis=1)

In [3]:
def gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.fmax(1.0, np.sum(true_order))
    L_pred = np.cumsum(pred_order) / np.fmax(1.0, np.sum(pred_order))
    L_ones = np.linspace(1/n_samples, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred/G_true

In [4]:
def eval_model(model, X, y, f_param=''):
    scores = []
    for _ in range(5):
        preds = cross_val_predict(model, X, y['pure_premium'], cv=5, n_jobs=-1, fit_params={f_param +'sample_weight': y['exposure']})
        preds *= y['exposure']
        scores.append(gini(y['claim_cost'], preds))
    return scores

In [5]:
# Training a base Tweedie model
base_model = TweedieRegressor(power=1.5)
base_cv = cross_val_score(base_model,
                           X,
                           y['pure_premium'],
                           scoring=make_scorer(gini),
                           cv=RepeatedKFold(n_splits=5, n_repeats=5),
                           n_jobs=1,
                           fit_params={'sample_weight': y['exposure']})
#base_cv = eval_model(base_model, X, y)
print(np.mean(base_cv))

0.12823672930929708


In [6]:
# Training a tuned model
tuned_model= GridSearchCV(make_pipeline(StandardScaler(),
                                         TweedieRegressor()),
                        {'tweedieregressor__alpha': np.linspace(0.1, 1.0, 5),
                         'tweedieregressor__power':  np.linspace(1.001, 1.99, 5)},
                        scoring=make_scorer(gini),
                        n_jobs=-1,
                        cv=5)
#tuned_cv = eval_model(tuned_model, X, y, 'tweedieregressor__')
tuned_cv = cross_val_score(tuned_model,
                           X,
                          y['pure_premium'],
                          scoring=make_scorer(gini),
                          cv=RepeatedKFold(n_splits=5, n_repeats=5),
                          n_jobs=1,
                          fit_params={'tweedieregressor__sample_weight': y['exposure']})
print(np.mean(tuned_cv))
# Wow thats a pretty lame increase

0.17165983019823725


In [7]:
# Let's just submit with Tweedie
model = GridSearchCV(make_pipeline(StandardScaler(),
                                   TweedieRegressor()),
                        {'tweedieregressor__alpha': np.linspace(0.1, 1.0, 5),
                         'tweedieregressor__power':  np.linspace(1.001, 1.99, 5)},
                        scoring=make_scorer(gini),
                        n_jobs=-1,
                        cv=10)
model_cv = cross_val_score(model,
                           X,
                           y['pure_premium'],
                           scoring=make_scorer(gini),
                           cv=RepeatedKFold(n_splits=5, n_repeats=5),
                           n_jobs=1,
                           fit_params={'tweedieregressor__sample_weight': y['exposure']})

print(np.mean(model_cv))
model.fit(X, y['pure_premium'], tweedieregressor__sample_weight=y['exposure'])

0.14448191272278657


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('tweedieregressor',
                                        TweedieRegressor())]),
             n_jobs=-1,
             param_grid={'tweedieregressor__alpha': array([0.1  , 0.325, 0.55 , 0.775, 1.   ]),
                         'tweedieregressor__power': array([1.001  , 1.24825, 1.4955 , 1.74275, 1.99   ])},
             scoring=make_scorer(gini))

In [None]:
# Getting our predicstions
df_test = pd.read_csv('InsNova_test.csv')
df_test = pd.get_dummies(df_test, columns=['veh_body', 'veh_age', 'gender', 'area', 'dr_age'])
X_test = df_test.drop(['exposure', 'id'], axis=1)
df_test['claim_cost'] = df_test['exposure'] * model.best_estimator_.predict(X_test)
df_test['id'] = np.arange(df_test.shape[0])
df_test['id'] = df_test['id'].astype(int)
df_test['id'] += 1
df_test[['id', 'claim_cost']].to_csv('baseline_predictions.csv', index=False)
tweedie_preds = df_test['claim_cost']