In [1]:
# Importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import class_weight
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score, GroupKFold
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.metrics import make_scorer, mean_poisson_deviance, mean_gamma_deviance
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, LogisticRegressionCV, RidgeCV, PoissonRegressor, GammaRegressor, TweedieRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import statsmodels.api as sm
pd.set_option('display.max_rows', 500)

In [2]:
# Importing all our data
df = pd.read_csv('InsNova_train.csv')
df = df.sample(frac=1.0)
df.loc[:, 'pure_premium'] = df['claim_cost'] / df['exposure']
df.loc[:, 'severity'] = df['claim_cost'] / np.fmax(df['claim_count'], 1)
df.loc[:, 'frequency'] = df['claim_count'] / df['exposure']

# Getting CV inds
cv = StratifiedKFold(15, shuffle=True, random_state=123)
df.loc[:, 'fold'] = 0
for fold, (_, test_inds) in enumerate(cv.split(df, df['claim_ind'])):
    df.loc[test_inds, 'fold'] = fold
    
# Prepping box-cox transformers
box_cox = {}
# Frequency
df.loc[:,'bc_frequency'] = df['frequency'].copy()
box_cox['frequency'] = PowerTransformer(method='box-cox', standardize=False)
df.loc[df['frequency'] > 0.0, 'bc_frequency'] = box_cox['frequency'].fit_transform(df.loc[df['frequency'] > 0.0, 'frequency'].values.reshape(-1, 1)).flatten()
# Severity
df.loc[:,'bc_severity'] = df['severity'].copy()
box_cox['severity'] = PowerTransformer(method='box-cox', standardize=False)
df.loc[df['severity'] > 0.0, 'bc_severity'] = box_cox['severity'].fit_transform(df.loc[df['severity'] > 0.0, 'severity'].values.reshape(-1, 1)).flatten()
# Pure premium
df.loc[:,'bc_pure_premium'] = df['pure_premium'].copy()
box_cox['pure_premium'] = PowerTransformer(method='box-cox', standardize=False)
df.loc[df['pure_premium'] > 0.0, 'bc_pure_premium'] = box_cox['pure_premium'].fit_transform(df.loc[df['pure_premium'] > 0.0, 'pure_premium'].values.reshape(-1, 1)).flatten()

# Splitting into pred/response
sev_mask = df['claim_ind'] == 1
response_cols = ['fold',
                 'exposure',
                 'claim_ind',
                 'claim_count',
                 'claim_cost',
                 'pure_premium',
                 'severity',
                 'frequency',
                 'bc_pure_premium',
                 'bc_severity',
                 'bc_frequency']
X, y = df.drop(response_cols, axis=1), df[response_cols]
X = X.drop('id', axis=1)

# Adding a condensed veh_body column
other_bodies = ['TRUCK', 'COUPE', 'MIBUS', 'PANVN', 'BUS', 'RDSTR', 'MCARA', 'CONVT']
X.loc[:,'veh_body2'] = np.where(X['veh_body'].isin(other_bodies), 'OTHER', X['veh_body'])
X.loc[:,'log_veh_value'] = np.log(X['veh_value'] + 0.1)

# Creating Categorical dataset for LightGBM and CatBoost
for i in ['veh_body', 'veh_body2', 'gender', 'area']:
    X[i] = X[i].astype('category')
X_sev = X[y['claim_cost'] > 0.0]
y_sev = y[y['claim_cost'] > 0.0]
lin_cols = ['veh_body', 'veh_age', 'gender', 'area', 'dr_age', 'log_veh_value']
boost_cols = ['veh_value', 'veh_body', 'veh_age', 'gender', 'area', 'dr_age']
    
# Defining column transformers for later steps          
get_cats = make_column_selector(dtype_include=pd.CategoricalDtype)
one_hot = lambda: ColumnTransformer([('one_hot', OneHotEncoder(drop='first', sparse=False), get_cats)], remainder='passthrough')

# Initializing cross validated preds
cv_preds = {}

In [3]:
preprocessor = make_pipeline(ColumnTransformer([('one_hot', OneHotEncoder(drop='first', sparse=False), get_cats)], remainder='passthrough'),
                             StandardScaler())
_X = preprocessor.fit_transform(X[lin_cols])
_X = sm.add_constant(_X, prepend=False)

In [8]:
model = sm.GLM(y['pure_premium'], _X, family=sm.families.Tweedie(var_power=1.2), freq_weights=y['exposure'])
results = model.fit_regularized(maxiter=200, L1_wt=0.0)

In [9]:
results.predict(_X)

array([1., 1., 1., ..., 1., 1., 1.])

In [10]:
results.params

x1       0.0
x2       0.0
x3       0.0
x4       0.0
x5       0.0
x6       0.0
x7       0.0
x8       0.0
x9       0.0
x10      0.0
x11      0.0
x12      0.0
x13      0.0
x14      0.0
x15      0.0
x16      0.0
x17      0.0
x18      0.0
x19      0.0
x20      0.0
x21      0.0
const    0.0
dtype: float64

In [21]:
results.model.estimate_tweedie_power(results.fittedvalues, low=1.01, high=2.5)

2.102456591998465