In [38]:
############################ PREPARATION ############################

#-------------------------- import packages --------------------------
import random
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from itertools import combinations, product
import pyreadr
import seaborn as sns
import copy

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Multiply, Add, Embedding, Reshape, Concatenate, Dropout, BatchNormalization, Lambda, Layer, CategoryEncoding, Activation
from keras.constraints import Constraint
from keras.callbacks import EarlyStopping
from keras.initializers import Zeros, Constant
from keras.models import clone_model
import keras_tuner as kt
from keras import backend as K
from keras import regularizers
from keras.utils import plot_model
from keras.losses import Poisson, Loss
from keras.metrics import MeanAbsoluteError, RootMeanSquaredError

import xgboost as xgb
from scipy.stats import uniform, randint
from pygam import PoissonGAM, GAM, s, f, l, te # s for spline, f for factor, l for linear, te for tensor product
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import make_scorer
from interpret.glassbox import ExplainableBoostingRegressor
import joblib

import warnings
warnings.filterwarnings('ignore') # suprress warnings

In [39]:
#-------------------------- for reproducibility --------------------------
seed_value = 2000
keras.utils.set_random_seed(seed_value)
np.random.seed(seed_value)

In [40]:
#-------------------------- import data --------------------------
''' Belgian motor third-party liability dataset from the CAS collection '''
r_data = pyreadr.read_r("data/beMTPL97.rda")
data = r_data['beMTPL97']
data.head() # view data snapshot

Unnamed: 0,id,expo,claim,nclaims,amount,average,coverage,ageph,sex,bm,power,agec,fuel,use,fleet,postcode,long,lat
0,1,1.0,1,1,1618.001036,1618.001036,TPL,50,male,5,77,12,gasoline,private,0,1000,4.355223,50.845386
1,2,1.0,0,0,0.0,,TPL+,64,female,5,66,3,gasoline,private,0,1000,4.355223,50.845386
2,3,1.0,0,0,0.0,,TPL,60,male,0,70,10,diesel,private,0,1000,4.355223,50.845386
3,4,1.0,0,0,0.0,,TPL,77,male,0,57,15,gasoline,private,0,1000,4.355223,50.845386
4,5,0.046575,1,1,155.974606,155.974606,TPL,28,female,9,70,7,gasoline,private,0,1000,4.355223,50.845386


In [41]:
############################ DATA UNDERSTANDING & CLEANING ############################

#-------------------------- high-level checking --------------------------
print(data.shape) # number of rows and columns
print(data.info()) # check data type and missing values

(163212, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163212 entries, 0 to 163211
Data columns (total 18 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   id        163212 non-null  int32   
 1   expo      163212 non-null  float64 
 2   claim     163212 non-null  category
 3   nclaims   163212 non-null  int32   
 4   amount    163212 non-null  float64 
 5   average   18276 non-null   float64 
 6   coverage  163212 non-null  category
 7   ageph     163212 non-null  int32   
 8   sex       163212 non-null  category
 9   bm        163212 non-null  int32   
 10  power     163212 non-null  int32   
 11  agec      163212 non-null  int32   
 12  fuel      163212 non-null  category
 13  use       163212 non-null  category
 14  fleet     163212 non-null  category
 15  postcode  163212 non-null  int32   
 16  long      163212 non-null  float64 
 17  lat       163212 non-null  float64 
dtypes: category(6), float64(5), int32(7)
memory usage: 

In [42]:
#-------------------------- filter claim data --------------------------
claim = data.drop(columns = ["id","claim","amount","average","postcode"]).reset_index(drop = True) # filter positive claims and remove unnecessary columns
print(f"{claim.shape[0]} rows and {claim.shape[1]} columns")
claim.head()

163212 rows and 13 columns


Unnamed: 0,expo,nclaims,coverage,ageph,sex,bm,power,agec,fuel,use,fleet,long,lat
0,1.0,1,TPL,50,male,5,77,12,gasoline,private,0,4.355223,50.845386
1,1.0,0,TPL+,64,female,5,66,3,gasoline,private,0,4.355223,50.845386
2,1.0,0,TPL,60,male,0,70,10,diesel,private,0,4.355223,50.845386
3,1.0,0,TPL,77,male,0,57,15,gasoline,private,0,4.355223,50.845386
4,0.046575,1,TPL,28,female,9,70,7,gasoline,private,0,4.355223,50.845386


In [43]:
#-------------------------- define variable list --------------------------
# get the categorical variables
cat_vars = ["coverage", "sex", "fuel", "use", "fleet"]
num_vars = ["ageph", "bm", "power", "agec", "long", "lat"] # numerical variables
all_vars = ["expo"] + cat_vars + num_vars

In [44]:
#-------------------------- train-test split --------------------------
target_var = "nclaims"
X_train, X_test, y_train, y_test = train_test_split(
    claim[all_vars], claim[target_var], test_size = 0.2, random_state = 1)

In [45]:
#-------------------------- variable encoding --------------------------
# define transformer
ct = make_column_transformer(
    ("passthrough", ["expo"]),
    (OrdinalEncoder(), cat_vars),
    remainder = StandardScaler(),
    verbose_feature_names_out = False
)

# fit & transform
train = ct.fit_transform(X_train)
test = ct.transform(X_test)
feature_names = ct.get_feature_names_out()  # get the columns' names
feature_names

array(['expo', 'coverage', 'sex', 'fuel', 'use', 'fleet', 'ageph', 'bm',
       'power', 'agec', 'long', 'lat'], dtype=object)

In [46]:
# load model if already exists
gbm_name = "gbm_beMTPL97.joblib"
if os.path.exists(gbm_name):
    model_gbm = joblib.load(gbm_name)
    print(f"Model loaded from '{gbm_name}'")
else:
    # initialize XGBoost
    xgb_reg = xgb.XGBRegressor(
        objective = 'count:poisson',
        seed = 2024,
        subsample = 0.8,
        verbosity = 0  # Suppress warnings
    )

    # tuning hyperparamters
    param_dist = {
        'eta': uniform(0.01, 0.2), # learning rate (0.01 to 0.21)
        'max_depth': randint(3, 11), # maximum tree depth (3 to 10)
        'n_estimators': randint(100, 501)  # number of trees (100 to 500)
    }

    # set up random search
    random_search_gbm = RandomizedSearchCV(
        estimator = xgb_reg,
        param_distributions = param_dist,
        n_iter = 50,  # Number of parameter settings sampled
        scoring = 'neg_mean_poisson_deviance',
        cv = 5,
        verbose = 1,
        random_state = 2024,
        n_jobs = -1 # using all processors for parallel processing
    )

    # perform the search
    random_search_gbm.fit(train[:,1:], y_train/train[:,0])
    model_gbm = random_search_gbm.best_estimator_ # get the best model

    # -------------------------- Best Parameters and Score --------------------------
    print("Best Hyperparameters:", random_search_gbm.best_params_)
    print("Best Gamma Log Likehood:", random_search_gbm.best_score_)
    print(f"New model fitted and saved to '{gbm_name}'")
    joblib.dump(model_gbm, gbm_name)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'eta': 0.051003790478857766, 'max_depth': 4, 'n_estimators': 416}
Best Gamma Log Likehood: -0.751399843710188
New model fitted and saved to 'gbm_beMTPL97.joblib'


In [47]:
# -------------------------- evaluation --------------------------
gbm_pred = model_gbm.predict(test[:,1:]) # get the prediction

# get metrics on test data
gbm_nll = Poisson()(y_test, gbm_pred*test[:,0]).numpy()
gbm_rmse = np.sqrt(mse(y_test, gbm_pred*test[:,0]))
gbm_mae = mae(y_test, gbm_pred*test[:,0])
print(f"Poisson loss on test set: {gbm_nll}")
print(f"RMSE on test set: {gbm_rmse}")
print(f"MAE on test set: {gbm_mae}")

Poisson loss on test set: 0.37979942560195923
RMSE on test set: 0.370558859009979
MAE on test set: 0.22443260044576238
