In [7]:
############################ 0. PREPARATION ############################

#-------------------------- import packages --------------------------
import random
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from itertools import combinations, product
import pyreadr
import seaborn as sns
import copy

import tensorflow as tf
import tensorflow_lattice as tfl
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Multiply, Add, Embedding, Reshape, Concatenate, Dropout, BatchNormalization, Lambda, Layer, CategoryEncoding, Activation
from keras.constraints import Constraint
from keras.callbacks import EarlyStopping
from keras.initializers import Zeros, Constant
from keras.optimizers.legacy import Adam, Nadam, RMSprop
from keras.models import clone_model
import keras_tuner as kt
from keras import backend as K
from keras import regularizers
from keras.utils import plot_model
from keras.losses import Poisson, Loss
from keras.metrics import MeanAbsoluteError, RootMeanSquaredError

# import xgboost as xgb
from scipy.stats import uniform, randint
from pygam import GammaGAM, GAM, s, f, l, te # s for spline, f for factor, l for linear, te for tensor product
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import make_scorer
from interpret.glassbox import ExplainableBoostingRegressor

In [None]:
#-------------------------- import data --------------------------
'''we use the popular French Motor TPL insurance claim data '''
r_data = pyreadr.read_r("data/freMPL1.rda")
data = r_data['freMPL1']

In [2]:
############################ 1. PREPROCESSING ############################

#-------------------------- filter claim data --------------------------

# drop unnecessary columns and filter policies with positive claims
claim = data[data["ClaimInd"] == 1].drop(columns = ["RecordBeg", "RecordEnd", "ClaimInd"]).reset_index(drop = True)
claim.head()

# global variables
cont_vars = ["LicAge", "DrivAge", "BonusMalus"] # continuous variables
ordinal_vars = ["RiskVar", "VehAge", "VehPrice", "VehMaxSpeed"] # ordinal categorical variables
nominal_vars = [] # nominal categorical variables
for var in claim.columns:
    if var not in cont_vars + ["Exposure", "ClaimAmount"] + ordinal_vars:
        nominal_vars.append(var)
cat_vars = nominal_vars + ordinal_vars


# #-------------------------- cap outliers for ClaimAmount --------------------------
log_claim = np.log(claim['ClaimAmount'])
IQR = log_claim.quantile(0.75) - log_claim.quantile(0.25) # inter-quartile range
lower_whisker = log_claim.quantile(0.25)- 1.5*IQR
upper_whisker = log_claim.quantile(0.75) + 1.5*IQR

# apply caps
claim['ClaimAmount'] = np.exp(np.where(log_claim < lower_whisker, lower_whisker, log_claim))
claim['ClaimAmount'] = np.exp(np.where(log_claim > upper_whisker, upper_whisker, np.log(claim['ClaimAmount'])))


# -------------------------- remove outliers for BonusMalus --------------------------
claim["BonusMalus"] = np.where(claim["BonusMalus"] > 150, 150, claim["BonusMalus"])
claim["BonusMalus"] = np.where(claim["BonusMalus"] > 150, 150, claim["BonusMalus"])