In [1]:
############################ PREPARATION ############################

#-------------------------- import packages --------------------------
import random
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from itertools import combinations, product
import pyreadr
import seaborn as sns
import copy

import tensorflow as tf
import tensorflow_lattice as tfl
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Multiply, Add, Embedding, Reshape, Concatenate, Dropout, BatchNormalization, Lambda, Layer, CategoryEncoding, Activation
from keras.constraints import Constraint
from keras.callbacks import EarlyStopping
from keras.initializers import Zeros, Constant
from keras.optimizers.legacy import Adam, Nadam, RMSprop
from keras.models import clone_model
import keras_tuner as kt
from keras import backend as K
from keras import regularizers
from keras.utils import plot_model
from keras.losses import Poisson, Loss
from keras.metrics import MeanAbsoluteError, RootMeanSquaredError

# import xgboost as xgb
from scipy.stats import uniform, randint
from pygam import PoissonGAM, GAM, s, f, l, te # s for spline, f for factor, l for linear, te for tensor product
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import make_scorer
from interpret.glassbox import ExplainableBoostingRegressor

import warnings
warnings.filterwarnings('ignore') # suprress warnings

In [2]:
#-------------------------- for reproducibility --------------------------
seed_value = 2024
keras.utils.set_random_seed(seed_value)
np.random.seed(seed_value)

In [3]:
#-------------------------- import data --------------------------
''' Belgian motor third-party liability dataset from the CAS collection '''
r_data = pyreadr.read_r("data/beMTPL16.rda")
data = r_data['beMTPL16']
data.head() # view data snapshot

Unnamed: 0,insurance_contract,policy_year,exposure,insured_birth_year,vehicle_age,policy_holder_age,driver_license_age,vehicle_brand,vehicle_model,mileage,vehicle_power,catalog_value,claim_value,number_of_liability_claims,number_of_bodily_injury_liability_claims,claim_time,claim_responsibility_rate,driving_training_label,signal
0,C1,1,0.386301,1945,10,9,40,MERCEDES,ME-1245,30000,75,983732,2,0,0,00:00,0,No,0
1,C2,1,0.493151,1941,4,25,24,VOLKSWAGEN,VO-2461,30000,55,510562,8,0,0,07:45,0,No,0
2,C3,1,0.290411,1944,0,2,39,AUDI,AU-967,30000,120,1934768,10,0,0,00:00,0,No,0
3,C4,1,0.336986,1948,1,14,37,LANCIA,LA-2346,30000,51,536755,13,0,0,18:50,0,No,0
4,C5,1,0.219178,1928,3,7,59,CITROEN,CI-1258,30000,54,446725,14,0,0,00:00,100,No,0


In [None]:
############################ DATA UNDERSTANDING & CLEANING ############################

#-------------------------- high-level checking --------------------------
print(data.shape) # number of rows and columns
print(data.info()) # check data type and missing values

(70791, 19)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70791 entries, 0 to 70790
Data columns (total 19 columns):
 #   Column                                    Non-Null Count  Dtype   
---  ------                                    --------------  -----   
 0   insurance_contract                        70791 non-null  category
 1   policy_year                               70791 non-null  int32   
 2   exposure                                  70791 non-null  float64 
 3   insured_birth_year                        70791 non-null  int32   
 4   vehicle_age                               70791 non-null  int32   
 5   policy_holder_age                         70791 non-null  int32   
 6   driver_license_age                        70791 non-null  int32   
 7   vehicle_brand                             70791 non-null  category
 8   vehicle_model                             70791 non-null  category
 9   mileage                                   70791 non-null  int32   
 10  vehicle_po

In [7]:
data["insurance_contract"]

0            C1
1            C2
2            C3
3            C4
4            C5
          ...  
70786    C58720
70787    C58721
70788    C58722
70789    C52079
70790    C58723
Name: insurance_contract, Length: 70791, dtype: category
Categories (58723, object): ['C1', 'C10', 'C100', 'C1000', ..., 'C9996', 'C9997', 'C9998', 'C9999']

In [9]:
duplicate_values = data['insurance_contract'].value_counts()
# duplicate_values = duplicate_values[duplicate_values > 1].index.tolist()
duplicate_values

insurance_contract
C30268    6
C11300    6
C13606    5
C569      5
C937      5
         ..
C30437    1
C30438    1
C3044     1
C30440    1
C9999     1
Name: count, Length: 58723, dtype: int64