In [137]:
############################ 0. PREPARATION ############################

#-------------------------- import packages --------------------------
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm

import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Multiply, Add
from keras.callbacks import EarlyStopping
from keras.utils import plot_model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error as mse
from sklearn.compose import make_column_transformer
from NAM_models import ActivationLayer, FeatureNN, NAM

#-------------------------- import data --------------------------
'''we use the popular French Motor TPL insurance claim data '''
freq = pd.read_csv("data/freMTPL2freq.csv")
sev = pd.read_csv("data/freMTPL2sev.csv")

In [138]:
############################ 1. PREPROCESSING ############################

random.seed(2000)

#-------------------------- merge/filter claim data --------------------------
# complete claim severity data
claimsev = sev.merge(freq, on = 'IDpol', how = 'left')
claimsev = claimsev.drop(columns = ['ClaimNb', 'Exposure'])

# drop ID
claimfreq = freq.drop(columns = "IDpol")


#-------------------------- subsample and split --------------------------
X_train, X_test, y_train, y_test = train_test_split(
    claimfreq.drop("ClaimNb", axis = 1), claimfreq["ClaimNb"], random_state = 2024)

In [139]:
#-------------------------- feature transformation --------------------------
# define transformer
ct = make_column_transformer(
    ("passthrough", ["Exposure"]),
    (OrdinalEncoder(), ["Area", "VehGas"]),
    (OneHotEncoder(), ["VehBrand", "Region"]),
    remainder = StandardScaler(),
    verbose_feature_names_out = False
)


# fit & transform
train = ct.fit_transform(X_train).toarray()
test = ct.transform(X_test).toarray()
feature_names = ct.get_feature_names_out()  # get the columns' names


# number of columns for each feature after transformation
feature_expansion = {} # empty dictionary to store the output
for original_feature in X_train.columns:
    # For each original feature, count how many transformed feature names start with it
    count = sum(fn.startswith(original_feature) for fn in feature_names)
    feature_expansion[original_feature] = count
feature_expansion

{'Exposure': 1,
 'VehPower': 1,
 'VehAge': 1,
 'DrivAge': 1,
 'BonusMalus': 1,
 'VehBrand': 11,
 'VehGas': 1,
 'Area': 1,
 'Density': 1,
 'Region': 21}

In [140]:
############################ 2. NAM ############################

#-------------------------- define functions to create NAM and subnets --------------------------
'''create subnetwork for each feature/group of features'''
def create_subnet(input_shape):
    model = Sequential([
        Dense(64, activation = 'relu'),
        Dense(32, activation = 'relu'),
        Dense(1, activation = "exponential")  # output layer for the subnet, assuming a scalar output
    ])
    return model


'''create NAM'''
def create_nam_model(feature_expansion):
    inputs = [] # store the input layer of each subnet 
    sub_outputs = [] # store the output of each subnetwork
    for column in feature_expansion:
        input_layer = Input(shape = (feature_expansion[column],))
        inputs.append(input_layer)
        
        if column == "Exposure":
            # assume the first input is the exposure, we directly use it without a subnet
            exposure_input = input_layer
        else:
            # create subnet for each feature/group of features
            subnet = create_subnet(feature_expansion[column])
            sub_output = subnet(input_layer)
            sub_outputs.append(sub_output)
    
    # concatenate the outputs of the subnets and sum them
    sum_of_subs = Add()(sub_outputs)
    
    # multiply the exposure input by the sum of subnets' outputs
    final_output = Multiply()([exposure_input, sum_of_subs])
    
    # final model
    model = Model(inputs = inputs, outputs = final_output)
    return model

In [141]:
#-------------------------- fit a NAM to training data --------------------------

model_NAM = create_nam_model(feature_expansion) # create NAM
model_NAM.compile(optimizer = "adam", loss = "poisson", metrics = ['mean_squared_error', 'poisson'])
es = EarlyStopping(patience = 5, restore_best_weights = True, verbose = 1)

# training data need to be split into different arrays, each correponds to input for a particular subnet
start = 0
X_train_split = []
for size in feature_expansion.values():
    end = start + size
    X_train_split.append(train[:, start:end])
    start = end

# fit the model
model_NAM.fit(X_train_split, y_train, epochs = 50, batch_size = 32, callbacks = [es], validation_split = 0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 12: early stopping


0.2061309516429901

In [142]:
#-------------------------- plot model --------------------------
plot_model(model_NAM, show_layer_names = True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [151]:
#-------------------------- evaluate on testing data --------------------------

# split testing data into different sets
start = 0
X_test_split = []
for size in feature_expansion.values():
    end = start + size
    X_test_split.append(test[:, start:end])
    start = end

# evaluation
model_NAM.evaluate(X_test_split, y_test)



[0.20545218884944916, 0.057174332439899445, 0.20545218884944916]

In [144]:
############################ 3. GLM ############################

#-------------------------- preprocessing --------------------------
'''we want to use dummy encoding for VehBrand and Region so 2 columns need
to be removed from both training and test data. These 2 columns are the reference
levels for VehBrand and Region. Choose B12 for VehBrand and Centre for Region.'''

# separate the offset term (or exposure)
offset_train = train[:, 0]
offset_test = test[:, 0]

# index for reference level in feature_names
ref_index = [np.where(feature_names == "VehBrand_B12")[0][0],
             np.where(feature_names == "Region_Centre")[0][0]]
ref_index.append(0) # Exposure

# remove reference levels
train_dummy = np.delete(train, ref_index, axis = 1)
test_dummy = np.delete(test, ref_index, axis = 1)
feature_dummy = [feature for i, feature in enumerate(feature_names) if i not in ref_index]

# add a constant to the model (intercept)
train_dummy = sm.add_constant(train_dummy)
test_dummy = sm.add_constant(test_dummy)

In [145]:
#-------------------------- fit a GLM model --------------------------
model_GLM = sm.GLM(y_train, train_dummy, family = sm.families.Poisson(), offset = offset_train)
results = model_GLM.fit()
# results.summary()

In [146]:
#-------------------------- evaluation --------------------------
GLM_pred = results.predict(test_dummy, offset = offset_test)
mse_GLM = mse(y_test, GLM_pred)
print(mse_GLM)

0.056988671710803546


In [156]:
############################ 4. NEURAL NETWORKS ############################

#-------------------------- preprocessing --------------------------
# separate exposure with other features
train_exposure = train[:,0]
test_exposure = test[:,0]
train_NN = train[:,1:]
test_NN = test[:,1:]

In [152]:
#-------------------------- fit a deep NN --------------------------

'''Define model architecture'''
# input layers
exposure = Input(shape=(1,))
other_inputs = Input(shape = train_NN.shape[1:])

# dense layers
x = Dense(64, "relu")(other_inputs)
x = Dense(64, "relu")(x)
lambda_ = Dense(1, "exponential")(x)

# final output
out = Multiply()([lambda_, exposure])
model_NN = Model([exposure, other_inputs], out)

# regularization
es = EarlyStopping(patience = 5, restore_best_weights = True, verbose = 0)

# compile the model
model_NN.compile(optimizer = "adam",
              loss = "poisson", 
              metrics = ['mean_squared_error', 'poisson'])

# fit
model_NN.fit([train_exposure, train_NN], y_train,
            epochs = 50,  
            batch_size = 32, 
            validation_split = 0.2,
            callbacks = [es])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50


<keras.src.callbacks.History at 0x31b6a5050>

In [157]:
#-------------------------- evaluate on test set --------------------------
model_NN.evaluate([test_exposure, test_NN], y_test)



[0.20424488186836243, 0.05705279856920242, 0.20424488186836243]