In [39]:
############################ 0. PREPARATION ############################

#-------------------------- import packages --------------------------
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from itertools import combinations

import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Multiply, Add, Embedding, Reshape, Concatenate, Dropout, BatchNormalization, Lambda
from keras.callbacks import EarlyStopping
from keras.initializers import Zeros
from tensorflow.keras.optimizers.legacy import Adam, Nadam # the non-legacy version runs slowly on Mac
import keras_tuner as kt
from keras.utils import plot_model
from tensorflow.keras.losses import Poisson
from scipy.stats import gamma

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error as mse
from sklearn.compose import make_column_transformer
from NAM_models import ActivationLayer, FeatureNN, NAM

from interpret.glassbox import ExplainableBoostingClassifier

#-------------------------- import data --------------------------
'''we use the popular French Motor TPL insurance claim data '''
freq = pd.read_csv("data/freMTPL2freq.csv")
sev = pd.read_csv("data/freMTPL2sev.csv")

In [2]:
############################ 1. PREPROCESSING ############################

#-------------------------- merge/filter claim data --------------------------
# remove outliers
sev_sum = sev.groupby('IDpol').agg(
    ClaimNb_sev = ('IDpol', 'count'),
    Avg_Clamt = ('ClaimAmount', 'mean')
)
sev_sum = sev_sum[np.array(sev_sum["ClaimNb_sev"] <= 4)] # remove outliers and policies that do not appear in freq data (ClaimNb > 16 do not appear in freq data)
freq_sum = freq[np.array(freq["ClaimNb"] <= 4)] # remove outliers

# combine claim frequency and severity
claim = freq_sum.merge(sev_sum, on = 'IDpol', how = 'left')
claim['Avg_Clamt'] = claim['Avg_Clamt'].fillna(0)
claim_final = claim.drop(["IDpol", "ClaimNb_sev"], axis = 1)


#-------------------------- subsample and split --------------------------
# train-test-split
claim_final['ClaimNb_Cat'] = claim_final["ClaimNb"].astype(str)
X_train, X_test, y_train_mult, y_test_mult = train_test_split(
    claim_final.drop(["ClaimNb", "ClaimNb_Cat", "Avg_Clamt"], axis = 1), 
    claim_final[["ClaimNb", "Avg_Clamt"]], 
    stratify = claim_final["ClaimNb_Cat"], # stratify sampling to ensure equal distribution of non-zero claims across train and test sets
    random_state = 2025)

y_train = y_train_mult[["ClaimNb"]].squeeze() # seperate the target for single task and multitask learning
y_test = y_test_mult[["ClaimNb"]].squeeze() # seperate the target for single task and multitask learning
claim_final = claim_final.drop("ClaimNb_Cat", axis = 1)

In [3]:
'''check if claim count is balanced between train and test sets'''
print(sum(y_train)/sum(X_train["Exposure"]))
print(sum(y_test)/sum(X_test["Exposure"]))

0.10054878097157874
0.1002555031162374


In [4]:
#-------------------------- feature transformation --------------------------

# define transformer
ct = make_column_transformer(
    ("passthrough", ["Exposure"]),
    (OrdinalEncoder(), ["Area", "VehGas", "VehBrand", "Region"]),
    remainder = StandardScaler(),
    verbose_feature_names_out = False
)


# transform the data for NAM
train_NAM = ct.fit_transform(X_train)
test_NAM = ct.transform(X_test)
feature_names = ct.get_feature_names_out()  # get the columns' names
print(feature_names)

# # feature summary
# feature_expansion = {} # number of columns for each feature after transformation
# for original_feature in X_train.columns:
#     # For each original feature, count how many transformed feature names start with it
#     count = sum(fn.startswith(original_feature) for fn in feature_names)
#     feature_expansion[original_feature] = count
# print(feature_expansion)

['Exposure' 'Area' 'VehGas' 'VehBrand' 'Region' 'VehPower' 'VehAge'
 'DrivAge' 'BonusMalus' 'Density']


In [5]:
############################ 2. NAM ############################

categorical_var = ["Area", "VehBrand", "Region"]

#-------------------------- define functions to create NAM and subnets --------------------------
'''create subnetwork for each feature/group of features'''
def create_subnet(num_layers, units_per_layer, activation, dropout_rate):
    """create a subnet with configurable layers and neurons."""
    model = Sequential()
    for _ in range(num_layers):
        model.add(Dense(units_per_layer, activation = activation))
        model.add(Dropout(dropout_rate, seed = 2000))
    model.add(Dense(1, activation = 'tanh'))
    return model


'''create NAM'''
def build_NAM(hp):
    inputs = []  # Store input layers
    interaction_inputs = [] # input for modeling pairwise interaction effect
    sub_outputs = []  # Outputs from each subnet

    # hyperparameters shared across all subnets
    num_layers = hp.Int('num_layers', 2, 4)
    units_main = hp.Int('units_main', 10, 25, step = 5) # number of units per layer for main effect
    units_interaction = hp.Int('units_interaction', 15, 30, step = 5) # number of units per layer for interaction effect
    activation = hp.Choice('activation', ['relu', 'leaky_relu', 'swish', 'gelu'])
    dropout_rate = hp.Float('dropout_rate', 0.2, 0.5, step = 0.1)
    embedding_dim = hp.Int("embedding_dimension", 2, 4) # embedding dimension for categorical variables
    lr = hp.Float('learning_rate', min_value = 1e-4, max_value = 1e-2, sampling = 'log') # learning rate for optimizer

    # main effect
    for name in feature_names:
        input_layer = Input(shape = (1,), name = name)
        inputs.append(input_layer)
        
        if name == "Exposure":  # Direct use without a subnet
            exposure_input = input_layer
        elif name in categorical_var:
            # categorical variables will pass through an embedding layer
            embed_layer = Embedding(input_dim = claim_final[name].nunique(), 
                        output_dim = embedding_dim, 
                        name = f"embed_{name}")(input_layer)
            embed_layer_reshape = Reshape(target_shape = (embedding_dim,), name = f"reshape_{name}")(embed_layer)
            interaction_inputs.append(embed_layer_reshape)
            subnet = create_subnet(num_layers, units_main, activation, dropout_rate)
            sub_output = subnet(embed_layer_reshape)
            sub_outputs.append(sub_output)
        else:
            interaction_inputs.append(input_layer)
            subnet = create_subnet(num_layers, units_main, activation, dropout_rate)
            sub_output = subnet(input_layer)
            sub_outputs.append(sub_output)
    
    # interaction effect
    for input_1, input_2 in combinations(interaction_inputs, 2): # we don't include Exposure
        # for categorical input
        interaction_input = Concatenate()([input_1, input_2])
        interaction_subnet = create_subnet(num_layers, units_interaction, activation, dropout_rate)
        interaction_output = interaction_subnet(interaction_input)
        sub_outputs.append(interaction_output)

    # combine subnets' outputs
    subnet_output = Concatenate()(sub_outputs)
    output_layer = Dense(1, activation = "exponential")(subnet_output)

    # multiply output with exposure
    final_output = Multiply()([exposure_input, output_layer])
    model = Model(inputs = inputs, outputs = final_output)
    
    #compile and return model
    model.compile(optimizer = Nadam(learning_rate = lr), loss='poisson', metrics=['mean_squared_error', 'poisson'])
    return model

In [6]:
# -------------------------- fit a NAM to training data --------------------------

# hyperparameter tuning
tuner = kt.RandomSearch(
    build_NAM,
    objective = 'val_loss',
    max_trials = 5,  # Increased trials due to additional hyperparameters
    directory = "hyperparameter_tuning_NAM",
    seed = 2025 # for reproducibility
)

# regularization
es = EarlyStopping(patience = 10, restore_best_weights = True, verbose = 0)

# training data need to be split into different arrays, each correponds to input for a particular subnet
X_train_split = []
for i in range(len(feature_names)):
    X_train_split.append(train_NAM[:, i])

# search for the best model
tuner.search(X_train_split, y_train,
            epochs = 100,  
            batch_size = 1000, 
            validation_split = 0.2,
            callbacks = [es])

# get the best model
model_NAM = tuner.get_best_models()[0]
tuner.results_summary(1)

INFO:tensorflow:Reloading Tuner from hyperparameter_tuning_NAM\untitled_project\tuner0.json
INFO:tensorflow:Oracle triggered exit
Results summary
Results in hyperparameter_tuning_NAM\untitled_project
Showing 1 best trials
<keras_tuner.engine.objective.Objective object at 0x00000264938FAF70>
Trial summary
Hyperparameters:
num_layers: 3
units_main: 20
units_interaction: 25
activation: swish
dropout_rate: 0.4
embedding_dimension: 4
learning_rate: 0.0010799944024561447
Score: 0.2021609991788864


In [7]:
#-------------------------- evaluate on testing data --------------------------

# split testing data into different sets
X_test_split = []
for i in range(len(feature_names)):
    X_test_split.append(test_NAM[:, i])

# evaluation
nam_poisson_loss, nam_mse, _ = model_NAM.evaluate(X_test_split, y_test)
print(f"NAM test set metrics ~ Poisson loss: {nam_poisson_loss:.4f}, MSE: {nam_mse:.4f}")

NAM test set metrics ~ Poisson loss: 0.2019, MSE: 0.0552


In [8]:
############################ 3. NAM - multitask ############################

'''loss function for multitask learning - assume Gamma distribution for claim amount'''
def multitask_loss(y_true, y_pred):
    claim_freq = y_pred[:,0]
    gamma_scale = y_pred[:,1]
    gamma_shape = y_pred[:,2]

    poisson_loss = Poisson()(y_true[:,0], claim_freq)
    
    # Calculating Gamma loss
    epsilon = 1e-7  # to avoid log(0)
    gamma_loss = -(
        (gamma_shape - 1) * tf.math.log(y_true[:, 1]+ epsilon) 
        - y_true[:, 1] / gamma_scale 
        - gamma_shape * tf.math.log(gamma_scale + epsilon)
        - tf.math.lgamma(gamma_shape + epsilon)
    )
    
    sev_weights = tf.where(y_true[:,0] > 0, 0.5, 0.0) # weighting factor for claim severity

    # Weighted average loss
    total_loss = (1 - sev_weights) * poisson_loss + sev_weights * gamma_loss
    return tf.reduce_mean(total_loss)

'''calculate the metric of claim frequency'''
def custom_mse(y_true, y_pred):
    return tf.keras.metrics.mean_squared_error(y_true[:, 0], y_pred[:, 0])

def custom_poisson(y_true, y_pred):
    return Poisson()(y_true[:,0], y_pred[:,0])

'''create multi-task model'''
def build_multitask_NAM(hp):
    inputs = []  # Store input layers
    interaction_inputs = [] # input for modeling pairwise interaction effect
    sub_outputs = []  # Outputs from each subnet

    # hyperparameters shared across all subnets
    num_layers = hp.Int('num_layers', 2, 4)
    units_main = hp.Int('units_main', 10, 25, step = 5) # number of units per layer for main effect
    units_interaction = hp.Int('units_interaction', 15, 30, step = 5) # number of units per layer for interaction effect
    activation = hp.Choice('activation', ['relu', 'leaky_relu', 'swish', 'gelu'])
    dropout_rate = hp.Float('dropout_rate', 0.2, 0.5, step = 0.1)
    embedding_dim = hp.Int("embedding_dimension", 2, 4) # embedding dimension for categorical variables
    lr = hp.Float('learning_rate', min_value = 1e-4, max_value = 1e-2, sampling = 'log') # learning rate for optimizer

    # main effect
    for name in feature_names:
        input_layer = Input(shape = (1,), name = name)
        inputs.append(input_layer)
        
        if name == "Exposure":  # Direct use without a subnet
            exposure_input = input_layer
        elif name in categorical_var:
            # categorical variables will pass through an embedding layer
            embed_layer = Embedding(input_dim = claim_final[name].nunique(), 
                        output_dim = embedding_dim, 
                        name = f"embed_{name}")(input_layer)
            embed_layer_reshape = Reshape(target_shape = (embedding_dim,), name = f"reshape_{name}")(embed_layer)
            interaction_inputs.append(embed_layer_reshape)
            subnet = create_subnet(num_layers, units_main, activation, dropout_rate)
            sub_output = subnet(embed_layer_reshape)
            sub_outputs.append(sub_output)
        else:
            interaction_inputs.append(input_layer)
            subnet = create_subnet(num_layers, units_main, activation, dropout_rate)
            sub_output = subnet(input_layer)
            sub_outputs.append(sub_output)
    
    # interaction effect
    for input_1, input_2 in combinations(interaction_inputs, 2): # we don't include Exposure
        # for categorical input
        interaction_input = Concatenate()([input_1, input_2])
        interaction_subnet = create_subnet(num_layers, units_interaction, activation, dropout_rate)
        interaction_output = interaction_subnet(interaction_input)
        sub_outputs.append(interaction_output)

    # combine subnets' outputs
    subnet_output = Concatenate()(sub_outputs)
    
    # Define separate output neurons for claim frequency, gamma scale, and gamma shape
    claim_freq_output = Dense(1, activation = "exponential", name ='claim_freq')(subnet_output)
    gamma_scale_output = Dense(1, activation = "exponential", name = 'gamma_scale')(subnet_output)
    gamma_shape_output = Dense(1, activation = "exponential", name ='gamma_shape')(subnet_output)

    # multiply claim frequency with exposure
    claim_freq_adjusted = Multiply()([exposure_input, claim_freq_output])

    # concatenate adjusted claim frequency with gamma parameters
    final_output = Concatenate()([claim_freq_adjusted, gamma_scale_output, gamma_shape_output])

    model = Model(inputs = inputs, outputs = final_output)
    
    #compile and return model
    model.compile(optimizer = Nadam(learning_rate = lr), 
                  loss = multitask_loss, 
                   metrics=[custom_mse, custom_poisson])
    return model

In [9]:
# -------------------------- tuning and fit the model to training data --------------------------
# hyperparameter tuning
tuner = kt.RandomSearch(
    build_multitask_NAM,
    objective = 'val_loss',
    max_trials = 5,  # Increased trials due to additional hyperparameters
    directory = "hyperparameter_tuning_multitask_NAM",
    seed = 2025 # for reproducibility
)

# regularization
es = EarlyStopping(patience = 3, restore_best_weights = True, verbose = 0)

# search for the best model
tuner.search(X_train_split, y_train_mult,
            epochs = 100,  
            batch_size = 1000, 
            validation_split = 0.2,
            callbacks = [es])

# get the best model
model_multitask_NAM = tuner.get_best_models()[0]
tuner.results_summary(1)

INFO:tensorflow:Reloading Tuner from hyperparameter_tuning_multitask_NAM\untitled_project\tuner0.json
INFO:tensorflow:Oracle triggered exit
Results summary
Results in hyperparameter_tuning_multitask_NAM\untitled_project
Showing 1 best trials
<keras_tuner.engine.objective.Objective object at 0x00000264938FAB50>
Trial summary
Hyperparameters:
num_layers: 3
units_main: 20
units_interaction: 20
activation: swish
dropout_rate: 0.30000000000000004
embedding_dimension: 2
learning_rate: 0.007836255783976532
Score: 0.29096662998199463


In [10]:
#-------------------------- evaluate on testing data --------------------------
_, nam_multitask_mse, nam_multitask_poisson_loss, = model_multitask_NAM.evaluate(X_test_split, y_test_mult)
print(f"NAM-multitask test set metrics ~ Poisson loss: {nam_multitask_poisson_loss:.4f}, MSE: {nam_multitask_mse:.4f}")

NAM-multitask test set metrics ~ Poisson loss: 0.2035, MSE: 0.0555


In [11]:
############################ 4. GLM ############################

#-------------------------- preprocessing --------------------------

# define transformer
ct = make_column_transformer(
    ("passthrough", ["Exposure"]),
    (OrdinalEncoder(), ["Area", "VehGas"]),
    (OneHotEncoder(), ["VehBrand", "Region"]),
    remainder = StandardScaler(),
    verbose_feature_names_out = False
)

# fit & transform
train = ct.fit_transform(X_train).toarray()
test = ct.transform(X_test).toarray()
feature_names = ct.get_feature_names_out()  # get the columns' names


'''we want to use dummy encoding for VehBrand and Region so 2 columns need
to be removed from both training and test data. These 2 columns are the reference
levels for VehBrand and Region. Choose B12 for VehBrand and Centre for Region.'''

# separate the offset term (or exposure)
offset_train = train[:, 0]
offset_test = test[:, 0]

# index for reference level in feature_names
ref_index = [np.where(feature_names == "VehBrand_B12")[0][0],
             np.where(feature_names == "Region_Centre")[0][0]]
ref_index.append(0) # Exposure

# remove reference levels
train_dummy = np.delete(train, ref_index, axis = 1)
test_dummy = np.delete(test, ref_index, axis = 1)
feature_dummy = [feature for i, feature in enumerate(feature_names) if i not in ref_index]

# add a constant to the model (intercept)
train_dummy = sm.add_constant(train_dummy)
test_dummy = sm.add_constant(test_dummy)

In [12]:
train_dummy

array([[ 1.        ,  4.        ,  1.        , ...,  0.24853382,
        -0.62435238,  0.15282861],
       [ 1.        ,  1.        ,  0.        , ..., -0.24667445,
        -0.62435238, -0.43372675],
       [ 1.        ,  4.        ,  0.        , ...,  0.53150997,
        -0.3689211 ,  0.07786865],
       ...,
       [ 1.        ,  3.        ,  1.        , ..., -1.59081117,
         2.24924957, -0.04857896],
       [ 1.        ,  1.        ,  0.        , ...,  0.74374208,
         0.78051968, -0.43675543],
       [ 1.        ,  3.        ,  1.        , ...,  0.46076593,
        -0.62435238, -0.26588711]])

In [13]:
#-------------------------- fit a GLM model --------------------------
model_GLM = sm.GLM(y_train, train_dummy, family = sm.families.Poisson(), offset = offset_train)
results = model_GLM.fit()
# results.summary()

In [14]:
train_dummy

array([[ 1.        ,  4.        ,  1.        , ...,  0.24853382,
        -0.62435238,  0.15282861],
       [ 1.        ,  1.        ,  0.        , ..., -0.24667445,
        -0.62435238, -0.43372675],
       [ 1.        ,  4.        ,  0.        , ...,  0.53150997,
        -0.3689211 ,  0.07786865],
       ...,
       [ 1.        ,  3.        ,  1.        , ..., -1.59081117,
         2.24924957, -0.04857896],
       [ 1.        ,  1.        ,  0.        , ...,  0.74374208,
         0.78051968, -0.43675543],
       [ 1.        ,  3.        ,  1.        , ...,  0.46076593,
        -0.62435238, -0.26588711]])

In [15]:
#-------------------------- evaluation --------------------------
GLM_pred = results.predict(test_dummy, offset = offset_test)
GLM_mse = mse(y_test, GLM_pred)
print(GLM_mse)

0.05560565395994129


In [16]:
# Calculate the Poisson loss for the GLM model which Keras uses
glm_poisson_loss = keras.losses.Poisson()(y_test, GLM_pred).numpy()
glm_poisson_loss

0.20274714124657348

In [17]:
############################ 5. NEURAL NETWORKS ############################

#-------------------------- preprocessing --------------------------
ct_NN = make_column_transformer(
    ("passthrough", ["Exposure"]),
    (OrdinalEncoder(), ["Area", "VehGas", "VehBrand", "Region"]),
    remainder = StandardScaler(),
    verbose_feature_names_out = False)


# transform the data
train_NN = ct_NN.fit_transform(X_train)
test_NN = ct_NN.transform(X_test)
feature_names_NN = ct_NN.get_feature_names_out()  # get the columns' names


# separate exposure column
exposure_index = np.where(feature_names_NN == "Exposure")[0][0] # index of the Exposure column
train_exposure = train_NN[:,exposure_index]
test_exposure = test_NN[:,exposure_index]


# separate multi-level categorical columns
categorical_var = ["Area", "VehBrand", "Region"]
train_cat = []
test_cat = []
cat_index = []
for i in range(len(categorical_var)):
    index = np.where(feature_names_NN == categorical_var[i])[0][0]
    train_cat.append(train_NN[:,index])
    test_cat.append(test_NN[:,index])
    cat_index.append(index)

                    
# drop exposure and nominal categorical columns
train_others = np.delete(train_NN, [exposure_index] + cat_index, axis = 1)
test_others = np.delete(test_NN, [exposure_index] + cat_index, axis = 1)

In [18]:
#-------------------------- define model architecture --------------------------

'''Define model architecture'''
def build_NN(hp):
    
    exposure = Input(shape=(1,), name = "exposure") # exposure
    cat_inputs = [] # input layers for categorical features
    embedding_layers = [] # embedding layers
    embedding_dim = hp.Int('embedding_dim', 2, 4, step = 1)
    for cat_column in categorical_var:

        # input layers
        cat_input = Input(shape = (1,), name = f"input_{cat_column}")  # assuming categorical vars are single integers
        cat_inputs.append(cat_input)

        # embedding layers
        embed_layer = Embedding(input_dim = claim_final[cat_column].nunique(), 
                                output_dim = embedding_dim, 
                                name = f"embed_{cat_column}")(cat_input)
        embed_layer_reshape = Reshape(target_shape = (embedding_dim,), name = f"reshape_{cat_column}")(embed_layer)

        embedding_layers.append(embed_layer_reshape)
        
    other_inputs = Input(shape = train_others.shape[1:])
    inputs = Concatenate(name = "combined_input")(embedding_layers + [other_inputs])

    # dense layers
    x = inputs
    activation = hp.Choice('activation', ['relu', 'leaky_relu', 'swish', 'gelu'])
    dropout_rate = hp.Float(f'dropout', min_value = 0.2, max_value = 0.5, step = 0.1)
    for i in range(hp.Int('num_dense_layers', 2, 5, step = 2)):
        num_units = hp.Int(f'num_units_{i}', 32, 128, step = 16)
        x = Dense(units = num_units, activation = activation)(x)
        x = Dropout(dropout_rate)(x)
    lambda_ = Dense(1, "exponential")(x)

    # final output
    out = Multiply()([lambda_, exposure])
    model = Model([exposure] + cat_inputs + [other_inputs], out)

    # optimizer
    lr = hp.Float('learning_rate', min_value = 1e-4, max_value = 1e-2, sampling = 'log')
    optimizer = Nadam(learning_rate = lr)
    
    # build model
    model.compile(optimizer = optimizer,
              loss = "poisson", 
              metrics = ['mean_squared_error', 'poisson'])
    
    return model

In [19]:
#-------------------------- tune and fit fit a deep NN --------------------------
# tuner
tuner = kt.RandomSearch(
    build_NN,
    objective = 'val_loss',
    max_trials = 5,  # Increased trials due to additional hyperparameters
    directory = "hyperparameter_tuning_NN",
    seed = 2025 # for reproducibility
)

# regularization
es = EarlyStopping(patience = 10, restore_best_weights = True, verbose = 0)

# search for the best model
tuner.search([train_exposure, train_cat, train_others], y_train,
            epochs = 500,  
            batch_size = 1000, 
            validation_split = 0.2,
            callbacks = [es])

# get the best model
model_NN = tuner.get_best_models()[0]
tuner.results_summary(1)

INFO:tensorflow:Reloading Tuner from hyperparameter_tuning_NN\untitled_project\tuner0.json
INFO:tensorflow:Oracle triggered exit
Results summary
Results in hyperparameter_tuning_NN\untitled_project
Showing 1 best trials
<keras_tuner.engine.objective.Objective object at 0x0000026496725C70>
Trial summary
Hyperparameters:
embedding_dim: 3
activation: relu
dropout: 0.2
num_dense_layers: 2
num_units_0: 112
num_units_1: 64
learning_rate: 0.0021291840426046296
Score: 0.2021438479423523


In [20]:
#-------------------------- plot model --------------------------
plot_model(model_NN, show_layer_names = True, show_shapes = True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [21]:
#-------------------------- evaluate on test set --------------------------
nn_poisson_loss, nn_mse, _ = model_NN.evaluate([test_exposure, test_cat, test_others], y_test)
print(f"NN test set metrics ~ Poisson loss: {nn_poisson_loss:.4f}, MSE: {nn_mse:.4f}")

NN test set metrics ~ Poisson loss: 0.2017, MSE: 0.0551


In [22]:
############################ 6. NAM - sparse ############################

#-------------------------- define functions to create NAM and subnets --------------------------
'''create subnetwork for each feature/group of features'''
def create_subnet(num_layers, units_per_layer, activation, dropout_rate):
    """create a subnet with configurable layers and neurons."""
    model = Sequential()
    for _ in range(num_layers):
        model.add(Dense(units_per_layer, activation = activation))
        model.add(Dropout(dropout_rate, seed = 2000))
    model.add(Dense(1))
    model.add(BatchNormalization())
    return model


'''create NAM'''
def build_NAM_sparse(hp):
    inputs = []  # Store input layers
    interaction_inputs = [] # input for modeling pairwise interaction effect
    sub_outputs = []  # Outputs from each subnet

    # hyperparameters shared across all subnets
    num_layers = hp.Int('num_layers', 2, 4)
    units_main = hp.Int('units_main', 10, 25, step = 5) # number of units per layer for main effect
    units_interaction = hp.Int('units_interaction', 15, 30, step = 5) # number of units per layer for interaction effect
    activation = hp.Choice('activation', ['relu', 'leaky_relu', 'swish', 'gelu'])
    dropout_rate = hp.Float('dropout_rate', 0.2, 0.5, step = 0.1)
    embedding_dim = hp.Int("embedding_dimension", 2, 4) # embedding dimension for categorical variables
    lr = hp.Float('learning_rate', min_value = 1e-4, max_value = 1e-2, sampling = 'log') # learning rate for optimizer
    lambda_val = hp.Float('lambda', min_value = 1e-7, max_value = 1e-3, sampling = 'log')

    # main effect
    for name in feature_names_NN:
        input_layer = Input(shape = (1,), name = name)
        inputs.append(input_layer)
        
        if name == "Exposure":  # Direct use without a subnet
            exposure_input = input_layer
        elif name in categorical_var:
            # categorical variables will pass through an embedding layer
            embed_layer = Embedding(input_dim = claim_final[name].nunique(), 
                        output_dim = embedding_dim, 
                        name = f"embed_{name}")(input_layer)
            embed_layer_reshape = Reshape(target_shape = (embedding_dim,), name = f"reshape_{name}")(embed_layer)
            interaction_inputs.append(embed_layer_reshape)
            subnet = create_subnet(num_layers, units_main, activation, dropout_rate)
            sub_output = subnet(embed_layer_reshape)
            sub_outputs.append(sub_output)
        else:
            interaction_inputs.append(input_layer)
            subnet = create_subnet(num_layers, units_main, activation, dropout_rate)
            sub_output = subnet(input_layer)
            sub_outputs.append(sub_output)
    
    # interaction effect
    for input_1, input_2 in combinations(interaction_inputs, 2): # we don't include Exposure
        # for categorical input
        interaction_input = Concatenate()([input_1, input_2])
        interaction_subnet = create_subnet(num_layers, units_interaction, activation, dropout_rate)
        interaction_output = interaction_subnet(interaction_input)
        sub_outputs.append(interaction_output)

    # combine subnets' outputs
    subnet_output = Concatenate()(sub_outputs)
    final_dense_layer = Dense(1, activation="exponential")
    output_layer = final_dense_layer(subnet_output)

    # multiply output with exposure
    final_output = Multiply()([exposure_input, output_layer])
    model = Model(inputs = inputs, outputs = final_output)
    
    def objective_function(y_true, y_pred):
        base_loss = tf.keras.losses.poisson(y_true, y_pred)
        regularization = lambda_val * tf.reduce_sum(tf.abs(final_dense_layer.kernel))
        return base_loss + regularization

    #compile and return model
    model.compile(optimizer = Nadam(learning_rate = lr), loss = objective_function, metrics=['mean_squared_error', 'poisson'])
    return model

In [23]:
# -------------------------- fit a NAM sparse to training data --------------------------

# hyperparameter tuning
tuner_NAM_sparse = kt.RandomSearch(
    build_NAM_sparse,
    objective = 'val_loss',
    max_trials = 5,  # Increased trials due to additional hyperparameters
    directory = "hyperparameter_tuning_NAM_sparse",
    seed = 2025 # for reproducibility
)

# regularization
es = EarlyStopping(patience = 10, restore_best_weights = True, verbose = 0)

# training data need to be split into different arrays, each correponds to input for a particular subnet
X_train_split = []
for i in range(len(feature_names_NN)):
    X_train_split.append(train_NAM[:, i])

# search for the best model
tuner_NAM_sparse.search(X_train_split, y_train,
            epochs = 100,  
            batch_size = 2000, 
            validation_split = 0.2,
            callbacks = [es])

# get the best model
NAM_sparse = tuner_NAM_sparse.get_best_models()[0]
tuner_NAM_sparse.results_summary(1)

INFO:tensorflow:Reloading Tuner from hyperparameter_tuning_NAM_sparse\untitled_project\tuner0.json
INFO:tensorflow:Oracle triggered exit
Results summary
Results in hyperparameter_tuning_NAM_sparse\untitled_project
Showing 1 best trials
<keras_tuner.engine.objective.Objective object at 0x0000026493378580>
Trial summary
Hyperparameters:
num_layers: 3
units_main: 10
units_interaction: 30
activation: gelu
dropout_rate: 0.2
embedding_dimension: 2
learning_rate: 0.008079775183819825
lambda: 3.369139883399427e-06
Score: 0.20280131697654724


In [24]:
#-------------------------- evaluate on testing data --------------------------
# split testing data into different sets
X_test_split = []
for i in range(len(feature_names_NN)):
    X_test_split.append(test_NAM[:, i])

# evaluation
_, nam_sparse_mse, nam_sparse_poisson_loss = NAM_sparse.evaluate(X_test_split, y_test)
print(f"NAM test set metrics ~ Poisson loss: {nam_sparse_poisson_loss:.4f}, MSE: {nam_sparse_mse:.4f}")

NAM test set metrics ~ Poisson loss: 0.2027, MSE: 0.0552


In [25]:
NAM_sparse.get_weights()[-2]

array([[ 0.06982408],
       [ 0.02478735],
       [ 0.40121156],
       [-0.00322605],
       [ 0.0071771 ],
       [-0.03222786],
       [-0.04049305],
       [-0.11454726],
       [-0.00348934],
       [ 0.05488854],
       [-1.0184373 ],
       [-0.02394445],
       [ 0.06990471],
       [ 0.24423312],
       [-0.08087277],
       [ 0.2297778 ],
       [-0.06123446],
       [-0.09661167],
       [ 0.00657442],
       [ 0.06168557],
       [-0.1518234 ],
       [-0.0592466 ],
       [-0.2413681 ],
       [ 0.0065154 ],
       [-0.14535318],
       [-0.17051622],
       [ 0.3123775 ],
       [-0.07521696],
       [-0.17325313],
       [ 0.06939006],
       [ 0.07103633],
       [ 0.06774147],
       [-0.10647857],
       [-0.13080256],
       [-0.06830701],
       [-0.14871813],
       [-0.07896812],
       [ 0.04126915],
       [ 0.04356374],
       [-0.06790996],
       [ 0.13342215],
       [ 0.00928518],
       [ 0.13726951],
       [ 0.01560248],
       [ 0.04981403]], dtype=flo

In [26]:
############################ 7. EBM ############################

#-------------------------- fit the model --------------------------
# # prepare data
# train_EBM = train_dummy[:,1:]
# test_EBM = test_dummy[:,1:]

# fit the model
model_EBM = ExplainableBoostingClassifier()
model_EBM.fit(train_NN, y_train)


  warn(


In [27]:
#-------------------------- evaluate on testing data --------------------------

# Make predictions
y_pred = model_EBM.predict(test_NN)

# Calculate Mean Squared Error
ebm_mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {ebm_mse}')

# Convert y_test and y_pred to tensors for TensorFlow Poisson loss
y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)
y_pred_tensor = tf.convert_to_tensor(y_pred, dtype=tf.float32)

# Calculate Poisson loss using TensorFlow
poisson_loss = Poisson()
ebm_poisson_loss = poisson_loss(y_test_tensor, y_pred_tensor).numpy()
print(f'Poisson Loss (TensorFlow): {ebm_poisson_loss}')

Mean Squared Error: 0.05923858856289933
Poisson Loss (TensorFlow): 0.8562985062599182


In [45]:
############################ 8. CANN ############################

#-------------------------- fit the model --------------------------
'''Define model architecture'''
def build_CANN(hp):
    
    exposure = Input(shape=(1,), name = "exposure") # exposure
    cat_inputs = [] # input layers for categorical features
    embedding_layers = [] # embedding layers
    embedding_dim = hp.Int('embedding_dim', 2, 4, step = 1)
    for cat_column in categorical_var:

        # input layers
        cat_input = Input(shape = (1,), name = f"input_{cat_column}")  # assuming categorical vars are single integers
        cat_inputs.append(cat_input)

        # embedding layers
        embed_layer = Embedding(input_dim = claim_final[cat_column].nunique(), 
                                output_dim = embedding_dim, 
                                name = f"embed_{cat_column}")(cat_input)
        embed_layer_reshape = Reshape(target_shape = (embedding_dim,), name = f"reshape_{cat_column}")(embed_layer)

        embedding_layers.append(embed_layer_reshape)
        
    other_inputs = Input(shape = train_others.shape[1:])
    inputs = Concatenate(name = "combined_input")(embedding_layers + [other_inputs])

    # dense layers
    x = inputs
    activation = hp.Choice('activation', ['relu', 'leaky_relu', 'swish', 'gelu'])
    dropout_rate = hp.Float(f'dropout', min_value = 0.2, max_value = 0.5, step = 0.1)
    for i in range(hp.Int('num_dense_layers', 2, 5, step = 2)):
        num_units = hp.Int(f'num_units_{i}', 32, 128, step = 16)
        x = Dense(units = num_units, activation = activation,
                  kernel_initializer = Zeros(),  
                  bias_initializer = Zeros())(x)
        x = Dropout(dropout_rate)(x)
    lambda_ = Dense(1, activation = 'exponential')(x)
    nn_out = Multiply()([lambda_, exposure])

    # glm skip connection
    glm_input = Input(shape = (1,), name = 'glm_input', dtype=tf.float32)
    glm_layer = Lambda(lambda x: x, name="glm_layer")(glm_input)

    # final output
    final_output = Multiply()([nn_out, glm_input])
    model = Model([exposure] + cat_inputs + [other_inputs] + [glm_input], final_output)

    # optimizer
    lr = hp.Float('learning_rate', min_value = 1e-4, max_value = 1e-2, sampling = 'log')
    optimizer = Nadam(learning_rate = lr)
    
    # build model
    model.compile(optimizer = optimizer,
              loss = "poisson", 
              metrics = ['mean_squared_error', 'poisson'])
    
    return model

In [1]:
#-------------------------- tune and fit fit a CANN--------------------------
# tuner
tuner = kt.RandomSearch(
    build_CANN,
    objective = 'val_loss',
    max_trials = 10,  # Increased trials due to additional hyperparameters
    directory = "hyperparameter_tuning_CANN",
    seed = 2025 # for reproducibility
)

# regularization
es = EarlyStopping(patience = 10, restore_best_weights = True, verbose = 0)

train_glm = results.predict(train_dummy, offset = offset_train)

# search for the best model
tuner.search([train_exposure, train_cat, train_others, train_glm], y_train,
            epochs = 500,  
            batch_size = 1000, 
            validation_split = 0.2,
            callbacks = [es])

# get the best model
model_CANN = tuner.get_best_models()[0]
tuner.results_summary(1)

In [None]:
#-------------------------- evaluate on test set --------------------------
test_glm = results.predict(test_dummy, offset = offset_test)
cann_poisson_loss, cann_mse, _ = model_CANN.evaluate([test_exposure, test_cat, test_others, test_glm], y_test)
print(f"CANN test set metrics ~ Poisson loss: {cann_poisson_loss:.4f}, MSE: {cann_mse:.4f}")

CANN test set metrics ~ Poisson loss: 0.2174, MSE: 0.0564


In [None]:
# Print all the results
print(f"GLM test set metrics \t Poisson loss: {glm_poisson_loss:.4f} \t MSE: {GLM_mse:.4f}")
print(f"NAM test set metrics \t Poisson loss: {nam_poisson_loss:.4f} \t MSE: {nam_mse:.4f}")
print(f"NAM multitask test set metrics \t Poisson loss: {nam_multitask_poisson_loss:.4f} \t MSE: {nam_multitask_mse:.4f}")
print(f"NAM sparse metrics \t Poisson loss: {nam_sparse_poisson_loss:.4f} \t MSE: {nam_sparse_mse:.4f}")
print(f"NN test set metrics \t Poisson loss: {nn_poisson_loss:.4f} \t MSE: {nn_mse:.4f}")
print(f"EBM test set metrics \t Poisson loss: {ebm_poisson_loss:.4f} \t MSE: {ebm_mse:.4f}")
print(f"CANN test set metrics \t Poisson loss: {cann_poisson_loss:.4f} \t MSE: {cann_mse:.4f}")

GLM test set metrics 	 Poisson loss: 0.2027 	 MSE: 0.0556
NAM test set metrics 	 Poisson loss: 0.2019 	 MSE: 0.0552
NAM multitask test set metrics 	 Poisson loss: 0.2035 	 MSE: 0.0555
NAM sparse metrics 	 Poisson loss: 0.2027 	 MSE: 0.0552
NN test set metrics 	 Poisson loss: 0.2017 	 MSE: 0.0551
EBM test set metrics 	 Poisson loss: 0.8563 	 MSE: 0.0592
CANN test set metrics 	 Poisson loss: 0.2174 	 MSE: 0.0564
