### Introduction to Machine Learning in Finance and Insurance (Spring 2024)
# Project 2 - Insurance Claim Prediction - Sandbox

In [1]:
# Import basic libraries
import numpy as np
import matplotlib.pyplot as plt

# Read a csv file using pandas

In [2]:
# Import libraries
# Pandas is a package used for data manipulation (e.g. dataframes, databases, etc)
import pandas as pd

In [3]:
# Load dataset from csv file into pandas dataframe object
df = pd.read_csv('freMTPL2freq.csv', sep=';', decimal=',')

FileNotFoundError: [Errno 2] No such file or directory: 'freMTPL2freq.csv'

In [None]:
# Inspect the first few rows of the dataframe
df

# Pre-process dataset features

In [None]:
# Define the pre-processing function for VehAge
# Attention! This is just an example. For your project submission, you must modify this function according to instructions.

def pre_process_VehAge(x):

    if x >= 0 and x < 6:
        output = 0
    else:
        output = 1

    return output

In [None]:
Exposure = df['Exposure']

# Transform discrete/continuous variables
VehPower = np.log(df['VehPower'])
DrivAge = np.log(df['DrivAge'])
BonusMalus = np.log(df['BonusMalus'])

# Apply pre-processing function to VehAge and one-hot encode it
VehAge = pd.get_dummies(df['VehAge'].apply(pre_process_VehAge))

# Re-assemble the dataset by concatenating vertically the transformed features
X = np.float32(pd.concat([Exposure, VehPower, VehAge, DrivAge, BonusMalus], axis=1).values)
# Define the target labels (i.e. claim frequency)
y = np.float32(df['ClaimNb'].values/df['Exposure'].values)

# Attention! Since this is an example, we are keeping only some features of the original dataset.
# For your final submission, modify the code accordingly.
# Attetion! For the moment, we also keep `Exposure` as the first feature in the dataset, because we want to be able
# to split it during the train-test split together with the rest of the dataset. We will then remove it from the dataset
# before training (see the function `get_train_test_split` below)

# Train a Poisson GLM

In [None]:
# Import libraries
# Poisson Regressor model
from sklearn import linear_model

# sklearn's functions for train-test split and data standardization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# losses
from sklearn.metrics import mean_absolute_error, mean_poisson_deviance, mean_squared_error

In [None]:
# Custom train-test split function to split and return the weights (i.e. feature Exposure)
def get_train_test_split(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)

    # Keep exposures

    w_train = X_train[:, 0]
    X_train = X_train[:, 1:]

    w_test = X_test[:, 0]
    X_test = X_test[:, 1:]

    return X_train, X_test, y_train, y_test, w_train, w_test

In [None]:
# Perform a train-test split
X_train, X_test, y_train, y_test, w_train, w_test = get_train_test_split(X, y)

# Standardize features
scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Train Poisson GLM

# Set alpha=0 to train without regularization
glm = linear_model.PoissonRegressor(alpha=0.)
# Use the argument `sample_weight` to minimize the *weighted* Poisson deviance with the appropriate weights (i.e. feature Exposure)
glm.fit(X_train, y_train, sample_weight=w_train)

In [None]:
# Custom mean Poisson deviance to handle cases of zero y_true or zero y_pred
def weighted_poisson_deviance(y_true, y_pred, sample_weight=None):

    log_y_true = np.zeros(len(y_true))
    log_y_true[y_true > 0] = np.log(y_true[y_true > 0])

    log_y_pred = np.zeros(len(y_pred))
    log_y_pred[y_pred > 0] = np.log(y_pred[y_pred > 0])

    loss = (1/np.sum(sample_weight))*np.sum(sample_weight * 2 * (y_pred - y_true - y_true * log_y_pred + y_true * log_y_true))

    return loss

# Function to print metrics
def print_metrics(y_true, y_pred, losses, losses_names, sample_weight=None):
    for i_loss, loss in enumerate(losses):
        print(losses_names[i_loss] + ':', loss(y_true, y_pred, sample_weight=sample_weight))


losses = [mean_absolute_error, mean_squared_error, weighted_poisson_deviance]

losses_names = ['MAE', 'MSE', 'Poisson-Dev']

print('Training data set')
print_metrics(y_train, glm.predict(X_train), losses, losses_names, sample_weight=w_train)
print('Test data set')
print_metrics(y_test, glm.predict(X_test), losses, losses_names, sample_weight=w_test)

# Train a Poisson feedforward neural network

In [None]:
import keras

In [None]:
model = keras.Sequential([keras.layers.Dense(10, activation='relu'),
                          keras.layers.Dense(10, activation='relu'),
                          keras.layers.Dense(1, activation='exponential')])

lr = 0.01

# Choose Poisson deviance as loss
model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),
              loss=keras.losses.Poisson(),
              weighted_metrics=[])

In [None]:
# Minimize the *weighted* Poisson deviance by training with w_train (i.e. Exposure) as sample weights
# Attention! w_train must be a dataframe object from pandas due to a bug in Keras (https://github.com/keras-team/keras/issues/14877)
history = model.fit(x=X_train,
                    y=y_train,
                    sample_weight=pd.Series(w_train).to_frame(),
                    batch_size=10000,
                    epochs=50)

In [None]:
plt.plot(history.history['loss'], 'b-')
plt.yscale('log')
plt.show()

In [None]:
#train_preds = model.predict(X_train).reshape(-1)
test_preds = model.predict(X_test).reshape(-1)

In [None]:
# print('Training data set')
# print_metrics(y_train, train_preds, losses, losses_names, sample_weight=w_train)
print('Test data set')
print_metrics(y_test, test_preds, losses, losses_names, sample_weight=w_test)