In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

In [3]:
df_freq = fetch_openml(data_id=41214, as_frame=True).data
df_freq["IDpol"] = df_freq["IDpol"].astype(int)
df_freq.set_index("IDpol", inplace=True)
df_freq

Unnamed: 0_level_0,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region
IDpol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1.0,0.10000,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82
3,1.0,0.77000,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82
5,1.0,0.75000,B,6.0,2.0,52.0,50.0,B12,Diesel,54.0,R22
10,1.0,0.09000,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72
11,1.0,0.84000,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72
...,...,...,...,...,...,...,...,...,...,...,...
6114326,0.0,0.00274,E,4.0,0.0,54.0,50.0,B12,Regular,3317.0,R93
6114327,0.0,0.00274,E,4.0,0.0,41.0,95.0,B12,Regular,9850.0,R11
6114328,0.0,0.00274,D,6.0,2.0,45.0,50.0,B12,Diesel,1323.0,R82
6114329,0.0,0.00274,B,4.0,0.0,60.0,50.0,B12,Regular,95.0,R26


In [4]:
df_sev = fetch_openml(data_id=41215, as_frame=True).data
df_sev["IDpol"] = df_sev["IDpol"].astype(int)
df_sev.set_index("IDpol", inplace=True)
df_sev = df_sev.groupby("IDpol").sum()

In [5]:
df = df_freq.join(df_sev, how="left")
df["ClaimAmount"].fillna(0, inplace=True)
df["ClaimFreq"] = df["ClaimNb"] / df["Exposure"]

In [6]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
df_dummy = pd.get_dummies(df, columns=categorical_columns, prefix=categorical_columns)
df_dummy

Unnamed: 0_level_0,ClaimNb,Exposure,VehPower,VehAge,DrivAge,BonusMalus,Density,ClaimAmount,ClaimFreq,Area_A,...,Region_R53,Region_R54,Region_R72,Region_R73,Region_R74,Region_R82,Region_R83,Region_R91,Region_R93,Region_R94
IDpol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.10000,5.0,0.0,55.0,50.0,1217.0,0.0,10.000000,0,...,0,0,0,0,0,1,0,0,0,0
3,1.0,0.77000,5.0,0.0,55.0,50.0,1217.0,0.0,1.298701,0,...,0,0,0,0,0,1,0,0,0,0
5,1.0,0.75000,6.0,2.0,52.0,50.0,54.0,0.0,1.333333,0,...,0,0,0,0,0,0,0,0,0,0
10,1.0,0.09000,7.0,0.0,46.0,50.0,76.0,0.0,11.111111,0,...,0,0,1,0,0,0,0,0,0,0
11,1.0,0.84000,7.0,0.0,46.0,50.0,76.0,0.0,1.190476,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6114326,0.0,0.00274,4.0,0.0,54.0,50.0,3317.0,0.0,0.000000,0,...,0,0,0,0,0,0,0,0,1,0
6114327,0.0,0.00274,4.0,0.0,41.0,95.0,9850.0,0.0,0.000000,0,...,0,0,0,0,0,0,0,0,0,0
6114328,0.0,0.00274,6.0,2.0,45.0,50.0,1323.0,0.0,0.000000,0,...,0,0,0,0,0,1,0,0,0,0
6114329,0.0,0.00274,4.0,0.0,60.0,50.0,95.0,0.0,0.000000,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X = df_dummy.drop(['ClaimNb','ClaimAmount','ClaimFreq'], axis=1)
y = df_dummy['ClaimNb']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train_binary = (y_train > 0).astype(int)

# Hurdle Model

## Stage1: Binary Model (Logistic Regression)

In [9]:
logit_model = LogisticRegression(max_iter=1000)
logit_model.fit(X_train_scaled, y_train_binary)

## Stage2: Poisson Model (Only ClaimNb > 0  included)

In [13]:
poisson_model = sm.GLM(y_train[y_train > 0], X_train[y_train > 0], family = sm.families.Poisson()).fit()
print(poisson_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                ClaimNb   No. Observations:                27214
Model:                            GLM   Df Residuals:                    27170
Model Family:                 Poisson   Df Model:                           43
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -28401.
Date:                Sun, 11 Feb 2024   Deviance:                       1408.3
Time:                        22:50:57   Pearson chi2:                 2.13e+03
No. Iterations:                     4   Pseudo R-squ. (CS):          0.0005725
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Exposure           0.0127      0.019      0.

## Prediction

In [19]:
y_test_pred_binary = logit_model.predict(X_test_scaled)

X_test_positive = X_test_scaled[y_test_pred_binary > 0]
y_test_pred_counts = poisson_model.predict(scaler.inverse_transform(X_test_positive))
y_test_pred_rounded = np.ceil(y_test_pred_counts)

y_pred_full = np.zeros(y_test.shape)
y_pred_full[y_test_pred_binary > 0] = y_test_pred_rounded

In [27]:
mae = mean_absolute_error(y_test, y_pred_full)
print(f"MAE: {mae}")
mse = mean_squared_error(y_test, y_pred_full)
print(f"MSE:, {mse}")

MAE: 0.05339852363148308
MSE:, 0.05965207259426414
