In [4]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

df = pd.read_csv("../data/freq_data.csv")

# Keep only valid exposure
df = df[(df["Exposure"] > 0) & (df["Exposure"] <= 1)].copy()

# Target
y = df["ClaimNb"].astype(float)

# Predictors (raw)
X_raw = df[[
    "Area",
    "VehPower",
    "VehAge",
    "DrivAge",
    "BonusMalus",
    "VehBrand",
    "VehGas",
    "Density",
    "Region"
]]

# One-hot encode categoricals
X = pd.get_dummies(X_raw, drop_first=True)

# Make sure everything is numeric and no NaN
X = X.astype(float)
X = X.fillna(0.0)

# Offset = log(exposure)
offset = np.log(df["Exposure"].astype(float))

X.head(), y.head(), offset[:5]



(   VehPower  VehAge  DrivAge  BonusMalus  Density  Area_B  Area_C  Area_D  \
 0       5.0     0.0     55.0        50.0   1217.0     0.0     0.0     1.0   
 1       5.0     0.0     55.0        50.0   1217.0     0.0     0.0     1.0   
 2       6.0     2.0     52.0        50.0     54.0     1.0     0.0     0.0   
 3       7.0     0.0     46.0        50.0     76.0     1.0     0.0     0.0   
 4       7.0     0.0     46.0        50.0     76.0     1.0     0.0     0.0   
 
    Area_E  Area_F  ...  Region_R53  Region_R54  Region_R72  Region_R73  \
 0     0.0     0.0  ...         0.0         0.0         0.0         0.0   
 1     0.0     0.0  ...         0.0         0.0         0.0         0.0   
 2     0.0     0.0  ...         0.0         0.0         0.0         0.0   
 3     0.0     0.0  ...         0.0         0.0         1.0         0.0   
 4     0.0     0.0  ...         0.0         0.0         1.0         0.0   
 
    Region_R74  Region_R82  Region_R83  Region_R91  Region_R93  Region_R94  
 

In [5]:
# Add intercept and fit Poisson GLM
X_glm = sm.add_constant(X, has_constant="add")

glm_model = sm.GLM(
    y,
    X_glm,
    family=sm.families.Poisson(),
    offset=offset
)

glm_results = glm_model.fit()
glm_results.summary()


0,1,2,3
Dep. Variable:,ClaimNb,No. Observations:,676789.0
Model:,GLM,Df Residuals:,676746.0
Model Family:,Poisson,Df Model:,42.0
Link Function:,Log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-143060.0
Date:,"Sat, 29 Nov 2025",Deviance:,216910.0
Time:,19:02:44,Pearson chi2:,1790000.0
No. Iterations:,7,Pseudo R-squ. (CS):,0.01053
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.9572,0.047,-85.059,0.000,-4.048,-3.866
VehPower,0.0138,0.003,4.994,0.000,0.008,0.019
VehAge,-0.0388,0.001,-33.184,0.000,-0.041,-0.036
DrivAge,0.0065,0.000,16.565,0.000,0.006,0.007
BonusMalus,0.0224,0.000,72.927,0.000,0.022,0.023
Density,-1.447e-06,3.69e-06,-0.392,0.695,-8.68e-06,5.79e-06
Area_B,0.0520,0.022,2.391,0.017,0.009,0.095
Area_C,0.0860,0.018,4.762,0.000,0.051,0.121
Area_D,0.1844,0.019,9.528,0.000,0.146,0.222


In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Make sure types are numeric
df["ClaimNb"] = df["ClaimNb"].astype(float)
df["glm_pred"] = df["glm_pred"].astype(float)

# MAE
mae = mean_absolute_error(df["ClaimNb"], df["glm_pred"])

# MSE then RMSE (manual)
mse = mean_squared_error(df["ClaimNb"], df["glm_pred"])
rmse = np.sqrt(mse)

# Poisson deviance (mean per observation)
def poisson_deviance(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    y_pred = np.maximum(y_pred, 1e-9)
    return 2 * np.mean(
        y_true * np.log((y_true + 1e-9) / y_pred) - (y_true - y_pred)
    )

poisson_dev = poisson_deviance(df["ClaimNb"], df["glm_pred"])

mae, rmse, poisson_dev


(0.09888754487613945, 0.2390283060282645, 0.3204939503685755)

In [8]:
df["decile"] = pd.qcut(df["glm_pred"], 10, labels=False)

decile_table = df.groupby("decile", as_index=False).agg(
    actual_mean=("ClaimNb", "mean"),
    predicted_mean=("glm_pred", "mean"),
    count=("ClaimNb", "size")
).sort_values("decile")

decile_table


Unnamed: 0,decile,actual_mean,predicted_mean,count
0,0,0.022636,0.00363,67679
1,1,0.027999,0.009147,67681
2,2,0.03527,0.017427,67677
3,3,0.042111,0.028806,67679
4,4,0.048139,0.041784,67679
5,5,0.049085,0.054366,67678
6,6,0.054803,0.065796,67679
7,7,0.06389,0.077833,67679
8,8,0.074765,0.092881,67679
9,9,0.113935,0.140962,67679
