In [17]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import os

In [18]:
fdf = pd.read_csv('./data/processed/frequency_data.csv')

In [19]:
fdf.columns

Index(['policy_id', 'claim_id', 'exposure', 'claim_count', 'vehicle_age',
       'region', 'channel', 'sum_insured', 'policy_year'],
      dtype='object')

In [20]:
 # Fit a Poisson GLM for claim frequency with exposure as an offset 
model = smf.glm( formula="claim_count ~ vehicle_age + region + channel + sum_insured",
                 data=fdf, family=sm.families.Poisson(), offset=np.log(fdf["exposure"]) ).fit()
print(model.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:            claim_count   No. Observations:                44067
Model:                            GLM   Df Residuals:                    44055
Model Family:                 Poisson   Df Model:                           11
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -46953.
Date:                Sat, 27 Dec 2025   Deviance:                       3123.9
Time:                        11:26:15   Pearson chi2:                 3.76e+03
No. Iterations:                     4   Pseudo R-squ. (CS):          3.069e-05
Covariance Type:            nonrobust                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept               

In [21]:
# Predict claim frequency
fdf['predicted_frequency'] = model.predict(fdf)

In [22]:
fdf.head()

Unnamed: 0,policy_id,claim_id,exposure,claim_count,vehicle_age,region,channel,sum_insured,policy_year,predicted_frequency
0,24,1.0,0.966461,1,9,South,Direct,579900,2024,1.16827
1,32,2.0,1.0,1,2,Central,Online,2658400,2019,1.152253
2,46,3.0,1.0,1,10,East,Agent,836800,2024,1.164314
3,55,4.0,0.933607,2,5,West,Online,270800,2021,1.155222
4,55,5.0,0.933607,2,5,West,Online,270800,2021,1.155222


In [26]:
fdf.to_csv('./data/processed/frequency_output.csv', index=False)

In [25]:
import matplotlib.pyplot as plt

coef = model.params.drop("Intercept", errors="ignore")
relativity = np.exp(coef)

plt.figure(figsize=(10,6))
relativity.sort_values().plot(kind="barh")
plt.axvline(1, linestyle="--")
plt.title("GLM Risk Relativities")
plt.xlabel("Relativity")
plt.ylabel("Rating Factors")
plt.tight_layout()
plt.savefig("./visuals/fre_relativity_plots.png", dpi=300)
plt.close()
