Import 

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import shap
import matplotlib.pyplot as plt

from scripts.utils import add_claim_flags

ImportError: Numba needs NumPy 2.2 or less. Got NumPy 2.3.

Load

In [None]:
df = pd.read_csv('../data/MachineLearningRating_v3.txt', sep='|', engine='python', on_bad_lines='skip')
df = add_claim_flags(df)
df['VehicleAge'] = 2025 - df['RegistrationYear']
df_model = df[['TotalClaims', 'CalculatedPremiumPerTerm', 'VehicleAge', 'Gender', 'Province', 'HasClaim']]
df_model = df_model.dropna()

Prediction Model

In [None]:
df_claims = df_model[df_model['HasClaim']]
X = pd.get_dummies(df_claims[['VehicleAge', 'Gender', 'Province']], drop_first=True)
y = df_claims['TotalClaims']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)
    results[name] = {"model": model, "rmse": rmse, "r2": r2}
    print(f"{name} -> RMSE: {rmse:.2f}, R2: {r2:.2f}")


SHAP Interpretation for Best Model

In [None]:
# Choose best model (example: Random Forest)
best_model = results["RandomForest"]["model"]

# Explain predictions using SHAP
explainer = shap.Explainer(best_model.predict, X_test)
shap_values = explainer(X_test)

# Visualize feature importance
shap.plots.beeswarm(shap_values)
