In [None]:
# !pip install scikit-learn

In [None]:
# General & Preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Regression & Classification Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import xgboost as xgb

# Metrics
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score

# Model Interpretation
import shap

# File: notebooks/04_modeling.ipynb (Python notebook code as Python script for reference)

# Step 1: Filter only claim cases
df_claims = df[df['TotalClaims'] > 0].copy()

# Step 2: Prepare features and target for regression (Model 1)
y_reg = df_claims['TotalClaims']
X_reg = df_claims.drop(columns=['UnderwrittenCoverID', 'PolicyID', 'TotalClaims', 'TotalPremium', 'CalculatedPremiumPerTerm'])
X_reg = X_reg.fillna(0)
X_reg = pd.get_dummies(X_reg, drop_first=True)

# Step 3: Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Step 4: Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Linear Regression RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2 Score:", r2_score(y_test, y_pred))

# Step 5: Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest RMSE:", mean_squared_error(y_test, y_pred_rf, squared=False))
print("Random Forest R2:", r2_score(y_test, y_pred_rf))

# Step 6: Classification model - probability of claim
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)
y_cls = df['HasClaim']
X_cls = df.drop(columns=['UnderwrittenCoverID', 'PolicyID', 'TotalClaims', 'HasClaim', 'CalculatedPremiumPerTerm'])
X_cls = X_cls.fillna(0)
X_cls = pd.get_dummies(X_cls, drop_first=True)

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
logr = LogisticRegression(max_iter=1000)
logr.fit(X_train_c, y_train_c)
y_pred_cls = logr.predict(X_test_c)
print("Logistic Regression Accuracy:", accuracy_score(y_test_c, y_pred_cls))
print("Logistic Regression F1 Score:", f1_score(y_test_c, y_pred_cls))

# Step 7: SHAP feature importance
import shap
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)
