### Data Preprocessing

In [1]:
import os
import pandas as pd

# Correct path construction (use raw string or forward slashes)
file_path = r'C:\Users\user\Desktop\insurance-risk-analytics\notebooks\data\Insurance_dataset.csv'  # Add .csv

# Verify file exists
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("Data loaded successfully! Shape:", df.shape)
else:
    print("File not found. Check:")
    print("- File exists at path")
    print("- Correct extension (.csv/.parquet/.xlsx)")
    print("- No typos in path")

  df = pd.read_csv(file_path)


Data loaded successfully! Shape: (1000098, 52)


In [2]:
df['TransactionMonth'] = pd.to_datetime(
    df['TransactionMonth'], 
    format='%m/%d/%Y %I:%M:%S %p',
    errors='coerce'
)
df['Transaction_Year'] = df['TransactionMonth'].dt.year
df['Transaction_Month'] = df['TransactionMonth'].dt.month

In [3]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Drop rows with missing TotalClaims if needed
df = df[df['TotalPremium'].notnull()]

# Create target for claim classification
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

# Fill missing numeric columns with median
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill categorical columns with 'Missing'
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna('Missing')

# Feature engineering
df['VehicleAge'] = df['Transaction_Year'] - df['RegistrationYear']
df['IsNewVehicle'] = df['NewVehicle'].map({'Y': 1, 'N': 0})
df['VehicleAge'] = df['VehicleAge'].clip(lower=0)

# Drop columns not needed
drop_cols = ['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'VehicleIntroDate']
df.drop(columns=drop_cols, inplace=True, errors='ignore')

# One-hot encode low-cardinality categoricals
df = pd.get_dummies(df, drop_first=True)


### Split Data for Severity & Classification

In [4]:
# --- Severity: Only rows where TotalClaims > 0 ---
severity_df = df[df['TotalClaims'] > 0]
X_severity = severity_df.drop(['TotalClaims'], axis=1)
y_severity = severity_df['TotalClaims']

# --- Classification: All rows ---
X_class = df.drop(['TotalClaims', 'HasClaim'], axis=1)
y_class = df['HasClaim']

# Train-test splits
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_severity, y_severity, test_size=0.2, random_state=42)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_class, y_class, test_size=0.2, random_state=42)


### Regression (Severity)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

models_reg = {
    'Linear': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

for name, model in models_reg.items():
    model.fit(X_train_reg, y_train_reg)
    preds = model.predict(X_test_reg)
    rmse = np.sqrt(mean_squared_error(y_test_reg, preds))
    r2 = r2_score(y_test_reg, preds)
    print(f"{name}: RMSE = {rmse:.2f}, R² = {r2:.2f}")


### Classification (HasClaim)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

models_cls = {
    'Logistic': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

for name, model in models_cls.items():
    model.fit(X_train_cls, y_train_cls)
    preds = model.predict(X_test_cls)
    proba = model.predict_proba(X_test_cls)[:, 1]
    print(f"\n{name}:\n{classification_report(y_test_cls, preds)}")
    print(f"AUC-ROC: {roc_auc_score(y_test_cls, proba):.2f}")


### SHAP Interpretation

In [None]:
import shap

# For regression
reg_best = models_reg['XGBoost']
explainer_reg = shap.Explainer(reg_best)
shap_values_reg = explainer_reg(X_test_reg)

shap.summary_plot(shap_values_reg, X_test_reg)

# For classification
cls_best = models_cls['XGBoost']
explainer_cls = shap.Explainer(cls_best)
shap_values_cls = explainer_cls(X_test_cls)

shap.summary_plot(shap_values_cls, X_test_cls)


### Risk-Based Premium Formula

In [None]:
# Risk-based premium estimate
claim_prob = cls_best.predict_proba(X_class)[:, 1]
claim_severity = reg_best.predict(X_class)

df['RiskBasedPremium'] = (claim_prob * claim_severity) + 1000  # Add loading/margin


### Save Results

In [None]:
df[['TotalPremium', 'RiskBasedPremium', 'TotalClaims', 'HasClaim']].head(10)
