# PassportCard Insurance Claims Prediction: Model Development

This notebook focuses on developing and evaluating machine learning models for the PassportCard insurance claims prediction project.

## Setup and Imports

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge

import xgboost as xgb

# Configure visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Set random seed for reproducibility
np.random.seed(42)

## Loading Processed Data

We'll load the cleaned data produced in the previous notebook.

In [None]:
# Load claims and member data
claims_data = pd.read_csv('claims_data_clean.csv')
members_data = pd.read_csv('members_data_clean.csv')

# Convert date columns to datetime
claims_data['ServiceDate'] = pd.to_datetime(claims_data['ServiceDate'])
claims_data['PayDate'] = pd.to_datetime(claims_data['PayDate'])

# Display basic info
print(f"Claims data shape: {claims_data.shape}")
print(f"Members data shape: {members_data.shape}")

## Feature Engineering

Let's prepare features for modeling.

In [None]:
def prepare_features(claims_df, members_df):
    """Prepare features for modeling"""
    # Work with copies
    claims = claims_df.copy()
    members = members_df.copy()
    
    # Basic temporal features
    claims['Year'] = claims['ServiceDate'].dt.year
    claims['Month'] = claims['ServiceDate'].dt.month
    claims['DayOfWeek'] = claims['ServiceDate'].dt.dayofweek
    
    # Cyclical encoding for month and day of week
    claims['Month_sin'] = np.sin(2 * np.pi * claims['Month'] / 12)
    claims['Month_cos'] = np.cos(2 * np.pi * claims['Month'] / 12)
    claims['DayOfWeek_sin'] = np.sin(2 * np.pi * claims['DayOfWeek'] / 7)
    claims['DayOfWeek_cos'] = np.cos(2 * np.pi * claims['DayOfWeek'] / 7)
    
    # Aggregate to member level
    member_features = claims.groupby('Member_ID').agg({
        'TotPaymentUSD': ['count', 'mean', 'sum', 'std'],
        'ServiceDate': ['min', 'max']
    }).reset_index()
    
    # Flatten multi-level column names
    member_features.columns = ['_'.join(col).strip('_') for col in member_features.columns.values]
    
    # Rename columns for clarity
    member_features = member_features.rename(columns={
        'Member_ID': 'Member_ID',
        'TotPaymentUSD_count': 'ClaimCount',
        'TotPaymentUSD_mean': 'MeanClaimAmount',
        'TotPaymentUSD_sum': 'TotalClaimAmount',
        'TotPaymentUSD_std': 'ClaimAmountStd',
        'ServiceDate_min': 'FirstClaimDate',
        'ServiceDate_max': 'LastClaimDate'
    })
    
    # Calculate member tenure (days between first and last claim)
    member_features['TenureDays'] = (member_features['LastClaimDate'] - member_features['FirstClaimDate']).dt.days
    
    # Calculate claim frequency (claims per month)
    member_features['ClaimFrequency'] = np.where(
        member_features['TenureDays'] > 0,
        member_features['ClaimCount'] / (member_features['TenureDays'] / 30),
        0
    )
    
    # Merge with member data
    data = pd.merge(member_features, members, on='Member_ID', how='left')
    
    # Create target variable: future claims (this would be calculated from additional data in a real scenario)
    # For demonstration, we'll use a simple function of existing features plus random noise
    np.random.seed(42)  # For reproducibility
    data['FutureClaimAmount'] = (
        0.7 * data['MeanClaimAmount'] + 
        0.3 * data['ClaimFrequency'] * 100 +
        0.2 * data['BMI'] +
        np.random.normal(0, 50, size=len(data))
    )
    
    # Ensure non-negative values
    data['FutureClaimAmount'] = data['FutureClaimAmount'].clip(lower=0)
    
    # Drop date columns for modeling
    data = data.drop(columns=['FirstClaimDate', 'LastClaimDate'])
    
    return data

# Prepare features for modeling
modeling_data = prepare_features(claims_data, members_data)

# Display the first few rows
print(f"Modeling data shape: {modeling_data.shape}")
modeling_data.head()

## Data Preparation for Modeling

In [None]:
def prepare_for_modeling(df, target_col='FutureClaimAmount', test_size=0.2, log_transform=True):
    """Prepare data for modeling by splitting and transforming"""
    # Work with a copy
    data = df.copy()
    
    # Apply log transform to target if specified
    if log_transform:
        data['Log_' + target_col] = np.log1p(data[target_col])
        y_col = 'Log_' + target_col
    else:
        y_col = target_col
    
    # Select features and target
    feature_cols = [
        col for col in data.columns 
        if col not in [target_col, 'Log_' + target_col, 'Member_ID', 'PolicyID']
    ]
    
    X = data[feature_cols]
    y = data[y_col]
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )
    
    # Scale numerical features
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
    scaler = RobustScaler()  # Robust to outliers
    
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
    
    return X_train, X_test, y_train, y_test, feature_cols, log_transform

# Prepare data for modeling
X_train, X_test, y_train, y_test, feature_cols, log_transform = prepare_for_modeling(
    modeling_data, target_col='FutureClaimAmount', log_transform=True
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(f"Number of features: {len(feature_cols)}")
print(f"Log-transformed target: {log_transform}")

## Model Selection

We'll evaluate several regression models to select the best performing one for our task.

In [None]:
def evaluate_model(name, model, X_train, y_train, X_test, y_test, log_transform=True):
    """Train and evaluate a model"""
    # Time training process
    start_time = time.time()
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Calculate training time
    train_time = time.time() - start_time
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Transform predictions back to original scale if log-transformed
    if log_transform:
        y_pred_train_orig = np.expm1(y_pred_train)
        y_pred_test_orig = np.expm1(y_pred_test)
        y_train_orig = np.expm1(y_train)
        y_test_orig = np.expm1(y_test)
    else:
        y_pred_train_orig = y_pred_train
        y_pred_test_orig = y_pred_test
        y_train_orig = y_train
        y_test_orig = y_test
    
    # Calculate metrics
    rmse_train = np.sqrt(mean_squared_error(y_train_orig, y_pred_train_orig))
    rmse_test = np.sqrt(mean_squared_error(y_test_orig, y_pred_test_orig))
    
    mae_train = mean_absolute_error(y_train_orig, y_pred_train_orig)
    mae_test = mean_absolute_error(y_test_orig, y_pred_test_orig)
    
    r2_train = r2_score(y_train_orig, y_pred_train_orig)
    r2_test = r2_score(y_test_orig, y_pred_test_orig)
    
    # Calculate MAPE (Mean Absolute Percentage Error) for values > 10
    # to avoid division by very small values
    train_idx = y_train_orig > 10
    test_idx = y_test_orig > 10
    
    if any(train_idx):
        mape_train = np.mean(np.abs((y_train_orig[train_idx] - y_pred_train_orig[train_idx]) / y_train_orig[train_idx])) * 100
    else:
        mape_train = np.nan
        
    if any(test_idx):
        mape_test = np.mean(np.abs((y_test_orig[test_idx] - y_pred_test_orig[test_idx]) / y_test_orig[test_idx])) * 100
    else:
        mape_test = np.nan
    
    # Organize results
    results = {
        'Model': name,
        'RMSE_Train': rmse_train,
        'RMSE_Test': rmse_test,
        'MAE_Train': mae_train,
        'MAE_Test': mae_test,
        'R2_Train': r2_train,
        'R2_Test': r2_test,
        'MAPE_Train': mape_train,
        'MAPE_Test': mape_test,
        'Training_Time': train_time
    }
    
    return results, model, y_pred_test_orig

# Define models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42)
}

# Evaluate each model
results_list = []
predictions = {}
trained_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    result, trained_model, y_pred = evaluate_model(
        name, model, X_train, y_train, X_test, y_test, log_transform
    )
    results_list.append(result)
    trained_models[name] = trained_model
    predictions[name] = y_pred
    print(f"  RMSE: {result['RMSE_Test']:.2f}, MAE: {result['MAE_Test']:.2f}, R²: {result['R2_Test']:.3f}")

# Collect results in a DataFrame
results_df = pd.DataFrame(results_list)
results_df