# PassportCard Insurance Claims Prediction

This project develops a machine learning system to predict future insurance claims for PassportCard policyholders.

## Setup and Installation

First, let's install the required packages:

In [1]:
# Install required packages
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost lightgbm jupyter scipy statsmodels plotly imbalanced-learn

Collecting notebook (from jupyter)
  Using cached notebook-7.3.3-py3-none-any.whl.metadata (10 kB)
Using cached notebook-7.3.3-py3-none-any.whl (13.1 MB)
Installing collected packages: notebook


ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\yaniv\\OneDrive\\Documents\\DS_assignment_passportcard\\venv\\Scripts\\jupyter-notebook.exe' -> 'C:\\Users\\yaniv\\OneDrive\\Documents\\DS_assignment_passportcard\\venv\\Scripts\\jupyter-notebook.exe.deleteme'
Check the permissions.


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge

import xgboost as xgb
import scipy.stats as stats
import os
import warnings

# Configure visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Data Loading

Let's load the claims and member data:

In [3]:
# Load claims data
claims_data = pd.read_csv('../claims_data_clean.csv')

# Display the first few rows
print(f"Claims data shape: {claims_data.shape}")
claims_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'claims_data_clean.csv'

In [4]:
# Load member data
members_data = pd.read_csv('../members_data_clean.csv')

# Display the first few rows
print(f"Members data shape: {members_data.shape}")
members_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'members_data_clean.csv'

## Data Exploration and Cleaning

Let's explore the data and perform necessary cleaning.

In [None]:
# Convert date columns to datetime
claims_data['ServiceDate'] = pd.to_datetime(claims_data['ServiceDate'])
claims_data['PayDate'] = pd.to_datetime(claims_data['PayDate'])

# Display basic statistics
claims_data.describe()

In [None]:
# Plot the distribution of claim amounts
plt.figure(figsize=(14, 8))

# Main plot - histogram with KDE
sns.histplot(claims_data['TotPaymentUSD'], kde=True, bins=50)
plt.title('Distribution of Claim Amounts (TotPaymentUSD)', fontsize=16)
plt.xlabel('Claim Amount (USD)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

# Add statistical annotations
mean_val = claims_data['TotPaymentUSD'].mean()
median_val = claims_data['TotPaymentUSD'].median()
skew_val = claims_data['TotPaymentUSD'].skew()
kurtosis_val = claims_data['TotPaymentUSD'].kurtosis()

stats_text = f"Mean: ${mean_val:.2f}\nMedian: ${median_val:.2f}\nSkewness: {skew_val:.2f}\nKurtosis: {kurtosis_val:.2f}"
plt.annotate(stats_text, xy=(0.75, 0.75), xycoords='axes fraction', 
             bbox=dict(boxstyle="round,pad=0.5", fc="white", alpha=0.8))

plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Missing Value Analysis and Data Cleaning

Let's analyze missing values and handle them appropriately.

In [None]:
def handle_missing_values(df, categorical_strategy='mode', numerical_strategy='knn'):
    """Advanced missing value handling with multiple strategies"""
    # Make a copy to avoid modifying the original
    df_processed = df.copy()
    
    # Get column types
    categorical_cols = df_processed.select_dtypes(include=['object', 'category']).columns
    numerical_cols = df_processed.select_dtypes(include=['int', 'float']).columns
    
    # Handle categorical features
    for col in categorical_cols:
        missing_count = df_processed[col].isna().sum()
        if missing_count > 0:
            print(f"Column {col}: {missing_count} missing values ({missing_count/len(df_processed)*100:.2f}%)")
            
            if categorical_strategy == 'mode':
                # Replace with mode
                mode_value = df_processed[col].mode()[0]
                df_processed[col].fillna(mode_value, inplace=True)
                print(f"  - Filled with mode: {mode_value}")
    
    # Handle numerical features
    if numerical_strategy == 'knn':
        # Check if there are any missing numerical values
        num_missing = df_processed[numerical_cols].isna().sum().sum()
        if num_missing > 0:
            print(f"Using KNN imputation for {num_missing} missing numerical values")
            
            # Use KNN imputation for numerical features
            numerical_data = df_processed[numerical_cols]
            
            # Handle infinite values before KNN imputation
            numerical_data = numerical_data.replace([np.inf, -np.inf], np.nan)
            
            # Initialize and fit the KNN imputer
            imputer = KNNImputer(n_neighbors=5)
            imputed_data = imputer.fit_transform(numerical_data)
            
            # Update the dataframe with imputed values
            df_processed[numerical_cols] = imputed_data
    
    return df_processed

# Apply the function to our datasets
claims_data_clean = handle_missing_values(claims_data)
members_data_clean = handle_missing_values(members_data)

# Verify that all missing values are handled
print("\nMissing values after imputation:")
print(f"Claims data: {claims_data_clean.isnull().sum().sum()}")
print(f"Members data: {members_data_clean.isnull().sum().sum()}")