# Applied ML - Car Insurance Claim Prediction
The Dataset contains information on policyholders having the attributes like policy tenure, age of the car, age of the car owner, the population density of the city, make and model of the car, power, engine type, etc, and the target variable indicating whether the policyholder files a claim in the next 6 months or not.

## Setting Up

### Importing Relevant Libraries

In [None]:
from pathlib import Path

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss, roc_auc_score, log_loss, confusion_matrix

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
import optuna

In [None]:
from process_model import Processor, PartProcessor

In [None]:
import warnings

### Workbook Settings

In [None]:
from IPython.display import display, HTML

def display_scrollable(df, rows=10):
    display(HTML(df.to_html(notebook=True, max_rows=rows, max_cols=None, border=0)))

In [None]:
# Set the aesthetic style
sns.set_style("whitegrid")

### Importing Relevant Data

In [None]:
# data_paths: Adjust path of data as you see fit
dir_path = Path("./") # Insert path here if relative paths fail to work
data_path = dir_path / Path("data")

In [None]:
# importing data
ins_df = pd.read_csv(data_path / "train.csv")
y = ins_df['is_claim']
x = ins_df.drop(columns = ['policy_id','is_claim'], inplace=False)

## Data Exploration

### Data Structure

In [None]:
# Number of observations in dataset
n = ins_df.shape[0]
p = ins_df.shape[1]
print(f"The dataset has {n} observations with {p} predictors")

### Distribution of Y

In [None]:
# Distribution of insurance_claims 

plt.figure(figsize=(4, 3))
sns.countplot(data=ins_df, x='is_claim')

# Set title and labels
plt.title('Distribution of Y (Claims)')
plt.xlabel('Claims')
plt.ylabel('Count')

# Display the plot
plt.show()

In [None]:
ins_df["is_claim"].value_counts()

### Analyzing Qualitative Variables

In [None]:
# Find all qualitative variables in the dataFrame
categorical_cols = ins_df.select_dtypes(include=['object', 'category']).columns

In [None]:
# View categorical columns:
display_scrollable(ins_df[categorical_cols].head())

### Identifying Multicollinearity

#### Perfect or Near Perfect Correlation

In [None]:
ins_df.drop(columns = categorical_cols,axis=1).head()

In [None]:
# unlike_categories = categories that are not perfectly captured by collinearity
unlike_categories = ['policy_id','policy_tenure', 'age_of_car', 'age_of_policyholder', 'area_cluster', 'population_density',
                 'is_claim']
counts_df = ins_df.drop(columns=unlike_categories,axis=1).value_counts().to_frame().reset_index()
counts_df.columns = list(ins_df.drop(columns=unlike_categories).columns) + ['Count']
counts_df.shape

In [None]:
display_scrollable(counts_df, rows=11)

In [None]:
# Visualizing the different models
plt.figure(figsize=(12, 6))
sns.barplot(data=counts_df, x='model', y='Count')
plt.xticks(rotation=45)
plt.title(f'Counts by Model')
plt.show()

#### Analyzing CrossTabs

In [None]:
# Count combinations of values in both columns
count_combined = pd.crosstab(ins_df['model'], ins_df['is_claim'])

# Calculate row percentages
row_percentage_combined = pd.crosstab(ins_df['model'], ins_df['is_claim'], normalize='index') * 100

print("\nValue counts for both columns:\n", count_combined)

As seen above, there is only 11 types of cars present in the dataset. Outside of the categories in unlike_categories, every other column can be grouped in only 11 categories (model). So given the model, you can always find the values of the other categories. 

### Identifying Skewness

In [None]:
# Create subplots side by side
fig, axes = plt.subplots(nrows=1, ncols=len(["policy_tenure", "age_of_car", "age_of_policyholder", "population_density"]), figsize=(5 * len(["policy_tenure", "age_of_car", "age_of_policyholder", "population_density"]), 5))

for i, col in enumerate(["policy_tenure", "age_of_car", "age_of_policyholder", "population_density"]):
    ax = axes[i]
    ins_df[col].plot(kind='hist', bins=10, alpha=0.7, ax=ax)
    ax.set_title(f'Distribution of {col}')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Data Modeling

### Pre-Processing

In [None]:
# Remove variables with perfect correlation
X = unlike_categories[1:-1]+["model"]
y = "is_claim"

In [None]:
X_quant_cols = ["policy_tenure", "age_of_car", "age_of_policyholder", "population_density"]
X_qual_cols = ["model", "area_cluster"]

### Modeling
#### Models

In [None]:
# Logistic Regression: 
log_reg = LogisticRegression(max_iter=1000,random_state=42)

# Set up the parameter grid
lr_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2'],  
    'class_weight': [None, 'balanced']
}

In [None]:
# Decision Trees:
decision_tree = DecisionTreeClassifier(random_state=42)

# Define the hyperparameters grid to search over
dt_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

In [None]:
# Random Forest: 
random_forest = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid for tuning
rf_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [ 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4], 
    'class_weight': [None, 'balanced']
}

In [None]:
# KNN:
knn = KNeighborsClassifier()

# Parameters for KNN:
knn_grid = {'n_neighbors': range(3, 40, 2) }

In [None]:
# SVM:
svm = SVC()

svm_grid = {
    'C': [0.1, 10],                        # Regularization strength
    'gamma': [0.1, 0.001],                 # Kernel coefficient (for 'rbf')
    'kernel': ['linear', 'rbf'],           # Different kernel types
}

#### Base - No Oversampling, Full Models

In [None]:
# Custom written class that handles storage of various models and ensures pre-processing is standardized
base_models = Processor(
    data_df=ins_df, 
    target_col = y, 
    quant_cols=X_quant_cols, 
    qual_cols=X_qual_cols
)

# splits the model into train and test segments
base_models.train_test_split(test_size=0.2, stratify_by="model")

# standardize data using standard scaler
base_models.data_standarization(processor='StandardScaler')

In [None]:
base_models.run_model("Logistic Regression", log_reg, lr_grid, scoring="f1")

In [None]:
base_models.run_model("Decision Tree", decision_tree, dt_grid, scoring="f1")

In [None]:
base_models.run_model("Random Forest", random_forest, rf_grid, scoring="f1")

In [None]:
base_models.run_model("KNN", knn, knn_grid, scoring="f1")

In [None]:
base_models.run_model("SVM", svm, svm_grid, scoring="f1")

In [None]:
base_models.get_results()

In [None]:
# stores results in a pkl file for later analysis
base_models.dump_pkl("./pkl_store/base.pkl")

#### Oversampling Models - Oversamlping, Full Models

In [None]:
# Custom written class that handles storage of various models and ensures pre-processing is standardized
baseOS_models = Processor(
    data_df=ins_df, 
    target_col = y, 
    quant_cols=X_quant_cols, 
    qual_cols=X_qual_cols
)

# splits the model into train and test segments
baseOS_models.train_test_split(test_size=0.2, stratify_by="model")

# standardize data using standard scaler
baseOS_models.data_standarization(processor='StandardScaler')

# oversamples the data
baseOS_models.oversample_data()

In [None]:
baseOS_models.run_model("Logistic Regression", log_reg, lr_grid, scoring="f1")

In [None]:
baseOS_models.run_model("Decision Tree", decision_tree, dt_grid, scoring="f1")

In [None]:
baseOS_models.run_model("Random Forest", random_forest, rf_grid, scoring="f1")

In [None]:
baseOS_models.run_model("KNN", knn, knn_grid, scoring="f1")

In [None]:
baseOS_models.run_model("SVM", svm, svm_grid, scoring="f1", cv=2)

In [None]:
baseOS_models.get_results()

In [None]:
baseOS_models.dump_pkl("./pkl_store/baseOS.pkl")

#### Oversampling in Parts Models - Oversamlping, Partition Models

In [None]:
partsOS_models = PartProcessor(
    data_df=ins_df, 
    target_col = y,
    strat_col="model",
    quant_cols=X_quant_cols, 
    qual_cols=X_qual_cols
)

# splits the model into train and test segments
partsOS_models.train_test_split(test_size=0.2)

# standardize data using standard scaler
partsOS_models.data_standarization(processor='StandardScaler')

# oversamples the data
partsOS_models.oversample_data()

In [None]:
partsOS_models.run_model("Logistic Regression", log_reg, lr_grid, scoring="f1")

In [None]:
partsOS_models.run_model("Decision Tree", decision_tree, dt_grid, scoring="f1")

In [None]:
partsOS_models.run_model("Random Forest", random_forest, rf_grid, scoring="f1")

In [None]:
partsOS_models.run_model("KNN", knn, knn_grid, scoring="f1")

In [None]:
partsOS_models.run_model("SVM", svm, svm_grid, scoring="f1", cv=2)

In [None]:
partsOS_models.get_results()

In [None]:
partsOS_models.dump_pkl("./pkl_store/partsOS_models.pkl")

In [None]:
partsOS_models.data_df.head()