# Insurance Claim Prediction - Enhanced Pipeline
Complete ML pipeline with:
- Comprehensive visualizations
- Train/Validation/Test split
- Multiple algorithms comparison
- SMOTE vs Non-SMOTE analysis
- Best model selection

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (classification_report, confusion_matrix, f1_score, 
                             roc_auc_score, roc_curve, auc, accuracy_score, 
                             precision_score, recall_score)
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries loaded successfully")

## 1. Load and Explore Raw Data

In [None]:
train_raw = pd.read_csv("train_Insurance.csv")
test_raw = pd.read_csv("test_Insurance.csv")

print(f"Train shape: {train_raw.shape}")
print(f"Test shape: {test_raw.shape}")
print(f"\nTrain columns: {list(train_raw.columns)}")
print(f"\nTrain info:")
train_raw.info()

### Visualize Missing Values

In [None]:
missing = train_raw.isna().sum()
missing = missing[missing > 0].sort_values(ascending=False)

if len(missing) > 0:
    plt.figure(figsize=(10, 5))
    plt.bar(missing.index, missing.values, color='coral')
    plt.title('Missing Values in Training Data', fontsize=14, fontweight='bold')
    plt.ylabel('Count')
    plt.xlabel('Columns')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found")

### Target Variable Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

train_raw['Claim'].value_counts().plot.pie(
    autopct='%1.1f%%',
    ax=axes[0],
    ylabel='',
    colors=['#66b3ff', '#ff9999'],
    title='Target Distribution (Pie Chart)'
)

claim_counts = train_raw['Claim'].value_counts()
axes[1].bar(claim_counts.index, claim_counts.values, color=['#66b3ff', '#ff9999'])
axes[1].set_title('Target Distribution (Bar Chart)')
axes[1].set_ylabel('Count')
axes[1].set_xlabel('Claim')
for i, v in enumerate(claim_counts.values):
    axes[1].text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nClass distribution:")
print(train_raw['Claim'].value_counts())
print(f"\nClass proportions:")
print(train_raw['Claim'].value_counts(normalize=True))

### Visualize Categorical Features

In [None]:
cat_cols = [col for col in train_raw.select_dtypes(include=["object", "category"]).columns
            if col not in ["Claim", "Customer Id"]]

print("Categorical columns:", cat_cols)

n_cols = 2
n_rows = math.ceil(len(cat_cols) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 5 * n_rows))
axes = axes.flatten()

for i, col in enumerate(cat_cols):
    sns.countplot(x=col, data=train_raw, ax=axes[i], palette='Set2')
    axes[i].set_title(f"Distribution of {col}", fontweight='bold')
    axes[i].tick_params(axis='x', rotation=45)
    
    for container in axes[i].containers:
        axes[i].bar_label(container)

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

### Visualize Numerical Features

In [None]:
num_cols = train_raw.select_dtypes(include=["int64", "float64"]).columns.tolist()
num_cols = [c for c in num_cols if c not in ["Claim", "Customer Id"]]

n_cols = 2
n_rows = math.ceil(len(num_cols) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 5 * n_rows))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    sns.histplot(train_raw[col].dropna(), kde=True, ax=axes[i], color='steelblue')
    axes[i].set_title(f"Distribution of {col}", fontweight='bold')
    axes[i].set_xlabel(col)

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

## 2. Data Cleaning Pipeline

### Step 1: Remove Customer ID

In [None]:
train_raw.drop(labels=['Customer Id'], axis=1, inplace=True)
test_raw.drop(labels=['Customer Id'], axis=1, inplace=True)

print(f"Columns after removing Customer ID: {list(train_raw.columns)}")

### Step 2: Clean NumberOfWindows

In [None]:
train_raw['NumberOfWindows'] = train_raw['NumberOfWindows'].replace({'without': 0, '>=10': 10})
train_raw['NumberOfWindows'] = train_raw['NumberOfWindows'].astype(int)

test_raw['NumberOfWindows'] = test_raw['NumberOfWindows'].replace({'without': 0, '>=10': 10})
test_raw['NumberOfWindows'] = test_raw['NumberOfWindows'].astype(int)

print("NumberOfWindows cleaned in both datasets")

### Step 3: Remove Duplicates and Conflicts

In [None]:
n_exact = train_raw.duplicated().sum()
n_same_features = train_raw.duplicated(subset=[c for c in train_raw.columns if c != 'Claim']).sum()

print(f"Exact duplicates: {n_exact}")
print(f"Duplicates with same features: {n_same_features}")

if n_same_features > 0:
    features = [c for c in train_raw.columns if c != "Claim"]
    dups = train_raw[train_raw.duplicated(subset=features, keep=False)]
    conflicts = dups.groupby(features)['Claim'].nunique()
    n_conflicts = (conflicts > 1).sum()
    print(f"Conflicting records: {n_conflicts}")
    
    if n_conflicts > 0:
        conflicting_groups = conflicts[conflicts > 1].reset_index()
        before = len(train_raw)
        train_raw = train_raw.merge(conflicting_groups[features], on=features, how='left', indicator=True)
        train_raw = train_raw[train_raw['_merge'] == 'left_only'].drop(columns=['_merge'])
        after = len(train_raw)
        print(f"Removed {before - after} conflicting records")

train_raw.drop_duplicates(inplace=True)
print(f"Train shape after cleaning: {train_raw.shape}")

### Step 4: Handle Missing Values (Train)

In [None]:
print("Missing values in train:")
print(train_raw.isna().sum())

mf_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
train_raw[["Garden"]] = mf_imputer.fit_transform(train_raw[["Garden"]])

median_imputer = SimpleImputer(strategy="median")
train_raw[["Building Dimension"]] = median_imputer.fit_transform(train_raw[["Building Dimension"]])

print("\nImputers fitted on train data")
print(f"Missing values after imputation:\n{train_raw.isna().sum()}")

### Step 5: Fill Missing Geo_Code

In [None]:
mode_geo_train = (
    train_raw[train_raw["Geo_Code"].notna()]
    .groupby(["Settlement", "Residential"])["Geo_Code"]
    .agg(lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0])
    .reset_index()
    .rename(columns={"Geo_Code": "Geo_Code_mode"})
)

print("Train Geo_Code modes by Settlement+Residential:")
print(mode_geo_train)

train_raw = train_raw.merge(mode_geo_train, on=["Settlement", "Residential"], how="left")
train_raw["Geo_Code"] = train_raw["Geo_Code"].fillna(train_raw["Geo_Code_mode"])
train_raw = train_raw.drop(columns=["Geo_Code_mode"])

print(f"\nTrain missing values after Geo_Code fill: {train_raw['Geo_Code'].isna().sum()}")

### Step 6: Clean Geo_Code (remove alphanumeric)

In [None]:
mask_numeric = train_raw["Geo_Code"].astype(str).str.isnumeric()
print(f"Train numeric Geo_Code: {mask_numeric.sum()}")
print(f"Train alphanumeric Geo_Code: {(~mask_numeric).sum()}")

train_raw = train_raw[mask_numeric].copy()
train_raw["Geo_Code"] = train_raw["Geo_Code"].astype(int)

print(f"Train shape after Geo_Code cleaning: {train_raw.shape}")

### Step 7: Handle Outliers in Building Dimension

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.boxplot(x=train_raw['Building Dimension'], ax=axes[0], color='orange')
axes[0].set_title('Building Dimension - Before Outlier Treatment', fontweight='bold')

Q1 = train_raw['Building Dimension'].quantile(0.25)
Q3 = train_raw['Building Dimension'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

print(f"Building Dimension - Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
print(f"Bounds: [{lower}, {upper}]")

outliers_before = (train_raw['Building Dimension'] < lower) | (train_raw['Building Dimension'] > upper)
print(f"Outliers found: {outliers_before.sum()}")

train_raw['Building Dimension'] = train_raw['Building Dimension'].clip(lower, upper)

sns.boxplot(x=train_raw['Building Dimension'], ax=axes[1], color='skyblue')
axes[1].set_title('Building Dimension - After Outlier Treatment', fontweight='bold')

plt.tight_layout()
plt.show()

print("Outliers clipped")

### Step 8: Scale Numerical Features

In [None]:
cols_to_scale = ['Building Dimension', 'NumberOfWindows']
scaler = RobustScaler()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for i, col in enumerate(cols_to_scale):
    sns.histplot(train_raw[col], kde=True, ax=axes[i, 0], color='coral', bins=30)
    axes[i, 0].set_title(f"{col} - Before Scaling", fontweight='bold')

train_raw[cols_to_scale] = scaler.fit_transform(train_raw[cols_to_scale])

for i, col in enumerate(cols_to_scale):
    sns.histplot(train_raw[col], kde=True, ax=axes[i, 1], color='steelblue', bins=30)
    axes[i, 1].set_title(f"{col} - After RobustScaler", fontweight='bold')

plt.tight_layout()
plt.show()

print("RobustScaler fitted and applied to train data")

### Step 9: Encode Categorical Variables

In [None]:
train_transformed = train_raw.copy()

train_transformed["Building_Painted"] = train_transformed["Building_Painted"].map({'N': 1, 'V': 0}).astype('int32')
train_transformed["Building_Fenced"] = train_transformed["Building_Fenced"].map({'N': 1, 'V': 0}).astype('int32')
train_transformed["Garden"] = train_transformed["Garden"].map({'V': 1, 'O': 0}).astype('int32')

train_transformed = pd.get_dummies(train_transformed, columns=["Settlement", "Building_Type"], drop_first=True, dtype='int32')

le_claim = LabelEncoder()
train_transformed["Claim"] = le_claim.fit_transform(train_transformed["Claim"])

cols = [c for c in train_transformed.columns if c != "Claim"] + ["Claim"]
train_transformed = train_transformed[cols]

print("Train data encoded")
print(f"Train shape: {train_transformed.shape}")
print(f"Train columns: {list(train_transformed.columns)}")

### Step 10: Correlation Analysis

In [None]:
df_corr = train_transformed.corr(numeric_only=True)
corr_with_claim = df_corr[["Claim"]].sort_values(by="Claim", ascending=False)

print("Correlation with Claim:")
print(corr_with_claim)

plt.figure(figsize=(6, len(corr_with_claim)*0.4))
sns.heatmap(corr_with_claim, annot=True, vmin=-1, vmax=1, cmap='coolwarm', center=0)
plt.title('Feature Correlation with Target (Claim)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

cols_to_drop = [
    'Building_Painted',
    'Geo_Code',
    'YearOfObservation',
    'Building_Type_Non-combustible'
]

train_transformed = train_transformed.drop(columns=cols_to_drop)
train_transformed = train_transformed.reset_index(drop=True)

print(f"\nFinal train shape: {train_transformed.shape}")
print(f"Final train columns: {list(train_transformed.columns)}")

## 3. Apply Same Transformations to Test Data

In [None]:
print("Applying transformations to test data...")

test_raw[["Garden"]] = mf_imputer.transform(test_raw[["Garden"]])
test_raw[["Building Dimension"]] = median_imputer.transform(test_raw[["Building Dimension"]])

test_raw = test_raw.merge(mode_geo_train, on=["Settlement", "Residential"], how="left")
test_raw["Geo_Code"] = test_raw["Geo_Code"].fillna(test_raw["Geo_Code_mode"])
test_raw = test_raw.drop(columns=["Geo_Code_mode"])

mask_numeric_test = test_raw["Geo_Code"].astype(str).str.isnumeric()
test_raw = test_raw[mask_numeric_test].copy()
test_raw["Geo_Code"] = test_raw["Geo_Code"].astype(int)

test_raw[cols_to_scale] = scaler.transform(test_raw[cols_to_scale])

test_transformed = test_raw.copy()
test_transformed["Building_Painted"] = test_transformed["Building_Painted"].map({'N': 1, 'V': 0}).astype('int32')
test_transformed["Building_Fenced"] = test_transformed["Building_Fenced"].map({'N': 1, 'V': 0}).astype('int32')
test_transformed["Garden"] = test_transformed["Garden"].map({'V': 1, 'O': 0}).astype('int32')

test_transformed = pd.get_dummies(test_transformed, columns=["Settlement", "Building_Type"], drop_first=True, dtype='int32')
test_transformed["Claim"] = le_claim.transform(test_transformed["Claim"])

cols = [c for c in test_transformed.columns if c != "Claim"] + ["Claim"]
test_transformed = test_transformed[cols]
test_transformed = test_transformed.drop(columns=cols_to_drop)
test_transformed = test_transformed.reset_index(drop=True)

print(f"Final test shape: {test_transformed.shape}")
print(f"Test columns match train: {list(train_transformed.columns) == list(test_transformed.columns)}")

## 4. Split Data: Train/Validation from train_Insurance, Test from test_Insurance

In [None]:
X_train_full = train_transformed.drop('Claim', axis=1)
y_train_full = train_transformed['Claim']

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full,
    test_size=0.2,
    random_state=42,
    stratify=y_train_full
)

X_test = test_transformed.drop('Claim', axis=1)
y_test = test_transformed['Claim']

print("="*60)
print("DATA SPLIT SUMMARY")
print("="*60)
print(f"Train set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set (final): {X_test.shape[0]} samples")
print(f"\nTrain target distribution:\n{y_train.value_counts()}")
print(f"\nValidation target distribution:\n{y_val.value_counts()}")
print(f"\nTest target distribution:\n{y_test.value_counts()}")

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sets = [('Train', y_train), ('Validation', y_val), ('Test', y_test)]

for ax, (name, y_data) in zip(axes, sets):
    counts = y_data.value_counts()
    ax.bar(['No Claim', 'Claim'], counts.values, color=['#66b3ff', '#ff9999'])
    ax.set_title(f'{name} Set Distribution', fontweight='bold')
    ax.set_ylabel('Count')
    for i, v in enumerate(counts.values):
        ax.text(i, v + 10, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Define Models

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=500, class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, max_depth=10),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(class_weight='balanced', probability=True, random_state=42, kernel='rbf')
}

print("Models defined:")
for name in models.keys():
    print(f"  - {name}")

## 6. Train and Evaluate WITHOUT SMOTE

In [None]:
print("="*60)
print("TRAINING WITHOUT SMOTE (class_weight='balanced')")
print("="*60)

results_no_smote = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training: {name}")
    print(f"{'='*60}")
    
    model.fit(X_train, y_train)
    
    y_val_pred = model.predict(X_val)
    y_val_proba = model.predict_proba(X_val)[:, 1]
    
    acc = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    roc_auc = roc_auc_score(y_val, y_val_proba)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    
    results_no_smote[name] = {
        'model': model,
        'accuracy': acc,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'precision': precision,
        'recall': recall,
        'y_val_pred': y_val_pred,
        'y_val_proba': y_val_proba
    }
    
    print(f"Validation Accuracy: {acc:.4f}")
    print(f"Validation F1-Score: {f1:.4f}")
    print(f"Validation ROC-AUC: {roc_auc:.4f}")
    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_val, y_val_pred))

print("\n" + "="*60)
print("WITHOUT SMOTE - SUMMARY")
print("="*60)
summary_no_smote = pd.DataFrame({
    'Model': list(results_no_smote.keys()),
    'Accuracy': [results_no_smote[m]['accuracy'] for m in results_no_smote.keys()],
    'F1-Score': [results_no_smote[m]['f1_score'] for m in results_no_smote.keys()],
    'ROC-AUC': [results_no_smote[m]['roc_auc'] for m in results_no_smote.keys()],
    'Precision': [results_no_smote[m]['precision'] for m in results_no_smote.keys()],
    'Recall': [results_no_smote[m]['recall'] for m in results_no_smote.keys()]
})
print(summary_no_smote.to_string(index=False))

## 7. Train and Evaluate WITH SMOTE

In [None]:
print("="*60)
print("TRAINING WITH SMOTE")
print("="*60)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"\nOriginal train set: {X_train.shape[0]} samples")
print(f"After SMOTE: {X_train_smote.shape[0]} samples")
print(f"\nClass distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
pd.Series(y_train).value_counts().plot(kind='bar', ax=axes[0], color=['#66b3ff', '#ff9999'])
axes[0].set_title('Before SMOTE', fontweight='bold')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['No Claim', 'Claim'], rotation=0)

pd.Series(y_train_smote).value_counts().plot(kind='bar', ax=axes[1], color=['#66b3ff', '#ff9999'])
axes[1].set_title('After SMOTE', fontweight='bold')
axes[1].set_ylabel('Count')
axes[1].set_xticklabels(['No Claim', 'Claim'], rotation=0)

plt.tight_layout()
plt.show()

results_smote = {}

for name, model_class in models.items():
    print(f"\n{'='*60}")
    print(f"Training: {name} (WITH SMOTE)")
    print(f"{'='*60}")
    
    if name == 'Logistic Regression':
        model = LogisticRegression(max_iter=500, random_state=42)
    elif name == 'Random Forest':
        model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
    elif name == 'Gradient Boosting':
        model = GradientBoostingClassifier(n_estimators=100, random_state=42)
    elif name == 'SVM':
        model = SVC(probability=True, random_state=42, kernel='rbf')
    
    model.fit(X_train_smote, y_train_smote)
    
    y_val_pred = model.predict(X_val)
    y_val_proba = model.predict_proba(X_val)[:, 1]
    
    acc = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    roc_auc = roc_auc_score(y_val, y_val_proba)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    
    results_smote[name] = {
        'model': model,
        'accuracy': acc,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'precision': precision,
        'recall': recall,
        'y_val_pred': y_val_pred,
        'y_val_proba': y_val_proba
    }
    
    print(f"Validation Accuracy: {acc:.4f}")
    print(f"Validation F1-Score: {f1:.4f}")
    print(f"Validation ROC-AUC: {roc_auc:.4f}")
    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_val, y_val_pred))

print("\n" + "="*60)
print("WITH SMOTE - SUMMARY")
print("="*60)
summary_smote = pd.DataFrame({
    'Model': list(results_smote.keys()),
    'Accuracy': [results_smote[m]['accuracy'] for m in results_smote.keys()],
    'F1-Score': [results_smote[m]['f1_score'] for m in results_smote.keys()],
    'ROC-AUC': [results_smote[m]['roc_auc'] for m in results_smote.keys()],
    'Precision': [results_smote[m]['precision'] for m in results_smote.keys()],
    'Recall': [results_smote[m]['recall'] for m in results_smote.keys()]
})
print(summary_smote.to_string(index=False))

## 8. Comparison: SMOTE vs No SMOTE

In [None]:
print("="*60)
print("COMPREHENSIVE COMPARISON: SMOTE vs NO SMOTE")
print("="*60)

comparison_df = pd.DataFrame({
    'Model': list(models.keys()),
    'F1_No_SMOTE': [results_no_smote[m]['f1_score'] for m in models.keys()],
    'F1_With_SMOTE': [results_smote[m]['f1_score'] for m in models.keys()],
    'ROC_AUC_No_SMOTE': [results_no_smote[m]['roc_auc'] for m in models.keys()],
    'ROC_AUC_With_SMOTE': [results_smote[m]['roc_auc'] for m in models.keys()],
    'Accuracy_No_SMOTE': [results_no_smote[m]['accuracy'] for m in models.keys()],
    'Accuracy_With_SMOTE': [results_smote[m]['accuracy'] for m in models.keys()]
})

comparison_df['F1_Improvement'] = comparison_df['F1_With_SMOTE'] - comparison_df['F1_No_SMOTE']
comparison_df['ROC_AUC_Improvement'] = comparison_df['ROC_AUC_With_SMOTE'] - comparison_df['ROC_AUC_No_SMOTE']

print(comparison_df.to_string(index=False))

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

x = np.arange(len(models))
width = 0.35

axes[0, 0].bar(x - width/2, comparison_df['F1_No_SMOTE'], width, label='No SMOTE', color='skyblue')
axes[0, 0].bar(x + width/2, comparison_df['F1_With_SMOTE'], width, label='With SMOTE', color='coral')
axes[0, 0].set_xlabel('Model')
axes[0, 0].set_ylabel('F1-Score')
axes[0, 0].set_title('F1-Score Comparison', fontweight='bold', fontsize=12)
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels(comparison_df['Model'], rotation=15, ha='right')
axes[0, 0].legend()
axes[0, 0].grid(axis='y', alpha=0.3)

axes[0, 1].bar(x - width/2, comparison_df['ROC_AUC_No_SMOTE'], width, label='No SMOTE', color='skyblue')
axes[0, 1].bar(x + width/2, comparison_df['ROC_AUC_With_SMOTE'], width, label='With SMOTE', color='coral')
axes[0, 1].set_xlabel('Model')
axes[0, 1].set_ylabel('ROC-AUC')
axes[0, 1].set_title('ROC-AUC Comparison', fontweight='bold', fontsize=12)
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels(comparison_df['Model'], rotation=15, ha='right')
axes[0, 1].legend()
axes[0, 1].grid(axis='y', alpha=0.3)

axes[1, 0].bar(x - width/2, comparison_df['Accuracy_No_SMOTE'], width, label='No SMOTE', color='skyblue')
axes[1, 0].bar(x + width/2, comparison_df['Accuracy_With_SMOTE'], width, label='With SMOTE', color='coral')
axes[1, 0].set_xlabel('Model')
axes[1, 0].set_ylabel('Accuracy')
axes[1, 0].set_title('Accuracy Comparison', fontweight='bold', fontsize=12)
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(comparison_df['Model'], rotation=15, ha='right')
axes[1, 0].legend()
axes[1, 0].grid(axis='y', alpha=0.3)

colors = ['green' if val > 0 else 'red' for val in comparison_df['F1_Improvement']]
axes[1, 1].bar(comparison_df['Model'], comparison_df['F1_Improvement'], color=colors)
axes[1, 1].set_xlabel('Model')
axes[1, 1].set_ylabel('F1-Score Improvement')
axes[1, 1].set_title('F1-Score Improvement with SMOTE', fontweight='bold', fontsize=12)
axes[1, 1].axhline(y=0, color='black', linestyle='--', linewidth=0.8)
axes[1, 1].tick_params(axis='x', rotation=15)
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Select Best Model and Configuration

In [None]:
print("="*60)
print("BEST MODEL SELECTION")
print("="*60)

best_f1_no_smote = max(results_no_smote.items(), key=lambda x: x[1]['f1_score'])
best_f1_smote = max(results_smote.items(), key=lambda x: x[1]['f1_score'])

print(f"\nBest model WITHOUT SMOTE: {best_f1_no_smote[0]}")
print(f"  F1-Score: {best_f1_no_smote[1]['f1_score']:.4f}")
print(f"  ROC-AUC: {best_f1_no_smote[1]['roc_auc']:.4f}")

print(f"\nBest model WITH SMOTE: {best_f1_smote[0]}")
print(f"  F1-Score: {best_f1_smote[1]['f1_score']:.4f}")
print(f"  ROC-AUC: {best_f1_smote[1]['roc_auc']:.4f}")

if best_f1_smote[1]['f1_score'] > best_f1_no_smote[1]['f1_score']:
    best_overall_name = best_f1_smote[0]
    best_overall_model = best_f1_smote[1]['model']
    best_config = 'WITH SMOTE'
    X_train_best = X_train_smote
    y_train_best = y_train_smote
else:
    best_overall_name = best_f1_no_smote[0]
    best_overall_model = best_f1_no_smote[1]['model']
    best_config = 'WITHOUT SMOTE'
    X_train_best = X_train
    y_train_best = y_train

print(f"\n{'='*60}")
print(f"FINAL BEST MODEL: {best_overall_name} ({best_config})")
print(f"{'='*60}")

## 10. Final Evaluation on Test Set

In [None]:
print("="*60)
print("FINAL TEST SET EVALUATION")
print(f"Model: {best_overall_name} ({best_config})")
print("="*60)

y_test_pred = best_overall_model.predict(X_test)
y_test_proba = best_overall_model.predict_proba(X_test)[:, 1]

test_acc = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_proba)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)

print(f"\nTest Set Performance:")
print(f"  Accuracy:  {test_acc:.4f}")
print(f"  F1-Score:  {test_f1:.4f}")
print(f"  ROC-AUC:   {test_roc_auc:.4f}")
print(f"  Precision: {test_precision:.4f}")
print(f"  Recall:    {test_recall:.4f}")

print(f"\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names=['No Claim', 'Claim']))

cm = confusion_matrix(y_test, y_test_pred)
print(f"\nConfusion Matrix:")
print(cm)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['No Claim', 'Claim'],
            yticklabels=['No Claim', 'Claim'])
axes[0].set_title('Confusion Matrix - Test Set', fontweight='bold', fontsize=12)
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')

fpr, tpr, _ = roc_curve(y_test, y_test_proba)
roc_auc_calc = auc(fpr, tpr)

axes[1].plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc_calc:.4f})')
axes[1].plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Classifier')
axes[1].set_xlim([0.0, 1.0])
axes[1].set_ylim([0.0, 1.05])
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title(f'ROC Curve - {best_overall_name}', fontweight='bold', fontsize=12)
axes[1].legend(loc="lower right")
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 11. Predictions Summary

In [None]:
predictions_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_test_pred,
    'Probability_Claim': y_test_proba,
    'Correct': y_test.values == y_test_pred
})

print("\n" + "="*60)
print("PREDICTION SUMMARY")
print("="*60)
print(f"Total predictions: {len(predictions_df)}")
print(f"Correct predictions: {predictions_df['Correct'].sum()}")
print(f"Incorrect predictions: {(~predictions_df['Correct']).sum()}")
print(f"Accuracy: {predictions_df['Correct'].mean():.4f}")

print("\nFirst 20 predictions:")
print(predictions_df.head(20).to_string(index=False))

## 12. Final Comparison Summary

In [None]:
print("\n" + "="*80)
print("FINAL SUMMARY - ALL MODELS AND CONFIGURATIONS")
print("="*80)

final_summary = pd.DataFrame([
    {
        'Model': name,
        'Configuration': 'No SMOTE',
        'F1-Score': results_no_smote[name]['f1_score'],
        'ROC-AUC': results_no_smote[name]['roc_auc'],
        'Accuracy': results_no_smote[name]['accuracy'],
        'Precision': results_no_smote[name]['precision'],
        'Recall': results_no_smote[name]['recall']
    }
    for name in models.keys()
] + [
    {
        'Model': name,
        'Configuration': 'With SMOTE',
        'F1-Score': results_smote[name]['f1_score'],
        'ROC-AUC': results_smote[name]['roc_auc'],
        'Accuracy': results_smote[name]['accuracy'],
        'Precision': results_smote[name]['precision'],
        'Recall': results_smote[name]['recall']
    }
    for name in models.keys()
])

final_summary = final_summary.sort_values('F1-Score', ascending=False)
print(final_summary.to_string(index=False))

print(f"\n{'='*80}")
print(f"WINNING MODEL: {best_overall_name} ({best_config})")
print(f"Final Test F1-Score: {test_f1:.4f}")
print(f"Final Test ROC-AUC: {test_roc_auc:.4f}")
print(f"{'='*80}")