# Model Developement

Thabang Ndhlovu, 2024

In [1]:
import numpy as np
import pandas as pd
from pycaret.classification import *

import warnings
warnings.filterwarnings("ignore") 

## Data Propressing

In [2]:
df = pd.read_excel("datasets/insurance_claims_clean_and_features.xlsx")

In [3]:
y = (df["fraud_reported"] == "Y").astype(int)
X = df.drop(["fraud_reported", "policy_bind_date", "incident_date"], axis=1)
X = pd.get_dummies(X)
X = X.astype(float)


## Model Development with Pycaret

In [4]:
_ = setup(data=X, target=y)

Unnamed: 0,Description,Value
0,Session id,8010
1,Target,fraud_reported
2,Target type,Binary
3,Original data shape,"(980, 1156)"
4,Transformed data shape,"(980, 1156)"
5,Transformed train set shape,"(686, 1156)"
6,Transformed test set shape,"(294, 1156)"
7,Numeric features,1155
8,Preprocess,True
9,Imputation type,simple


In [5]:
model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8541,0.8731,0.7574,0.6981,0.7215,0.6239,0.6289,0.822
lightgbm,Light Gradient Boosting Machine,0.8468,0.8698,0.6621,0.7112,0.6787,0.5794,0.585,0.832
dt,Decision Tree Classifier,0.8454,0.7796,0.6504,0.7239,0.6754,0.5756,0.5844,0.269
xgboost,Extreme Gradient Boosting,0.8352,0.8585,0.639,0.6994,0.6593,0.5521,0.5592,0.92
catboost,CatBoost Classifier,0.8309,0.8577,0.6985,0.656,0.6728,0.5599,0.5632,10.934
ada,Ada Boost Classifier,0.8045,0.8113,0.4857,0.6633,0.5476,0.4289,0.4449,0.451
et,Extra Trees Classifier,0.7696,0.8475,0.2129,0.6592,0.3006,0.2051,0.2604,0.455
lda,Linear Discriminant Analysis,0.7609,0.8332,0.0654,0.5333,0.1138,0.0755,0.1328,0.562
ridge,Ridge Classifier,0.7536,0.0,0.0353,0.2333,0.0606,0.033,0.0524,0.231
dummy,Dummy Classifier,0.7536,0.5,0.0,0.0,0.0,0.0,0.0,0.44


In [6]:
tuned_model = tune_model(model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8261,0.8767,0.5294,0.6923,0.6,0.4914,0.4986
1,0.7681,0.7579,0.5294,0.5294,0.5294,0.3756,0.3756
2,0.7101,0.7115,0.3529,0.4,0.375,0.1873,0.1879
3,0.7971,0.7907,0.5294,0.6,0.5625,0.4311,0.4325
4,0.7826,0.8518,0.6471,0.55,0.5946,0.4474,0.4502
5,0.7971,0.7975,0.3529,0.6667,0.4615,0.3508,0.3778
6,0.8235,0.857,0.5625,0.6429,0.6,0.4874,0.4892
7,0.7353,0.7716,0.3529,0.4615,0.4,0.234,0.2375
8,0.8235,0.8512,0.5294,0.6923,0.6,0.4894,0.4966
9,0.8088,0.8178,0.3529,0.75,0.48,0.381,0.4216


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [7]:
tuned_model

## Feature Importance

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Define the number of top features to select
top_features = [5, 10, 15, 20, 25]

for k in top_features:
    # Select top k features based on f_classif score
    selector = SelectKBest(f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    
    # Get the selected feature names
    selected_feature_names = X.columns[selector.get_support()]
    print(f"\nTop {k} Selected Features:")
    print(selected_feature_names.to_list())
    
    # Train the model with selected features
    tuned_model.fit(X_train_selected, y_train)
    
    # Make predictions using the model with selected features
    y_pred = tuned_model.predict(X_test_selected)
    
    # Calculate the accuracy of the model with selected features
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy with Top {k} Features: {accuracy:.2%}")


Top 5 Selected Features:
['insured_hobbies_chess', 'insured_hobbies_cross-fit', 'incident_severity_Major Damage', 'incident_severity_Minor Damage', 'incident_severity_Total Loss']
Accuracy with Top 5 Features: 84.18%

Top 10 Selected Features:
['total_claim_amount', 'property_claim', 'vehicle_claim', 'total_claims', 'insured_hobbies_chess', 'insured_hobbies_cross-fit', 'incident_severity_Major Damage', 'incident_severity_Minor Damage', 'incident_severity_Total Loss', 'authorities_contacted_Police']
Accuracy with Top 10 Features: 83.16%

Top 15 Selected Features:
['total_claim_amount', 'property_claim', 'vehicle_claim', 'total_claims', 'insured_hobbies_chess', 'insured_hobbies_cross-fit', 'incident_type_Parked Car', 'incident_type_Vehicle Theft', 'incident_severity_Major Damage', 'incident_severity_Minor Damage', 'incident_severity_Total Loss', 'incident_severity_Trivial Damage', 'authorities_contacted_Other', 'authorities_contacted_Police', 'auto_make_Mercedes']
Accuracy with Top 15 F

In [19]:
k = 5

selector = SelectKBest(f_classif, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Get the selected feature names
selected_feature_names = X.columns[selector.get_support()]
print(f"\nTop {k} Selected Features:")
print(selected_feature_names.to_list())

# Train the model with selected features
tuned_model.fit(X_train_selected, y_train)

# Make predictions using the model with selected features
y_pred = tuned_model.predict(X_test_selected)

# Calculate the accuracy of the model with selected features
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with Top {k} Features: {accuracy:.2%}")

# Perform permutation importance to analyse feature contribution to fraud
perm_importance = permutation_importance(tuned_model, X_test_selected, y_test, n_repeats=10, random_state=42)

# Get the mean feature importance scores
mean_importance_scores = perm_importance.importances_mean

# Print the feature importance scores
print("\nFeature Contribution to Fraud:")
for feature, score in zip(selected_feature_names, mean_importance_scores):
    print(f"{feature}: {score:.4f}")


Top 5 Selected Features:
['insured_hobbies_chess', 'insured_hobbies_cross-fit', 'incident_severity_Major Damage', 'incident_severity_Minor Damage', 'incident_severity_Total Loss']
Accuracy with Top 5 Features: 84.18%

Feature Contribution to Fraud:
insured_hobbies_chess: 0.0658
insured_hobbies_cross-fit: 0.0209
incident_severity_Major Damage: 0.2281
incident_severity_Minor Damage: -0.0020
incident_severity_Total Loss: -0.0010


The analysis of the top 5 features and their contribution to predicting fraudulent insurance claims provides valuable insights, highlighting the importance of considering the insured person's hobbies and the severity of the incident, particularly major damage, in detecting and preventing fraud, which can help optimise risk assessment and claims processing strategies.

## Save Model

In [63]:
model = tuned_model.fit(X[selected_feature_names], y)

In [64]:
save_model(model, "fraud_claims_model")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['months_as_customer', 'age',
                                              'policy_number',
                                              'policy_deductable',
                                              'policy_annual_premium',
                                              'umbrella_limit', 'insured_zip',
                                              'capital-gains', 'capital-loss',
                                              'incident_hour_of_the_day',
                                              'number_of_vehicles_involved',
                                              'bodily_injuries', 'witnesses',
                                              'total_...
                                             criterion='friedman_mse', init=None,
                                             learning_rate=0.1, loss

In [65]:
predict_proba = model.predict_proba(X[selected_feature_names])

In [67]:
df['predict_proba'] = predict_proba[:, 1]

In [69]:
df.to_excel("datasets/insurance_claims_clean_and_features_final.xlsx", index=False)