In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import joblib
from imblearn.over_sampling import SMOTE


In [None]:

url = "https://github.com/dsrscientist/Data-Science-ML-Capstone-Projects/blob/master/Automobile_insurance_fraud.csv?raw=true"
df = pd.read_csv(url)

In [None]:

df.head()


In [None]:

print(df.describe())
print(df.info())

In [None]:

print(df.isnull().sum())

In [None]:

df.hist(figsize=(12, 10))
plt.show()

In [None]:

plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:

df.drop_duplicates(inplace=True)
df.fillna(df.mean(), inplace=True)


In [None]:

categorical_features = df.select_dtypes(include=['object']).columns
for feature in categorical_features:
    df[feature] = LabelEncoder().fit_transform(df[feature])

In [None]:

X = df.drop('fraud_reported', axis=1)
y = df['fraud_reported']

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:

# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred_log = logistic_model.predict(X_test)

print("Logistic Regression - Classification Report:\n", classification_report(y_test, y_pred_log))
print("Logistic Regression - ROC AUC Score:", roc_auc_score(y_test, y_pred_log))

In [None]:

# Decision Tree Classifier
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

print("Decision Tree - Classification Report:\n", classification_report(y_test, y_pred_tree))
print("Decision Tree - ROC AUC Score:", roc_auc_score(y_test, y_pred_tree))

In [None]:

# Random Forest Classifier
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)
y_pred_forest = forest_model.predict(X_test)

print("Random Forest - Classification Report:\n", classification_report(y_test, y_pred_forest))
print("Random Forest - ROC AUC Score:", roc_auc_score(y_test, y_pred_forest))

In [None]:

# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

print("Gradient Boosting - Classification Report:\n", classification_report(y_test, y_pred_gb))
print("Gradient Boosting - ROC AUC Score:", roc_auc_score(y_test, y_pred_gb))

In [None]:

# XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost - Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("XGBoost - ROC AUC Score:", roc_auc_score(y_test, y_pred_xgb))


In [None]:

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid, cv=5, scoring='roc_auc')
grid.fit(X_train, y_train)

print("Best parameters for XGBoost Classifier:", grid.best_params_)


In [None]:

joblib.dump(grid.best_estimator_, 'best_insurance_fraud_model_improved.pkl')
