In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import model_dev.preprocessing as proc

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, auc, average_precision_score, classification_report
from sklearn.metrics import det_curve, roc_curve, precision_recall_curve, confusion_matrix
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay, DetCurveDisplay, ConfusionMatrixDisplay
from imblearn.over_sampling import RandomOverSampler

import joblib

pd.pandas.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('carclaims.csv')
print(df.shape)
df.head()

In [None]:
X = df.drop(['PolicyNumber', 'FraudFound'], axis=1)
y = df['FraudFound']
print(X.shape, y.shape)
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, 
                                                   random_state=42,
                                                   stratify=y)
X_train.shape, X_test.shape

In [None]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
label_encode = LabelEncoder()
y_train_enc = label_encode.fit_transform(y_train)
y_test_enc = label_encode.transform(y_test)

#### Variables

In [None]:
TEMP_VAR = ['Month', 'MonthClaimed', 'DayOfWeek']

NUM_VAR = ['DriverRating']

ONE_HOT_CAT_VAR = ['Make', 'PolicyType', 'MaritalStatus', 'VehicleCategory',
                       'BasePolicy', 'AgentType', 'WitnessPresent', 
                       'PoliceReportFiled', 'Fault', 'Sex', 'AccidentArea']

ORDINAL_CAT_VAR = ['NumberOfCars', 'Days:Policy-Claim']

NUM_CAR_VAR = ['NumberOfCars']
DAYS_CLAIM_VAR = ['Days:Policy-Claim']

MONTH_MAP = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
DAY_MAP = {'Sunday': 6, 'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5}
NUM_CAR_MAP = {'1 vehicle': 1, '2 vehicles': 2, '3 to 4': 3, '5 to 8': 4, 'more than 8': 5}
DAYS_CLAIM_MAP = {'none':1, '8 to 15': 2, '15 to 30': 3, 'more than 30': 4}

In [None]:
FEATURES = [
    'Make',
    'PolicyType',
    'BasePolicy',
    'MaritalStatus',
    'Sex',
    'VehicleCategory',
    'AgentType',
    'WitnessPresent',
    'PoliceReportFiled',
    'AccidentArea',
    'NumberOfCars',
    'Fault',
    'Month',
    'MonthClaimed',
    'DayOfWeek',
    'Days:Policy-Claim',
    'DriverRating']

In [None]:
X_train = X_train[FEATURES]
X_test = X_test[FEATURES]

X_train.shape, X_test.shape

#### Pipeline

In [None]:
# column transformers for Random Forest (One Hot Encoding + Scaling)
cat_transform = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'), ONE_HOT_CAT_VAR)
    ], remainder='passthrough'
).set_output(transform='pandas')

In [None]:
# clean -> map -> cos/sin -> drop -> onehot -> sampling -> classify
pipeline = Pipeline([
    ('clean', proc.CleanTransform(variable=TEMP_VAR)),
    ('map_month', proc.MapTransform(variable=TEMP_VAR[:2], mappings=MONTH_MAP)),
    ('map_day', proc.MapTransform(variable=[TEMP_VAR[-1]], mappings=DAY_MAP)),
    ('map_num', proc.MapTransform(variable=NUM_CAR_VAR, mappings=NUM_CAR_MAP)),
    ('map_claim', proc.MapTransform(variable=DAYS_CLAIM_VAR, mappings=DAYS_CLAIM_MAP)),
    ('cos_sin', proc.CoSineTransform(variable=TEMP_VAR)),
    ('drop', proc.DropTransform(variable=TEMP_VAR)),
    ('one_hot', cat_transform),
    ('sampler', RandomOverSampler(random_state=42)),
    ('classifier', RandomForestClassifier(n_estimators=300, criterion='gini', random_state=42))
])

In [None]:
pipeline.fit(X_train, y_train_enc)

#### Evaluation

In [None]:
pred_train = pipeline.predict(X_train)
pred_test = pipeline.predict(X_test)

Accuracy

In [None]:
acc_train = accuracy_score(y_train_enc, pred_train)
print(f'train accuracy: {acc_train}')

In [None]:
acc_test = accuracy_score(y_test_enc, pred_test)
print(f'test accuracy: {acc_test}')

Classification report

In [None]:
classification_report(y_train_enc, pred_train, target_names=['Not Fraud', 'Fraud'])

In [None]:
classification_report(y_test_enc, pred_test, target_names=['Not Fraud', 'Fraud'])

ROC Curve and AUC

In [None]:
fpr_train, tpr_train, thresholds_train = roc_curve(y_train_enc, pipeline.predict_proba(X_train)[:, 1], pos_label=1)
#pos_label=1
roc_auc_train = roc_auc_score(y_train_enc, pipeline.predict_proba(X_train)[:, 1])
print(f'false positive: {fpr_train}\ntrue positive: {tpr_train}\nthreshold: {thresholds_train}')
print(f'auc: {auc(fpr_train, tpr_train)}')
print(f'roc_auc: {roc_auc_train}')
roc_tr_plot = RocCurveDisplay(fpr=fpr_train, tpr=tpr_train, roc_auc=roc_auc_train, pos_label=1).plot()
#pos_label = 1

In [None]:
fpr_test, tpr_test, thresholds_test = roc_curve(y_test_enc, pipeline.predict_proba(X_test)[:, 1], pos_label=1)
#pos_label=1
roc_auc_test = roc_auc_score(y_test_enc, pipeline.predict_proba(X_test)[:, 1])
print(f'false positive: {fpr_test}\ntrue positive: {tpr_test}\nthreshold: {thresholds_test}')
print(f'auc: {auc(fpr_test, tpr_test)}')
print(f'roc_auc: {roc_auc_test}')
roc_ts_plot = RocCurveDisplay(fpr=fpr_test, tpr=tpr_test, roc_auc=roc_auc_test, pos_label=1).plot()
#pos_label=1

Precision-Recall curve

In [None]:
precision_train, recall_train, _ = precision_recall_curve(y_train_enc, pipeline.predict_proba(X_train)[:, 1], pos_label=1)
#pos_label = 1
avg_train = average_precision_score(y_train_enc, pipeline.predict_proba(X_train)[:, 1], pos_label=1)
prc_tr_plot = PrecisionRecallDisplay(precision=precision_train, recall=recall_train, average_precision=avg_train, estimator_name='RandomForestClassifier', pos_label=1).plot()
#pos_label = 1

In [None]:
precision_test, recall_test, _ = precision_recall_curve(y_test_enc, pipeline.predict_proba(X_test)[:, 1], pos_label=1)
#pos_label = 1
avg_test = average_precision_score(y_test_enc, pipeline.predict_proba(X_test)[:, 1], pos_label=1)
prc_ts_plot = PrecisionRecallDisplay(precision=precision_test, recall=recall_test, average_precision=avg_test, estimator_name='RandomForestClassifier', pos_label=1).plot()
#pos_label = 1

Detection Error

In [None]:
fpr_dt_train, fnr_dt_train, treshold_dt_train = det_curve(y_train_enc, pipeline.predict_proba(X_train)[:, 1], pos_label=1)
fpr_dt_test, fnr_dt_test, treshold_dt_test = det_curve(y_test_enc, pipeline.predict_proba(X_test)[:, 1], pos_label=1)
dt_tr_plot = DetCurveDisplay(fpr=fpr_dt_train, fnr=fnr_dt_train, estimator_name='RandomForestClassifier', pos_label=1)
dt_ts_plot = DetCurveDisplay(fpr=fpr_dt_test, fnr=fnr_dt_test, estimator_name='RandomForestClassifier', pos_label=1)

In [None]:
fig, (ax1, ax2, ax3, ax4, ax5, ax6) = plt.subplots(3, 3, figsize=(12, 10))
roc_tr_plot.plot(ax=ax1)
prc_tr_plot.plot(ax=ax2)
dt_tr_plot.plot(ax=ax3)
roc_ts_plot.plot(ax=ax4)
prc_ts_plot.plot(ax=ax5)
dt_ts_plot.plot(ax=ax6)
plt.show()

Confusion matrix

In [None]:
cm_train = confusion_matrix(y_train_enc, pred_train)
ConfusionMatrixDisplay(cm_train).plot()

In [None]:
cm_test = confusion_matrix(y_test_enc, pred_test)
ConfusionMatrixDisplay(cm_test).plot()

Dump

In [None]:
joblib.dump(pipeline, 'rf_pipeline.joblib')