In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import preprocessing as proc

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

pd.pandas.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('carclaims.csv')
print(df.shape)
df.head()

In [None]:
X = df.drop(['PolicyNumber', 'FraudFound'], axis=1)
y = df['FraudFound']
print(X.shape, y.shape)
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, 
                                                   random_state=42,
                                                   stratify=y)
X_train.shape, X_test.shape

In [None]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
label_encode = LabelEncoder()
y_train = label_encode.fit_transform(y_train)
y_test = label_encode.transform(y_test)

In [None]:
TEMP_VAR = ['Month', 'MonthClaimed', 'DayOfWeek', 'DayOfWeekClaimed']

NUMERICAL = ['Year', 'Deductible']

ONE_HOT_CATEGORICAL = ['Make', 'PolicyType', 'MaritalStatus',
                       'BasePolicy','Fault', 'Sex', 'AccidentArea']
ORDINAL_CATEGORICAL = ['AgeOfVehicle', 'AgeOfPolicyHolder', 'VehiclePrice',
                       'AddressChange-Claim','NumberOfSuppliments', 'PastNumberOfClaims']

AGE_OF_VEH_VAR = ['AgeOfVehicle']
AGE_OF_POL_VAR = ['AgeOfPolicyHolder']
VEH_PRICE_VAR = ['VehiclePrice']
ADD_CHANGE_VAR = ['AddressChange-Claim']
NUM_SUPP_VAR = ['NumberOfSuppliments']
PAST_CLAIM_VAR = ['PastNumberOfClaims']

MONTH_MAP = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
DAY_MAP = {'Sunday': 6, 'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5}

AGE_OF_VEH_MAP = {'new': 8, '2 years': 7, '3 years': 6, '4 years': 5,
                  '5 years': 4, '6 years': 3, '7 years': 2, 'more than 7': 1}
AGE_OF_POL_MAP = {'16 to 17': 1, '18 to 20': 2, '21 to 25': 3, '26 to 30': 4,
                  '31 to 35': 5, '36 to 40': 6, '41 to 50': 7, '51 to 65': 8,
                  'over 65': 9}
VEH_PRICE_MAP = {'less than 20,000': 1, '20,000 to 29,000': 2,
                 '30,000 to 39,000': 3, '40,000 to 59,000': 4,
                 '60,000 to 69,000': 5, 'more than 69,000': 6}

ADD_CHANGE_MAP = {'no change': 1, 'under 6 months': 2, '1 year': 3,
                  '2 to 3 years': 4, '4 to 8 years': 5}
NUM_SUPP_MAP = {'none': 1, '1 to 2': 2, '3 to 5': 3, 'more than 5': 4}
PAST_CLAIM_MAP = {'none': 1, '1': 2, '2 to 4': 3, 'more than 4': 4}

In [None]:
FEATURES = [
    'BasePolicy', 'PolicyType', 'Make', 'AccidentArea', 'Fault', 'AgeOfVehicle', 'VehiclePrice', 
    'Year', 'Month', 'MonthClaimed', 'DayOfWeek', 'DayOfWeekClaimed', 'Sex', 'MaritalStatus', 'AgeOfPolicyHolder',
    'Deductible','AddressChange-Claim', 'NumberOfSuppliments', 'PastNumberOfClaims'
]

In [None]:
X_train = X_train[FEATURES]
X_test = X_test[FEATURES]

In [None]:
col_transform = ColumnTransformer(
    transformers=[
    ('map_month', proc.MapTransform(variable=TEMP_VAR[:2], mappings=MONTH_MAP), TEMP_VAR[:2]),
    ('map_day', proc.MapTransform(variable=TEMP_VAR[2:], mappings=DAY_MAP), TEMP_VAR[2:]),
    ('age_veh', proc.MapTransform(variable=AGE_OF_VEH_VAR, mappings=AGE_OF_VEH_MAP), AGE_OF_VEH_VAR),
    ('age_pol', proc.MapTransform(variable=AGE_OF_POL_VAR, mappings=AGE_OF_POL_MAP), AGE_OF_POL_VAR),
    ('veh_price', proc.MapTransform(variable=VEH_PRICE_VAR, mappings=VEH_PRICE_MAP), VEH_PRICE_VAR),
    ('add_change', proc.MapTransform(variable=ADD_CHANGE_VAR, mappings=ADD_CHANGE_MAP), ADD_CHANGE_VAR),
    ('num_supp', proc.MapTransform(variable=NUM_SUPP_VAR, mappings=NUM_SUPP_MAP), NUM_SUPP_VAR),
    ('past_claim', proc.MapTransform(variable=PAST_CLAIM_VAR, mappings=PAST_CLAIM_MAP), PAST_CLAIM_VAR),
    ('hot_cat', OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'), ONE_HOT_CATEGORICAL)
    ], remainder='passthrough'
).set_output(transform='pandas')

for feature in TEMP_VAR:
    if feature == 'DayOfWeekClaimed' or feature == 'DayOfWeek':
        X_train[feature] = X_train[feature].replace('0', 'Monday')
        X_test[feature] = X_test[feature].replace('0', 'Monday')
    if feature == 'MonthClaimed' or feature == 'Month':
        X_train[feature] = X_train[feature].replace('0', 'Jan')
        X_test[feature] = X_test[feature].replace('0', 'Jan')

sampler = RandomUnderSampler(random_state=42, sampling_strategy={0:6000})

xtrainsamp = col_transform.fit_transform(X_train)
xtrainsamp, ytrainsamp = sampler.fit_resample(xtrainsamp, y_train)


reindexed = proc.ResetIndexTransform().fit_transform(xtrainsamp)
reindexed

np.any(np.isnan(ytrainsamp))

In [None]:
pipeline = Pipeline([
    ('clean', proc.CleanTransform(variable=TEMP_VAR)),
    ('transform', col_transform),
    ('undersamp', RandomUnderSampler(random_state=42, sampling_strategy={0:5000})),
    ('reindex', proc.ResetIndexTransform()),
    ('oversamp', SMOTE(random_state=42)),
    ('cos_sin_transform', proc.CoSineTransform(['map_month__Month', 'map_month__MonthClaimed',
       'map_day__DayOfWeek', 'map_day__DayOfWeekClaimed'])),
    ('drop', proc.DropTransform(['map_month__Month', 'map_month__MonthClaimed',
       'map_day__DayOfWeek', 'map_day__DayOfWeekClaimed'])),
    ('classifier', RandomForestClassifier(random_state=42))
])

for name, step in pipeline.named_steps.items():
    print(f'[DEBUG]: Checking step {name}...')

    try:
        if name == 'undersamp' or name =='oversamp':
            X_transformed = step.fit_resample(X, y)
        else:
            X_transformed = step.fit_transform(X, y)

        print(f'Shape: {X_transformed.shape}')
        print(f'Number of NaNs: \n{pd.DataFrame(X_transformed).isna().sum()}')
    except Exception as err:
        print(f'Error in step: {err}')

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pred_train = pipeline.predict(X_train)
pred_test = pipeline.predict(X_test)

Classification report

In [None]:
print(classification_report(y_train, pred_train, target_names=['Not Fraud', 'Fraud']))

In [None]:
print(classification_report(y_test, pred_test, target_names=['Not Fraud', 'Fraud']))

GridSearchCV

In [None]:
us_strategy = [
    {0:9000}, {0:7000}, {0:5000}, {0:3000}
]

os_strategy = [
    {1:9000}, {1:7000}, {1:5000}, {1:3000}
]

In [None]:
grid_params = {
    'undersamp__sampling_strategy': us_strategy,
    'oversamp__sampling_strategy': os_strategy
}

In [None]:
clf = GridSearchCV(estimator=pipeline, param_grid=grid_params, n_jobs=-1, cv=5, error_score='raise')
clf.fit(X_train, y_train)

In [None]:
print(clf.best_params_)
#Output -> {'oversampler__sampling_strategy': {1: 5000}, 'undersampler__sampling_strategy': {0: 200000}}
hp_best_pipeline  = clf.best_estimator_
print(hp_best_pipeline)

In [None]:
hp_best_pipeline.fit(X_train, y_train)
predictions_hp = hp_best_pipeline.predict(X_test)
print(classification_report(y_test, predictions_hp))

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,  # Limit depth to prevent overfitting
    min_samples_leaf=5,  # Minimum samples per leaf
    min_samples_split=10,  # Minimum samples to split a node
    max_features='sqrt',  # Use fewer features per split
    random_state=42
)
rf.fit(X_train, y_train)


param_grid = {
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}