In [None]:
import pandas as pd
import preprocessing as proc

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

pd.pandas.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('carclaims.csv')
print(df.shape)
df.head()

(15420, 33)


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,PolicyNumber,RepNumber,Deductible,DriverRating,Days:Policy-Accident,Days:Policy-Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,21,Policy Holder,Sport - Liability,Sport,"more than 69,000",1,12,300,1,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability,No
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,34,Policy Holder,Sport - Collision,Sport,"more than 69,000",2,15,400,4,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision,No
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,47,Policy Holder,Sport - Collision,Sport,"more than 69,000",3,7,400,3,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision,No
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,65,Third Party,Sedan - Liability,Sport,"20,000 to 29,000",4,4,400,2,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability,No
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,27,Third Party,Sport - Collision,Sport,"more than 69,000",5,3,400,1,more than 30,more than 30,none,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision,No


In [3]:
X = df.drop(['PolicyNumber', 'FraudFound'], axis=1)
y = df['FraudFound']
print(X.shape, y.shape)
X.head()

(15420, 31) (15420,)


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,RepNumber,Deductible,DriverRating,Days:Policy-Accident,Days:Policy-Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,21,Policy Holder,Sport - Liability,Sport,"more than 69,000",12,300,1,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,34,Policy Holder,Sport - Collision,Sport,"more than 69,000",15,400,4,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,47,Policy Holder,Sport - Collision,Sport,"more than 69,000",7,400,3,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,65,Third Party,Sedan - Liability,Sport,"20,000 to 29,000",4,400,2,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,27,Third Party,Sport - Collision,Sport,"more than 69,000",3,400,1,more than 30,more than 30,none,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, 
                                                   random_state=42,
                                                   stratify=y)
X_train.shape, X_test.shape

((12336, 31), (3084, 31))

In [5]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [6]:
label_encode = LabelEncoder()
y_train = label_encode.fit_transform(y_train)
y_test = label_encode.transform(y_test)

In [7]:
TEMP_VAR = ['Month', 'MonthClaimed', 'DayOfWeek', 'DayOfWeekClaimed']

NUMERICAL = ['Year', 'Deductible']

ONE_HOT_CATEGORICAL = ['Make', 'PolicyType', 'MaritalStatus',
                       'BasePolicy','Fault', 'Sex', 'AccidentArea']
ORDINAL_CATEGORICAL = ['AgeOfVehicle', 'AgeOfPolicyHolder', 'VehiclePrice',
                       'AddressChange-Claim','NumberOfSuppliments', 'PastNumberOfClaims']

AGE_OF_VEH_VAR = ['AgeOfVehicle']
AGE_OF_POL_VAR = ['AgeOfPolicyHolder']
VEH_PRICE_VAR = ['VehiclePrice']
ADD_CHANGE_VAR = ['AddressChange-Claim']
NUM_SUPP_VAR = ['NumberOfSuppliments']
PAST_CLAIM_VAR = ['PastNumberOfClaims']

MONTH_MAP = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
DAY_MAP = {'Sunday': 6, 'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5}

AGE_OF_VEH_MAP = {'new': 8, '2 years': 7, '3 years': 6, '4 years': 5,
                  '5 years': 4, '6 years': 3, '7 years': 2, 'more than 7': 1}
AGE_OF_POL_MAP = {'16 to 17': 1, '18 to 20': 2, '21 to 25': 3, '26 to 30': 4,
                  '31 to 35': 5, '36 to 40': 6, '41 to 50': 7, '51 to 65': 8,
                  'over 65': 9}
VEH_PRICE_MAP = {'less than 20,000': 1, '20,000 to 29,000': 2,
                 '30,000 to 39,000': 3, '40,000 to 59,000': 4,
                 '60,000 to 69,000': 5, 'more than 69,000': 6}

ADD_CHANGE_MAP = {'no change': 1, 'under 6 months': 2, '1 year': 3,
                  '2 to 3 years': 4, '4 to 8 years': 5}
NUM_SUPP_MAP = {'none': 1, '1 to 2': 2, '3 to 5': 3, 'more than 5': 4}
PAST_CLAIM_MAP = {'none': 1, '1': 2, '2 to 4': 3, 'more than 4': 4}

In [8]:
FEATURES = [
    'BasePolicy', 'PolicyType', 'Make', 'AccidentArea', 'Fault', 'AgeOfVehicle', 'VehiclePrice', 
    'Year', 'Month', 'MonthClaimed', 'DayOfWeek', 'DayOfWeekClaimed', 'Sex', 'MaritalStatus', 'AgeOfPolicyHolder',
    'Deductible','AddressChange-Claim', 'NumberOfSuppliments', 'PastNumberOfClaims'
]

In [9]:
X_train = X_train[FEATURES]
X_test = X_test[FEATURES]

In [10]:
col_transform = ColumnTransformer(
    transformers=[
    ('map_month', proc.MapTransform(variable=TEMP_VAR[:2], mappings=MONTH_MAP), TEMP_VAR[:2]),
    ('map_day', proc.MapTransform(variable=TEMP_VAR[2:], mappings=DAY_MAP), TEMP_VAR[2:]),
    ('age_veh', proc.MapTransform(variable=AGE_OF_VEH_VAR, mappings=AGE_OF_VEH_MAP), AGE_OF_VEH_VAR),
    ('age_pol', proc.MapTransform(variable=AGE_OF_POL_VAR, mappings=AGE_OF_POL_MAP), AGE_OF_POL_VAR),
    ('veh_price', proc.MapTransform(variable=VEH_PRICE_VAR, mappings=VEH_PRICE_MAP), VEH_PRICE_VAR),
    ('add_change', proc.MapTransform(variable=ADD_CHANGE_VAR, mappings=ADD_CHANGE_MAP), ADD_CHANGE_VAR),
    ('num_supp', proc.MapTransform(variable=NUM_SUPP_VAR, mappings=NUM_SUPP_MAP), NUM_SUPP_VAR),
    ('past_claim', proc.MapTransform(variable=PAST_CLAIM_VAR, mappings=PAST_CLAIM_MAP), PAST_CLAIM_VAR),
    ('hot_cat', OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'), ONE_HOT_CATEGORICAL)
    ], remainder='passthrough'
).set_output(transform='pandas')

In [11]:
pipeline = Pipeline([
    ('clean', proc.CleanTransform(variable=TEMP_VAR)),
    ('transform', col_transform),
    ('undersamp', RandomUnderSampler(random_state=42)),
    ('reindex', proc.ResetIndexTransform()),
    ('oversamp', SMOTE(random_state=42)),
    ('cos_sin_transform', proc.CoSineTransform(['map_month__Month', 'map_month__MonthClaimed',
       'map_day__DayOfWeek', 'map_day__DayOfWeekClaimed'])),
    ('drop', proc.DropTransform(['map_month__Month', 'map_month__MonthClaimed',
       'map_day__DayOfWeek', 'map_day__DayOfWeekClaimed'])),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [12]:
pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [46]:
pred_train = pipeline.predict_proba(X_train)[:, 1]
pred_train = (pred_train >= 0.65).astype('int')

pred_test = pipeline.predict_proba(X_test)[:, 1]
pred_test = (pred_test >= 0.65).astype('int')

Classification report

In [47]:
print(classification_report(y_train, pred_train, target_names=['Not Fraud', 'Fraud']))

              precision    recall  f1-score   support

   Not Fraud       1.00      0.80      0.89     11598
       Fraud       0.24      0.98      0.38       738

    accuracy                           0.81     12336
   macro avg       0.62      0.89      0.63     12336
weighted avg       0.95      0.81      0.86     12336



In [48]:
print(classification_report(y_test, pred_test, target_names=['Not Fraud', 'Fraud']))

              precision    recall  f1-score   support

   Not Fraud       0.97      0.78      0.87      2899
       Fraud       0.15      0.61      0.24       185

    accuracy                           0.77      3084
   macro avg       0.56      0.69      0.55      3084
weighted avg       0.92      0.77      0.83      3084



GridSearchCV

In [53]:
grid_params = {
    'undersamp__sampling_strategy': [0.3, 0.4, 0.5],
    'oversamp__sampling_strategy': [0.6, 0.7, 0.8]
}

In [54]:
clf = GridSearchCV(estimator=pipeline, param_grid=grid_params, n_jobs=-1, cv=5, error_score='raise')
clf.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [55]:
print(clf.best_params_)

{'oversamp__sampling_strategy': 0.6, 'undersamp__sampling_strategy': 0.3}


In [56]:
sampling_pipeline  = clf.best_estimator_
print(sampling_pipeline)

Pipeline(steps=[('clean',
                 CleanTransform(variable=['Month', 'MonthClaimed', 'DayOfWeek',
                                          'DayOfWeekClaimed'])),
                ('transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('map_month',
                                                  MapTransform(mappings={'Apr': 4,
                                                                         'Aug': 8,
                                                                         'Dec': 12,
                                                                         'Feb': 2,
                                                                         'Jan': 1,
                                                                         'Jul': 7,
                                                                         'Jun': 6,
                                                                         'Mar': 3,
           

In [57]:
sampling_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [68]:
predictions_hp = sampling_pipeline.predict_proba(X_test)[:, 1]
predictions_hp = (predictions_hp >= 0.4).astype('int')
print(classification_report(y_test, predictions_hp))

              precision    recall  f1-score   support

           0       0.97      0.82      0.88      2899
           1       0.16      0.55      0.25       185

    accuracy                           0.80      3084
   macro avg       0.56      0.68      0.57      3084
weighted avg       0.92      0.80      0.85      3084



In [239]:
update_pipeline = Pipeline([
    ('clean', proc.CleanTransform(variable=TEMP_VAR)),
    ('transform', col_transform),
    ('undersamp', RandomUnderSampler(random_state=42, sampling_strategy=0.2)),
    ('reindex', proc.ResetIndexTransform()),
    ('oversamp', SMOTE(random_state=42, sampling_strategy=0.75)),
    ('cos_sin_transform', proc.CoSineTransform(['map_month__Month', 'map_month__MonthClaimed',
       'map_day__DayOfWeek', 'map_day__DayOfWeekClaimed'])),
    ('drop', proc.DropTransform(['map_month__Month', 'map_month__MonthClaimed',
       'map_day__DayOfWeek', 'map_day__DayOfWeekClaimed'])),
    ('classifier', RandomForestClassifier(n_estimators=600, random_state=42, oob_score=True, class_weight='balanced', 
                                          min_samples_leaf=5, min_samples_split=15, max_depth=40))
])

In [None]:
param_grid = {
    'classifier__n_estimators': [600, 700, 800],
}

In [210]:
grid_search = GridSearchCV(estimator=update_pipeline, param_grid=param_grid, scoring='f1', 
                           n_jobs=-1, cv=5)
grid_search.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [211]:
print(grid_search.best_params_)

{'classifier__n_estimators': 600}


In [212]:
classifier_pipeline = grid_search.best_estimator_
classifier_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
pred_tr = classifier_pipeline.predict(X_train)
print(classification_report(y_true=y_train, y_pred=pred_tr))

              precision    recall  f1-score   support

           0       0.97      0.88      0.93     11598
           1       0.25      0.63      0.36       738

    accuracy                           0.87     12336
   macro avg       0.61      0.75      0.64     12336
weighted avg       0.93      0.87      0.89     12336



In [None]:
pred_tst = classifier_pipeline.predict(X_test)
print(classification_report(y_true=y_test, y_pred=pred_tst))

              precision    recall  f1-score   support

           0       0.96      0.88      0.92      2899
           1       0.19      0.45      0.27       185

    accuracy                           0.85      3084
   macro avg       0.58      0.66      0.59      3084
weighted avg       0.92      0.85      0.88      3084

