In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler
import preprocessing as proc
import joblib

pd.pandas.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('carclaims.csv')
print(df.shape)
df.head()

In [None]:
X = df.drop(['PolicyNumber', 'FraudFound'], axis=1)
y = df['FraudFound']
print(X.shape, y.shape)
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, 
                                                   random_state=42,
                                                   stratify=y)
X_train.shape, X_test.shape

In [None]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
label_encode = LabelEncoder()
y_train_enc = label_encode.fit_transform(y_train)
y_test_enc = label_encode.transform(y_test)

#### Variables

In [None]:
TEMP_SIN_COS = ['Month', 'MonthClaimed', 'DayOfWeek']

ONE_HOT_CATEGORICAL = ['Make', 'PolicyType', 'VehicleCategory',
                       'BasePolicy', 'AgentType', 'Fault', 'Sex', 'AccidentArea']

MONTH_VAR = ['Month', 'MonthClaimed']
DAY_VAR = ['DayOfWeek']
DEDUCTIBLE_VAR = ['Deductible']
AGE_OF_VEH_VAR = ['AgeOfVehicle']
AGE_OF_POL_VAR = ['AgeOfPolicyHolder']
VEH_PRICE_VAR = ['VehiclePrice']
DAYS_ACC_VAR = ['Days:Policy-Accident']
ADD_CHANGE_VAR = ['AddressChange-Claim']
NUM_SUPP_VAR = ['NumberOfSuppliments']
PAST_CLAIM_VAR = ['PastNumberOfClaims']
DAYS_CLAIM_VAR = ['Days:Policy-Claim']

MONTH_MAP = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
DAY_MAP = {'Sunday': 6, 'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5}
DEDUCTIBLE_MAP = {300: 1, 400: 2, 500: 3, 700: 4}
AGE_OF_VEH_MAP = {'new': 8, '2 years': 7, '3 years': 6, '4 years': 5,
                  '5 years': 4, '6 years': 3, '7 years': 2, 'more than 7': 1}
AGE_OF_POL_MAP = {'16 to 17': 1, '18 to 20': 2, '21 to 25': 3, '26 to 30': 4,
                  '31 to 35': 5, '36 to 40': 6, '41 to 50': 7, '51 to 65': 8,
                  'over 65': 9}
VEH_PRICE_MAP = {'less than 20,000': 1, '20,000 to 29,000': 2,
                 '30,000 to 39,000': 3, '40,000 to 59,000': 4,
                 '60,000 to 69,000': 5, 'more than 69,000': 6}
DAYS_ACC_MAP = {'none': 1, '1 to 7': 2, '8 to 15': 3, '15 to 30': 4, 'more than 30': 5}
ADD_CHANGE_MAP = {'no change': 1, 'under 6 months': 2, '1 year': 3,
                  '2 to 3 years': 4, '4 to 8 years': 5}
NUM_SUPP_MAP = {'none': 1, '1 to 2': 2, '3 to 5': 3, 'more than 5': 4}
PAST_CLAIM_MAP = {'none': 1, '1': 2, '2 to 4': 3, 'more than 4': 4}
DAYS_CLAIM_MAP = {'none':1, '8 to 15': 2, '15 to 30': 3, 'more than 30': 4}

In [None]:
FEATURES = [
    'BasePolicy', 
    'PolicyType', 
    'VehicleCategory',
    'Fault', 
    'PastNumberOfClaims', 
    'Make',
    'NumberOfSuppliments', 
    'Deductible', 
    'VehiclePrice',
    'AgeOfVehicle', 
    'Days:Policy-Accident', 
    'Days:Policy-Claim',
    'Sex', 
    'AgeOfPolicyHolder', 
    'Month', 
    'MonthClaimed', 
    'DayOfWeek',
    'AgentType', 
    'AddressChange-Claim', 
    'AccidentArea']

In [None]:
X_train = X_train[FEATURES]
X_test = X_test[FEATURES]

X_train.shape, X_test.shape

#### Pipeline

In [None]:
pipeline = Pipeline([
    ('clean', proc.CleanTransform(variable=TEMP_SIN_COS)),
    ('map_month', proc.MapTransform(variable=MONTH_VAR, mappings=MONTH_MAP)),
    ('map_day', proc.MapTransform(variable=DAY_VAR, mappings=DAY_MAP))
    ('cossin', proc.CoSineTransform(variable=TEMP_SIN_COS)),
    ('drop', proc.DropTransform(variable=TEMP_SIN_COS)),
    ('onehot', ColumnTransformer(transformers=[
        ('hot_cat', OneHotEncoder(handle_unknown='infrequent_if_exist'), ONE_HOT_CATEGORICAL)])),
    ('map_deduct', proc.MapTransform(variable=DEDUCTIBLE_VAR, mappings=DEDUCTIBLE_MAP)),
    ('map_ageofvehicle', proc.MapTransform(variable=AGE_OF_VEH_VAR, mappings=AGE_OF_VEH_MAP)),
    ('map_ageofholder', proc.MapTransform(variable=AGE_OF_POL_VAR, mappings=AGE_OF_POL_MAP)),
    ('map_vehicleprice', proc.MapTransform(variable=VEH_PRICE_VAR, mappings=VEH_PRICE_MAP)),
    ('map_daysaccident', proc.MapTransform(variable=DAYS_ACC_VAR, mappings=DAYS_ACC_MAP)),
    ('map_addresschange', proc.MapTransform(variable=ADD_CHANGE_VAR, mappings=ADD_CHANGE_MAP)),
    ('map_supplement', proc.MapTransform(variable=NUM_SUPP_VAR, mappings=NUM_SUPP_VAR)),
    ('map_pastclaim', proc.MapTransform(variable=PAST_CLAIM_VAR, mappings=PAST_CLAIM_MAP)),
    ('map_daysclaim', proc.MapTransform(variable=DAYS_CLAIM_VAR, mappings=DAYS_CLAIM_MAP)),
    ('sampler', RandomOverSampler(random_state=42)),
    ('classifier', RandomForestClassifier())
], verbose=True)