# **Imports and Other Requirements**


In [1]:
from google.colab import drive
drive.mount("/gdrive")
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import VarianceThreshold

from imblearn.over_sampling import SMOTE
from collections import Counter

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, confusion_matrix, SCORERS

from sklearn.model_selection import GridSearchCV, cross_validate

import warnings
warnings.filterwarnings('ignore')



# **EDA and Data Cleaning**



In [3]:
proj_dir = "/gdrive/My Drive/CIS_508/Colab Notebooks/Projects/2.Fraud Detection/"
train = pd.read_csv(proj_dir+"train.csv")
test = pd.read_csv(proj_dir+"test.csv")
print(train.shape, test.shape)

(2999, 32) (12918, 32)


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 32 columns):
MONTH                   2999 non-null object
WEEKOFMONTH             2999 non-null int64
DAYOFWEEK               2999 non-null object
MAKE                    2999 non-null object
ACCIDENTAREA            2999 non-null object
DAYOFWEEKCLAIMED        2999 non-null object
MONTHCLAIMED            2999 non-null object
WEEKOFMONTHCLAIMED      2999 non-null int64
SEX                     2999 non-null object
MARITALSTATUS           2999 non-null object
AGE                     2999 non-null int64
FAULT                   2999 non-null object
POLICYTYPE              2999 non-null object
VEHICLECATEGORY         2999 non-null object
VEHICLEPRICE            2999 non-null object
REPNUMBER               2999 non-null int64
DEDUCTIBLE              2999 non-null int64
DRIVERRATING            2999 non-null int64
DAYS_POLICY_ACCIDENT    2999 non-null object
DAYS_POLICY_CLAIM       2999 non-null object

In [5]:
train.describe()

Unnamed: 0,WEEKOFMONTH,WEEKOFMONTHCLAIMED,AGE,REPNUMBER,DEDUCTIBLE,DRIVERRATING,YEAR
count,2999.0,2999.0,2999.0,2999.0,2999.0,2999.0,2999.0
mean,2.78126,2.671224,40.055352,8.511837,407.302434,2.496832,1995.114038
std,1.286055,1.261614,13.497026,4.601437,41.847258,1.118365,0.606007
min,1.0,1.0,0.0,1.0,300.0,1.0,1994.0
25%,2.0,2.0,31.0,4.0,400.0,1.0,1995.0
50%,3.0,3.0,38.0,9.0,400.0,2.0,1995.0
75%,4.0,4.0,49.0,12.0,400.0,3.0,1995.0
max,5.0,5.0,80.0,16.0,700.0,4.0,1996.0


In [6]:
train.sample(10)

Unnamed: 0,MONTH,WEEKOFMONTH,DAYOFWEEK,MAKE,ACCIDENTAREA,DAYOFWEEKCLAIMED,MONTHCLAIMED,WEEKOFMONTHCLAIMED,SEX,MARITALSTATUS,AGE,FAULT,POLICYTYPE,VEHICLECATEGORY,VEHICLEPRICE,REPNUMBER,DEDUCTIBLE,DRIVERRATING,DAYS_POLICY_ACCIDENT,DAYS_POLICY_CLAIM,PASTNUMBEROFCLAIMS,AGEOFVEHICLE,AGEOFPOLICYHOLDER,POLICEREPORTFILED,WITNESSPRESENT,AGENTTYPE,NUMBEROFSUPPLIMENTS,ADDRESSCHANGE_CLAIM,NUMBEROFCARS,YEAR,BASEPOLICY,FRAUDFOUND
113,Jun,3,Saturday,Mazda,Urban,Monday,Jun,3,Male,Married,35,Policy_Holder,Sport-Collision,Sport,more_than_69000,11,400,3,more_than_30,more_than_30,none,3_years,31_to_35,No,No,External,none,no_change,1-vehicle,1994,Collision,Yes
527,Jul,4,Thursday,Honda,Urban,Friday,Jul,4,Female,Single,34,Policy_Holder,Sedan-Liability,Sport,more_than_69000,10,400,2,more_than_30,more_than_30,more_than_4,new,31_to_35,No,No,External,none,no_change,1-vehicle,1994,Liability,No
1932,Sep,1,Saturday,VW,Rural,Monday,Sep,1,Male,Single,26,Third_Party,Sedan-Liability,Sport,40000_to_59000,14,400,1,more_than_30,more_than_30,2_to_4,7_years,31_to_35,No,No,External,more_than_5,no_change,1-vehicle,1995,Liability,No
1050,Sep,3,Wednesday,Pontiac,Rural,Wednesday,Oct,1,Male,Married,32,Policy_Holder,Sedan-Collision,Sedan,20000_to_29000,8,400,4,more_than_30,more_than_30,none,6_years,31_to_35,No,No,External,none,no_change,1-vehicle,1994,Collision,No
495,Dec,4,Friday,Pontiac,Urban,Monday,Dec,4,Male,Married,29,Policy_Holder,Sedan-Collision,Sedan,20000_to_29000,8,400,2,more_than_30,more_than_30,1,6_years,31_to_35,No,No,External,more_than_5,no_change,1-vehicle,1995,Collision,No
1006,Mar,5,Tuesday,Chevrolet,Urban,Tuesday,Apr,2,Male,Married,53,Policy_Holder,Sedan-All_Perils,Sedan,20000_to_29000,4,400,1,more_than_30,more_than_30,none,more_than_7,41_to_50,No,No,External,1_to_2,no_change,1-vehicle,1994,All_Perils,No
222,Mar,1,Sunday,VW,Urban,Monday,Mar,3,Female,Single,40,Policy_Holder,Sedan-All_Perils,Sedan,20000_to_29000,9,400,1,more_than_30,more_than_30,none,7_years,36_to_40,No,No,External,none,1_year,1-vehicle,1995,All_Perils,Yes
1951,Feb,1,Wednesday,Toyota,Urban,Friday,Feb,4,Male,Married,55,Policy_Holder,Sedan-Collision,Sedan,30000_to_39000,3,400,3,more_than_30,more_than_30,none,7_years,41_to_50,Yes,No,External,1_to_2,no_change,1-vehicle,1996,Collision,No
2499,Jan,3,Wednesday,Mazda,Urban,Thursday,Jan,3,Male,Married,43,Policy_Holder,Sedan-Collision,Sedan,20000_to_29000,7,400,4,more_than_30,more_than_30,1,more_than_7,36_to_40,No,No,External,3_to_5,no_change,1-vehicle,1996,Collision,No
2587,Sep,1,Saturday,Pontiac,Urban,Monday,Sep,1,Male,Married,48,Policy_Holder,Sedan-All_Perils,Sedan,30000_to_39000,4,400,3,more_than_30,more_than_30,none,more_than_7,41_to_50,No,No,External,none,no_change,1-vehicle,1996,All_Perils,No


In [7]:
train['NUMBEROFSUPPLIMENTS'].value_counts()

none           1367
more_than_5     774
1_to_2          479
3_to_5          379
Name: NUMBEROFSUPPLIMENTS, dtype: int64

In [8]:
train.isna().sum()

MONTH                   0
WEEKOFMONTH             0
DAYOFWEEK               0
MAKE                    0
ACCIDENTAREA            0
DAYOFWEEKCLAIMED        0
MONTHCLAIMED            0
WEEKOFMONTHCLAIMED      0
SEX                     0
MARITALSTATUS           0
AGE                     0
FAULT                   0
POLICYTYPE              0
VEHICLECATEGORY         0
VEHICLEPRICE            0
REPNUMBER               0
DEDUCTIBLE              0
DRIVERRATING            0
DAYS_POLICY_ACCIDENT    0
DAYS_POLICY_CLAIM       0
PASTNUMBEROFCLAIMS      0
AGEOFVEHICLE            0
AGEOFPOLICYHOLDER       0
POLICEREPORTFILED       0
WITNESSPRESENT          0
AGENTTYPE               0
NUMBEROFSUPPLIMENTS     0
ADDRESSCHANGE_CLAIM     0
NUMBEROFCARS            0
YEAR                    0
BASEPOLICY              0
FRAUDFOUND              0
dtype: int64

In [9]:
train['REPNUMBER'].unique()

array([ 4,  9,  8,  1, 14, 16, 13,  5, 12,  7, 10, 11,  2,  3,  6, 15])

In [10]:
train.sample(5)

Unnamed: 0,MONTH,WEEKOFMONTH,DAYOFWEEK,MAKE,ACCIDENTAREA,DAYOFWEEKCLAIMED,MONTHCLAIMED,WEEKOFMONTHCLAIMED,SEX,MARITALSTATUS,AGE,FAULT,POLICYTYPE,VEHICLECATEGORY,VEHICLEPRICE,REPNUMBER,DEDUCTIBLE,DRIVERRATING,DAYS_POLICY_ACCIDENT,DAYS_POLICY_CLAIM,PASTNUMBEROFCLAIMS,AGEOFVEHICLE,AGEOFPOLICYHOLDER,POLICEREPORTFILED,WITNESSPRESENT,AGENTTYPE,NUMBEROFSUPPLIMENTS,ADDRESSCHANGE_CLAIM,NUMBEROFCARS,YEAR,BASEPOLICY,FRAUDFOUND
1106,May,2,Wednesday,Pontiac,Rural,Thursday,May,2,Male,Married,61,Policy_Holder,Sedan-Liability,Sport,20000_to_29000,3,400,3,more_than_30,more_than_30,2_to_4,more_than_7,51_to_65,No,No,External,more_than_5,no_change,1-vehicle,1995,Liability,No
1707,Dec,4,Sunday,VW,Urban,Thursday,Dec,5,Male,Single,35,Policy_Holder,Sedan-Liability,Sport,30000_to_39000,6,400,4,more_than_30,more_than_30,none,5_years,31_to_35,No,No,External,none,no_change,1-vehicle,1995,Liability,No
1820,Feb,3,Tuesday,Toyota,Urban,Wednesday,Feb,3,Female,Married,55,Policy_Holder,Sedan-Collision,Sedan,20000_to_29000,10,400,1,more_than_30,more_than_30,none,7_years,41_to_50,No,No,External,more_than_5,no_change,1-vehicle,1995,Collision,No
55,Mar,3,Sunday,Toyota,Rural,Tuesday,Mar,4,Male,Married,45,Policy_Holder,Sedan-All_Perils,Sedan,20000_to_29000,7,400,3,more_than_30,more_than_30,2_to_4,more_than_7,36_to_40,No,No,External,more_than_5,no_change,3_to_4,1994,All_Perils,Yes
1747,Sep,2,Monday,Mazda,Urban,Monday,Sep,2,Male,Single,44,Policy_Holder,Sedan-Liability,Sport,20000_to_29000,15,400,3,more_than_30,more_than_30,none,7_years,36_to_40,No,No,External,3_to_5,no_change,1-vehicle,1995,Liability,No


In [13]:
all_cols = train.columns[:-1]
num_cols = ['AGE', 'DEDUCTIBLE']
#ordinal_cols = ['VEHICLEPRICE', 'DRIVERRATING', 'DAYS_POLICY_ACCIDENT', 'DAYS_POLICY_CLAIM', 'PASTNUMBEROFCLAIMS', 'AGEOFVEHICLE', 'AGEOFPOLICYHOLDER', 'NUMBEROFSUPPLIMENTS', 'NUMBEROFCARS']
cardinal_cols = [x for x in all_cols if x not in num_cols]

print(len(num_cols), num_cols)
print(len(cardinal_cols), cardinal_cols)

2 ['AGE', 'DEDUCTIBLE']
29 ['MONTH', 'WEEKOFMONTH', 'DAYOFWEEK', 'MAKE', 'ACCIDENTAREA', 'DAYOFWEEKCLAIMED', 'MONTHCLAIMED', 'WEEKOFMONTHCLAIMED', 'SEX', 'MARITALSTATUS', 'FAULT', 'POLICYTYPE', 'VEHICLECATEGORY', 'VEHICLEPRICE', 'REPNUMBER', 'DRIVERRATING', 'DAYS_POLICY_ACCIDENT', 'DAYS_POLICY_CLAIM', 'PASTNUMBEROFCLAIMS', 'AGEOFVEHICLE', 'AGEOFPOLICYHOLDER', 'POLICEREPORTFILED', 'WITNESSPRESENT', 'AGENTTYPE', 'NUMBEROFSUPPLIMENTS', 'ADDRESSCHANGE_CLAIM', 'NUMBEROFCARS', 'YEAR', 'BASEPOLICY']


In [14]:
# Ordinal Encoder for ordinal columns
"""
oe = OrdinalEncoder()
ordinal_df = pd.DataFrame(oe.fit_transform(train[ordinal_cols]), columns = ["O_"+x for x in ordinal_cols])
ordinal_df.head()
ordinal_df.shape
"""

'\noe = OrdinalEncoder()\nordinal_df = pd.DataFrame(oe.fit_transform(train[ordinal_cols]), columns = ["O_"+x for x in ordinal_cols])\nordinal_df.head()\nordinal_df.shape\n'

In [15]:
# One Hot Encoder for cardinal columns
ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
cardinal_df = pd.DataFrame(ohe.fit_transform(train[cardinal_cols]), columns = ohe.get_feature_names())
cardinal_df.head()
cardinal_df.shape

(2999, 165)

In [16]:
X_train = pd.concat([cardinal_df, train[num_cols]], axis = 1)
X_train.shape

(2999, 167)

In [17]:
# Label Encoder for Target column
le = LabelEncoder()
label_df = pd.DataFrame(le.fit_transform(train['FRAUDFOUND']), columns = ['TARGET'])
label_df.head()
label_df.shape

(2999, 1)

In [18]:
Y_train = label_df.copy()
Y_train.shape
Y_train.head()

Unnamed: 0,TARGET
0,1
1,1
2,1
3,1
4,1


# **Using SMOTE to handle unbalanced target class**

1.   X_train and Y_train are original data points
2.   X and Y are resampled data points

In [19]:
Y_train['TARGET'].value_counts()

0    2600
1     399
Name: TARGET, dtype: int64

In [20]:
sm = SMOTE(ratio = 0.99)
X,Y = sm.fit_resample(X_train, Y_train)
print(X.shape, Y.shape)

(5174, 167) (5174,)


In [21]:
print(Counter(Y))

Counter({0: 2600, 1: 2574})


In [22]:
X = pd.DataFrame(X, columns = X_train.columns)
Y = pd.DataFrame(Y, columns = Y_train.columns)
X.sample(10)

Unnamed: 0,x0_Apr,x0_Aug,x0_Dec,x0_Feb,x0_Jan,x0_Jul,x0_Jun,x0_Mar,x0_May,x0_Nov,x0_Oct,x0_Sep,x1_1,x1_2,x1_3,x1_4,x1_5,x2_Friday,x2_Monday,x2_Saturday,x2_Sunday,x2_Thursday,x2_Tuesday,x2_Wednesday,x3_Accura,x3_BMW,x3_Chevrolet,x3_Dodge,x3_Ford,x3_Honda,x3_Mazda,x3_Mecedes,x3_Mercury,x3_Nisson,x3_Pontiac,x3_Porche,x3_Saab,x3_Saturn,x3_Toyota,x3_VW,...,x19_6_years,x19_7_years,x19_more_than_7,x19_new,x20_16_to_17,x20_18_to_20,x20_21_to_25,x20_26_to_30,x20_31_to_35,x20_36_to_40,x20_41_to_50,x20_51_to_65,x20_over_65,x21_No,x21_Yes,x22_No,x22_Yes,x23_External,x23_Internal,x24_1_to_2,x24_3_to_5,x24_more_than_5,x24_none,x25_1_year,x25_2_to_3_years,x25_4_to_8_years,x25_no_change,x25_under_6_months,x26_1-vehicle,x26_2-vehicles,x26_3_to_4,x26_5_to_8,x27_1994,x27_1995,x27_1996,x28_All_Perils,x28_Collision,x28_Liability,AGE,DEDUCTIBLE
2089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,25.0,400.0
1831,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,50.0,400.0
3408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.440618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.559382,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.559382,0.440618,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.440618,0.0,0.559382,0.0,0.0,1.0,0.0,1.0,0.0,0.0,56.321855,400.0
2608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,55.0,400.0
4523,0.0,0.0,0.0,0.0,0.0,0.0,0.789078,0.0,0.0,0.0,0.210922,0.0,0.0,0.789078,0.0,0.0,0.210922,0.210922,0.0,0.0,0.0,0.0,0.789078,0.0,0.210922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789078,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.789078,0.210922,0.0,1.0,0.0,0.0,60.789078,400.0
2752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,33.0,400.0
4751,0.0,0.0,0.0,0.340497,0.0,0.0,0.0,0.0,0.0,0.0,0.659503,0.0,0.0,0.0,0.340497,0.0,0.659503,0.659503,0.0,0.340497,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.659503,0.340497,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,60.340497,400.0
4558,0.0,0.526745,0.0,0.0,0.0,0.0,0.0,0.473255,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.526745,0.0,0.0,0.473255,0.0,0.0,0.0,0.0,0.0,0.526745,0.0,0.0,0.0,0.0,0.473255,0.0,0.0,0.0,0.0,0.0,...,0.473255,0.0,0.526745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.526745,0.473255,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.473255,0.526745,0.0,1.0,0.0,53.473255,400.0
1514,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,43.0,400.0
2100,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,26.0,400.0


In [23]:
X.shape, Y.shape

((5174, 167), (5174, 1))

# **Model Building**

## Model 1: Random Forest Classifier - With Class weights to handle imbalanced target classes

In [24]:
rf1 = RandomForestClassifier(n_estimators= 200, 
                             max_depth = 18, 
                             min_samples_split = 5, 
                             max_features = 45, 
                             class_weight = {0:1, 1:7}, 
                             warm_start = True, 
                             random_state= 6)

rf1.fit(X_train, Y_train)
rf1_train_pred = pd.DataFrame( rf1.predict(X_train), columns = ['TARGET'])

print("\n--------------Training Metrics-----------------\n")
print("Confusion Matrix:\n", confusion_matrix(Y_train, rf1_train_pred))
print("\nAUC:", round(roc_auc_score(Y_train, rf1_train_pred),4))

scores = cross_validate(rf1, X_train, Y_train, scoring='roc_auc', cv=5, return_estimator = True)
rf1_val_score = np.mean(scores['test_score'])

print("\n--------------Validation Metrics-----------------\n")
print("AUC:", round(rf1_val_score,4))
print("5-Fold scores", scores['test_score'])


--------------Training Metrics-----------------

Confusion Matrix:
 [[2577   23]
 [   0  399]]

AUC: 0.9956

--------------Validation Metrics-----------------

AUC: 0.825
5-Fold scores [0.86644231 0.88677885 0.80310096 0.71847356 0.85012171]


## Model 2: Random Forest Classifier - With SMOTE to handle imbalanced target classes

In [25]:
rf2 = RandomForestClassifier(n_estimators= 200, 
                             max_depth = 18, 
                             min_samples_split = 5, 
                             max_features = 45, 
                             warm_start = True, 
                             random_state = 6)

rf2.fit(X,Y)
rf2_train_pred = pd.DataFrame( rf2.predict(X), columns = ['TARGET'])

print("\n--------------Training Metrics-----------------\n")
print("Confusion Matrix:\n", confusion_matrix(Y, rf2_train_pred))
print("\nAUC:", round(roc_auc_score(Y, rf2_train_pred),4))

scores = cross_validate(rf2, X, Y, scoring='roc_auc', cv=5, return_estimator = True)
rf2_val_score = np.mean(scores['test_score'])


print("\n--------------Validation Metrics-----------------\n")
print("AUC:", round(rf2_val_score,4))
print("5-Fold scores", scores['test_score'])


--------------Training Metrics-----------------

Confusion Matrix:
 [[2597    3]
 [  21 2553]]

AUC: 0.9953

--------------Validation Metrics-----------------

AUC: 0.9825
5-Fold scores [0.91525019 0.99803211 0.9999888  0.99997386 0.99901601]


***Hyper-parameter tuning for Model 2***

In [26]:
params = {
          'n_estimators':[100, 150, 200],
          'max_depth':[10, 15, 20],
          'min_samples_split':[10, 20, 30],
          'max_features':[25, 35, 45],
          'warm_start':[ True, False]
}

gs_rf2 = GridSearchCV(rf2, params, scoring = 'roc_auc', cv = 5, return_train_score = True)
gs_rf2.fit(X,Y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=18,
                                              max_features=45,
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=5,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=200, n_jobs=None,
                                              oob_score=False, random_state=6,
                                         

In [27]:
print("----------------Hyperparameter tuning results---------------")
print("\nBest Parameters", gs_rf2.best_params_)
print("\nBest Estimator", gs_rf2.best_estimator_)

----------------Hyperparameter tuning results---------------

Best Parameters {'max_depth': 20, 'max_features': 25, 'min_samples_split': 10, 'n_estimators': 200, 'warm_start': True}

Best Estimator RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features=25,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=6, verbose=0,
                       warm_start=True)


## Model 3: Model 2 Best Estimator with SMOTE data

In [28]:
# Best estimator from hyperparameter tuning
rf3 = RandomForestClassifier(**gs_rf2.best_params_)
rf3.fit(X,Y)

rf3_train_pred = pd.DataFrame(rf3.predict(X), columns = ['TARGET'])

print("\n--------------Training Metrics-----------------\n")
print("Confusion Matrix:\n", confusion_matrix(Y, rf3_train_pred))
print("\nAUC:", round(roc_auc_score(Y, rf3_train_pred),4))

scores = cross_validate(rf3, X, Y, scoring='roc_auc', cv=5, return_estimator = True)
rf3_val_score = np.mean(scores['test_score'])


print("\n--------------Validation Metrics-----------------\n")
print("AUC:", round(rf3_val_score,4))
print("5-Fold scores", scores['test_score'])


--------------Training Metrics-----------------

Confusion Matrix:
 [[2591    9]
 [  68 2506]]

AUC: 0.9851

--------------Validation Metrics-----------------

AUC: 0.982
5-Fold scores [0.91320762 0.99820015 1.         0.99997386 0.99840991]


In [29]:
# Important features of the model
rf3_feature_imp = pd.DataFrame({'Variable':X.columns,
              'Importance':rf3.feature_importances_}).sort_values('Importance', ascending=False)

rf3_feature_imp[rf3_feature_imp['Importance']>0.004]

Unnamed: 0,Variable,Importance
159,x27_1994,0.117848
160,x27_1995,0.08134
164,x28_Liability,0.058856
76,x11_Sedan-Liability,0.047587
73,x10_Third_Party,0.046522
72,x10_Policy_Holder,0.038357
161,x27_1996,0.032152
83,x12_Sport,0.031744
82,x12_Sedan,0.028378
74,x11_Sedan-All_Perils,0.022736


## Model 4: XGB Classifier - With SMOTE data points

In [30]:
xgb1 = XGBClassifier(max_depth = 5, 
                    n_estimators = 500, 
                    random_state = 6)
xgb1.fit(X,Y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=6,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [31]:
xgb1_train_pred = pd.DataFrame(xgb1.predict(X), columns = ['TARGET'])

print("\n--------------Training Metrics-----------------\n")
print("Confusion Matrix:\n", confusion_matrix(Y, xgb1_train_pred))
print("\nAUC:", round(roc_auc_score(Y, xgb1_train_pred),4))

scores = cross_validate(xgb1, X, Y, scoring='roc_auc', cv=5, return_estimator = True)
xgb1_val_score = np.mean(scores['test_score'])


print("\n--------------Validation Metrics-----------------\n")
print("AUC:", round(xgb1_val_score,4))
print("5-Fold scores", scores['test_score'])


--------------Training Metrics-----------------

Confusion Matrix:
 [[2600    0]
 [   0 2574]]

AUC: 1.0

--------------Validation Metrics-----------------

AUC: 0.9786
5-Fold scores [0.89282674 0.99997386 1.         0.99999627 0.99998878]


In [32]:
feature_imp2 = pd.DataFrame( data = {'Features':X.columns, 'XGB_Importance':xgb1.feature_importances_ , 'RF_Importance':rf3.feature_importances_ }).sort_values(['XGB_Importance', 'RF_Importance'], ascending = False)
feature_imp2[ feature_imp2['XGB_Importance'] > 0.0001]

Unnamed: 0,Features,XGB_Importance,RF_Importance
164,x28_Liability,0.102116,0.058856
159,x27_1994,0.100384,0.117848
72,x10_Policy_Holder,0.070525,0.038357
151,x25_2_to_3_years,0.055857,0.010773
10,x0_Oct,0.039279,0.012342
...,...,...,...
125,x19_4_years,0.000583,0.001040
68,x9_Divorced,0.000507,0.000047
144,x23_External,0.000488,0.000375
150,x25_1_year,0.000475,0.000360


In [33]:
feature_imp2['Avg_Imp'] = 0.5 * (feature_imp2['XGB_Importance'] + feature_imp2['RF_Importance'])
feature_imp2[ feature_imp2['Avg_Imp'] > 0.0001].sort_values('Avg_Imp', ascending = False)

Unnamed: 0,Features,XGB_Importance,RF_Importance,Avg_Imp
159,x27_1994,0.100384,0.117848,0.109116
164,x28_Liability,0.102116,0.058856,0.080486
72,x10_Policy_Holder,0.070525,0.038357,0.054441
160,x27_1995,0.018404,0.081340,0.049872
151,x25_2_to_3_years,0.055857,0.010773,0.033315
...,...,...,...,...
145,x23_Internal,0.000000,0.000260,0.000130
33,x3_Nisson,0.000000,0.000248,0.000124
131,x20_16_to_17,0.000000,0.000241,0.000120
71,x9_Widow,0.000000,0.000231,0.000115


# **Model Selection**

## Pre-processing Test data

In [34]:
test.shape

(12918, 32)

In [35]:
test.columns

Index(['MONTH', 'WEEKOFMONTH', 'DAYOFWEEK', 'MAKE', 'ACCIDENTAREA',
       'DAYOFWEEKCLAIMED', 'MONTHCLAIMED', 'WEEKOFMONTHCLAIMED', 'SEX',
       'MARITALSTATUS', 'AGE', 'FAULT', 'POLICYTYPE', 'VEHICLECATEGORY',
       'VEHICLEPRICE', 'REPNUMBER', 'DEDUCTIBLE', 'DRIVERRATING',
       'DAYS_POLICY_ACCIDENT', 'DAYS_POLICY_CLAIM', 'PASTNUMBEROFCLAIMS',
       'AGEOFVEHICLE', 'AGEOFPOLICYHOLDER', 'POLICEREPORTFILED',
       'WITNESSPRESENT', 'AGENTTYPE', 'NUMBEROFSUPPLIMENTS',
       'ADDRESSCHANGE_CLAIM', 'NUMBEROFCARS', 'YEAR', 'BASEPOLICY',
       'FRAUDFOUND'],
      dtype='object')

In [36]:
Xtest = test.drop(columns = ['FRAUDFOUND'])
Ytest = test['FRAUDFOUND']
Xtest.shape, Ytest.shape

((12918, 31), (12918,))

In [37]:
cardinal_df2 = pd.DataFrame(ohe.transform(Xtest[cardinal_cols]), columns = ohe.get_feature_names())
label_df2 = le.transform(Ytest)
cardinal_df2.shape

(12918, 165)

In [38]:
Xtest = pd.concat([Xtest, cardinal_df2], axis = 1)
Xtest.drop(columns = cardinal_cols, inplace = True)
Ytest = label_df2.copy()
Xtest.shape, Ytest.shape

((12918, 167), (12918,))

## Model Evaluation

In [39]:
rf3_test_pred = rf3.predict(Xtest)

print("\n--------------Test Metrics-----------------\n")
print("Confusion Matrix:\n", confusion_matrix(Ytest, rf3_test_pred))
print("\nAUC:", round(roc_auc_score(Ytest, rf3_test_pred),4))


--------------Test Metrics-----------------

Confusion Matrix:
 [[11134  1286]
 [  459    39]]

AUC: 0.4874


In [42]:
Xtest = Xtest[ X_train.columns]

xgb1_test_pred = xgb1.predict(Xtest)

print("\n--------------Test Metrics-----------------\n")
print("Confusion Matrix:\n", confusion_matrix(Ytest, xgb1_test_pred))
print("\nAUC:", round(roc_auc_score(Ytest, xgb1_test_pred),4))


--------------Test Metrics-----------------

Confusion Matrix:
 [[11470   950]
 [   67   431]]

AUC: 0.8945
