In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance


import warnings
warnings.filterwarnings("ignore")

In [2]:
#feature engineering v1: mapp data to numerical without one-hot encoding
data = pd.read_excel('data_new.xlsx')
data

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy,Date,DateClaimed,Date_Diff
0,12,5,3,Honda,Urban,2,1,1,Female,Single,...,No,External,none,1 year,3 to 4,1994,Liability,1994-12-29,1994-01-05,-358
1,1,3,3,Honda,Urban,1,1,4,Male,Single,...,No,External,none,no change,1 vehicle,1994,Collision,1994-01-20,1994-01-25,5
2,10,5,5,Honda,Urban,4,11,2,Male,Married,...,No,External,none,no change,1 vehicle,1994,Collision,1994-10-29,1994-11-11,13
3,6,2,6,Toyota,Rural,5,7,1,Male,Married,...,No,External,more than 5,no change,1 vehicle,1994,Liability,1994-06-12,1994-07-02,20
4,1,5,1,Honda,Urban,2,2,2,Female,Single,...,No,External,none,no change,1 vehicle,1994,Collision,1994-02-01,1994-02-09,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15414,11,4,5,Toyota,Urban,2,11,5,Male,Married,...,No,External,none,no change,1 vehicle,1996,Collision,1996-11-23,1996-12-04,11
15415,11,5,4,Pontiac,Urban,5,12,1,Male,Married,...,No,External,more than 5,no change,3 to 4,1996,Liability,1996-11-29,1996-12-07,8
15416,11,5,4,Toyota,Rural,5,12,1,Male,Single,...,No,External,1 to 2,no change,1 vehicle,1996,Collision,1996-11-29,1996-12-07,8
15417,12,1,1,Toyota,Urban,4,12,2,Female,Married,...,No,External,more than 5,no change,1 vehicle,1996,All Perils,1996-12-03,1996-12-13,10


In [3]:
VehiclePrice_mapping = {
    'more than 69000': np.mean([69000, 100000]),          # OR [69000,float(inf)]
    '20000 to 29000': np.mean([20000, 29000]),
    '30000 to 39000': np.mean([30000, 39000]),
    'less than 20000': np.mean([0, 20000]),
    '40000 to 59000': np.mean([40000, 59000]),
    '60000 to 69000': np.mean([60000, 69000])
}
data['VehiclePrice'] = data['VehiclePrice'].map(VehiclePrice_mapping)

Days_Policy_Accident_mapping = {
    'more than 30': np.mean([30, 50]),
    '15 to 30': np.mean([15, 30]),
    'none': 0,
    '1 to 7': np.mean([1, 7]),
    '8 to 15': np.mean([8, 15])
}
data['Days_Policy_Accident'] = data['Days_Policy_Accident'].map(Days_Policy_Accident_mapping)

Days_Policy_Claim_mapping = {
    'more than 30': np.mean([30, 50]),
    '15 to 30': np.mean([15, 30]),
    '8 to 15': np.mean([8, 15]),
    'none': 0
}
data['Days_Policy_Claim'] = data['Days_Policy_Claim'].map(Days_Policy_Claim_mapping)

PastNumberOfClaims_mapping = {
    'none': 0,
    '1': 1,
    '2 to 4': np.mean([2, 4]),
    'more than 4': 5
}
data['PastNumberOfClaims'] = data['PastNumberOfClaims'].map(PastNumberOfClaims_mapping)

AgeOfVehicle_mapping = {
    '3 years': 3,
    '6 years': 6,
    '7 years': 7,
    'more than 7': 8,
    '5 years': 5,
    'new': 1,
    '4 years': 4,
    '2 years': 2
}
data['AgeOfVehicle'] = data['AgeOfVehicle'].map(AgeOfVehicle_mapping)

AgeOfPolicyHolder_mapping = {
    '26 to 30': np.mean([26, 30]),
    '31 to 35': np.mean([31, 35]),
    '41 to 50': np.mean([41, 50]),
    '51 to 65': np.mean([51, 65]),
    '21 to 25': np.mean([21, 25]),
    '36 to 40': np.mean([36, 40]),
    '16 to 17': np.mean([16, 17]),
    'over 65': np.mean([65, 75]),
    '18 to 20': np.mean([18, 20])
}
data['AgeOfPolicyHolder'] = data['AgeOfPolicyHolder'].map(AgeOfPolicyHolder_mapping)

NumberOfSuppliments_mapping = {
    'none': 0,
    'more than 5': 6,
    '3 to 5': np.mean([3, 5]),
    '1 to 2': np.mean([1, 2])
}
data['NumberOfSuppliments'] = data['NumberOfSuppliments'].map(NumberOfSuppliments_mapping)

AddressChange_Claim_mapping = {
    '1 year': 1,
    'no change': 0,
    '4 to 8 years': np.mean([4, 8]),
    '2 to 3 years': np.mean([2, 3]),
    'under 6 months': np.mean([0, 0.5])
}
data['AddressChange_Claim']= data['AddressChange_Claim'].map(AddressChange_Claim_mapping)

NumberOfCars_mapping = {
    '3 to 4': np.mean([3, 4]),
    '1 vehicle': 1,
    '2 vehicles': 2,
    '5 to 8': np.mean([5, 8]),
    'more than 8': np.mean([8, 10])
}
data['NumberOfCars']= data['NumberOfCars'].map(NumberOfCars_mapping)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15419 entries, 0 to 15418
Data columns (total 36 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Month                 15419 non-null  int64         
 1   WeekOfMonth           15419 non-null  int64         
 2   DayOfWeek             15419 non-null  int64         
 3   Make                  15419 non-null  object        
 4   AccidentArea          15419 non-null  object        
 5   DayOfWeekClaimed      15419 non-null  int64         
 6   MonthClaimed          15419 non-null  int64         
 7   WeekOfMonthClaimed    15419 non-null  int64         
 8   Sex                   15419 non-null  object        
 9   MaritalStatus         15419 non-null  object        
 10  Age                   15419 non-null  int64         
 11  Fault                 15419 non-null  object        
 12  PolicyType            15419 non-null  object        
 13  VehicleCategory 

In [5]:
data.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy,Date,DateClaimed,Date_Diff
0,12,5,3,Honda,Urban,2,1,1,Female,Single,...,No,External,0.0,1.0,3.5,1994,Liability,1994-12-29,1994-01-05,-358
1,1,3,3,Honda,Urban,1,1,4,Male,Single,...,No,External,0.0,0.0,1.0,1994,Collision,1994-01-20,1994-01-25,5
2,10,5,5,Honda,Urban,4,11,2,Male,Married,...,No,External,0.0,0.0,1.0,1994,Collision,1994-10-29,1994-11-11,13
3,6,2,6,Toyota,Rural,5,7,1,Male,Married,...,No,External,6.0,0.0,1.0,1994,Liability,1994-06-12,1994-07-02,20
4,1,5,1,Honda,Urban,2,2,2,Female,Single,...,No,External,0.0,0.0,1.0,1994,Collision,1994-02-01,1994-02-09,8


In [6]:
data = data.drop(['Date', 'DateClaimed'], axis = 1)

In [7]:
#define a function for one-hot encoding and scaling
def encodeNscale(df):
    scaler = StandardScaler()
    encoder = OneHotEncoder(sparse=False)

    ohe_cols = data.select_dtypes(include=['object']).columns
    df_encoded = pd.DataFrame(encoder.fit_transform(df[ohe_cols]), columns=encoder.get_feature_names_out(ohe_cols))
    df = pd.concat([df, df_encoded], axis=1)
    df = df.drop(ohe_cols, axis=1)

    scaled_col = list(df.columns)
    scaled_col.remove('FraudFound_P')
    scaled_ds = pd.DataFrame(scaler.fit_transform(df[scaled_col].copy()),columns= scaled_col)
    return scaled_ds

In [8]:
#split X, y & train, validation, test
y = data['FraudFound_P']
X = encodeNscale(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [9]:
n_samples = len(data)
n_classes = 2
weight_0 = n_samples/(n_samples * np.bincount(y)[0])
weight_1 = n_samples/(n_samples * np.bincount(y)[1])
print(weight_0)
print(weight_1)

6.898454746136865e-05
0.0010834236186348862


In [10]:
model = LogisticRegression(class_weight={0:weight_0, 1:weight_1})
model.fit(X_train, y_train)
train_accuracy = model.score(X_train, y_train)
print("Training Accuracy:", train_accuracy)
test_accuracy = model.score(X_test, y_test)
print("Testing Accuracy:", test_accuracy)
print(classification_report(y_test, model.predict(X_test)))

Training Accuracy: 0.6116740980948521
Testing Accuracy: 0.6206225680933852
              precision    recall  f1-score   support

           0       0.99      0.60      0.75      2885
           1       0.13      0.89      0.23       199

    accuracy                           0.62      3084
   macro avg       0.56      0.75      0.49      3084
weighted avg       0.93      0.62      0.71      3084



In [11]:
conf_matrix = confusion_matrix(y_test, model.predict(X_test))
print(conf_matrix)
tp = conf_matrix[1, 1]
fn = conf_matrix[1, 0]
recall = tp/(tp + fn)
print("Recall rate:", recall)

[[1737 1148]
 [  22  177]]
Recall rate: 0.8894472361809045


In [23]:
feature_importance = model.coef_
most_important_feature_index = feature_importance.argmax()
most_important_feature_name = X.columns[most_important_feature_index]
print('Most important feature index:', most_important_feature_index)
print('Most important feature name:', most_important_feature_name)
print('Most important feature coefficient:', feature_importance[0][14])

Most important feature index: 49
Most important feature name: Fault_Policy Holder
Most important feature coefficient: -0.02982806081942889


In [25]:
#model tuning
solver = ['liblinear', 'newton-cg', 'lbfgs']
C = [1, 5, 10]
max_iter = [50, 100]
for s in solver:
    for c in C:
        for i in max_iter:
            model = LogisticRegression(class_weight={0:weight_0, 1:weight_1}, solver=s, C=c, max_iter=i)
            model.fit(X_train, y_train)

            print("Solver: ", s)
            print("C: ", c)
            print("Max_iteration: ", i)

            train_accuracy = model.score(X_train, y_train)
            print("Training Accuracy:", train_accuracy)
            val_accuracy = model.score(X_val, y_val)
            print("Validation Accuracy:", val_accuracy)
            print(classification_report(y_val, model.predict(X_val)))

Solver:  liblinear
C:  1
Max_iteration:  50
Training Accuracy: 0.5436765301986218
Validation Accuracy: 0.542359140656668
              precision    recall  f1-score   support

           0       0.99      0.52      0.68      2339
           1       0.10      0.93      0.17       128

    accuracy                           0.54      2467
   macro avg       0.54      0.73      0.43      2467
weighted avg       0.95      0.54      0.66      2467

Solver:  liblinear
C:  1
Max_iteration:  100
Training Accuracy: 0.5436765301986218
Validation Accuracy: 0.542359140656668
              precision    recall  f1-score   support

           0       0.99      0.52      0.68      2339
           1       0.10      0.93      0.17       128

    accuracy                           0.54      2467
   macro avg       0.54      0.73      0.43      2467
weighted avg       0.95      0.54      0.66      2467

Solver:  liblinear
C:  5
Max_iteration:  50
Training Accuracy: 0.5982975273611674
Validation Accuracy: 

In [26]:
#since we focus on recall, the best parameter set is
#solver = liblinear, C = 5, max_iter = 50

#run on test set
model = LogisticRegression(class_weight={0:weight_0, 1:weight_1}, solver='liblinear', C=5, max_iter=50)
model.fit(X_train, y_train)

train_accuracy = model.score(X_train, y_train)
print("Training Accuracy:", train_accuracy)
test_accuracy = model.score(X_test, y_test)
print("Testing Accuracy:", test_accuracy)
print(classification_report(y_test, model.predict(X_test)))    

Training Accuracy: 0.5982975273611674
Testing Accuracy: 0.6102464332036317
              precision    recall  f1-score   support

           0       0.99      0.59      0.74      2885
           1       0.13      0.92      0.23       199

    accuracy                           0.61      3084
   macro avg       0.56      0.76      0.49      3084
weighted avg       0.94      0.61      0.71      3084



In [71]:
#Adjust threshold using Youden's Index
model = LogisticRegression(class_weight={0:weight_0, 1:weight_1}, solver='liblinear', C=10, max_iter=50)
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
fpr, tpr, thresholds = roc_curve(y_val, y_val_pred[:, 1])
j_score = tpr-fpr
optimal_threshold = thresholds[np.argmax(j_score)]
print("Youden's Index:", max(j_score))
print("thresholds:", optimal_threshold)

y_test_pred = model.predict_proba(X_test)
preds = np.where(y_test_pred[:, 1]>optimal_threshold, 1, 0)
test_accuracy = accuracy_score(y_test, preds)
print("Test Accuracy:", test_accuracy)
print(classification_report(y_test, preds))

#recall and f1-score improved compared to baseline model

Youden's Index: 0.5386884085079093
thresholds: 0.5056284237242623
Test Accuracy: 0.6384565499351491
              precision    recall  f1-score   support

           0       0.99      0.62      0.76      2885
           1       0.14      0.88      0.24       199

    accuracy                           0.64      3084
   macro avg       0.56      0.75      0.50      3084
weighted avg       0.93      0.64      0.73      3084

