<a href="https://colab.research.google.com/github/yeyi-alice/Vehicle-Insurance-Fraud-Detection/blob/main/knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [360]:
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn import metrics
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

import numpy as np
import matplotlib.pyplot as plt
np.random.seed(123)


In [361]:
path = "data_cleaned.csv"
data = pd.read_csv(path)
data.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy,Date,DateClaimed,Date_Diff
0,12,5,3,Honda,Urban,2,1,1,Female,Single,...,No,External,none,1 year,3 to 4,1994,Liability,1994-12-29,1994-01-05,-358
1,1,3,3,Honda,Urban,1,1,4,Male,Single,...,No,External,none,no change,1 vehicle,1994,Collision,1994-01-20,1994-01-25,5
2,10,5,5,Honda,Urban,4,11,2,Male,Married,...,No,External,none,no change,1 vehicle,1994,Collision,1994-10-29,1994-11-11,13
3,6,2,6,Toyota,Rural,5,7,1,Male,Married,...,No,External,more than 5,no change,1 vehicle,1994,Liability,1994-06-12,1994-07-02,20
4,1,5,1,Honda,Urban,2,2,2,Female,Single,...,No,External,none,no change,1 vehicle,1994,Collision,1994-02-01,1994-02-09,8


In [362]:
#remove policy number column (irrelevant)
data.drop(['PolicyNumber'], axis=1, inplace=True)
data.drop(['Date'], axis=1, inplace=True)
data.drop(['DateClaimed'], axis=1, inplace=True)

#create 2 new columns, category and coverage based on existing PolicyType column
def extract_categories(policytype):
    category, coverage = policytype.split(' - ')
    return category, coverage

# Apply the custom function to create two new columns
data[['PolicyCategory', 'PolicyCoverage']] = data['PolicyType'].apply(extract_categories).apply(pd.Series)

data.drop(['PolicyType'], axis=1, inplace=True)
data.drop(['BasePolicy'], axis=1, inplace=True)
data.drop(['Month', 'WeekOfMonth', 'DayOfWeek', 'DayOfWeekClaimed',	'MonthClaimed',	'WeekOfMonthClaimed'], axis=1, inplace=True)  #, 'DayOfWeekClaimed',	'MonthClaimed',	'WeekOfMonthClaimed'

In [363]:
#train test split 80%
df = data
col_names = df.columns
feature_cols = [col for col in col_names if col not in ["FraudFound_P"]]
label_col = "FraudFound_P"
X = df[feature_cols]
y = df[label_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [364]:
#Convert numerical variables to categorical variables, excluding Age and Date_Diff

#'Month', 'WeekOfMonth', 'DayOfWeek', 'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed',
numerical_cols = ['RepNumber', 'Deductible', 'DriverRating', 'Year'] #, 'Year'
X_train[numerical_cols] = X_train[numerical_cols].astype('category')
X_test[numerical_cols] = X_test[numerical_cols].astype('category')


In [365]:
#bin numerical feature 'Age'

bin_edges = [0, 18, 35, 50, 65, 80, 100]
bin_labels = ['0-18', '19-35', '36-50', '51-65', '66-80', '80-100']
X_train['Age'] = pd.cut(X_train['Age'], bins=bin_edges, labels=bin_labels, right=False)
X_test['Age'] = pd.cut(X_test['Age'], bins=bin_edges, labels=bin_labels, right=False)

In [366]:
# Scale Date_Diff

scaler = StandardScaler()
X_train['Date_Diff'] = scaler.fit_transform(X_train[['Date_Diff']]) # fit scaler on the training set and transform
X_test['Date_Diff'] = scaler.transform(X_test[['Date_Diff']]) #transform testing set

In [367]:
# Prepare to Encode

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import make_pipeline

#'Month', 'WeekOfMonth', 'DayOfWeek',
#'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed',

unchanged_cols_names = ['AccidentArea',
                       'Sex' , 'Fault', 'RepNumber', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'Date_Diff']

# List of columns for one-hot encoding
ohe_cols = ['Make', 'MaritalStatus', 'PolicyCategory', 'PolicyCoverage', 'VehicleCategory', 'Deductible', 'Year'] #, 'Year'

# List of columns for ordinal encoding
ordinal_cols = ['Age', 'DriverRating', 'VehiclePrice', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars']

#categories for ordinal encoder
age_cat = ['0-18', '19-35', '36-50', '51-65', '66-80', '80-100']
driverrating_cat = ['1', '2', '3', '4']
vprice_cat = ['more than 69000', '20000 to 29000', '30000 to 39000', 'less than 20000', '40000 to 59000', '60000 to 69000']
days_pol_acc_cat = ['none', '1 to 7', '8 to 15', '15 to 30', 'more than 30']
Days_Policy_Claim_cat = ['8 to 15', 'more than 30', '15 to 30' ]
PastNumberOfClaims_cat = ['none', '1', '2 to 4', 'more than 4']
AgeOfVehicle_cat = ['new', '2 years', '3 years',  '4 years', '5 years', '6 years', '7 years', 'more than 7' ]
AgeOfPolicyHolder_cat = [ '16 to 17', '18 to 20', '21 to 25', '26 to 30', '31 to 35', '36 to 40', '41 to 50', '51 to 65', 'over 65']
NumberOfSuppliments_cat = ['none', '1 to 2', '3 to 5', 'more than 5']
AddressChange_Claim_cat = ['no change', 'under 6 months', '1 year','2 to 3 years', '4 to 8 years']
NumberOfCars_cat = ['1 vehicle', '2 vehicles', '3 to 4', '5 to 8', 'more than 8']


In [368]:
# Create encoders
ohe = OneHotEncoder(sparse_output=False)
ordinal_encoder = OrdinalEncoder(categories=[age_cat, driverrating_cat, vprice_cat, days_pol_acc_cat, Days_Policy_Claim_cat, PastNumberOfClaims_cat,
                                            AgeOfVehicle_cat, AgeOfPolicyHolder_cat, NumberOfSuppliments_cat, AddressChange_Claim_cat, NumberOfCars_cat])

# Create a ColumnTransformer
feature_transform = ColumnTransformer(
    transformers=[
        ('one_hot', ohe, ohe_cols),
        ('ordinal', ordinal_encoder, ordinal_cols)
    ],
    remainder='passthrough'
)


In [369]:
# Fit and transform on training set
X_train_fea = feature_transform.fit_transform(X_train)
X_test_fea = feature_transform.transform(X_test)

# Get the feature names for the one-hot encoded columns
ohe_feature_names = feature_transform.named_transformers_['one_hot'].get_feature_names_out(input_features=ohe_cols)

# Combine feature names for the one-hot encoded and ordinal columns amd binary columns and numerical columns
all_feature_names = list(ohe_feature_names) + ordinal_cols + unchanged_cols_names

# Create DataFrames with the transformed features and proper column names
X_train_fea = pd.DataFrame(X_train_fea, columns=all_feature_names)
X_test_fea = pd.DataFrame(X_test_fea, columns=all_feature_names)

In [370]:
#binary encoding: Sex, AccidentArea, Fault, PoliceReportFiled, WitnessPresent, AgentType

binary_column_list = ['Sex', 'AccidentArea', 'Fault', 'PoliceReportFiled', 'WitnessPresent', 'AgentType']
# X_train_binary = X_train[binary_column_list]
# X_train_binary
X_train_fea['Sex'] = X_train_fea['Sex'].map({'Female': 1, 'Male': 0})
X_test_fea['Sex'] = X_test_fea['Sex'].map({'Female': 1, 'Male': 0})
X_train_fea['AccidentArea'] = X_train_fea['AccidentArea'].map({'Urban': 1, 'Rural': 0})
X_test_fea['AccidentArea'] = X_test_fea['AccidentArea'].map({'Urban': 1, 'Rural': 0})
X_train_fea['Fault'] = X_train_fea['Fault'].map({'Policy Holder': 1, 'Third Party': 0})
X_test_fea['Fault'] = X_test_fea['Fault'].map({'Policy Holder': 1, 'Third Party': 0})
X_train_fea['PoliceReportFiled'] = X_train_fea['PoliceReportFiled'].map({'Yes': 1, 'No': 0})
X_test_fea['PoliceReportFiled'] = X_test_fea['PoliceReportFiled'].map({'Yes': 1, 'No': 0})
X_train_fea['WitnessPresent'] = X_train_fea['WitnessPresent'].map({'Yes': 1, 'No': 0})
X_test_fea['WitnessPresent'] = X_test_fea['WitnessPresent'].map({'Yes': 1, 'No': 0})
X_train_fea['AgentType'] = X_train_fea['AgentType'].map({'External': 1, 'Internal': 0})
X_test_fea['AgentType'] = X_test_fea['AgentType'].map({'External': 1, 'Internal': 0})

Model fitting on raw train and test data

In [371]:
# Fit KNN model (initial)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, classification_report

knn = KNeighborsClassifier(n_neighbors=5)
model = knn.fit(X_train_fea, y_train)
knn_predict = knn.predict(X_test_fea)

knn_conf_matrix = confusion_matrix(y_test, knn_predict)

knn_train_acc_score = accuracy_score(y_train, knn.predict(X_train_fea))
knn_test_acc_score = accuracy_score(y_test, knn_predict)

print("test confusion matrix")
print(knn_conf_matrix)
print("\n")
print("Train Accuracy of k-NN Classification:",knn_train_acc_score*100,'\n')
print("Test Accuracy of k-NN Classification:",knn_test_acc_score*100,'\n')
print(classification_report(y_test, knn_predict))

test confusion matrix
[[2876    9]
 [ 195    4]]


Train Accuracy of k-NN Classification: 94.44669639237941 

Test Accuracy of k-NN Classification: 93.3852140077821 

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      2885
           1       0.31      0.02      0.04       199

    accuracy                           0.93      3084
   macro avg       0.62      0.51      0.50      3084
weighted avg       0.90      0.93      0.91      3084



In [372]:
#Balance the data using SMOTE
from imblearn.over_sampling import SMOTE
import seaborn as sns

OS_SMOTE = SMOTE()
X_train_fea_os, y_train_os = OS_SMOTE.fit_resample(X_train_fea, y_train)

# oversample_plot = y_train_os.value_counts().reset_index()
# oversample_plot.columns = ['Labels', 'FraudFound_P']
# print(oversample_plot)
# sns.barplot(x='Labels', y='FraudFound_P', data=oversample_plot)
# plt.title("Status after upsampling")

knn = KNeighborsClassifier(n_neighbors=5)
model = knn.fit(X_train_fea_os, y_train_os)
knn_predict = knn.predict(X_test_fea)

knn_conf_matrix = confusion_matrix(y_test, knn_predict)

knn_train_acc_score = accuracy_score(y_train_os, knn.predict(X_train_fea_os))
knn_test_acc_score = accuracy_score(y_test, knn_predict)

print("test confusion matrix")
print(knn_conf_matrix)
print("\n")
# print("Train Accuracy of k-NN Classification:",knn_train_acc_score*100,'\n')
print("Test Accuracy of k-NN Classification:",knn_test_acc_score*100,'\n')
print(classification_report(y_test, knn_predict))

tn, fp, fn, tp = knn_conf_matrix.ravel()

# Calculate Sensitivity and Specificity
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

# Calculate Youden Index
youden_index = sensitivity + specificity - 1
print("Youden Index: ", youden_index)

test confusion matrix
[[2092  793]
 [ 110   89]]


Test Accuracy of k-NN Classification: 70.71984435797665 

              precision    recall  f1-score   support

           0       0.95      0.73      0.82      2885
           1       0.10      0.45      0.16       199

    accuracy                           0.71      3084
   macro avg       0.53      0.59      0.49      3084
weighted avg       0.90      0.71      0.78      3084

Youden Index:  0.1723661635735001


In [373]:
# Tune KNN model
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split


# Define the parameter grid for k
param_grid = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15]}

# Create a k-NN classifier
knn2 = KNeighborsClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(knn2, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_fea_os, y_train_os)

# Print the best parameters and corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: {:.2f}".format(grid_search.best_score_))

# Evaluate on the test set
best_knn = grid_search.best_estimator_
test_accuracy = best_knn.score(X_test_fea, y_test)
print("Test Accuracy: {:.2f}".format(test_accuracy))

Best Parameters:  {'n_neighbors': 3}
Best Accuracy: 0.88
Test Accuracy: 0.75


In [374]:
# Fit KNN model after Grid Search and SMOTE. k = 3
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, classification_report

knn = KNeighborsClassifier(n_neighbors=3)
model = knn.fit(X_train_fea_os, y_train_os)
knn_predict = knn.predict(X_test_fea)

knn_conf_matrix = confusion_matrix(y_test, knn_predict)

knn_train_acc_score = accuracy_score(y_train_os, knn.predict(X_train_fea_os))
knn_test_acc_score = accuracy_score(y_test, knn_predict)

print("test confusion matrix")
print(knn_conf_matrix)
print("\n")
print("Test Accuracy of k-NN Classification:",knn_test_acc_score*100,'\n')
print(classification_report(y_test, knn_predict))

test confusion matrix
[[2232  653]
 [ 122   77]]


Test Accuracy of k-NN Classification: 74.87029831387808 

              precision    recall  f1-score   support

           0       0.95      0.77      0.85      2885
           1       0.11      0.39      0.17       199

    accuracy                           0.75      3084
   macro avg       0.53      0.58      0.51      3084
weighted avg       0.89      0.75      0.81      3084



In [377]:
#Tune using threshold

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


# Train KNN model
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train_fea_os, y_train_os)

# Obtain predicted probabilities for each class
y_probabilities = knn_model.predict_proba(X_test_fea)

# Adjust threshold and make predictions
threshold = 0.3
y_pred_adjusted = (y_probabilities[:, 1] > threshold).astype(int)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_adjusted)

# Calculate sensitivity and specificity
sensitivity = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
specificity = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])

# Calculate Youden Index
youden_index = sensitivity + specificity - 1

# Display the results
print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Youden Index: {youden_index:.4f}")

# Print the classification report for additional metrics
print("Classification Report with Adjusted Threshold:")
print(classification_report(y_test, y_pred_adjusted))
print(conf_matrix)
print("accuracy: ", accuracy_score(y_test, y_pred_adjusted))

Sensitivity: 0.5578
Specificity: 0.6579
Youden Index: 0.2157
Classification Report with Adjusted Threshold:
              precision    recall  f1-score   support

           0       0.96      0.66      0.78      2885
           1       0.10      0.56      0.17       199

    accuracy                           0.65      3084
   macro avg       0.53      0.61      0.48      3084
weighted avg       0.90      0.65      0.74      3084

[[1898  987]
 [  88  111]]
accuracy:  0.6514267185473411
