# Basic data import and pre-processing

In [None]:
Basic data import and pre-processing
# Imports

import glob
import string
import ast

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
from scipy import interp
from itertools import cycle
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier, export_graphviz 
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

#metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix, f1_score, recall_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve

In [None]:
# import Insurance_Fraud from your computer, you need to change your source...

import pandas as pd

df = pd.read_excel (r'/Users/sunqiaoyubing/Downloads/I_Fraud.xlsx')
print(df)

In [None]:
# one-hot encoding
import category_encoders as ce
encoder = ce.OrdinalEncoder(cols=['Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'MonthClaimed', 'Sex',
             'Month', 'DayOfWeek', 'Make', 'AccidentArea', 'Month', 'DayOfWeek', 'Make', 'MaritalStatus',
             'Fault', 'PolicyType', 'VehicleCategory','VehiclePrice', 'Days_Policy_Accident', 'Days_Policy_Claim',
             'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder','PoliceReportFiled', 'WitnessPresent',
             'AgentType', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars','BasePolicy'])


df = encoder.fit_transform(df)
df.info()

In [None]:
X_all = df.drop(['FraudFound_P'], axis=1)
y_all = df['FraudFound_P']

In [None]:
#data split to test&train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=25
)

print(f'''% Positive class in Train = {np.round(y_train.value_counts(normalize=True)[1] * 100, 2)}
% Positive class in Test  = {np.round(y_test.value_counts(normalize=True)[1] * 100, 2)}''')

# Modelling

## Logistic Regression

In [None]:
#logistics regression without penalty
import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit()
print(result.summary2())

In [None]:
# Grid search cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

In [None]:
#logreg = LogisticRegression(penalty='l2')
logreg = LogisticRegression(penalty='l2',C= 0.01)
logreg.fit(X_train, y_train)

In [None]:
# Accuracy
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
# ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve: Logistic Regreesion model for Fraud Detection ')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
print(f'''% Positive class in Train = {np.round(y_train.value_counts(normalize=True)[1] * 100, 2)}
% Positive class in Test  = {np.round(y_test.value_counts(normalize=True)[1] * 100, 2)}''')

In [None]:
# Confusion Matrix
from sklearn.metrics import recall_score
# Evaluate
print(f'Accuracy = {accuracy_score(y_test, y_pred):.2f}\nRecall = {recall_score(y_test, y_pred):.2f}\n')
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.title('Confusion Matrix (without Resampling)', size=16)
sns.heatmap(cm, annot=True, cmap='Blues');

In [None]:
# Average precision score
y_score = logreg.predict_proba(X_test)[:, 1]
average_precision = average_precision_score(y_test, y_score)
print(average_precision)

In [None]:
# Data to plot precision - recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
# Use AUC function to calculate the area under the curve of precision recall curve
auc_precision_recall = auc(recall, precision)
print(auc_precision_recall)

In [None]:
# AUPRC
disp = plot_precision_recall_curve(logreg, X_test, y_test)
disp.ax_.set_title('Binary class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

In [None]:
# MCC
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(y_test, y_pred)

## Decision Tree

In [None]:
clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_leaf=5)


# fit the model
clf_gini.fit(X_train, y_train)

In [None]:
# Plot Tree Model
plt.figure(figsize=(16,13))

from sklearn import tree

y_pred = clf_gini.predict(X_test)
tree.plot_tree(clf_gini.fit(X_train,y_train)) 

In [None]:
# Classification Report
print(classification_report(y_test, y_pred))

In [None]:
# Evaluate: Confusion Matrix
print(f'Accuracy = {accuracy_score(y_test, y_pred):.2f}\nRecall = {recall_score(y_test, y_pred):.2f}\n')
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.title('Confusion Matrix (without Resampling)', size=16)
sns.heatmap(cm, annot=True, cmap='Blues');

In [None]:
# Average precision score
y_score = clf_gini.predict_proba(X_test)[:, 1]
average_precision = average_precision_score(y_test, y_score)
print(average_precision)

In [None]:
# Data to plot precision - recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
# Use AUC function to calculate the area under the curve of precision recall curve
auc_precision_recall = auc(recall, precision)
print(auc_precision_recall)

In [None]:
# AUPRC
disp = plot_precision_recall_curve(clf_gini, X_test, y_test)
disp.ax_.set_title('Binary class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

In [None]:
# MCC
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(y_test, y_pred)

## Random Forest

In [None]:
#rf
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
# prediction 
y_pred = rf.predict(X_test)
print (y_pred) 


In [None]:
# Classification Report
print(classification_report(y_test, y_pred))

In [None]:
# Evaluate
print(f'Accuracy = {accuracy_score(y_test, y_pred):.2f}\nRecall = {recall_score(y_test, y_pred):.2f}\n')
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.title('Confusion Matrix (without Resampling)', size=16)
sns.heatmap(cm, annot=True, cmap='Blues');

In [None]:
# Average precision score
y_score = rf.predict_proba(X_test)[:, 1]
average_precision = average_precision_score(y_test, y_score)
print(average_precision)

In [None]:
# Data to plot precision - recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
# Use AUC function to calculate the area under the curve of precision recall curve
auc_precision_recall = auc(recall, precision)
print(auc_precision_recall)

In [None]:
# AUPRC
disp = plot_precision_recall_curve(rf, X_test, y_test)
disp.ax_.set_title('Binary class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

In [None]:
# MCC
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(y_test, y_pred)

## Support Vector Machine

In [None]:
from sklearn import svm

clf_svm = svm.SVC(kernel='rbf')
clf_svm.fit(X_train,y_train)

In [None]:
y_pred = clf_svm.predict(X_test)
# print classification report
print(classification_report(y_test, y_pred))

In [None]:
y_score = clf_svm.decision_function(X_test)
print(y_score)

In [None]:
# Evaluate
print(f'Accuracy = {accuracy_score(y_test, y_pred):.2f}\nRecall = {recall_score(y_test, y_pred):.2f}\n')
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.title('Confusion Matrix (without Resampling)', size=16)
sns.heatmap(cm, annot=True, cmap='Blues');

In [None]:
# Average precision score

average_precision = average_precision_score(y_test, y_score)
print(average_precision)

In [None]:
# Data to plot precision - recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
# Use AUC function to calculate the area under the curve of precision recall curve
auc_precision_recall = auc(recall, precision)
print(auc_precision_recall)

In [None]:
# AUPRC
disp = plot_precision_recall_curve(clf_svm, X_test, y_test)
disp.ax_.set_title('Binary class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

In [None]:
# MCC
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(y_test, y_pred)