In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot

In [43]:
# import data
df = pd.read_csv('creditcard.csv')

# rename class name by label
class_names = {0: 'Not Fraud', 1: 'Fraud'}
df['Class'] = df['Class'].apply(lambda x: class_names.get(x))

# count the number of fraud in dataset
df.Class.value_counts()
num_frauds = len(df[df['Class'] == 'Fraud'])
num_not_frauds = len(df[df['Class'] == 'Not Fraud'])
proba_fraud = num_frauds/(num_frauds + num_not_frauds)
print(f'Proba of fraud: {proba_fraud}')

Proba of fraud: 0.001727485630620034


In [44]:
# split data into train and test
from sklearn.model_selection import train_test_split

Y = df['Class']
X = df.drop(['Class'], axis = 1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=420)


In [46]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [34]:
# create models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, \
    RandomForestClassifier, ExtraTreesClassifier

models = []
models.append(('LR', LogisticRegression()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('SVC', SVC()))
models.append(('Ada', AdaBoostClassifier()))
models.append(('GBC', GradientBoostingClassifier()))
models.append(('MLP', MLPClassifier()))
models.append(('RFC', RandomForestClassifier()))
models.append(('ETC', ExtraTreesClassifier()))
models.append(('KNN', KNeighborsClassifier()))

In [None]:
# train models
from sklearn.model_selection import KFold, cross_val_score

def train_models_with_kfold(num_folds, scoring):
    print(f"Training with: {scoring}")
    results, names = [], []
    for name, model in models:
        kfold = KFold(n_splits=num_folds)
        cv_results = cross_val_score(model, X_train_scaled, Y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
    return results, names

results_accuracy, names = train_models_with_kfold(10, 'accuracy')
results_recall, names = train_models_with_kfold(10, 'recall')



In [None]:
# compare algorithms
import matplotlib.pyplot as plt

fig = plt.figure()
fig.subtitle('Algorithm Comparison')
plt.boxplot(results_accuracy)
plt.show()

In [49]:
# Test performance on best model - LDA
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


model = LinearDiscriminantAnalysis()
model.fit(X_train_scaled, Y_train)
predictions = model.predict(X_test_scaled)
# Metric 1 - accuracy score
accuracy_lda = accuracy_score(predictions, Y_test)
print(accuracy_lda)
# Metric 2 - Classification Report
classification_report_lda = classification_report(predictions, Y_test)
print(classification_report_lda)
# Metric 3 - Confusion Matrix
confusion_matrix_lda = confusion_matrix(predictions, Y_test)
print(confusion_matrix_lda)

0.9995084442259752
              precision    recall  f1-score   support

       Fraud       0.82      0.89      0.85        90
   Not Fraud       1.00      1.00      1.00     56872

    accuracy                           1.00     56962
   macro avg       0.91      0.94      0.93     56962
weighted avg       1.00      1.00      1.00     56962

[[   80    10]
 [   18 56854]]
