In [0]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 25 20:36:32 2020

@author: yash
"""

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import balanced_accuracy_score
import datetime as dt
import numpy as np
from sklearn.preprocessing import StandardScaler


import pandas as pd

dataset = pd.read_csv('creditcard.csv')

X = dataset.iloc[:, 0:30].values
Y = dataset.iloc[:, 30].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25,shuffle = True, random_state = 42,  stratify = Y )
C_values = [0.05, 0.1, 0.25, 0.5]
max_bal_acc = 0
best_c = 0
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
for c in C_values:
  model = LogisticRegression(max_iter = 500, C = c)

  model.fit(X_train, Y_train)

  model.score(X_test, Y_test) 

  Y_pred = model.predict(X_test)

  bal_acc = balanced_accuracy_score(Y_test, Y_pred)
  if bal_acc > max_bal_acc:
    max_bal_acc = bal_acc
    best_c = c

print("The maximum balaced accuracy acheived is: {}".format(max_bal_acc))
print("The best value of C is:{}".format(best_c))




The maximum balaced accuracy acheived is: 0.8088446074601294
The best value of C is:0.05


As we have acheived a m maximum balanced accuracy of C, we will use that and try to experiment with different data splits to identify the best split.

In [0]:
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, accuracy_score

train_time = []
test_time = []
accuracy = []
precision = []
recall = []
auc = []
balanced = []
f1 = []
splits = []
models = []
roc = []
pr = []
conf_mat = []

# for each split percent
for split in range(10, 51, 10):
    splits.append(split)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=split/100, shuffle = True, random_state=42, stratify=None)
    # train
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    start1 = dt.datetime.now()
    model = LogisticRegression(max_iter = 500, C = best_c)
    model.fit(X_train, Y_train)
    models.append(model)
    end1 = dt.datetime.now()
    # test
    start2 = dt.datetime.now()
    Y_pred = model.predict(X_test)
    print(classification_report(Y_test, Y_pred))
    fpr, tpr, _ = roc_curve(Y_test, Y_pred)
    roc.append([tpr, fpr])  # for roc curve
    p, r, _ = precision_recall_curve(Y_test, Y_pred)
    pr.append([p, r])   # for pr curve
    cfm = confusion_matrix(Y_test, Y_pred)
    print(cfm)
    print("----------")
    conf_mat.append(cfm)   # for confusion matrix later
    end2 = dt.datetime.now()
    # score
    train_time.append(end1 - start1)
    test_time.append(end2 - start2)
    accuracy.append(accuracy_score(Y_test, Y_pred))
    precision.append(precision_score(Y_test, Y_pred))
    recall.append(recall_score(Y_test, Y_pred))
    auc.append(roc_auc_score(Y_test, Y_pred))
    balanced.append(balanced_accuracy_score(Y_test, Y_pred))
    f1.append(f1_score(Y_test, Y_pred))

for i in range(len(train_time)):
  print("----------")
  print("Logistic Regression at {}% split:".format(splits[i]))
  print("Accuracy score = {}".format(accuracy[i]))
  print("Precision score = {}".format(precision[i]))
  print("Recall score = {}".format(recall[i]))
  print("AUC_ROC score = {}".format(auc[i]))
  print("Balanced accuracy score = {}".format(balanced[i]))
  print("F1 score = {}".format(f1[i]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28435
           1       0.83      0.63      0.72        46

    accuracy                           1.00     28481
   macro avg       0.91      0.82      0.86     28481
weighted avg       1.00      1.00      1.00     28481

[[28429     6]
 [   17    29]]
----------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.86      0.57      0.69        98

    accuracy                           1.00     56962
   macro avg       0.93      0.79      0.84     56962
weighted avg       1.00      1.00      1.00     56962

[[56855     9]
 [   42    56]]
----------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.88      0.62      0.73       136

    accuracy                           1.00     85443
   macro avg       0.94      0.81      0.86 

The balanced accuracy scores from the above reuslts are following a random trend. There is increase and then decrease after increasing the test size. Moreover, the accuracy of each of these models is almost a perfect score (100%). 

On the basis of these results, we can conclude that the model is overfitted and is not suitable for classification.

Also, from the above results, we can say that almost every new transaction that needs classification will have a higher probability of getting classified as a legitimate transaction. This is due to the high imbalance of the training dataset. 

To overcome this issue, we need to bring the data of the fraudulent and non-fradulent transactions to equal or almost equal percentages.

As we cannot generate fraud data, Under Sampling is performed.

The results of Logistic Regression on Under Sampled data is analyzed in a different file. Please find the file in the submission.