In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import scipy as sp
import pandas as pd
import csv
import random as rn
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import scipy.io as sio
import math

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.mlab as mlab
from sklearn import preprocessing

In [2]:
data = pd.read_csv('creditcard.csv')
df_ccf = pd.DataFrame(data)

In [3]:
number_fraud = len(data[data.Class == 1])
number_no_fraud = len(data[data.Class == 0])
print('Total cases: {}'.format(number_no_fraud + number_fraud))
print('Number of fraud: {}'.format(number_fraud))
print('Number of no fraud: {}'.format(number_no_fraud))

Total cases: 284807
Number of fraud: 492
Number of no fraud: 284315


In [4]:
df_train_1 = df_ccf[df_ccf['Class'] == 1]
df_train_0 = df_ccf[df_ccf['Class'] == 0]
len_df_train_1 = len(df_train_1)
print('Number of frauds in this training dataset: {}'.format(len_df_train_1))
# Append similar number of non-fraud samples into this training dataset
df_sample = df_train_0.sample(500)
df_train = df_train_1.append(df_sample)
# Mix our dataset
df_train = df_train.sample(frac=1)

Number of frauds in this training dataset: 492


In [5]:
# Drop Time and the Class (label)
X = df_train.drop(['Time', 'Class'],axis=1)
Y = df_train['Class'] # We create our label
X = np.asarray(X)
Y = np.asarray(Y)

## Partition 2 (80/20 Split)

### SVM

In [6]:
X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size = 0.2 ,random_state = 42)

In [7]:
# Calculate error given feature vectors X and labels Y.
def calc_error(X, Y, classifier):
    Y_pred = classifier.predict(X)   
    e = 1 - accuracy_score(Y, Y_pred) 
    return e

In [8]:
C_list = [0.1, 1, 10, 100, 1000]
opt_e_training = 1.0   # Optimal training error.
opt_classifier = None  # Optimal classifier.
opt_C          = None  # Optimal C.

for C in C_list:
    # Create a linear SVM classifier.
    classifier = svm.LinearSVC(penalty='l2', loss='hinge', C=C, max_iter=10000)
    
    # Use the classifier to fit the training set (use X_train, Y_train).
    classifier.fit(X_train_val, Y_train_val)

    # Obtain the weights and bias from the linear SVM classifier.
    W = classifier.coef_[0]
    b = classifier.intercept_[0]
    
    # Show decision boundary, training error and test error.
    print('C = {}'.format(C))
    print('Decision boundary: {:.3f}x0+{:.3f}x1+{:.3f}=0'.format(W[0],W[1],b))
    #vis(X_train_val, Y_train_val, W, b)
    e_training = calc_error(X_train_val, Y_train_val, classifier)
    print('Training accuracy: {}'.format(1 - e_training))
    print('\n\n\n')
    
    # Judge if it is the optimal one.
    if e_training < opt_e_training:
        opt_e_training = e_training
        opt_classifier = classifier
        opt_C = C

C = 0.1
Decision boundary: -0.120x0+-0.194x1+-1.169=0
Training accuracy: 0.9482976040353089




C = 1
Decision boundary: -0.209x0+-0.348x1+-1.356=0
Training accuracy: 0.9129886506935687




C = 10
Decision boundary: 0.155x0+0.084x1+-1.910=0
Training accuracy: 0.9344262295081968




C = 100
Decision boundary: 2.002x0+3.968x1+-6.014=0
Training accuracy: 0.9003783102143758




C = 1000
Decision boundary: 3.159x0+8.215x1+-8.780=0
Training accuracy: 0.78562421185372






In [9]:
# Obtain the weights and bias from the best linear SVM classifier.
opt_W = opt_classifier.coef_[0]
opt_b = opt_classifier.intercept_[0]
print('Best parameter C* = {}'.format(opt_C))
print('Decision boundary: {:.3f}x0+{:.3f}x1+{:.3f}=0'.format(opt_W[0],opt_W[1],opt_b))
#vis(X_test, Y_test, opt_W, opt_b)
print('Testing Accuracy: {}'.format(1 - calc_error(X_test, Y_test, opt_classifier)))

Best parameter C* = 0.1
Decision boundary: -0.120x0+-0.194x1+-1.169=0
Testing Accuracy: 0.9296482412060302


### Logistic Regression

In [10]:
classifier = LogisticRegressionCV()
param = [
  {'penalty': ['l1'], 'solver': ['liblinear']},
  {'penalty': ['l2'], 'solver': ['newton-cg']},
 ]
logit = GridSearchCV(classifier, param, cv=5, return_train_score=True)
logit.fit(X_train_val, Y_train_val)
logit_train = logit.cv_results_['mean_train_score']
logit_test = logit.cv_results_['mean_test_score']

test_accX = logit.best_estimator_.predict(X_test) == Y_test
print('Best parameter: {}'.format(logit.best_params_))
#print(sum(test_accX))
test_acc = sum(test_accX)/len(test_accX)
print('Training accuracy: {}'.format(logit_train))
print('Validation score: {}'.format(logit_test))
print('Testining accuracy: {}'.format(test_acc))

Best parameter: {'penalty': 'l1', 'solver': 'liblinear'}
Training accuracy: [0.94640602 0.95144837]
Validation score: [0.9407314 0.9407314]
Testining accuracy: 0.9547738693467337


### Random Forest

In [11]:
param = {"max_features":[0.05,0.1,0.2,0.3,0.4,0.6,0.8,1],"n_estimators":[1024]}
classifier = RandomForestClassifier()

rf = GridSearchCV(classifier,param,cv=5, return_train_score=True)
rf.fit(X_train_val, Y_train_val)
rf_train = rf.cv_results_['mean_train_score']
rf_test = rf.cv_results_['mean_test_score']

test_accX = rf.best_estimator_.predict(X_test) == Y_test

#print(sum(test_accX))
test_acc = sum(test_accX)/len(test_accX)
print('Best parameter: {}'.format(rf.best_params_))
print('Training accuracy: {}'.format(rf_train))
print('Validation score: {}'.format(rf_test))
print('Testining accuracy: {}'.format(test_acc))

Best parameter: {'max_features': 0.1, 'n_estimators': 1024}
Training accuracy: [1. 1. 1. 1. 1. 1. 1. 1.]
Validation score: [0.92812106 0.93568726 0.93442623 0.93568726 0.9331652  0.9331652
 0.92812106 0.92812106]
Testining accuracy: 0.9597989949748744


### Decision Tree

In [12]:
param  = {"criterion":["entropy"],'max_depth':[1,2,3,4,5]}
classifier = DecisionTreeClassifier()

dt = GridSearchCV(classifier,param,cv=5, return_train_score=True)
dt.fit(X_train_val, Y_train_val)
dt_train = dt.cv_results_['mean_train_score']
dt_test = dt.cv_results_['mean_test_score']

test_accX = dt.best_estimator_.predict(X_test) == Y_test
#print(sum(test_accX))
test_acc = sum(test_accX)/len(test_accX)
print('Best parameter: {}'.format(dt.best_params_))
print('Training accuracy: {}'.format(dt_train))
print('Validation score: {}'.format(dt_test))
print('Testining accuracy: {}'.format(test_acc))

Best parameter: {'criterion': 'entropy', 'max_depth': 5}
Training accuracy: [0.91078219 0.91456767 0.93473956 0.94766984 0.96469013]
Validation score: [0.90920555 0.90290038 0.90163934 0.90416141 0.91424968]
Testining accuracy: 0.9095477386934674
