In [14]:
# Credit Card Fraud Detection
# Dataset Link: https://www.kaggle.com/dalpozz/creditcardfraud
# Dataset Size: 68 MB

import os, sys
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support

In [15]:
def load_data(csv_data,ratio=0.1):
    # Read the csv nd extract all the required info
    dataframe = pd.read_csv(csv_data)
    num_pos = dataframe[dataframe['Class']==1].shape[0]
    num_neg = dataframe.shape[0]-num_pos

    # Print the statics of Data
    print("Data statitics:")
    print(". # of samples        : {}".format(dataframe.shape[0]))
    print(". # of features       : {}".format(dataframe.shape[1]-1))
    print(". # of +ve/-ve samples: {}/{}".format(num_pos,num_neg))

    message = ("*MESSAGE* As you can see, out dataset is highly skewed: "
               "only {0:.3f}% of data is +ve".format(num_pos*100/(num_pos+num_neg)))
    print(message)

    return dataframe

In [16]:
csv_data = "creditcard.csv"
data = load_data(csv_data)
data.head()

Data statitics:
. # of samples        : 284807
. # of features       : 30
. # of +ve/-ve samples: 492/284315
*MESSAGE* As you can see, out dataset is highly skewed: only 0.173% of data is +ve


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [17]:
def preprocess_FollowUnderSampling(data):
    data['NormalisedAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
    data = data.drop(['Time','Amount'],axis=1)

    # Undersampling Steps
    fraud_count = len(data[data.Class==1])
    fraud_indices = data[data.Class==1].index
    notFraud_indices = data[data.Class==0].index
    rand_notFraud_indices = notFraud_indices[np.random.permutation(len(notFraud_indices))]
    rand_notFraud_indices_undersampled = rand_notFraud_indices[0:3*fraud_count]
    undersampled_indices = np.concatenate([fraud_indices,rand_notFraud_indices_undersampled])
    undersampled_data = data.iloc[undersampled_indices,:]
    X_undersampled = undersampled_data.iloc[:,undersampled_data.columns!='Class']
    Y_undersampled = undersampled_data.iloc[:,undersampled_data.columns=='Class']
    return X_undersampled,Y_undersampled

In [18]:
X,Y = preprocess_FollowUnderSampling(data)

In [19]:
def get_tr_tst(X,Y,ratio=0.10):
    X = X.as_matrix()
    Y = Y.as_matrix()
    indices = np.random.permutation(X.shape[0])
    split_index = int(ratio*indices.size)
    tst_data = X[indices[0:split_index],:]
    tst_lb = Y[indices[0:split_index]]
    tst_lb.shape = (tst_lb.size,)
    tr_data = X[indices[split_index:],:]
    tr_lb = Y[indices[split_index:]]
    tr_lb.shape = (tr_lb.size,)
    print("Statitics:")
    print(". # of train/test samples: {}/{}".format(tr_lb.size,tst_lb.size))
    return tr_data,tr_lb,tst_data,tst_lb

In [20]:
tr_data,tr_lb,tst_data,tst_lb = get_tr_tst(X,Y)

Statitics:
. # of train/test samples: 1772/196


In [21]:
def rbf_svm(tr_data,tr_lb,tst_data,tst_lb):
    mySVM = SVC(kernel='rbf')
    C_range = np.logspace(1,4,4)
    param_grid = dict(C=C_range)
    #param_grid
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.1)
    clf = GridSearchCV(mySVM, param_grid=param_grid, cv=cv, n_jobs=-1)
    clf.fit(tr_data,tr_lb)
    message = ("*MESSAGE* Best C: {}, Score: {}".format(clf.best_params_['C'],clf.best_score_))
    print(message)
    pred = clf.predict(tst_data)
    count = 0
    for i in range(pred.size):
        if pred[i]==tst_lb[i]: count += 1
    print("Accuracy: {}".format(count/tst_lb.size))
    print(precision_recall_fscore_support(tst_lb,pred,average='binary'))

In [22]:
rbf_svm(tr_data,tr_lb,tst_data,tst_lb)

*MESSAGE* Best C: 10.0, Score: 0.9707865168539326
Accuracy: 0.9744897959183674
(0.96078431372549022, 0.94230769230769229, 0.95145631067961167, None)
