In [31]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
import matplotlib.pyplot as plt 
import io
import keras.backend as K
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, KFold


# Import data

In [32]:
# Store the input data in a dataframe
fraud_data = pd.read_csv("creditcard.csv")


# Preprocess data

In [36]:
# shuffle the dataframe so that the inputs are in a random order
df = fraud_data.sample(frac=1).reset_index(drop=True)
#
# Scale the time -1 and 1, since the rest of the features are scaled
robust_scaler = RobustScaler()
scaled_amount = robust_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
scaled_time = robust_scaler.fit_transform(df['Time'].values.reshape(-1,1))
df.drop(['Time','Amount'], axis=1, inplace=True)

# Insert into beginning of df
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

# Show the result
df.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,-0.279746,-0.14784,-1.411934,1.368292,0.915822,-0.503541,0.827001,1.292923,0.345922,0.546339,...,0.507386,-0.278143,-0.212725,0.119392,-1.107971,-0.35144,0.176095,0.557571,0.120358,0
1,-0.109271,-0.204749,1.173458,-0.115721,0.464002,0.813438,-0.527219,-0.279255,-0.206228,0.098494,...,-0.246839,-0.21977,-0.515834,0.056917,0.071978,0.320416,0.303436,-0.018508,0.007147,0
2,-0.279746,0.53894,2.094671,-0.17623,-1.509882,0.052969,0.383945,-0.285671,0.015893,-0.082719,...,-0.218559,-0.315788,-0.837633,0.201649,-1.115447,-0.20581,0.247585,-0.080147,-0.082402,0
3,-0.16768,-0.454869,-1.334343,1.326229,0.95846,-0.342571,0.344399,1.810734,-0.643068,-0.503493,...,-0.615301,1.558601,-0.471577,0.493029,-1.067489,-1.045921,0.046425,0.138835,0.090314,0
4,0.252917,-0.079735,1.096984,-0.598923,1.518756,1.115981,-1.269266,0.660641,-1.044155,0.306317,...,-0.039566,-0.107764,0.1357,-0.15528,-0.025546,0.465917,0.512095,0.049825,0.028509,0


In [38]:
# Split the data into training (80%) and testing (20%)
train, test = train_test_split(df, test_size=0.2)

# Split data into features and labels
train_features = np.array(train.values[:,:30])
train_labels = np.array(train.values[:,-1])
test_features = np.array(test.values[:,:30])
test_labels = np.array(test.values[:,-1])

print(train_features.shape)
print(test_features.shape)


(227845, 30)
(56962, 30)


# Set how much we care about each mis-classification data

In [29]:
# Set How much we value each mis-classification
cost_FalsePositive = 3
cost_FalseNegative = amount
cost_TruePositive = 0
cost_TrueNegative = 0

cost_mat = np.array([cost_FalsePositive * np.ones(df.shape[0]), cost_FalseNegative, 
                     cost_TruePositive * np.ones(df.shape[0]), 
                     cost_TrueNegative * np.ones(df.shape[0])]).T

X = df.iloc[:, :-1]
y = df.iloc[:, -1]
sc = StandardScaler()
X = sc.fit_transform(X)
    
kf = KFold(n_splits=5)
kf.get_n_splits(X)
X_train_l, X_test_l = [], []
y_train_l, y_test_l = [], []
cost_mat_train_l, cost_mat_test_l = [], []
for train_index, test_index in kf.split(X):
    X_train_l.append(X[train_index, :])
    X_test_l.append(X[test_index, :])
    y_train_l.append(y.iloc[train_index])
    y_test_l.append(y.iloc[test_index])
    cost_mat_train_l.append(cost_mat[train_index, :])
    cost_mat_test_l.append(cost_mat[test_index, :])


print(cost_mat_train_l[1][-1])
print(cost_mat_train_l[2][-1])

[ 3.   39.99  0.    0.  ]
[ 3.   39.99  0.    0.  ]


In [9]:
# Create custom loss function
# Keras does not allow custom arguments to be passed in to the loss function

def create_y_input(y_train, c_FN):
    y_str = pd.Series(y_train).reset_index(drop=True).apply(lambda x: str(int(x)))
    c_FN_str = pd.Series(c_FN).reset_index(drop=True).apply(lambda x: '0' *
                        (5-len(str(int(x)))) + str(int(x)))
    return y_str + '.' + c_FN_str
    
def csnn_loss(c_FP, c_TP, c_TN):
    def loss_function(y_input, y_pred):
        y_true = K.round(y_input)
        c_FN = (y_input - y_true) * 1e5
        eps = 0.0001
        y_pred = K.minimum(1.0 - eps, K.maximum(0.0 + eps, y_pred))
        cost = y_true * (K.log(y_pred) * c_FN + K.log(1 - y_pred) * c_TP)
        cost += (1 - y_true) * (K.log(1 - y_pred) * c_FP + K.log(y_pred) * c_TN)
        return - K.mean(cost, axis=-1)
    return loss_function


        

# Create the model

In [12]:
model = Sequential()

print(train_features.shape[0])
model.add(Dense(units=50, kernel_initializer='uniform', input_dim=train_features.shape[1], activation='relu'))
model.add(Dropout(.2))
model.add(Dense(units=25, kernel_initializer='uniform', activation='relu'))
model.add(Dropout(.2))
model.add(Dense(15, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))

# Including the custom loss fxn
model.compile(optimizer='adam', loss=csnn_loss(cost_FalsePositive, cost_TruePositive, cost_TrueNegative),
        metrics=['accuracy'])


227845


# Train the model

In [23]:
for i, (X_train, X_test, y_train, cost_mat_train) in enumerate(zip(X_train_l,  X_test_l, y_train_l, cost_mat_train_l)): 
    print('ANN Cost Sensitive ' + str(i + 1) + '/' + str("1") + ' ...')
    
    print("Cost mat_train {}".format(cost_mat_train.shape))
    cost_FN_train = cost_mat_train[:, 1]
    print("Cost fn_train {}".format(cost_FN_train.shape))
    y_input = create_y_input(train_labels, cost_FN_train).apply(float)
    
    print(train_features.shape)
    print(y_input.shape)
    
    model.compile(optimizer='adam', loss=csnn_loss(cost_FalsePositive, cost_TruePositive, cost_TrueNegative),
        metrics=['accuracy'])
    
    print(train_features.shape)
    print(y_input.shape)
    model.fit(train_features, y_input, batch_size=50, epochs=2, verbose=1)



ANN Cost Sensitive 1/1 ...
Cost mat_train (227845, 4)
Cost fn_train (227845,)
(227845, 30)
(227845,)
(227845, 30)
(227845,)
Epoch 1/2
Epoch 2/2
ANN Cost Sensitive 2/1 ...
Cost mat_train (227845, 4)
Cost fn_train (227845,)
(227845, 30)
(227845,)
(227845, 30)
(227845,)
Epoch 1/2
Epoch 2/2
ANN Cost Sensitive 3/1 ...
Cost mat_train (227846, 4)
Cost fn_train (227846,)
(227845, 30)
(227846,)
(227845, 30)
(227846,)


ValueError: Input arrays should have the same number of samples as target arrays. Found 227845 input samples and 227846 target samples.

In [None]:
# Get model accuracy 
scores = model.evaluate(test_features, test_labels)
print('\n')
print('accuracy=',scores[1])

In [None]:
# Get predictions from model
output = model.predict_classes(test_features)

In [None]:
# Show confusion matrix
y_actu = pd.Series(test_labels, name='Actual')
y_pred = pd.Series(np.ndarray.flatten(output), name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)
print(df_confusion)

