In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, recall_score, f1_score, log_loss
import matplotlib.pyplot as plt

df = pd.read_csv('card_transdata.csv')
display(df)
print(df.shape)

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.311140,1.945940,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
999995,2.207101,0.112651,1.626798,1.0,1.0,0.0,0.0,0.0
999996,19.872726,2.683904,2.778303,1.0,1.0,0.0,0.0,0.0
999997,2.914857,1.472687,0.218075,1.0,1.0,0.0,1.0,0.0
999998,4.258729,0.242023,0.475822,1.0,0.0,0.0,1.0,0.0


(1000000, 8)


In [36]:
# Check for any data instances with missing values
df[df.isna().any(axis=1)]

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud


In [37]:
df =  df.sample(frac=1) # shuffle the dataset
X = df.iloc[:,[0, 1, 2, 3, 4, 5, 6]]
y = df.iloc[:,[7]]
display(X)
display(y)

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
626092,2.323620,1.109998,1.813340,1.0,0.0,0.0,1.0
644756,20.841316,0.219515,0.294976,1.0,1.0,0.0,1.0
426735,0.500845,2.327406,0.881049,0.0,0.0,0.0,1.0
726564,13.083926,1.272575,0.233884,1.0,1.0,0.0,1.0
404752,13.678061,2.755713,0.629303,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...
368,32.756166,0.866699,0.715640,1.0,0.0,0.0,1.0
470502,10.838216,0.131953,0.615450,1.0,1.0,0.0,1.0
248647,4.603676,0.496718,1.619058,1.0,0.0,0.0,0.0
907104,3.050237,0.741465,0.569645,1.0,0.0,0.0,1.0


Unnamed: 0,fraud
626092,0.0
644756,0.0
426735,0.0
726564,0.0
404752,0.0
...,...
368,0.0
470502,0.0
248647,0.0
907104,0.0


In [38]:
# using the train test split function (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(
  X, y, random_state=42, test_size=0.2, shuffle=False)
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))
print(X_train.head()) # check if x and y align to each data instance
print(y_train.head())

800000
800000
200000
200000
        distance_from_home  distance_from_last_transaction  \
626092            2.323620                        1.109998   
644756           20.841316                        0.219515   
426735            0.500845                        2.327406   
726564           13.083926                        1.272575   
404752           13.678061                        2.755713   

        ratio_to_median_purchase_price  repeat_retailer  used_chip  \
626092                        1.813340              1.0        0.0   
644756                        0.294976              1.0        1.0   
426735                        0.881049              0.0        0.0   
726564                        0.233884              1.0        1.0   
404752                        0.629303              1.0        1.0   

        used_pin_number  online_order  
626092              0.0           1.0  
644756              0.0           1.0  
426735              0.0           1.0  
726564            

In [42]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [43]:
# First model
# 2 hidden layers with 64 and 32 neurons respectively
# activation function is logistic sigmoid
# solver is stochastic gradient descent
model1 = MLPClassifier(hidden_layer_sizes= (64, 32), activation='logistic', 
                      solver='sgd', random_state=42, max_iter=1000)
 
# Train the model
model1.fit(X_train, y_train.values.ravel())
 

In [44]:
# Model 2
# Same as first model except activation function is now 'identity'
model2 = MLPClassifier(hidden_layer_sizes= (64, 32), activation='identity', 
                      solver='sgd', random_state=42, max_iter=1000)
 
# Train the model
model2.fit(X_train, y_train.values.ravel())

In [45]:
# Model 3
# Same as first model exceot with only one hidden layer with 64 neurons
model3 = MLPClassifier(hidden_layer_sizes= (64), activation='logistic', 
                      solver='sgd', random_state=42, max_iter=1000)
 
# Train the model
model3.fit(X_train, y_train.values.ravel())

In [46]:
# Predict the labels for the test set
train_pred = model1.predict(X_train)
test_pred = model1.predict(X_test)

# class probabilities for log loss function
train_pred_proba = model1.predict_proba(X_train)
test_pred_proba = model1.predict_proba(X_test)

train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)
train_sensitivity = recall_score(y_train, train_pred, average='binary', pos_label=1)
test_sensitivity = recall_score(y_test, test_pred, average='binary', pos_label=1)
train_specificity = recall_score(y_train, train_pred, average='binary', pos_label=0)
test_specificity = recall_score(y_test, test_pred, average='binary', pos_label=0)
train_f1 = f1_score(y_train, train_pred, average='binary')
test_f1 = f1_score(y_test, test_pred, average='binary')
train_logloss = log_loss(y_train, train_pred_proba)
test_logloss = log_loss(y_test, test_pred_proba)

# To print out evaluation metrics
metric = {
    'Metrics': ['Accuracy', 'Sensitivity', 'Specificity', 'F1 Score', 'Log Loss'],
    'Training': [train_accuracy, train_sensitivity, train_specificity, train_f1, train_logloss],
    'Test' : [test_accuracy, test_sensitivity, test_specificity, test_f1, test_logloss]
}

# Replace header by the first row
metric = pd.DataFrame(metric).T
metric.columns = metric.iloc[0]
metric = metric[1:]
display(metric)

Metrics,Accuracy,Sensitivity,Specificity,F1 Score,Log Loss
Training,0.995315,0.968821,0.997854,0.973092,0.015563
Test,0.995365,0.968483,0.997935,0.973308,0.015362


In [47]:
# Predict the labels for the test set
train_pred = model2.predict(X_train)
test_pred = model2.predict(X_test)

# class probabilities for log loss function
train_pred_proba = model2.predict_proba(X_train)
test_pred_proba = model2.predict_proba(X_test)

train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)
train_sensitivity = recall_score(y_train, train_pred, average='binary', pos_label=1)
test_sensitivity = recall_score(y_test, test_pred, average='binary', pos_label=1)
train_specificity = recall_score(y_train, train_pred, average='binary', pos_label=0)
test_specificity = recall_score(y_test, test_pred, average='binary', pos_label=0)
train_f1 = f1_score(y_train, train_pred, average='binary')
test_f1 = f1_score(y_test, test_pred, average='binary')
train_logloss = log_loss(y_train, train_pred_proba)
test_logloss = log_loss(y_test, test_pred_proba)

# To print out evaluation metrics
metric = {
    'Metrics': ['Accuracy', 'Sensitivity', 'Specificity', 'F1 Score', 'Log Loss'],
    'Training': [train_accuracy, train_sensitivity, train_specificity, train_f1, train_logloss],
    'Test' : [test_accuracy, test_sensitivity, test_specificity, test_f1, test_logloss]
}

# Replace header by the first row
metric = pd.DataFrame(metric).T
metric.columns = metric.iloc[0]
metric = metric[1:]
display(metric)

Metrics,Accuracy,Sensitivity,Specificity,F1 Score,Log Loss
Training,0.959433,0.610547,0.992862,0.724667,0.133848
Test,0.959065,0.607988,0.992627,0.721597,0.135543


In [48]:
# Predict the labels for the test set
train_pred = model3.predict(X_train)
test_pred = model3.predict(X_test)

# class probabilities for log loss function
train_pred_proba = model3.predict_proba(X_train)
test_pred_proba = model3.predict_proba(X_test)

train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)
train_sensitivity = recall_score(y_train, train_pred, average='binary', pos_label=1)
test_sensitivity = recall_score(y_test, test_pred, average='binary', pos_label=1)
train_specificity = recall_score(y_train, train_pred, average='binary', pos_label=0)
test_specificity = recall_score(y_test, test_pred, average='binary', pos_label=0)
train_f1 = f1_score(y_train, train_pred, average='binary')
test_f1 = f1_score(y_test, test_pred, average='binary')
train_logloss = log_loss(y_train, train_pred_proba)
test_logloss = log_loss(y_test, test_pred_proba)

# To print out evaluation metrics
metric = {
    'Metrics': ['Accuracy', 'Sensitivity', 'Specificity', 'F1 Score', 'Log Loss'],
    'Training': [train_accuracy, train_sensitivity, train_specificity, train_f1, train_logloss],
    'Test' : [test_accuracy, test_sensitivity, test_specificity, test_f1, test_logloss]
}

# Replace header by the first row
metric = pd.DataFrame(metric).T
metric.columns = metric.iloc[0]
metric = metric[1:]
display(metric)

Metrics,Accuracy,Sensitivity,Specificity,F1 Score,Log Loss
Training,0.994216,0.956842,0.997797,0.96659,0.019166
Test,0.99436,0.957882,0.997847,0.967361,0.018971
