In [1]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE

### define some functions.

In [4]:
def feature_label_split(data, column_name):
    """Split dataset to features and labels."""
    
    labels = np.array(data[column_name])
    features = np.array(data.drop(column_name, axis=1))
    
    print("features shape: {} rows, {} columns".format(features.shape[0], features.shape[1]))
    print("labels shape: {} rows".format(labels.shape[0]))
    
    return features, labels


def report(y_true, y_predict):
    """Show model performance report.
    
    This includes confusion matrix and recall/precision
    """
    
    # build confusion matrix as DataFrame.
    cm = confusion_matrix(y_true, y_predict)
    print("CONFUSION MATRIX")
    print("----------------")
    print("True Positive: {}\nFalse Positive: {}".format(cm[0][0], cm[0][1]))
    print("False Negative: {}\nTrue Negative: {}".format(cm[1][0], cm[1][1]))
    print("\n")

    # recall/precision.
    print("PRECISION/RECALL")
    print("----------------")
    print(classification_report(y_true, y_predict))
    
    print("\nEND")

### import data.

In [2]:
fraud_data = pd.read_csv("data/creditcard.csv")
fraud_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### split data to train/test.

In [5]:
# split features and labels.
features, labels = feature_label_split(fraud_data, "Class")

# split training and test data.
train_features, test_features, train_labels, test_labels = train_test_split(
    features,
    labels,
    test_size=0.3
)

print("Train Dataset: ", train_features.shape, train_labels.shape)
print("Test Dataset: ", test_features.shape, test_labels.shape)

features shape: 284807 rows, 30 columns
labels shape: 284807 rows
Train Dataset:  (199364, 30) (199364,)
Test Dataset:  (85443, 30) (85443,)


### train the model and make predictions

In [6]:
%%time
# initialise random forest.
forest = RandomForestClassifier(n_estimators=500, random_state=42)

# train the model.
forest.fit(train_features, train_labels)

# make predictions.
predictions = forest.predict(test_features)

CPU times: user 10min 31s, sys: 1.41 s, total: 10min 32s
Wall time: 10min 35s


### report accuracy score.

In [59]:
report(test_labels, predictions)

CONFUSION MATRIX
----------------
True Positive: 85277
False Positive: 7
False Negative: 44
True Negative: 115


PRECISION/RECALL
----------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85284
           1       0.94      0.72      0.82       159

    accuracy                           1.00     85443
   macro avg       0.97      0.86      0.91     85443
weighted avg       1.00      1.00      1.00     85443



### oversampling using SMOTE

In [74]:
x_train = fraud_data.drop("Class", axis=1).copy()
y_train = fraud_data["Class"].copy()

print(len(y_train))
print(y_train.value_counts())

284807
0    284315
1       492
Name: Class, dtype: int64


In [77]:
smote = SMOTE(random_state=42)
smote_features, smote_labels = smote.fit_sample(x_train, y_train)


In [78]:
smote_train_data, smote_test_data, smote_train_labels, smote_test_labels = train_test_split(
    smote_features,
    smote_labels,
    test_size=0.3
)

print("Train Dataset: ", smote_train_data.shape, smote_train_labels.shape)
print("Test Dataset: ", smote_test_data.shape, smote_test_labels.shape)

Train Dataset:  (398041, 30) (398041,)
Test Dataset:  (170589, 30) (170589,)


### retrain random forest model with dataset using SMOTE

In [79]:
%%time

# create the model.
smote_forest = RandomForestClassifier(n_estimators=500, random_state=42)

# train the model.
smote_forest.fit(smote_train_data, smote_train_labels)

CPU times: user 22min 55s, sys: 3.44 s, total: 22min 59s
Wall time: 23min 5s


In [80]:
smote_pred = smote_forest.predict(smote_test_data)

In [82]:
report(smote_test_labels, smote_pred)

CONFUSION MATRIX
----------------
True Positive: 85538
False Positive: 25
False Negative: 4
True Negative: 85022


PRECISION/RECALL
----------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85563
           1       1.00      1.00      1.00     85026

    accuracy                           1.00    170589
   macro avg       1.00      1.00      1.00    170589
weighted avg       1.00      1.00      1.00    170589

