In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

In [2]:
dataset = pd.read_csv("dataset/PS_20174392719_1491204439457_log.csv")
dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
len(dataset)

6362620

# Check whether the dataset contains null values

In [4]:
dataset.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

# Check how many frauds are in the dataset

In [5]:
dataset.isFraud.value_counts()

isFraud
0    6354407
1       8213
Name: count, dtype: int64

## There are 8213 frauds in the 6362620 simulation cases

# Creating type2 -- CC (Customer to Customer) / CM (Customer to Merchant) to replace nameOrig and nameDest

In [6]:
dataset.drop(["step", "isFlaggedFraud"], axis = 1, inplace=True)

In [7]:
dataset.head()

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0
1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0
2,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
3,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
4,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0


In [8]:
def combine_first_char(row):
    return row["nameOrig"][0] + row["nameDest"][0]

In [9]:
dataset["type2"] = dataset.apply(combine_first_char, axis = 1)

In [10]:
dataset.head()

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,type2
0,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,CM
1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,CM
2,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,CC
3,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,CC
4,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,CM


In [11]:
dataset.drop(["nameOrig", "nameDest"], axis=1, inplace=True)

In [12]:
dataset.head()

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type2
0,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,CM
1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,CM
2,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,CC
3,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,CC
4,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,CM


In [13]:
dataset = dataset[["type", "type2", "amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest", "isFraud"]]
dataset.head()

Unnamed: 0,type,type2,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,PAYMENT,CM,9839.64,170136.0,160296.36,0.0,0.0,0
1,PAYMENT,CM,1864.28,21249.0,19384.72,0.0,0.0,0
2,TRANSFER,CC,181.0,181.0,0.0,0.0,0.0,1
3,CASH_OUT,CC,181.0,181.0,0.0,21182.0,0.0,1
4,PAYMENT,CM,11668.14,41554.0,29885.86,0.0,0.0,0


In [14]:
dataset.rename(columns={"oldbalanceOrg": "oldBalanceOrigin",
                        "newbalanceOrig": "newBalanceOrigin",
                        "oldbalanceDest": "oldBalanceReceiver",
                        "newbalanceDest": "newBalanceReceiver"}, 
               inplace=True)

In [15]:
dataset.head()

Unnamed: 0,type,type2,amount,oldBalanceOrigin,newBalanceOrigin,oldBalanceReceiver,newBalanceReceiver,isFraud
0,PAYMENT,CM,9839.64,170136.0,160296.36,0.0,0.0,0
1,PAYMENT,CM,1864.28,21249.0,19384.72,0.0,0.0,0
2,TRANSFER,CC,181.0,181.0,0.0,0.0,0.0,1
3,CASH_OUT,CC,181.0,181.0,0.0,21182.0,0.0,1
4,PAYMENT,CM,11668.14,41554.0,29885.86,0.0,0.0,0


# Check the most common type of transaction that is fraud

In [16]:
fraud = dataset[dataset.isFraud == 1].reset_index(drop=True)
fraud.head()

Unnamed: 0,type,type2,amount,oldBalanceOrigin,newBalanceOrigin,oldBalanceReceiver,newBalanceReceiver,isFraud
0,TRANSFER,CC,181.0,181.0,0.0,0.0,0.0,1
1,CASH_OUT,CC,181.0,181.0,0.0,21182.0,0.0,1
2,TRANSFER,CC,2806.0,2806.0,0.0,0.0,0.0,1
3,CASH_OUT,CC,2806.0,2806.0,0.0,26202.0,0.0,1
4,TRANSFER,CC,20128.0,20128.0,0.0,0.0,0.0,1


In [17]:
fraud.type.value_counts()

type
CASH_OUT    4116
TRANSFER    4097
Name: count, dtype: int64

In [18]:
fraud.type2.value_counts()

type2
CC    8213
Name: count, dtype: int64

## The most common are CASH_OUT & TRANSFER which were made from Customer to Customer transactions

# Due to there is a massive class imbalance issue, we will select from the dataset based on the info we found in fraud dataset in order for the model to learn effectively

In [19]:
filtered_dataset_non_fraud = dataset[dataset["type"].isin(["CASH_OUT", "TRANSFER"]) & (dataset["type2"] == "CC") & (dataset["isFraud"] == 0)].reset_index(drop=True)

In [20]:
filtered_dataset_non_fraud.isFraud.value_counts()

isFraud
0    2762196
Name: count, dtype: int64

In [21]:
random_dataset_non_fraud = filtered_dataset_non_fraud.sample(n=8213, random_state=42)

In [22]:
balanced_dataset = pd.concat([fraud, random_dataset_non_fraud], axis=0)

In [23]:
len(balanced_dataset)

16426

In [24]:
balanced_dataset.isFraud.value_counts()

isFraud
1    8213
0    8213
Name: count, dtype: int64

# Encoding String to Numeric

In [25]:
type_class = np.array(pd.factorize(balanced_dataset.type)[1])
type_class

array(['TRANSFER', 'CASH_OUT'], dtype=object)

In [26]:
balanced_dataset.type = pd.factorize(balanced_dataset.type)[0]

In [27]:
balanced_dataset.drop("type2", axis=1, inplace=True)

In [28]:
balanced_dataset.head()

Unnamed: 0,type,amount,oldBalanceOrigin,newBalanceOrigin,oldBalanceReceiver,newBalanceReceiver,isFraud
0,0,181.0,181.0,0.0,0.0,0.0,1
1,1,181.0,181.0,0.0,21182.0,0.0,1
2,0,2806.0,2806.0,0.0,0.0,0.0,1
3,1,2806.0,2806.0,0.0,26202.0,0.0,1
4,0,20128.0,20128.0,0.0,0.0,0.0,1


# Splitting the data for training and validation and perform normalization

In [29]:
X = balanced_dataset.drop("isFraud", axis = 1).to_numpy()
y = balanced_dataset.isFraud.to_numpy()

In [30]:
X_train, X_val, y_train, y_val = train_test_split(X, 
                                                  y, 
                                                  test_size=0.2, 
                                                  random_state=42)

In [31]:
len(X_train), len(y_train), len(X_val), len(y_val)

(13140, 13140, 3286, 3286)

In [32]:
norm_scaler = MinMaxScaler()
X_train_norm = norm_scaler.fit_transform(X_train)
X_val_norm = norm_scaler.transform(X_val)

# Building SVM, RandomForest & XGBoost models

In [33]:
# Base models
classifiers = {}
svm = SVC()
rfc = RandomForestClassifier(random_state=42)
xgb = XGBClassifier(random_state=42)
gbc = GradientBoostingClassifier(random_state=42)
lgbm = LGBMClassifier(objective="binary", random_state=42)

classifiers = {"SVM": svm,
               "Random_Forest": rfc,
               "XGBoost": xgb,
               "Gradient_Boosting": gbc,
               "Light_Gradient_Boosting": lgbm}

In [34]:
clf_scores = {}
for key, clf in classifiers.items():
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    val_score = clf.score(X_val, y_val)
    clf_scores[key] = {"Train score": train_score,
                       "Val score": val_score}

[LightGBM] [Info] Number of positive: 6564, number of negative: 6576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1277
[LightGBM] [Info] Number of data points in the train set: 13140, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499543 -> initscore=-0.001826
[LightGBM] [Info] Start training from score -0.001826


In [35]:
clf_scores

{'SVM': {'Train score': 0.89117199391172, 'Val score': 0.8950091296409007},
 'Random_Forest': {'Train score': 1.0, 'Val score': 0.9905660377358491},
 'XGBoost': {'Train score': 0.9981735159817352,
  'Val score': 0.9899573950091296},
 'Gradient_Boosting': {'Train score': 0.9850837138508372,
  'Val score': 0.984479610468655},
 'Light_Gradient_Boosting': {'Train score': 0.9993911719939117,
  'Val score': 0.9923919659160073}}

In [36]:
sk = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def perform_cross_validation(X, y, clf):
    cv = cross_validate(estimator=clf, 
                        X=X, 
                        y=y, 
                        scoring="accuracy", 
                        n_jobs=-1, 
                        return_train_score=True)

    cv_train_score = f"{np.mean(cv['train_score'])} +- {np.std(cv['train_score'])}" 
    cv_test_score = f"{np.mean(cv['test_score'])} +- {np.std(cv['test_score'])}"
    return {"Mean train score": cv_train_score,
            "Mean test score": cv_test_score}

In [37]:
cv_scores = {}
for key, clf in classifiers.items():
    cv_scores[key] = perform_cross_validation(X_train, y_train, clf)

In [38]:
cv_scores

{'SVM': {'Mean train score': '0.8823249619482496 +- 0.00914529312142661',
  'Mean test score': '0.8814307458143075 +- 0.019315302379561992'},
 'Random_Forest': {'Mean train score': '1.0 +- 0.0',
  'Mean test score': '0.9866818873668188 +- 0.001361380808219056'},
 'XGBoost': {'Mean train score': '0.9989726027397261 +- 0.0003728294890081224',
  'Mean test score': '0.9898021308980212 +- 0.002184581445115267'},
 'Gradient_Boosting': {'Mean train score': '0.9871955859969559 +- 0.0004946727549467251',
  'Mean test score': '0.9818873668188737 +- 0.000618267762909913'},
 'Light_Gradient_Boosting': {'Mean train score': '0.999771689497717 +- 0.00017643870805737565',
  'Mean test score': '0.9923896499238966 +- 0.0011541667342544124'}}

In [39]:
rfc.feature_importances_

array([0.05270471, 0.18199181, 0.41380702, 0.06857169, 0.09817672,
       0.18474805])