# Credit Card Fraud Classification - Toby Liang

## I. Import Essential Libraries

In [16]:
# Arrays and dataframes
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

## II. Introducing the Data

In [17]:
# Import the data
path = "./data/credit_card.csv"
dataset = pd.read_csv(path)

# Renaming columns
columns = ["time"]
for i in range(1, 29):
    columns.append("v" + str(i))
for col in ["amount", "label"]:
    columns.append(col)
dataset.columns = columns

# Printing first 5 rows
dataset.head()

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,label
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [18]:
dataset.shape

(284807, 31)

We have 284,807 examples with 31 columns.

In [19]:
# Checking for NaN values
dataset.isnull().sum()

time      0
v1        0
v2        0
v3        0
v4        0
v5        0
v6        0
v7        0
v8        0
v9        0
v10       0
v11       0
v12       0
v13       0
v14       0
v15       0
v16       0
v17       0
v18       0
v19       0
v20       0
v21       0
v22       0
v23       0
v24       0
v25       0
v26       0
v27       0
v28       0
amount    0
label     0
dtype: int64

No examples have NaN values.

In [33]:
# Checking for 0 values
(dataset == 0).astype(int).sum()

time           2
v1             0
v2             0
v3             0
v4             0
v5             0
v6             0
v7             0
v8             0
v9             0
v10            0
v11            0
v12            0
v13            0
v14            0
v15            0
v16            0
v17            0
v18            0
v19            0
v20            0
v21            0
v22            0
v23            0
v24            0
v25            0
v26            0
v27            0
v28            0
amount      1808
label     283253
dtype: int64

The only features that have zero values are time and amount.  Time is the amount of time in seconds after the first transaction which can be zero if the transactions occur at the same time.  Amount is the amount of money involved in a transaction.

In [21]:
# Count entries that have a 0 value for amount and is fraudulent
((dataset.amount==0).astype(int) & (dataset.label==1)).sum()

27

27 entries have a 0 transaction amount and is fraudulent.

In [22]:
# Checking for duplicate rows
dataset.duplicated().value_counts()

False    283726
True       1081
dtype: int64

There are 1081 duplicate rows in the dataset.  These duplicates will have to be dropped in the data preprocessing process.

In [23]:
# Checking for number of unique labels
dataset["label"].unique()

array([0, 1])

There are two unique labels in the dataset, meaning that this is a binary classification problem.

In [24]:
# Counting labels
dataset["label"].value_counts()

0    284315
1       492
Name: label, dtype: int64

There are many more non-fraudulent transactions than fraudulent transactions.  Thus, this is an imbalanced dataset.

## III. Data Preprocessing

In [59]:
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split

# Drop duplicates (Do not want duplicates in test set)
dataset.drop_duplicates(inplace=True)

# Duplicate drop sanity check
print("Duplicates:" + str((dataset.duplicated() == True).sum()))

# Remove entries that have 0 amount?
pass

# Split features and labels
features = dataset.drop("label", axis=1)
labels = dataset[["label"]]

# Standardize data
features[features.columns] = scale(features[features.columns])

# Split data into train and test sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.1)

Duplicates:0


In [68]:
from sklearn.utils import resample

def oversample(train_features, train_labels):
    # Concatenate features and labels for train set
    train_dataset = pd.concat([train_features, train_labels], axis = 1)
    
    # Get non_fraud and fraud data
    not_fraud_dataset = train_dataset[train_dataset.label == 0]
    fraud_dataset = train_dataset[train_dataset.label == 1]
    
    # Oversample fraud
    fraud_oversampled = resample(fraud_dataset, replace = True, n_samples = len(not_fraud_dataset))
    
    # Oversampled data
    train_dataset_oversampled = pd.concat([not_fraud_dataset, fraud_oversampled])
    
    # Split features and labels
    train_features_oversampled = train_dataset_oversampled.drop("label", axis=1)
    train_labels_oversampled = train_dataset_oversampled[["label"]]
    
    # Sanity check
    print(labels["label"].value_counts())
    
    return (train_features_oversampled, train_labels_oversampled)
oversample(train_features, train_labels)

0    283253
1       473
Name: label, dtype: int64


(            time         v1         v2         v3        v4         v5  \
 267691  1.433626  -0.004541  -0.365371   0.686368 -1.519851  -0.234126   
 53820  -1.024581   0.620644  -0.053318   0.223415 -0.160467  -0.305759   
 177193  0.595311  -0.014273   0.473474  -1.150959 -0.131520   0.404182   
 113527 -0.457280  -0.335202   0.726371   1.283372  0.405577   0.063191   
 128136 -0.341191   0.627215   0.438639  -0.057072  1.472987   0.586397   
 222897  1.018428  -0.800649  -0.238016  -0.623366 -0.221571   0.549546   
 276636  1.524946   0.949680  -1.171623  -0.820587 -1.317934   0.366838   
 12461  -1.536554  -0.374760   0.240223   1.853330 -1.181649  -0.289775   
 159016  0.364186  -0.718619   0.139712   0.983762 -0.662134   0.818117   
 126031 -0.356081   0.588598   0.066925  -0.227643  0.704031   0.114647   
 88929  -0.683644   0.581910  -1.106500   0.785327 -0.816525  -1.795504   
 216996  0.966933   0.918158  -0.207132  -0.621660  1.222141   0.175657   
 172405  0.553420   1.077