# Data Preprocessing - Toby Liang

## I. Import Essential Libraries

In [1]:
# Arrays and dataframes
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## II. Loading the Data

In [2]:
# Import the data
data_path = "../data/raw/credit_card.csv"
dataset = pd.read_csv(data_path)

# Renaming columns
columns = []
for col in dataset.columns:
    columns.append(col.lower())
columns[-1] = "label"
dataset.columns = columns

## III. Data Introduction

In [3]:
# Shape of dataset
print("Number of examples: {}\nNumber of variables: {}".format(dataset.shape[0], dataset.shape[1]))

# Checking for number of unique labels
print("Unique labels: {}\n".format(dataset.label.unique()))

# Checking label counts
print("Label counts: \n{}".format(dataset["label"].value_counts()))

# Printing first 5 rows
dataset.head()

Number of examples: 284807
Number of variables: 31
Unique labels: [0 1]

Label counts: 
0    284315
1       492
Name: label, dtype: int64


Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,label
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


We have 284,807 examples with 31 columns.  There are two unique labels in the dataset, meaning that this is a binary classification problem.

## IV. Missing Values Imputation

In [4]:
# Checking for NaN values
has_nan_vars = []
for var, nan_count in (dataset.isnull().sum()).items():
    if (nan_count != 0):
        has_nan_vars.append({var: nan_count})
        
# Checking for 0 values
has_zero_vars = []
for var, zero_count in ((dataset == 0).astype(int).sum()).items():
    if(zero_count != 0):
        has_zero_vars.append({var: zero_count})
        
print("Variables with NaN values: {}".format(has_nan_vars))
print("Variables with zero values: {}".format(has_zero_vars))

Variables with NaN values: []
Variables with zero values: [{'time': 2}, {'amount': 1825}, {'label': 284315}]


In [5]:
# Count entries that have a 0 value for amount and is fraudulent
print("Fraudulent zero amount count: {}".format(((dataset.amount == 0).astype(int) & (dataset.label == 1)).sum()))

# Count entries that have a 0 value for amount and is not fraudulent
print("Non-fraudulent zero amount count: {}".format(((dataset.amount == 0).astype(int) & (dataset.label == 0)).sum()))

Fraudulent zero amount count: 27
Non-fraudulent zero amount count: 1798


No features have NaN values.  The only features that have zero values are time and amount.  Time is the amount of time in seconds after the first transaction which can be zero if the transactions occur at the same time.  Amount is the amount of money involved in a transaction.  27 entries have a 0 transaction amount and is fraudulent.

In [6]:
# Remove 0 amount transactions
copy_dataset = dataset.drop(dataset[dataset["amount"] == 0].index)
mean = copy_dataset["amount"].mean()
dataset["amount"].mask(dataset["amount"] == 0, mean, inplace = True)

# Sanity check
print("Zero amount count: {}".format(((dataset.amount == 0).astype(int)).sum()))

dataset.reset_index(drop = True, inplace = True)

Zero amount count: 0


## V. Datatype Checking

In [7]:
# Check if features are all continuous (float64)
print("Feature Datatypes: \n{}".format(dataset.dtypes.value_counts()))

Feature Datatypes: 
float64    30
int64       1
dtype: int64


All 30 features are continuous numerical with a datatype of float64.  The label is binary and is of type int64.

## VI. Duplicate Removal

In [8]:
# Checking for duplicate rows
dataset.duplicated().value_counts()

False    283726
True       1081
dtype: int64

There are 1081 duplicate rows in the dataset.  These duplicates will have to be dropped so that there are no duplicates in the test set.

In [9]:
# Drop duplicates (Do not want duplicates in test set)
dataset.drop_duplicates(inplace = True)

# Duplicate drop sanity check
print("Duplicates: {}".format(dataset.duplicated().sum()))

dataset.reset_index(drop = True, inplace = True)

Duplicates: 0


## VII. Data Splitting

In [10]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(dataset.drop("label", axis = 1), dataset[["label"]], test_size = 0.05, random_state = 42)
train_features, val_features, train_labels, val_labels = train_test_split(train_features, train_labels, test_size = 0.03, random_state=42)

In [11]:
test_labels["label"].value_counts()

0    14162
1       25
Name: label, dtype: int64

## VIII. Feature Scaling

In [12]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler and computer training mean and std
scaler = StandardScaler()
scaler.fit(train_features)

# Normalize the training and testing features using training mean and std
train_features = pd.DataFrame(scaler.transform(train_features), index = train_features.index, columns = train_features.columns)
test_features = pd.DataFrame(scaler.transform(test_features), index = test_features.index, columns = test_features.columns)

In [13]:
train_features.describe().round(2)

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v20,v21,v22,v23,v24,v25,v26,v27,v28,amount
count,261452.0,261452.0,261452.0,261452.0,261452.0,261452.0,261452.0,261452.0,261452.0,261452.0,...,261452.0,261452.0,261452.0,261452.0,261452.0,261452.0,261452.0,261452.0,261452.0,261452.0
mean,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,...,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.0,-28.97,-44.15,-31.98,-4.02,-82.42,-19.63,-35.33,-62.01,-12.26,...,-70.64,-48.25,-15.09,-71.61,-4.69,-19.74,-5.4,-56.98,-47.27,-0.35
25%,-0.86,-0.47,-0.36,-0.59,-0.6,-0.5,-0.58,-0.45,-0.18,-0.59,...,-0.27,-0.32,-0.75,-0.26,-0.59,-0.61,-0.68,-0.18,-0.16,-0.33
50%,-0.21,0.01,0.04,0.12,-0.01,-0.04,-0.21,0.03,0.02,-0.05,...,-0.08,-0.04,0.01,-0.02,0.07,0.03,-0.11,-0.0,0.03,-0.26
75%,0.94,0.67,0.49,0.68,0.53,0.44,0.3,0.46,0.28,0.54,...,0.17,0.26,0.73,0.24,0.73,0.67,0.5,0.23,0.24,-0.04
max,1.64,1.26,13.4,6.21,11.93,25.22,55.0,97.8,16.59,14.23,...,51.09,31.33,14.5,36.0,7.57,14.42,7.3,79.82,103.7,101.52


## IX. Resampling: Undersample, Oversample, SMOTE

In [14]:
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

# Concatenate features and labels for train set
train_dataset = pd.concat([train_features, train_labels], axis = 1)

# Get non_fraud and fraud data
not_fraud_dataset = train_dataset[train_dataset.label == 0]
fraud_dataset = train_dataset[train_dataset.label == 1]

def undersample(not_fraud_dataset, fraud_dataset):
    # Undersample not_fraud
    not_fraud_undersampled = resample(not_fraud_dataset, replace = True, n_samples = len(fraud_dataset), random_state = 42)
    
    # Concatenate undersampled not_fraud with fraud
    train_dataset_undersampled = pd.concat([not_fraud_undersampled, fraud_dataset])
    
    return train_dataset_undersampled

def oversample(not_fraud_dataset, fraud_dataset):
    # Oversample fraud
    fraud_oversampled = resample(fraud_dataset, replace = True, n_samples = len(not_fraud_dataset), random_state = 42)
    
    # Concatenate oversampled fraud with not_fraud
    train_dataset_oversampled = pd.concat([fraud_oversampled, not_fraud_dataset])
    
    return train_dataset_oversampled

def smote(train_features, train_labels):
    # Apply smote
    sm = SMOTE(random_state = 42)
    train_features_smote, train_labels_smote = sm.fit_sample(train_features, train_labels)

    # Concatenate features and labels
    train_features_smote_df = pd.DataFrame(train_features_smote, columns = train_features.columns)
    train_lables_smote_df = pd.DataFrame(train_labels_smote, columns = train_labels.columns)
    train_dataset_smote = pd.concat([train_features_smote_df, train_lables_smote_df], axis = 1)
    
    return train_dataset_smote

Using TensorFlow backend.


In [15]:
# Datasets
train_dataset_undersampled = undersample(not_fraud_dataset, fraud_dataset)
train_dataset_oversampled = oversample(not_fraud_dataset, fraud_dataset)
train_dataset_smote = smote(train_features, train_labels)

# Train dataset dictionary
train_datasets_dict = {"Unsampled": train_dataset, "Undersampled": train_dataset_undersampled, "Oversampled": train_dataset_oversampled, "SMOTE": train_dataset_smote}


  y = column_or_1d(y, warn=True)


In [16]:
# Print label counts
for name, data in train_datasets_dict.items():
    print("{} Label Counts\n{}\n".format(name, data.label.value_counts()))

Unsampled Label Counts
0    261018
1       434
Name: label, dtype: int64

Undersampled Label Counts
1    434
0    434
Name: label, dtype: int64

Oversampled Label Counts
1    261018
0    261018
Name: label, dtype: int64

SMOTE Label Counts
1    261018
0    261018
Name: label, dtype: int64



## X. Write to File

In [17]:
# Write training datasets
for key in train_datasets_dict:
    train_datasets_dict[key].to_csv("../data/preprocessed/{}_train_dataset.csv".format(key), index = False)

# Write val features and labels
val_features.to_csv("../data/preprocessed/val_features.csv", index = False)
val_labels.to_csv("../data/preprocessed/val_labels.csv", index = False)

# Write test features and labels
test_features.to_csv("../data/preprocessed/test_features.csv", index = False)
test_labels.to_csv("../data/preprocessed/test_labels.csv", index = False)