# 1. Frame the Data

Before doing any further examination, processing or training on the dataset, it is important to frame the data. This process is concerned with determining what the outcome and goal of the model looks like, defining what the output of the model should be, and understanding what a successful solution of the problem would look like (Google Developers, 2023). For this particular scenario, the ideal outcome is a model that can detect fraud, by determining wehether a credit card transaction is genuine or fraudulent. In terms of outputs, the model will be dealing with a binary classification problem, genuine or fraudulent transaction represented by a 0 or a 1, and will therefore have 2 outputs. For the success metrics, we can consider accuracy, precision, recall and F1 score. We will prioritise recall over the other metrics because it is more important that all fraudulent transactions are classified as such than that flagged transactions are actually fraudulent: most people would rather need to authenticate some genuine transcations than have their money stolen. Obviously, however, if precision is too low then people are likely to assume that flagged transactions are incorrectly flagged and verify them without checking out of laziness, so we want a precision of at least 0.50.  

# 2. Get the Data

In [None]:
# Importing necessary libraries for data models and training process
import sys
!{sys.executable} - m pip install numpy pandas matplotlib scikit-learn seaborn imblearn plotnine | grep - v 'already satisfied'

import random
import numpy as np
import imblearn
import warnings
import seaborn as sns
import matplotlib.gridspec as gs
import matplotlib.pyplot as plt
import pandas
import plotnine as p9

from sklearn.svm import SVC
from math import log, exp
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict, StratifiedShuffleSplit, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, f1_score, precision_score, precision_recall_curve, confusion_matrix, roc_curve, roc_auc_score


from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [None]:
# Start by importing the dataset
# Read train csv, check the first ten lines of dataset
credit_card = pandas.read_csv("../data/train.csv")
credit_card.head(10)

# 3. Explore the Data

In [None]:
# View the relationship between any feature and the response variable
attributes = ["V17",
              "Class",]
scatter_matrix(credit_card[attributes],figsize=(5,5))

In [None]:
# Get the shape of the DataFrame
credit_card.shape

In [None]:
# View the information on the DataFrame
# The results show the data is in the dtyp float64(30), int64(2)
credit_card.info()

In [None]:
# Get some additional information from the description
credit_card.describe()

In [None]:
# Explore the correlations in the DataFrame
credit_card_correlations = credit_card.corr() 
credit_card_correlations

In [None]:
credit_card_correlations["Class"].sort_values(ascending=False)
       
# greater than 0.02: Positive correlation: V4,V18 
#                    Negative correlation: V3,V1,V14,V8,V7,V12
#
#         0.01-0.02: Positive correlation: V28,V20,V11,V9,V17,Amount,V2,V5
#                    Negative correlation: V10,V26,V23 
#
# might delete the other features less than 0.01 or 0.02

In [None]:
# Closer look at one particular attribute from the DataFrame
credit_card_v04 = credit_card[["V4"]] 
credit_card_v04 

In [None]:
# Examine the "Class" attribute
credit_card_class = credit_card[["Class"]] 
credit_card_class 

In [None]:
credit_card.groupby("Class").size()

# 469    cases of fraudulent
# 218660 cases of genuine

In [None]:
# Plot a figure to show the results, keeping 5 decimal points
# Description of procedure found in external resource (Kaggle, 2017)
fig, ax = plt.subplots(1, 1)
ax.pie(credit_card.Class.value_counts(),autopct='%0.5f%%', labels=['0','1'], colors=['b','r'])

ax.set_title("pie")

plt.show()

In [None]:
# Calculate and plot the "Amount" attribute part
fig, (ax_fraudulent, ax_genuine) = plt.subplots(2, 1, figsize=(12,10))  

# Subplot creates a figure and a grid of subplots
ax_fraudulent.hist(credit_card.Amount[credit_card["Class"] == 1],  color = 'r' ,bins = 50, alpha=0.7)
ax_fraudulent.set_title('Fraudulent Transactions')

ax_genuine.hist(credit_card.Amount[credit_card["Class"] == 0], color = 'b' ,bins = 50, alpha=0.7)  
ax_genuine.set_title('Genuine Transactions')

plt.xlabel('Transaction Amount')  
plt.ylabel('Number of Transactions')  
plt.yscale('log')          

plt.show()

In [None]:
# Calculate and plot the "Time" attribute part
# This dataset presents transactions that occurred in two days

fig, (ax_fraudulent, ax_genuine) = plt.subplots(2, 1, figsize=(12,10))  

ax_fraudulent.hist(credit_card.Time[credit_card["Class"] == 1],  color = 'r' ,bins = 50, alpha=0.7)
ax_fraudulent.set_title('Fraudulent Transactions')

ax_genuine.hist(credit_card.Time[credit_card["Class"] == 0], color = 'b' ,bins = 50, alpha=0.7) 
ax_genuine.set_title('Genuine Transactions')

plt.xlabel('Amount of Transactions')  
plt.ylabel('Number of Transactions')  
plt.yscale('log')                    
plt.show()


In [None]:
# Sort by the amount of fraud cases
credit_card = pandas.read_csv("../data/train.csv")

credit_card_fraud = credit_card.loc[credit_card["Class"] == 1]  
credit_card_fraud

credit_card_sort = credit_card_fraud.sort_values(by = 'Amount')
credit_card_sort


#relatively large transactions in fraudulent cases:2096.00
#                                                  2500.00
#                                                  2727.18
#                                                  3000.00
#                                                  4471.96

In [None]:
# Sort by the amount of normal cases
credit_card = pandas.read_csv("../data/train.csv")

credit_card_fraud = credit_card.loc[credit_card["Class"] == 0]  
credit_card_fraud

credit_card_sort = credit_card_fraud.sort_values(by = 'Amount')
credit_card_sort

# relatively large transactions in genuine cases:4610.36
#                                                4632.00
#                                                2727.18
#                                                4669.77
#                                                6513.35
#                                                7475.00

We plot the distributions for each feature, split by class (positive is red, negative is blue). Features with similar distributions for positive and negative classes are unlikely to be useful.

In [None]:
# Distribution of each feature 
# One might want to delete features that are not significantly different between the fraudulent and genuine distribution
# Description of procedure found in external resource (Kaggle, 2017)

gs = gs.GridSpec(31, 1)             #GridSpec: a more general subplot layout

plt.figure(figsize=(10,31*5))

for i, col in enumerate(credit_card[credit_card.iloc[:,0:31].columns]):   
#Except for line 32, which is the Class attribute
    
    ax1 = plt.subplot(gs[i])
    
    sns.distplot(credit_card[col][credit_card.Class == 1], bins=50, color='r',)
    sns.distplot(credit_card[col][credit_card.Class == 0], bins=50, color='b',)
    
    ax1.set_xlabel('')
    ax1.set_title('feature ' + str(col))

plt.show()

In [None]:
# Plot a histogram for each feature
credit_card.hist(bins = 50, figsize = (30,50))
plt.show

# 4. Prepare the Data
The next step in the process for a machine learning problem like this would be to prepare the data. Important steps to consider here are what columns to drop from the dataset, how to deal with missing values and how to convert attributes into numerical values. Depending on the dataset, there might also be a need for processing raw data, including values, symbols or syntax a machine learning model would not be able to process (Brownlee, 2020a). Considering how all values of the dataset are numerical, with no categories, this data preparation will not be necessary to consider for the imputation methods. However, moving on to the next step, preparation work must be performed before various models can be explored and trained on the dataset.

Besides assessing some imputation methods, to determine the most effective way of replacing missing values in the dataset, it is necessry to drop some attributes from the dataset before using it to train machine learning models. We therefore drop the "Class" attribute, as this is the target value we are trying to predict, and it therefore should not be included in the training set. The "id" is also not indicative of any meaningful data we will be working with, as it is just a ascending number to identify the transaction for the specfic row.

Additionally, it is important to consider feature scaling. Machine learning algorithms generally do not perform well on attributes that are considerably different in scale, which is the case for the credit card dataset. While the values for the attributes V1 to V28 have similar scale, the numerical value of the "Time" and "Amount" attributes vary drastically in scale from these. While the attributes V1 to V28 all have a mean around 0, and a standard deviation between 0 and 1, the "Time" attribute has a mean of 62377.42 and a standard deviation of 25620.35. Similarly, the "Amount" attribute have quite different values for these metrics, with a mean of 66.36 and a standard deviation of 150.80. As such, these attributes should be scaled.

Considering how several opther attributes of the dataset deals with numerical values in the range of 0 to 1, normalisation would be an appropriate scaling method for these attributes. Using normalisation would also be the better choice considering how the data does not have a normal distribution, and it would ensure the prepared data is compatible with models that expect a value between 0 and 1, such as neural networks (Géron, 2019).


## 4.1 Splitting the Datasets
We know from our data exploration that the two classes are very unbalanced, and that we have a large set of features, which may not all be useful. We therefore consider alternate datasets that could be better for training models.

### 4.1.1 Original Dataset

In [None]:
# Take a look at the original dataset again
credit_card

### 4.1.2 Dropping non-correlated features


In [None]:
credit_card_correlations["Class"].sort_values(ascending=False)
       
# greater than 0.02: Positive correlation: V4,V18 
#                    Negative correlation: V3,V1,V14,V8,V7,V12
#
#         0.01-0.02: Positive correlation: V28,V20,V11,V9,V17,Amount,V2,V5
#                    Negative correlation: V10,V26,V23 
#
# might delete the other features less than 0.01 or 0.02

In [None]:
# delete the less relevant features according to the score
# score > 0.2

uncorr_features = [
    'Class',
    'id',
    'Time',

    'V28',
    'V20',
    'V11',
    'V9',
    'V17',
    'V2',
    'V5',
    'V21',
    'V15',
    'V13',
    'V22',
    'V25',
    'V6',
    'V24',
    'V16',
    'V19',
    'V23',
    'V26',
    'V10',]

### 4.1.3 Undersample
We can undersample from the negative class to get a more even distribution.


In [None]:
# Random undersampling to balance the "Class" distribution, using a 0.2 strategy

X = credit_card.drop(columns=["Class"])
y = credit_card[["Class"]]


# define undersample strategy
# there are 218,660 examples in the majority class and 469 examples in the minority class
# set sampling_strategy to 0.2 because 469/2345 = 0.2
# then the majority of classes in the transformed data set will have 2345 examples


#undersample = RandomUnderSampler(sampling_strategy='majority')
undersample_02 = RandomUnderSampler(sampling_strategy=0.2)

credit_card_X_under, credit_card_labels_under = undersample_02.fit_resample(X, y)

del X, y, undersample_02

In [None]:
credit_card_X_under

In [None]:
credit_card_labels_under

In [None]:
credit_card_labels_under.groupby("Class").size()

### 4.1.4 Oversample
We can alternatively oversample from the positive class to get a more even distribution.


In [None]:
# Random oversampling to balance the class distribution

X = credit_card.drop(columns=["Class"])
y = credit_card[["Class"]]

# Define the oversample strategy  
oversample_025 = RandomOverSampler(sampling_strategy=0.25)

# Fit the results to the oversampling strategy
credit_card_X_over, credit_card_labels_over = oversample_025 .fit_resample(X, y)

del X, y, oversample_025

In [None]:
credit_card_X_over

In [None]:
credit_card_labels_over.groupby("Class").size()

#### 4.1.5 Dropping non-important features (Dimension Reduction)

We have about 30 features. It should be possible to use Principle Component Analysis (PCA) to reduce the number of features, either by collapsing them together or even removing unimportant features completely. 

Before applying PCA, we need to scale our data. 

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


def trial_PCA(n_components, verbose=True, plot_graph=False): 
    """Applies PCA to the credit_card DataFrame and prints the amount of captured variance.
       Optionally returns 2-component outputs on a graph, otherwise returns the percentage of lost variance."""
    # split data
    credit_card_x = credit_card.drop(columns=["id", "Class"])
    credit_card_y = credit_card["Class"] # for plotting 

    # scale data
    scaler = StandardScaler()
    scaler.fit(credit_card_x)
    credit_cared_x_scaled = scaler.transform(credit_card_x)

    # apply pca
    pca = PCA(n_components=n_components)
    pca.fit(credit_cared_x_scaled)
    credit_card_pca_x = pandas.DataFrame(pca.transform(credit_cared_x_scaled)) # return value needed for plotting 

    # see how much variance we lost 
    print(f"lost {(1 - pca.explained_variance_ratio_.sum()) *100}% of the variance with {len(pca.components_)} components")

    # see how much variance was captured by each dimension 
    if verbose: 
        for i in range(len(pca.components_)): 
            print(f"PCA feature {i} captured {(pca.explained_variance_ratio_[i]) * 100}% of the variance")

    if (plot_graph and n_components == 2): 
        return p9.ggplot(credit_card_pca_x) + p9.geom_point(p9.aes(x=credit_card_pca_x.columns[0], y=credit_card_pca_x.columns[1], color=[str(y) for y  in credit_card_y]))
    else: 
        return (1 - pca.explained_variance_ratio_.sum()) *100

trial_PCA(2, plot_graph=True)

Clearly, collapsing down to 2 features loses far too much of the variance. We will try with n_components=10, which still significantly reduces our dimensions.

In [None]:
trial_PCA(10)

This is still too much variance loss, so we will see what number of components is needed to capture 95% of variance.

In [None]:
trial_PCA(0.95, verbose=False)

26 components is not much of a feature reduction, so we will investigate how the loss varies with the number of principal components in the hopes of finding some number of components that meaningfully reduce the dimensions of our search space without losing too much variance (the elbow of the curve).

In [None]:
n_components = []
lost_variances = []
start = 2
stop = 30
for i in range(start, stop):
    n_components.append(i)
    lost_variances.append(trial_PCA(i, verbose=False))

In [None]:
plt.cla()
plt.scatter(x=n_components, y=lost_variances)
plt.xticks(range(start, stop))
plt.xlabel("No. of components")
plt.ylabel("Lost variance %")
plt.show()

The above curve does not have an obvious elbow: increasing the number of principal components (n) pretty consistently increases the captured variance by 2-3% for n >= 15, up to 5-6% for n <= 7. This halves the gradient, but the change is quite smooth, so picking a number to use is difficult. 

In [None]:
plt.cla()
del n_components, lost_variances, start, stop, i

Finally, we consider which features of credit_card explain the most variance for n = 26 across all principal components.  

In [None]:
# Split data
credit_card_x = credit_card.drop(columns=["id", "Class"])

# Scale data
scaler = StandardScaler()
scaler.fit(credit_card_x)
credit_cared_x_scaled = scaler.transform(credit_card_x)

# Apply pca
pca = PCA(n_components=0.95)
pca.fit(credit_cared_x_scaled)

feature_variance_sum = {}
for column in credit_card_x.columns:
    feature_variance_sum[column] = 0

for p_component in pca.components_:
    score_list = [abs(score) for score in list(p_component)]
    for i in range(len(score_list)):
        feature_variance_sum[credit_card_x.columns[i]] += score_list[i]

feature_variance_sum_list = [(k, v / len(pca.components_)) for k, v in feature_variance_sum.items()]
feature_variance_sum_list.sort(reverse=True, key=lambda tup: tup[1])
print("\n".join([str(item) for item in feature_variance_sum_list]))

del credit_card_x, scaler, credit_cared_x_scaled, pca, feature_variance_sum, feature_variance_sum_list, p_component, column, score_list

We can see that V4, V21, V8, V25, V2, V22 and V26 all explain at least 15% of the variance per component on average. Given that PCA loses a lot of the variance for low values of n, we could take the more drastic measure of simply dropping all of the other features. 

In [None]:
low_variance_features = ["V4",
                         "V21", "V8", "V25", "V2", "V22", "V26"]


### 4.1.6 Feature Deletion
We can alternatively reduce our feature set by dropping features that had similar distributions in both classes, as explored earlier.

In [None]:
similar_distribution_features = [
    'Class',
    'id',
    'Time',

    'V5',
    'V6',
    'V7',
    'V8',
    'V12',
    'V13',
    'V15',
    'V16',
    'V20',
    'V21',
    'V22',
    'V23',
    'V25',
    'V27',
    'V28',

    'Amount',]

## 4.2 Data Cleaning

Aside from the different data sets, we should drop the columns that we do not think will be useful.

In [None]:
# Create custom transformer

# Source: https://stackoverflow.com/questions/68402691/adding-dropping-column-instance-into-a-pipeline

class ColumnRemover():
   def __init__(self, columns):
    self.columns = columns
     
   def transform(self, X, y=None):
    return X.drop(self.columns, axis=1)

   def fit(self, X, y=None):
    return self

In [None]:
# Create a pipeline for cleaning the data
# Source: # Source: https://stackoverflow.com/questions/68402691/adding-dropping-column-instance-into-a-pipeline

# Dropping the id, time, and class columns, as described above
pipeline = Pipeline([
    ("columnDropper", ColumnRemover(['id', 'Class', 'Time'])),
    ("scaler", MinMaxScaler())
])

# Apply the pipeline to dataframe

credit_card_transformed = pipeline.fit_transform(credit_card)
credit_card_transformed

In [None]:
# check the correlation between features and the target which is the class attribute
credit_card_correlations = credit_card.corr() 
credit_card_correlations["Class"].sort_values(ascending=False)

## 4.3 Imputing Missing Data

### 4.3.1 Randomly Removing Data
The training data has no missing values, so before we can test imputation methods we need a way of deleting values. 

In [None]:
# Since there are no missing values in the training set, we need to create some
# The procedure for how this can be done was found in an external resource (Stack Overflow, 2020)
# This can be done by defining a value for the percentage of values that should be missing, and then removing these

def remove_random_values(df: pandas.DataFrame, missing_percentage=0.05, random_seed=42):
    """Returns a copy of a DataFrame with some missing values."""
    np.random.seed(random_seed)
    df_copy = df.copy()
    # Generate some missing values for the training set, and then replace these with NaN (GeeksforGeeks, 2022)
    # Perform same operations to create missing values as was done for the median imputation method
    missing_values = np.zeros(df_copy.shape, dtype=bool)
    missing_values[:, 1:] = np.random.choice([True, False], size=(
        df_copy.shape[0], df_copy.shape[1]-1), p=[missing_percentage, 1-missing_percentage])
    df_copy[missing_values] = np.nan

    return df_copy

In [None]:
# Define a variable for dataset with missing values
credit_card_missing = remove_random_values(credit_card)

# See how many null values we are working with now
credit_card_missing.info()

del credit_card_missing

### 4.3.2 Imputing the Mean 
Our first imputation strategy is to use the mean value. The mean imputation method is a common strategy for replacing missing values, assigning the mean in their place.

In [None]:
# Having created the missing values, we can now use the mean imputation method
mean_imputer = SimpleImputer(strategy="mean")
credit_card_missing_values = remove_random_values(credit_card)
imputed_mean_arr = mean_imputer.fit_transform(credit_card_missing_values)

# Converting the array for the imputed mean into DataFrame from an array 
imputed_mean_df = pandas.DataFrame(imputed_mean_arr, columns=credit_card.columns)

imputed_mean_df.info()

del mean_imputer, credit_card_missing_values

Scikit-learn's SimpleImputer likely works as expected, but we can now evaluate the mean strategy. First we consider its impact on the mean value for each column. We expected that it will be preserved, but we are now able to check this in our code to confirm.

In [None]:
# Calculate the percentage difference of the mean using the mean imputation method
percentage_diff_means = [(round(abs(imputed_mean - original_mean)/original_mean * 100, 2)) for (original_mean, imputed_mean)
 in zip(list(np.mean(credit_card, axis=0)), list(np.mean(imputed_mean_df, axis=0)))]

# We exclude the first two columns because they are ID and time, which are not randomly nulled
print("The mean impact on the mean for each column is {}%".format(np.mean(percentage_diff_means[2:])))

del percentage_diff_means

As expected, this is a very minor impact. As such, using the mean prediactably preserves the value of the mean.

Next, we can consider the impact on the median value, when using the mean imputation method.

In [None]:
percentage_diff_medians = []
for (original_median, imputed_median) in zip(list(np.median(credit_card.drop(columns=["Class"]), axis=0)), list(np.median(imputed_mean_df.drop(columns=["Class"]), axis=0))):
    try:
        percentage_diff_medians.append(round(exp(
            log(abs(imputed_median - original_median)) - log(abs(original_median))) * 100, 2))
        if (percentage_diff_medians[-1] > 0.5):
            # print("Big difference. Original median: {}, Imputed median: {}".format(original_median, imputed_median))
            pass 
    except:
        percentage_diff_medians.append(0)


# We exclude the first two columns because they are ID and time, which are not randomly nulled
print("The mean impact on the median for each column is {}%".format(
    np.mean(percentage_diff_medians[2:])))

del percentage_diff_medians

This is a much bigger impact than on the mean value. So far the mean imputation method, in calculating the mean impact on the median for each column, is not performing as well as one might have hoped for, considering how it has a 50.16% difference compared to the original dataset. Next, we can compare the variance between the original dataset and the mean imputed dataset.

In [None]:
# Calculate the percentage difference in variance between the original and imputed data
percentage_diff_variances = [(round(abs(imputed_variance - original_variance)/original_variance * 100, 2)) for (original_variance, imputed_variance)
 in zip(list(np.var(credit_card, axis=0)), list(np.var(imputed_mean_df, axis=0)))]

# We exclude the first two columns because they are ID and time, which are not randomly nulled
print("The mean impact on the variance for each column is {}%".format(np.mean(percentage_diff_variances[2:])))

del percentage_diff_variances

As we can see, the variance for each column is 5.04%, which is not a significant difference, especially considering the size of the dataset. Next, We can explore the distributions graphically for some columns, shown below. Next, we can create a plot where both median outputs are displayed on the same axis, so that they can easily be compared.

In [None]:
# Take a closer look at the median and mean for the imputed mean
print(np.median(credit_card[["V1"]]))
print(np.median(imputed_mean_df[["V1"]]))

print(np.mean(credit_card[["V1"]]))
print(np.mean(imputed_mean_df[["V1"]]))

# Explore the distribution graphically for some given columns
fig, ax = plt.subplots()
# Plot a scatter plot for the credit card data
credit_card.plot.scatter(ax=ax, x="id", y="V1", color='red', label='Credit Card')
# Plot a scatter plot for the credit card data with the imputed mean
imputed_mean_df.plot.scatter(ax=ax, x="id", y="V1", color='orange', label='Imputed Mean')
# Show the plot
plt.show()

Having compared some key metrics from the original dataset to the corresponding values from the dataset imputed with the mean, it becomes evident that the mean might be an acceptable alternative in replacing missing values, though it is worth noting that it does not perform well for the mean impact on the median for each column. However, when printing out the difference for each column for the original dataset and the dataset using the mean imputation method, we can perceive that this is because of the one outlier dealing with larger numbers, as the other inputs are performing relatively well. Furthermore, the mean for a samle attribute "V1" of the original dataset is almost identical after mean imputation, with a value of 0.096008 for the original dataset and 0.09544 for the imputed mean dataset. This further confirms that replacing missing values with the mean will yield a result that closely resembles the original data.

### 4.3.3 Imputing the Median

Next, we decided to use the median as our second imputation strategy. 

In [None]:
# After creating the missing values, we applied the median imputation method
median_imputer = SimpleImputer(strategy="median")
credit_card_missing_values = remove_random_values(credit_card)
imputed_median_arr = median_imputer.fit_transform(credit_card_missing_values)

# Converting the array for the imputed median into DataFrame from an array 
imputed_median_df = pandas.DataFrame(imputed_median_arr, columns=credit_card.columns)

imputed_median_df.info()

del median_imputer, credit_card_missing_values

Next, the median imputation strategy was evaluated by comparing the means of the imputed median values to the actual data's value.

In [None]:
percentage_diff_means = [(round(abs(imputed_median - original_median)/original_median * 100, 2)) for (original_mean, imputed_mean)
 in zip(list(np.mean(credit_card, axis=0)), list(np.mean(imputed_median_df, axis=0)))]

# we exclude the first two columns because they are ID and time, which are not randomly nulled
print("The mean impact on the mean for each column is {}%".format(np.mean(percentage_diff_means[2:])))

del percentage_diff_means

14.16% is a significant difference and indicates that the median may not be the most effective for predictably preserving the value of the means. It is also significantly higher than the results from the mean imputation. This could indicate that the median imputation biases the mean of the dataset. Next, the impact on the median value was evaluated.

In [None]:

percentage_diff_medians = []
for (original_median, imputed_median) in zip(list(np.median(credit_card.drop(columns=["Class"]), axis=0)), list(np.median(imputed_median_df.drop(columns=["Class"]), axis=0))):
    try:
        percentage_diff_medians.append(round(exp(
            log(abs(imputed_median - original_median)) - log(abs(original_median))) * 100, 2))
        if (percentage_diff_medians[-1] > 0.5):
            # print("big difference, original median: {}, imputed median: {}".format(original_median, imputed_median))
            pass 
    except:
        percentage_diff_medians.append(0)


# we exclude the first two columns because they are ID and time, which are not randomly nulled
print("The mean impact on the median for each column is {}%".format(
    np.mean(percentage_diff_medians[2:])))

del percentage_diff_medians

It appears that the median imputation works significantly better when predicting the missing median values than the mean imputation method, given it only has a 0.596% difference between the median imputed values and the median actual values. Then, the variance between the original data and the median imputed data were compared.

In [None]:
# Calculate the percentage difference in variance between the original and imputed data
percentage_diff_variances = [(round(abs(imputed_variance - original_variance)/original_variance * 100, 2)) for (original_variance, imputed_variance)
 in zip(list(np.var(credit_card, axis=0)), list(np.var(imputed_median_df, axis=0)))]

# We exclude the first two columns because they are ID and time, which are not randomly nulled
print("The mean impact on the variance for each column is {}%".format(np.mean(percentage_diff_variances[2:])))

del percentage_diff_variances

Given the variance is less than 5%, this is considered a small significance, especially when considering the size of this dataset.

In [None]:
# Take a closer look at the median and mean for the imputed median
print(np.median(credit_card[["V1"]]))
print(np.median(imputed_median_df[["V1"]]))

print(np.mean(credit_card[["V1"]]))
print(np.mean(imputed_median_df[["V1"]]))

# Explore the distribution graphically for some given columns
fig, ax = plt.subplots()
# Plot a scatter plot for the credit card data
credit_card.plot.scatter(ax=ax, x="id", y="V1", color='black', label='Credit Card')
# Plot a scatter plot for the credit card data with the imputed median
imputed_median_df.plot.scatter(ax=ax, x="id", y="V1", color='darkgrey', label='Imputed Median')
# Show the plot
plt.show()

Based on the findings above, it appears that the median imputation method is a relatively good alternative for replacing missing values in the dataset. So far, it performs better than the mean imputation method in all aspects except from the mean impact on the mean for each column.

### 4.3.4 Imputing from a Binned Frequency Distribution

Our third method is more (probably over-) complicated: instead of simply using the median or mean value, we build a frequency distribution of the known values and then replace missing values with (weighted) random values based on that distribution.


We first investigate if there are enough non-unique values to sensibly group in a frequency distribution.

In [None]:
column_header = "V7"
values = list(credit_card[column_header])
values.sort()
print("the range of values in data[{}] is {} to {}".format(
    column_header, values[0], values[-1]))
vals = list(set(values))
print("there are {} values in data[{}], of which {} are unique".format(
    len(values), column_header, len(vals)))

del column_header, values, vals

Most values are unique, so a basic frequency distribution will not be helpful. We can instead group values into a manageable number of bins, create a frequency distribution of those bins, and use the middle values of those bins for our imputation. 

Our next task is therefore to generate a binned frequency distribution. 

#### 4.3.4.1 Building a Binned Frequency Distribution

In [None]:
def get_fq(data: pandas.DataFrame, column_header: str, max_bins: int, debugging=False):
    """Creates a binned frequency distribution of a given DataFrame's column, using middle values for each bin."""
    MIN_FREQ = float(1.0e-9)

    values = list(data[column_header])
    # remove missing values
    values = list(filter(lambda x: not np.isnan(x), values))
    values.sort()

    # ensure that there are not more bins than unique values 
    max_bins = min(max_bins, len(set(values)))
    if debugging:
        print("get_fq(): max_bins: {}".format(max_bins))

    # find range of values, and corresponding width of initial bins
    val_range = abs(values[0] - values[-1])
    bin_size = val_range / max_bins
    if debugging:
        print("get_fq(): val_range: {}".format(val_range))
        print("get_fq(): bin_size: {}".format(bin_size))

    # find the frequencies of each bin
    freqs = [0]
    bin_end = values[0] + bin_size
    for value in values:
        while value > bin_end:
            bin_end += bin_size
            freqs.append(0)
        freqs[-1] += 1
    freqs = list(map(lambda x: x/len(values), freqs))
    if debugging:
        print("get_fq(): sum of freqs (should be ~1.0): {}".format(sum(freqs)))
        print("get_fq(): sum of freqs below {}: {}".format(
            MIN_FREQ, list(map(lambda x: x < MIN_FREQ, freqs)).count(True)))
        print("get_fq(): len(freqs) = {}".format(len(freqs)))

    # create a collection of the bins that have frequencies of at least MIN_FREQ, and their corresponding middle values
    binned_freq_dist = [[0]]
    curr_bin_freq = 0
    start = stop = values[0]
    for freq in freqs:
        curr_bin_freq += freq
        stop += bin_size
        if (curr_bin_freq > MIN_FREQ):
            if debugging:
                print("get_fq(): bin{index}{{range:{start},{stop} mid:{mid} curr_freq:{curr_freq} freq:{freq}}}".format(
                    index=len(binned_freq_dist), start=start, stop=stop, mid=(start+stop)/2, curr_freq=curr_bin_freq, freq=binned_freq_dist[-1][0] + curr_bin_freq
                ))
            binned_freq_dist[-1] = [binned_freq_dist[-1][0] + curr_bin_freq,
                                    (start + stop) / 2]
            binned_freq_dist.append([binned_freq_dist[-1][0]])
            curr_bin_freq = 0
            start = stop
    # remove the last item from the list, which will just be [1]
    del binned_freq_dist[-1]
    if debugging:
        print("get_fq(): len(binned_freq_dist) = {}".format(len(binned_freq_dist)))

    return binned_freq_dist


def get_val_from_fq(fq: list, rand_flt: float):
    """Returns a value from a binned frequency distribution corresponding to a given random float between 0 and 1"""
    if rand_flt < 0 or rand_flt > 1:
        raise ValueError("rand_flt must be between 0 and 1")
    for [freq, value] in fq:
        if rand_flt <= freq:
            return value
    # catch rand_flt == 1 in the case that the final frequency is 0.999999999
    return fq[-1][1]


Note that the final number of bins is less than the state maximum. This is because bins that would have a probability below a certain threshold are combined together based on the precision of random values generated by random.uniform(), so that bins are impossible to be selected. Investigation into this precision is below (takes about 11 minutes to run on a lab client); in my experimentation, the smallest value returned was 4.416561560915966e-10, so MIN_FREQ >= 1e-10 should be sensible. Replace False in the below code block to test for yourself 

We choose to set the lower probability bound to 1e-9 for the sake of efficiency, and because we know that there are not enough unique values in the dataset to warrant more bins than a minimum probability of 1e-9 supports.

In [None]:
if False: 
    smallest = 1
    for i in range(int(1e9)): 
        smallest = min(smallest, random.uniform(0, 1))
    smallest
    del smallest, i

We then test that this gives sensible outputs for a toy example.

In [None]:
df = pandas.DataFrame([[0, np.nan, 3],
                       [1, np.nan, 3],
                       [2, 5, np.nan],
                       [3, 5, np.nan],
                       ],
                      columns=["A", "B", "C"])
print(df)
print("\n debug messages for generating A's frequency distribution:")
fq = get_fq(df, "A", 3, debugging=True)
print()
print("fq, as [probability, middle value] pairs: {}".format(fq))
print()
print("value for probability 0.0: {}".format(get_val_from_fq(fq, 0)))
print("value for probability 0.5: {}".format(get_val_from_fq(fq, 0.5)))
print("value for probability 0.666: {}".format(get_val_from_fq(fq, 0.666)))
print("value for probability 1.0: {}".format(get_val_from_fq(fq, 1)))

del df, fq

This appears to be sensible, though with such a small dataset the middle values are relatively further from the actual values than we hope they will be over large datasets.

We next see how many bins are collapsed together when creating a frequency distribution of V1.

In [None]:
expected_bins = 50
got_bins = len(get_fq(credit_card, "V1", expected_bins, debugging=False))

print("We expected {} but only got {} (bins with too low a probability are condensed together for efficiency).".format(expected_bins, got_bins))

del expected_bins, got_bins

#### 4.3.4.2 Imputing from the Frequency Distribution

Now that we can generate frequency distributions and get values from them, it is quite simple to create a new Imputer to fill in missing values in a given DataFrame. Note that the default max bin count of 50 is an arbitrary choice and warrants investigation.

In [None]:
class FreqDistImputer():
    def __init__(self, verbose=0, max_bins=50):
        self.verbose = verbose
        self.max_bins = max_bins
        self.freq_dists = {}

    def fit(self, df: pandas.DataFrame):
        for column in df.columns:
            self.freq_dists[column] = get_fq(
                df, column, self.max_bins, self.verbose)
        if self.verbose:
            print(self.freq_dists)

    def transform(self, df: pandas.DataFrame):
        df_imputed = df.copy()
        na_rows, na_columns = np.where(pandas.isnull(df_imputed))
        for row, col in (list(zip(na_rows, na_columns))):
            index = df_imputed.index[row]
            column_header = df_imputed.columns[col]
            df_imputed.at[index, column_header] = get_val_from_fq(
                self.freq_dists[column_header], random.uniform(0, 1))            
        return df_imputed

    def fit_transfrom(self, df: pandas.DataFrame): 
        self.fit(df)
        self.transform(df)

Before using the binned frequency distribution function as an imputation method on the dataset, some testing is done to ensure it works as intended. Some elemental testing can therefore be found below. For future purposes, further testing encouragable, albeit it might potentially be difficult.

In [None]:
df = pandas.DataFrame([
    [1, np.nan, 3],
    [np.nan, 4, 3],
    [3, 4, 3],
    [3, 4, 3],
    [4, 5, np.nan]
],
    columns=["A", "B", "C"]
)
print(df)
imputer = FreqDistImputer(verbose=0)
imputer.fit(df)
df_imputed = imputer.transform(df)
print(df_imputed)

del df, df_imputed

### 4.1.4.3 Imputing the Binned Frequency Distribution

Next, we can use the binned frequency distribution as our third imputation strategy. 

In [None]:
# After creating the missing values, we can apply the binned frequency distribution imputation method
# This will look similar to the procedures performed for the mean and median, with the exception of converting the array to a DataFrame
binned_imputer = FreqDistImputer(verbose=0)
credit_card_missing_values = remove_random_values(credit_card)

binned_imputer.fit(credit_card_missing_values)
imputed_frequency_df = binned_imputer.transform(credit_card_missing_values)

imputed_frequency_df.info()

del binned_imputer, credit_card_missing_values

Having defined the DataFrame for a dataset containing missing values replaced with the binned frequency distribution imputation method, the strategy can be evaluated by comparing the means of the imputed binned frequnecy distribution values to the actual data's value.

In [None]:
# Calculate the percentage difference of the mean using the binned frequency distribution imputation
percentage_diff_means = [(round(abs(imputed_mean - original_mean)/original_mean * 100, 2)) for (original_mean, imputed_mean)
 in zip(list(np.mean(credit_card, axis=0)), list(np.mean(imputed_frequency_df, axis=0)))]

# We exclude the first two columns because they are ID and time, which are not randomly nulled
print("The mean impact on the mean for each column is {}%".format(np.mean(percentage_diff_means[2:])))

del percentage_diff_means

As we can see, 21.32% is a substantial difference, indicating that the binned frequency distribution is not be the best measure for preserving the value of the means. It is also significantly higher than the results from both the mean and median imputation. Next, the impact on the median value was evaluated.

In [None]:
percentage_diff_medians = []
for (original_median, imputed_median) in zip(list(np.median(credit_card.drop(columns=["Class"]), axis=0)), list(np.median(imputed_frequency_df.drop(columns=["Class"]), axis=0))):
    try:
        percentage_diff_medians.append(round(exp(
            log(abs(imputed_median - original_median)) - log(abs(original_median))) * 100, 2))
        if (percentage_diff_medians[-1] > 0.5):
            # print("big difference, original median: {}, imputed median: {}".format(original_median, imputed_median))
            pass 
    except:
        percentage_diff_medians.append(0)


# we exclude the first two columns because they are ID and time, which are not randomly nulled
print("The mean impact on the median for each column is {}%".format(
    np.mean(percentage_diff_medians[2:])))

del percentage_diff_medians

Still, the binned frequency distribution imputation method is not performin very well as a method of replacing missing values in the dataset. With a mean impact on the median for each column of 12.34%, this imputation method is so far performing worse than both the mean and median imputation method. Next, we can compare the variance between the original dataset and the mean imputed dataset.

In [None]:
# Calculate the percentage difference in variance between the original and binned frequency distribution data
percentage_diff_variances = [(round(abs(imputed_variance - original_variance)/original_variance * 100, 2)) for (original_variance, imputed_variance)
 in zip(list(np.var(credit_card, axis=0)), list(np.var(imputed_frequency_df, axis=0)))]

# We exclude the first two columns because they are ID and time, which are not randomly nulled
print("The mean impact on the variance for each column is {}%".format(np.mean(percentage_diff_variances[2:])))

del percentage_diff_variances

In comparing the variance, the binned frequency distribution performs more or less the same as the mean and median imputation methods, with a 5% differece. This is still considered a small significance considering the size of this dataset. Finally, we can comprare some values from the original and the imputed dataset graphically.

In [None]:
# Take a closer look at the median and mean for the imputed median
print(np.median(credit_card[["V1"]]))
print(np.median(imputed_frequency_df[["V1"]]))

print(np.mean(credit_card[["V1"]]))
print(np.mean(imputed_frequency_df[["V1"]]))

# Explore the distribution graphically for some given columns
fig, ax = plt.subplots()
# Plot a scatter plot for the credit card data
credit_card.plot.scatter(ax=ax, x="id", y="V1", color='blue', label='Credit Card')
# Plot a scatter plot for the credit card data with the imputed median
imputed_median_df.plot.scatter(ax=ax, x="id", y="V1", color='green', label='Imputed Median')
# Show the plot
plt.show()

### 4.2 Assessing Imputation Methods

In assessing imputation methods, three models were tested. Replacing hypothetically missing values in the dataset using the mean, median and binned frequency distribution have different implications. One apparent finding from the comparison is how the median imputation method performs better than both the mean imputation method and the binned frequency distribution method, with the exception of evaluating the mean impact on the mean for each column.

The **mean impact on the mean** is 14.16% for the median imputation, compared to -0.05% for the mean and 21.32% for the binned frequency distribution. While the **mean impact on the median** for each column is 0.60% for median imputation, in comparison, it is 50.16% for the mean imputation method and 12.34% for the binned frequency distribution method. Furthermore, the **mean impact on the variance** is 5.04% for the mean imputation method and 5.00% for the binned frequency distribution, while it is 4.99% percent for the median imputation method, which is slightly less.

As such, the median imputation method would be the preferred way of replacing missing values for the dataset, in the event that these exist.

# 5. Explore the Models 
The data has already been split into a training and test set, and for most of our models we can use K-fold cross validation, but it is too time-consuming to do so for some models, for which we will define a smaller training set and a validation set. 


In [None]:

RANDOM_SEED = 42
K = 5

credit_card_X_under = credit_card_X_under.assign(Class=None)
credit_card_X_under_clean = pandas.DataFrame(pipeline.fit_transform(credit_card_X_under), columns=[
    item for item in credit_card.columns if item not in ['id', 'Class', 'Time']])
credit_card_X_over = credit_card_X_over.assign(Class=None)
credit_card_X_over_clean = pandas.DataFrame(pipeline.fit_transform(credit_card_X_over), columns=[
    item for item in credit_card.columns if item not in ['id', 'Class', 'Time']])

# Create our X and Y for K-fold validation
credit_card_labels = credit_card["Class"].copy()
credit_card_X = pandas.DataFrame(credit_card_transformed, columns=[
                                 item for item in credit_card.columns if item not in ['id', 'Class', 'Time']])

credit_card_X_drop_uncorr = credit_card_X.drop(
    columns=[item for item in uncorr_features if item not in ['id', 'Class', 'Time']])
credit_card_X_drop_var = credit_card_X.drop(
    columns=[item for item in low_variance_features if item not in ['id', 'Class', 'Time']])
credit_card_X_distr = credit_card_X.drop(
    columns=[item for item in similar_distribution_features if item not in ['id', 'Class', 'Time']])

datasets = {
    "original": (credit_card_X, credit_card_labels),
    "under": (credit_card_X_under_clean, credit_card_labels_under),
    "over": (credit_card_X_over_clean, credit_card_labels_over),
    "uncorr": (credit_card_X_drop_uncorr, credit_card_labels),
    "low_var": (credit_card_X_drop_var, credit_card_labels),
    "same_dist": (credit_card_X_distr, credit_card_labels),
}


def split_data(x, y):
    """Create our testing and validation sets"""
    shuffled_data = StratifiedShuffleSplit(
        n_splits=1, test_size=1/K, random_state=RANDOM_SEED)
    [(train_index, validate_index)] = shuffled_data.split(x, y)
    x_train = x.loc[train_index]
    y_train = y.loc[train_index]
    x_validate = x.loc[validate_index]
    y_validate = y.loc[validate_index]
    return x_train, y_train, x_validate, y_validate


# Extract Xs and Ys from training and validation sets
credit_card_X_train, credit_card_labels_train, credit_card_X_validate, credit_card_labels_validate = split_data(
    credit_card_X, credit_card_labels)


print(
    f"number of positives in training set: {len(list(filter(lambda x : x==1, credit_card_labels_train)))}")
print(
    f"number of positives in validation set: {len(list(filter(lambda x : x==1, credit_card_labels_validate)))}")

We next define some helper functions to see the performance of our models.

In [None]:
def calc_matrix(y_true, y_pred):
    """Returns the TP, TN, FP and FN, i.e. the quadrants of a classification matrix for a binary classification problem."""
    cf_array = confusion_matrix(y_true, y_pred)
    tp = cf_array[1][1]
    tn = cf_array[0][0]
    fp = cf_array[0][1]
    fn = cf_array[1][0]
    return (tp, tn, fp, fn)


def calc_metrics(tp, tn, fp, fn):
    """Returns the accuracy, precision, recall and F1 score from the TP, TN, FP and FN values."""
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    return (accuracy, precision, recall, f1)


def stratified_K_fold(model, X: pandas.DataFrame, Y: pandas.Series, k=K, verbose=True, debugging=False):
    """Preforms stratified K-fold verification for a given model. Returns the mean accuracy, precision, recall and F1 score across folds."""
    skf = StratifiedKFold(k)
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    for i, (train_index, valid_index) in enumerate(skf.split(X, Y)):
        # train the model on the training data
        model.fit(X.iloc[train_index], Y.iloc[train_index])
        # validate the model and calculate the desired metics
        (tp, tn, fp, fn) = calc_matrix(
            Y.iloc[valid_index], model.predict(X.iloc[valid_index]))
        (accuracy, precision, recall, f1) = calc_metrics(tp, tn, fp, fn)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

        # print intermediate results
        if verbose:
            print(f"fold {i}")
            print(f"  accuracy:\t{accuracy}")
            print(f"  precision:\t{precision}")
            print(f"  recall:\t{recall}")
            print(f"  F1 score:\t{f1}")

        # print debugging info
        if debugging:
            print(f"train_index: {train_index}")
            print(f"test_index: {valid_index}")
            print(
                f"number of positives in train set: {len(list(filter(lambda x : x==1, Y.iloc[train_index])))}")
            print(
                f"number of positives in validation set: {len(list(filter(lambda x : x==1, Y.iloc[valid_index])))}")

    return (np.mean(list(filter(lambda x: not np.isnan(x), accuracies))),
            np.mean(list(filter(lambda x: not np.isnan(x), precisions))),
            np.mean(list(filter(lambda x: not np.isnan(x), recalls))),
            np.mean(list(filter(lambda x: not np.isnan(x), f1s))),
            )


def stratified_K_fold_scores(model, X: pandas.DataFrame, Y: pandas.Series, k=K, verbose=True, debugging=False):
    """Performs stratified K-fold verification for a given model and prints the mean accuracy, precision, recall and F1 score across folds.
       \nN.B. warnings are ignored due to the high likelihood of division-by-zero warnings."""
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        accuracy, precision, recall, f1 = stratified_K_fold(
            model, X, Y, k, verbose, debugging)
        print()
        print(f"Mean accuracy:\t{accuracy}")
        print(f"Mean precision:\t{precision}")
        print(f"Mean recall:\t{recall}")
        print(f"Mean F1 score:\t{f1}")


def stratified_K_fold_scores_datasets(model, k=K, verbose=False, debugging=False):
    """Performs stratified K-fold verification for a given model on all datasets."""
    for (name, x_y) in datasets.items():
        print(name, end='')
        stratified_K_fold_scores(model, x_y[0], x_y[1], k, verbose, debugging)
        print()


def validation_set_scores(model, train_x, train_y, validate_x, validate_y, already_fit=False):
    """Prints the performance metrics for a model that uses a training and validation set instead of K-fold validation.       
    \nN.B. warnings are ignored due to the high likelihood of division-by-zero warnings."""
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        if not already_fit:
            model.fit(train_x, train_y)
        (tp, tn, fp, fn) = calc_matrix(validate_y, model.predict(validate_x))
        (accuracy, precision, recall, f1) = calc_metrics(tp, tn, fp, fn)
        print(f"Accuracy:\t{accuracy}")
        print(f"Precision:\t{precision}")
        print(f"Recall:\t\t{recall}")
        print(f"F1 score:\t{f1}")


def validation_set_scores_datasets(model, already_fit=False):
    """Prints the performance metrics for a model that uses a training and validation set instead of K-fold validation.       
    \nN.B. warnings are ignored due to the high likelihood of division-by-zero warnings."""
    for (name, x_y) in datasets.items():
        print(name)
        x_train, y_train, x_validate, y_validate = split_data(x_y[0], x_y[1])
        validation_set_scores(model, x_train, y_train,
                              x_validate, y_validate, already_fit)
        print()


As a baseline to compare against, we will just predict class = 0 for all inputs. 

In [None]:
dummy_classifier = DummyClassifier(strategy='most_frequent')
stratified_K_fold_scores(dummy_classifier, credit_card_X, credit_card_labels)

del dummy_classifier

As one might expect, this has a very high accuracy, but it has a recall of zero and the precision cannot be calculated as there are neither true positives nor false positives. We can use the dummy to demonstrate that of our datasets, only under and oversampling have changed the ratio of positive and negative classes.

In [None]:
dummy_classifier = DummyClassifier(strategy='most_frequent')
stratified_K_fold_scores_datasets(dummy_classifier)

del dummy_classifier

## 5.1 Linear Classifier

In [None]:
logistic_classifier = SGDClassifier(random_state=RANDOM_SEED)

stratified_K_fold_scores(logistic_classifier, credit_card_X, credit_card_labels)

del logistic_classifier

Our linear classifier only classifies any transactions as fraudulent for our first fold, so our precision and recall is NaN for the other folds - they are the same as the dummy classifier, only much more computationally expensive. Our first fold has a lower accuracy and its precision is zero becase the instances that it classifies as fraudulent are not fraudulent, so the average accuracy of the linear classifier is actually worse than the dummy classifier.

In [None]:
logistic_classifier = SGDClassifier(random_state=RANDOM_SEED)

stratified_K_fold_scores_datasets(logistic_classifier)

del logistic_classifier

Under and oversampling massively improve the recall.

## 5.2 Logisitic Classifier

In [None]:
logistic_classifier = SGDClassifier(loss='log_loss', random_state=RANDOM_SEED)

stratified_K_fold_scores(logistic_classifier, credit_card_X, credit_card_labels)

del logistic_classifier

The logistic classifier appears to suffer from the same problems as the linear model, although its has fewer false positives in the first fold. Its performance is still worse than the dummy classifier.

In [None]:
logistic_classifier = SGDClassifier(loss='log_loss', random_state=RANDOM_SEED)

stratified_K_fold_scores_datasets(logistic_classifier)

del logistic_classifier

Again, under and oversampling improve recall, but less than for the linear classifier.

## 5.3 Decision Tree Classifier

In [None]:
tree_classifier = DecisionTreeClassifier(random_state=RANDOM_SEED)
stratified_K_fold_scores(tree_classifier, credit_card_X, credit_card_labels)

Our decision tree model is much less conservative with positive classifications, and has significantly increased our recall score (though it is still very low, missing the majority of fraudulent transactions). Despite the accuracy being lower than the dummy classifier, this is the most promising model so far. 


In [None]:
tree_classifier = DecisionTreeClassifier(random_state=RANDOM_SEED)
stratified_K_fold_scores_datasets(tree_classifier)

Under sampling has obtained a recall of 1, and an F1 score of 0.97. This is likely to be our most promising model and dataset. 

## 5.4 Random Forest Classifier


Random forest classifiers are slow to train, so K-fold validation is not feasible in our timeframe. Instead, we use the training and validation sets we defined earlier.

In [None]:
forest_classifier = RandomForestClassifier(random_state = RANDOM_SEED)
validation_set_scores(forest_classifier, credit_card_X_train, credit_card_labels_train, credit_card_X_validate, credit_card_labels_validate)

This is worse than the dummy classifier was. 

In [None]:
forest_classifier = RandomForestClassifier(random_state = RANDOM_SEED)
validation_set_scores_datasets(forest_classifier)

### 5.4.2 Plotting ROC Curve

In [None]:
# Use cross_val_predict() function, perform K-folder cross-validation to return a prediction for each fold (Aurelien Geron, 2020)
y_probs_forest = cross_val_predict(forest_classifier, credit_card_X, credit_card_labels.values.ravel(), cv=3, method="predict_proba")

In [None]:
# Have a look at the data
y_probs_forest

In [None]:
y_scores_forest = y_probs_forest[:, 1]
y_scores_forest

In [None]:
# Calculate TPR and FPR of various thresholds (Aurelien Geron, 2020)
FPR_forest, TPR_forest, thresholds_forest = roc_curve(credit_card_labels, y_scores_forest)

In [None]:
# Search for the lowest threshold that provides at least 90 percent accuracy (Aurelien Geron, 2020)
precisions_forest, recalls_forest, thresholds_forest = precision_recall_curve(credit_card_labels, y_scores_forest)
index_of_first_precision_at_least_90_percent_forest = np.argmax(precisions_forest >= 0.90)
recall_for_90_percent_precision_forest = recalls_forest[index_of_first_precision_at_least_90_percent_forest]
FPR_for_90_percent_precision_forest = FPR_forest[np.argmax(TPR_forest >= recall_for_90_percent_precision_forest)]

In [None]:
# Plot the curve of FPR to TPR (Aurelien Geron, 2020)
def plot_roc_curve(FPR, TPR, label=None): 
    plt.plot(FPR, TPR, linewidth=2, label=label) 
    plt.plot([0, 1], [0, 1], 'k--') 
    plt.axis([0, 1, 0, 1]) 
    plt.xlabel('FPR (1 - specificity)', fontsize=16) 
    plt.ylabel('TPR (recall)', fontsize=16) 
    plt.grid(True)

In [None]:
# Draw the roc curves
plt.figure(figsize=(8, 6)) 

plot_roc_curve(FPR_forest, TPR_forest, "Random Forest") 
plt.plot([FPR_for_90_percent_precision_forest],[recall_for_90_percent_precision_forest], "ro")
plt.grid(True) 
plt.legend(loc="lower right", fontsize=16) 
plt.show()

In [None]:
# Calculate roc_auc_score
roc_auc_score(credit_card_labels, y_scores_forest)

### 5.4.2 Dropping non-correlated features

In [None]:
credit_card_X_drop_uncorr

In [None]:
credit_card_labels

In [None]:
forest_classifier_2 = RandomForestClassifier(n_estimators = 100, random_state = 42, oob_score=True)

forest_classifier_2.fit(credit_card_X_drop_uncorr, credit_card_labels)

In [None]:
credit_card_predictions_2 = forest_classifier_2.predict(credit_card_X_drop_uncorr)  

In [None]:
matrix = confusion_matrix(credit_card_labels, credit_card_predictions_2)
matrix

# 1 fraudulent transaction was not detected

In [None]:
# Showing the confusion matrix
sns.heatmap(matrix, annot=True, fmt='d')
plt.show()

In [None]:
# Precision_score 
precision_score(credit_card_labels, credit_card_predictions_2)

In [None]:
# Recall_score
recall_score(credit_card_labels, credit_card_predictions_2)

In [None]:
f1_score(credit_card_labels, credit_card_predictions_2)

#Not much change from the original data set

In [None]:
y_probs_forest = cross_val_predict(forest_classifier_2, credit_card_X_drop_uncorr, credit_card_labels.values.ravel(), cv=3, method="predict_proba")

In [None]:
y_probs_forest

In [None]:
y_scores_forest = y_probs_forest[:, 1]
y_scores_forest

In [None]:
FPR_forest, TPR_forest, thresholds_forest = roc_curve(credit_card_labels, y_scores_forest)

In [None]:
precisions_forest, recalls_forest, thresholds_forest = precision_recall_curve(credit_card_labels, y_scores_forest)
index_of_first_precision_at_least_90_percent_forest = np.argmax(precisions_forest >= 0.90)
recall_for_90_percent_precision_forest = recalls_forest[index_of_first_precision_at_least_90_percent_forest]
FPR_for_90_percent_precision_forest = FPR_forest[np.argmax(TPR_forest >= recall_for_90_percent_precision_forest)]

In [None]:
plt.figure(figsize=(8, 6)) 

plot_roc_curve(FPR_forest, TPR_forest, "Random Forest") 
plt.plot([FPR_for_90_percent_precision_forest],[recall_for_90_percent_precision_forest], "ro")
plt.grid(True) 
plt.legend(loc="lower right", fontsize=16) 
plt.show()

In [None]:
roc_auc_score(credit_card_labels, y_scores_forest)

### 5.4.3 Undersampled Dataset

In [None]:
print(imblearn.__version__)

In [None]:
# Random undersampling to balance the Class distribution
 
X = credit_card
y = credit_card[["Class"]]


# Define undersample strategy
# There are 218,660 examples in the majority class and 469 examples in the minority class
# After undersampling, both classes have 469 examples in the transformed training data set

undersample_02 = RandomUnderSampler(sampling_strategy='majority')
#undersample = RandomUnderSampler(sampling_strategy=0.5)

credit_card_3_X, credit_card_3_y = undersample_02.fit_resample(X, y)

In [None]:
credit_card_3_X

In [None]:
credit_card_3_X = credit_card_3_X.drop(columns = [ 
                                            'Class',
                                            'id',
                                            'Time',
                                            ])

In [None]:
credit_card_3_X

In [None]:
credit_card_3_y

In [None]:
forest_classifier_3 = RandomForestClassifier(
    n_estimators=100, random_state=42, oob_score=True)

forest_classifier_3.fit(credit_card_3_X, credit_card_3_y)

In [None]:
credit_card_predictions_3 = forest_classifier_3.predict(credit_card_3_X)  

In [None]:
matrix = confusion_matrix(credit_card_3_y, credit_card_predictions_3)
matrix

# All detected perfectly

In [None]:
sns.heatmap(matrix, annot=True, fmt='d')
plt.show()

In [None]:
precision_score(credit_card_3_y, credit_card_predictions_3)

In [None]:
recall_score(credit_card_3_y, credit_card_predictions_3)

In [None]:
f1_score(credit_card_3_y, credit_card_predictions_3)

In [None]:
y_probs_forest = cross_val_predict(forest_classifier_3, credit_card_3_X, credit_card_3_y.values.ravel(), cv=3, method="predict_proba")

In [None]:
y_probs_forest

In [None]:
# Expanded array
y_scores_forest = y_probs_forest[:, 1]
y_scores_forest

In [None]:
FPR_forest, TPR_forest, thresholds_forest = roc_curve(credit_card_3_y, y_scores_forest)

In [None]:
precisions_forest, recalls_forest, thresholds_forest = precision_recall_curve(credit_card_3_y, y_scores_forest)
index_of_first_precision_at_least_90_percent_forest = np.argmax(precisions_forest >= 0.90)
recall_for_90_percent_precision_forest = recalls_forest[index_of_first_precision_at_least_90_percent_forest]
FPR_for_90_percent_precision_forest = FPR_forest[np.argmax(TPR_forest >= recall_for_90_percent_precision_forest)]

In [None]:
plt.figure(figsize=(8, 6)) 

plot_roc_curve(FPR_forest, TPR_forest, "Random Forest") 
plt.plot([FPR_for_90_percent_precision_forest],[recall_for_90_percent_precision_forest], "ro")
plt.grid(True) 
plt.legend(loc="lower right", fontsize=16) 
plt.show()

In [None]:
# Roc_auc_score
roc_auc_score(credit_card_3_y, y_scores_forest)

### 5.4.4 Oversampled Dataset

In [None]:
# Random oversampling to balance the class distribution
X = credit_card
y = credit_card[["Class"]]

# Define oversample strategy
# This strategy oversamples the minority class to a total of 218,660 examples
oversample_minority = RandomOverSampler(sampling_strategy=0.25)
# Fit
credit_card_4_X, credit_card_4_y = oversample_minority.fit_resample(X, y)

In [None]:
credit_card_4_X

In [None]:
credit_card_4_y

In [None]:
forest_classifier_4 = RandomForestClassifier(n_estimators = 100, random_state = 42, oob_score=True)

forest_classifier_4.fit(credit_card_4_X, credit_card_4_y)

In [None]:
credit_card_predictions_4 = forest_classifier_4.predict(credit_card_4_X) 

In [None]:
matrix = confusion_matrix(credit_card_4_y, credit_card_predictions_4)
matrix

#All detected perfectly

In [None]:
sns.heatmap(matrix, annot=True, fmt='d')
plt.show()

In [None]:
precision_score(credit_card_4_y, credit_card_predictions_4)

In [None]:
recall_score(credit_card_4_y, credit_card_predictions_4)

In [None]:
f1_score(credit_card_4_y, credit_card_predictions_4)

In [None]:
y_probs_forest = cross_val_predict(forest_classifier_4, credit_card_4_X, credit_card_4_y.values.ravel(), cv=3, method="predict_proba")

In [None]:
y_probs_forest

In [None]:
y_scores_forest = y_probs_forest[:, 1]
y_scores_forest

In [None]:
FPR_forest, TPR_forest, thresholds_forest = roc_curve(credit_card_4_y, y_scores_forest)

In [None]:
precisions_forest, recalls_forest, thresholds_forest = precision_recall_curve(credit_card_4_y, y_scores_forest)
index_of_first_precision_at_least_90_percent_forest = np.argmax(precisions_forest >= 0.90)
recall_for_90_percent_precision_forest = recalls_forest[index_of_first_precision_at_least_90_percent_forest]
FPR_for_90_percent_precision_forest = FPR_forest[np.argmax(TPR_forest >= recall_for_90_percent_precision_forest)]

In [None]:
plt.figure(figsize=(8, 6))

plot_roc_curve(FPR_forest, TPR_forest, "Random Forest")
plt.plot([FPR_for_90_percent_precision_forest], [
         recall_for_90_percent_precision_forest], "ro")
plt.grid(True)
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [None]:
roc_auc_score(credit_card_4_y, y_scores_forest)

#### 5.4.5 Dropping the features that have the same distribution (between genuine and fraudulent)

In [None]:
credit_card_X_distr = credit_card.drop(columns=[
    'Class',
    'id',
    'Time',

    'V5',
    'V6',
    'V7',
    'V8',
    'V12',
    'V13',
    'V15',
    'V16',
    'V20',
    'V21',
    'V22',
    'V23',
    'V25',
    'V27',
    'V28',

    'Amount',])


In [None]:
forest_classifier_6 = RandomForestClassifier(
    n_estimators=100, random_state=42, oob_score=True)

forest_classifier_6.fit(credit_card_X_distr, credit_card_labels)


In [None]:
credit_card_predictions_6 = forest_classifier_6.predict(credit_card_X_distr)

In [None]:
matrix = confusion_matrix(credit_card_labels, credit_card_predictions_6)
matrix

In [None]:
sns.heatmap(matrix, annot=True, fmt='d')
plt.show()

In [None]:
precision_score(credit_card_labels, credit_card_predictions_6)

In [None]:
recall_score(credit_card_labels, credit_card_predictions_6)

In [None]:
f1_score(credit_card_labels, credit_card_predictions_6)

In [None]:
y_probs_forest = cross_val_predict(forest_classifier_6, credit_card_X_distr, credit_card_labels.values.ravel(), cv=3, method="predict_proba")

y_probs_forest

In [None]:
y_scores_forest = y_probs_forest[:, 1]
y_scores_forest

In [None]:
FPR_forest, TPR_forest, thresholds_forest = roc_curve(credit_card_labels, y_scores_forest)

In [None]:
precisions_forest, recalls_forest, thresholds_forest = precision_recall_curve(credit_card_labels, y_scores_forest)
index_of_first_precision_at_least_80_percent_forest = np.argmax(precisions_forest >= 0.80)
recall_for_80_percent_precision_forest = recalls_forest[index_of_first_precision_at_least_80_percent_forest]
FPR_for_80_percent_precision_forest = FPR_forest[np.argmax(TPR_forest >= recall_for_80_percent_precision_forest)]

In [None]:
plt.figure(figsize=(8, 6)) 

plot_roc_curve(FPR_forest, TPR_forest, "Random Forest") 
plt.plot([FPR_for_90_percent_precision_forest],[recall_for_80_percent_precision_forest], "ro")
plt.grid(True) 
plt.legend(loc="lower right", fontsize=16) 
plt.show()

In [None]:
roc_auc_score(credit_card_labels, y_scores_forest)

### 5.4.6 Cross validation

In [None]:
# Cross validation  
# It takes a lot of time - 8 hours

K_ = 10
# Split the training set into 10 different subsets(Aurelien Geron, 2020)
# Reference: https://github.com/ageron/handson-ml2


forest_classifier_scores = cross_val_score(forest_classifier,
                                           credit_card_X, 
                                           credit_card_labels,
                                           scoring="neg_mean_squared_error", 
                                           cv=K_)

forest_classifier_rmse_scores = np.sqrt(-forest_classifier_scores) 
                                           #notes the minus

In [None]:
def display_scores(scores):
    print("Scores:",             np.round(scores))
    print("Mean:",               np.round(scores.mean())) 
    print("Standard deviation:", np.round(scores.std()))


display_scores(forest_classifier_rmse_scores) 

# All zero

## 5.5 Support Vector Classifier (SVC)

### 5.5.1 Poly Kernel

In [None]:
poly_kernel_svc = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=5))
])
validation_set_scores(poly_kernel_svc, credit_card_X_train, credit_card_labels_train, credit_card_X_validate, credit_card_labels_validate)

del poly_kernel_svc


### 5.5.2 Gaussian Radial Bias Function Kernel

In [None]:
rbf_kernel_svc = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001))
])
validation_set_scores(rbf_kernel_svc, credit_card_X_train, credit_card_labels_train, credit_card_X_validate, credit_card_labels_validate)


del rbf_kernel_svc

We do not have time to try the SVCs on all datasets. They do not appear to outperform random forest or decision tree based on preliminary testing.

## 5.6 Gaussian Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
validation_set_scores_datasets(gnb)

del gnb

# 6 Tuning the Model
## 6.1 Random Forest Grid Search

In [None]:
parameter_grid = [
  
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]


forest_classifier = RandomForestClassifier(n_estimators = 100, random_state = 42, oob_score=True)

print(forest_classifier.feature_importance_)

We can reduce our features based on their importance.

In [None]:
# Based on forest_classifier.feature_importance_
unimportant = ['Class','id','TIme','V1','V5','V6','V7','V11','V12','V13','V15','V16','V17','V18','V19','V20','V21','V22','V23','V24','V25','V27','V28','Amount']

grid_search = GridSearchCV(forest_classifier, parameter_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(credit_card.drop(columns=unimportant), credit_card_labels)

# Cross validation

In [None]:
grid_search_results = grid_search.cv_results_
for mean_score, params in zip(grid_search_results["mean_test_score"], 
                              grid_search_results["params"]):
    print(np.round(np.sqrt(-mean_score)), params)
    
    # Not working
    # Overfitting

In [None]:
credit_card_test = pandas.read_csv("../data/test.csv")
credit_card_y_predicted = forest_classifier.predict_proba(credit_card_test)[:,1]
# Predict_proba: calculate the class probability of each row of credit card data（Giorgos Myrianthous, 2021）
# Reference：https://towardsdatascience.com/predict-vs-predict-proba-scikit-learn-bdc45daa5972

In [None]:
credit_card_y_predicted

In [None]:
sample_submission = credit_card = pandas.read_csv("../data/sample_submission.csv")

In [None]:
sample_submission['Class'] = [x[1] for x in credit_card_y_predicted]
sample_submission.to_csv('Group17_forest.csv', index=False)   
# (pandas.DataFrame.to_csv)
# Reference:https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html

In [None]:
# Random Forest classifier stratified k-cross validation

credit_card_labels = credit_card["Class"].copy()
credit_card_X = credit_card.drop(columns=["id", "Class"])

stratified_K_fold_scores(forest_classifier, credit_card_X, credit_card_labels)

del forest_classifier

## 6.2 Decision Tree Grid Search
We found the decision tree with the undersampled dataset to be our best model, and is quicker to compute than an equivalent random forest, so we will investigate tuning it. First, we see what values were chosen by default.


In [None]:
tree_classifier = DecisionTreeClassifier(random_state=RANDOM_SEED)
tree_classifier.fit(credit_card_X_under_clean, credit_card_labels_under)

print(tree_classifier.get_params())

del tree_classifier

In [None]:
param_grid = {
    'random_state': [12, 24, RANDOM_SEED, 5059],
    'max_depth': [5, 25, 40, 50],
    'min_samples_split': [1, 2, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy'],
    'splitter': ['random', 'best'],
}

tree_classifier = DecisionTreeClassifier()

grid_search = GridSearchCV(
    estimator=tree_classifier, param_grid=param_grid, cv=K, scoring='recall')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    grid_search.fit(credit_card_X_under_clean, credit_card_labels_under)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best recall: {grid_search.best_score_}")

del param_grid, tree_classifier, grid_search


### 6.2.1 Testing with Kaggle
Now that we have tuned our model, we can test its predictions on Kaggle.

In [None]:
# Read test dataset
credit_card_test = pandas.read_csv("../data/test.csv")

credit_card_test = credit_card_test.assign(Class=None)

credit_card_test_clean = pandas.DataFrame(pipeline.fit_transform(credit_card_test), columns=[
    item for item in credit_card.columns if item not in ['id', 'Class', 'Time']])

tree_classifier = DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                                         max_depth=5, max_features='sqrt', min_samples_split=10, random_state=12, splitter='random')
tree_classifier.fit(credit_card_X_under_clean, credit_card_labels_under)

# Create predictions on test data
predictions = tree_classifier.predict(credit_card_test_clean)

print("Decision Tree Original Predictions:", predictions)

# Test the probabilities for the test data using the decision tree classifier
probability = tree_classifier.predict_proba(credit_card_test_clean)

print("Decision Tree Original Probabilities:", probability)

In [None]:
sample_submission = pandas.read_csv("../data/sample_submission.csv")
sample_submission_pos = sample_submission[["id"]].assign(Class = [x[1] for x in probability])
sample_submission_pos.to_csv('Group_17_positive.csv', index=False)   
sample_submission_neg = sample_submission[["id"]].assign(Class = [x[0] for x in probability])
sample_submission_neg.to_csv('Group_17_negative.csv', index=False)   