In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime
import os
import tempfile

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn import metrics

In [5]:
import tensorflow as tf
from tensorflow import keras
import mlflow
import mlflow.keras

- include timestamp as feature - done
- shuffle dataset - shuffle so that each set has frauds (stratify) - done
- separate 'retraining set' for the cloud updates 
- Data processing: change names of columns, create time stamp, delete 'time' column, train test split, stratify, apply encoding pipeline

In [None]:
data = pd.read_csv('ibm_4y.csv')
data.head()

### Data Cleaning

MCC codes are four-digit numbers that classify a business by the services it provides or products it sells. If a business has a variety of products or services, the MCC code is usually based on the product or service that makes up the bulk of the business’ sales. (https://www.heartland.us/resources/blog/merchants-guide-to-mcc-codes)

In [None]:
data.rename(str.lower, axis='columns', inplace=True)
data.rename(columns={'use chip': 'use_chip', 
                       'merchant name': 'merchant_name', 
                       'merchant city': 'merchant_city', 
                       'merchant state': 'merchant_state',
                       'errors?': 'errors', 
                       'is fraud?': 'is_fraud' }, 
                       inplace=True)
# set time series index
data[['hour', 'minute']] = data['time'].str.split(':', expand=True)
data['date'] = pd.to_datetime(data[['year', 'month', 'day', 'hour', 'minute']])
data.set_index('date', inplace=True)
data.sort_index(inplace=True)

data.drop(columns=[ 'time'], inplace=True)
# convert amount to float
data['amount'] = data['amount'].str[1:].astype('float64')

data.head()



### PCA - coding and normalizing 

Train tess split: since 2020 is not a full year, test data will be taken out of 2019. 

In [8]:
X = data.drop(columns=['is_fraud'])
y = data[['is_fraud']]

In [9]:
Xtrain, xtest, Ytrain, ytest = train_test_split(X, 
                                                y, 
                                                test_size=0.2,      #test size default 25%
                                                random_state=42, 
                                                shuffle=True, 
                                                stratify=y)     

In [10]:
xtrain, xval, ytrain, yval = train_test_split(Xtrain, 
                                                Ytrain, 
                                                test_size=0.25, 
                                                random_state=42, 
                                                shuffle=True, 
                                                stratify=Ytrain)     #test size default 25%

In [11]:

label_enc = LabelEncoder()

# replace missing values with a constant text, then encode to numeric classes and scale
state_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='online')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    ('scaler', StandardScaler())])

# replace missing values with zero, then encode and scale
zero_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    ('scaler', StandardScaler())
])

# implement number scaler on numerical features (no missing values)
# implement text replacement to state and errors
# implement zero replacement to zip, city and chip
transformer= ColumnTransformer(
    transformers=[
        ('number_scaler', StandardScaler(), [0, 1, 2, 3, 4, 5, 7, 11, 13, 14]),
        ('NAN_replace_text', state_pipe, [9, 12]),
        ('NAN_replace_zero', zero_pipe, [6, 8, 10]),
        
    ], remainder='drop', verbose_feature_names_out=False)        # pipeline(memory='dirpath to cache)


In [None]:
ytrain = label_enc.fit_transform(ytrain)
yval = label_enc.fit_transform(yval)
transformer.fit(xtrain)
xtrain = transformer.transform(xtrain)
xval = transformer.transform(xval)
xtest = transformer.transform(xtest)

In [None]:
xtrain

In [None]:
# reshaping labels tensor to fit the model requirements of 2 dimensions
ytrain = ytrain.reshape(ytrain.shape[0], 1)
yval = yval.reshape(yval.shape[0], 1)
ytrain.ndim


Correlation matrix and distributions: 
(already at this phase it is presumed that the data is quite biased - 
looks like a fraud group from Rome, Italy found its way into the data and made many fraudulant 
transactions)

### Model Creation

variables to play with 
- shuffle/ unshuffle
- number of layers
- size of layers
- dropout
- epochs
- use standard scaler
- use standardization instead of scaler


In [None]:
input_dim = xtrain.shape[1]
input_dim, xtrain.shape


In [16]:
METRICS = {
      'binary_crossentropy': keras.metrics.BinaryCrossentropy(name='binary_crossentropy'),  # same as model's loss
      'Brier_score': keras.metrics.MeanSquaredError(name='Brier_score'),
      'tp':keras.metrics.TruePositives(name='tp'),
      'fp':keras.metrics.FalsePositives(name='fp'),
      'tn':keras.metrics.TrueNegatives(name='tn'),
      'fn':keras.metrics.FalseNegatives(name='fn'), 
      'accuracy':keras.metrics.BinaryAccuracy(name='accuracy'),
      'precision':keras.metrics.Precision(name='precision'),
      'recall':keras.metrics.Recall(name='recall'),
      'auc':keras.metrics.AUC(name='auc'),
      'prc':keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
}


In [None]:
xtrain.shape[0] + xval.shape[0] + xtest.shape[0], data.shape[0], xtrain.shape[-1]

In [None]:
# set initial bias for balancing negative and positive classes: 
neg, pos = np.bincount(label_enc.transform(y))
sum = neg + pos

initial_bias = np.log([pos/neg])

In [54]:
params = {
    'learning_rate': 1e-3,
    'initial_bias': initial_bias,
    'dropout': 0.5,
    'layer1_size': 16,
    'activation1': 'relu'
    #'callbacks': [eraly_stopping, log_callback]
    }

train_params = {
    'patience': 10,
    'epochs' : 100,
    'batch_size': 2048,

}

In [20]:
def create_model(metrics=METRICS, output_bias=None, params=params):
    if output_bias:
        output_bias = tf.keras.initializers.Constant(output_bias)
    model = keras.Sequential([
        keras.Input(shape=(xtrain.shape[-1],)),
        keras.layers.Dense(
            params['layer1_size'],
            activation=params['activation1'],
        ),
        keras.layers.Dropout(params['dropout']),
        keras.layers.Dense(1, activation='sigmoid',
                           bias_initializer=output_bias),
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=params['learning_rate']),
        loss=keras.losses.BinaryCrossentropy(),                 ### why does BinaryCrossentropy has a shape problem???????/
        metrics=list(metrics.values())
    )

    return model

In [21]:
eraly_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_prc',
    verbose=1,
    patience=train_params['patience'],
    mode='max',
    restore_best_weights=True
)



In [None]:
model = create_model()
model.summary()

In [None]:
# testrun before training
model.predict(xtrain[:10]).ndim


In [None]:
results = model.evaluate(xtrain, ytrain, batch_size=params['batch_size'], verbose=0)
print("Loss: {:0.4f}".format(results[0]))


set an initial bias percentage in order to offset the bias in the data.
This will be smoothed out with a log, which will decrease the inherant loss at the start

In [None]:
model = create_model(output_bias=params['initial_bias'])
model.predict(xtrain[:10])

In [None]:
results = model.evaluate(xtrain, ytrain, batch_size=params['batch_size'], verbose=0)
print("Loss: {:0.4f}".format(results[0]))


In [27]:
# checkpoint weights
initial_weights = os.path.join(tempfile.mkdtemp(), 'initial_weights.weights.h5')
model.save_weights(initial_weights)


In [None]:
# create model from the saved weights, with initial bias 0
model = create_model()
model.load_weights(initial_weights)
model.layers[-1].bias.assign([0.0])
zero_bias_history = model.fit(
    xtrain,
    ytrain,
    batch_size=params['batch_size'],
    epochs=5,
    validation_data=(xval, yval),
    verbose=1
)

In [29]:
# names for MLFLOW runs
experiment = 'baseline'
run_name = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
logdir = os.path.join("logs", experiment, run_name)
os.makedirs(logdir, exist_ok=True)

log_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, 
                                    write_graph=True, 
                                    histogram_freq=1)

In [None]:
# model from saved weights with initial bias set.
model = create_model()
model.load_weights(initial_weights)
careful_bias_history = model.fit(
    xtrain,
    ytrain,
    batch_size=params['batch_size'],
    epochs=5,
    validation_data=(xval, yval),
    verbose=1,
    callbacks=[eraly_stopping, log_callback]
)

In [None]:
careful_bias_history.history.keys()
careful_bias_history.history['accuracy']


In [34]:
import mlflow.tensorflow

In [44]:
uri = mlflow.set_tracking_uri("./mlruns")
uri

### Training

In [None]:
# names for MLFLOW runs
experiment = 'tensorboard'
run_name = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
logdir = os.path.join("logs", experiment, run_name)

#os.makedirs(f'{logdir}/models', exist_ok=True)

# for tensorboard logging -
log_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, 
                                    write_graph=True, 
                                    histogram_freq=1)


model = create_model()
model.load_weights(initial_weights)
#mlflow.keras.autolog(registered_model_name='initial_bias')

mlflow.set_experiment('baseline')
with mlflow.start_run(run_name=run_name) as run: 

    mlflow.log_params(params)

    mlflow.set_experiment_tag('baseline', 'fraud_analysis')    
    #
    mlflow.tensorflow.log_model(model, "models")
    #mlflow.tensorflow.autolog(disable=True)
    baseline_history = model.fit(
            xtrain,
            ytrain,
            batch_size=params['batch_size'],
            #epochs=2,
            epochs=params['epochs'],
            callbacks=[eraly_stopping, mlflow.tensorflow.MlflowCallback(run)],  #log_callback, 
            validation_data=(xval, yval)
        )
    
    run_id = run.info.run_id

### Evaluation

In [None]:
train_pred_baseline = model.predict(xtrain, batch_size=params['batch_size'])
test_pred_baseline = model.predict(xtest, batch_size=params['batch_size'])

In [None]:
ytest = label_enc.transform(ytest)
ytest = ytest.reshape(ytest.shape[0], 1)
ytest.ndim

In [63]:
def plot_cm(labels, predictions, threshold=0.5):
    cm = confusion_matrix(labels, predictions > threshold)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title(f'Confusion matrix @{threshold:.2f}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')

    print('Legit transactions detected (True neg):', cm[0][0])
    print('Legit transactions incorrect detected (False pos):', cm[0][1])
    print('Fraud transactions missed (False neg):', cm[1][0])
    print('Fraud transactions detected (True Pos):', cm[1][1])
    print('Total fraud transactions:', np.sum(cm[1]))

In [None]:
baseline_results = model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE, verbose=1)
for name, value in zip(model.metrics_names, baseline_results):
    print(name, ': ', value)
print()
plot_cm(ytest, test_pred_baseline)

In [None]:
plot_cm(ytest, test_pred_baseline, threshold=0.1)
plot_cm(ytest, test_pred_baseline, threshold=0.01)

In [111]:
#ROC - Receiver Operator Characteristic- 
#TPR vs FPR at various threshold values: separates the 'signal' from the 'noise'.)
#true positive rate (TPR) on the Y axis, and false positive rate (FPR) on the X axis
def plot_roc(name, labels, preds, **kwargs):        
    fpr, tpr, _ = metrics.roc_curve(labels, preds)            # _ for thresholds
    #metrics.RocCurveDisplay.from_predictions(labels, preds, pos_label= 'Detected Fraud', name=name)
    print(len(fpr))
   
    print(len(tpr))
    plt.plot(100*fpr, 100*tpr, label=name, linewidth=2, **kwargs)
    plt.xlabel('False positives [%]')
    plt.ylabel('True positives [%]')
    plt.xlim([-0.5,25])
    plt.ylim([80,100.5])
    plt.grid(True)
    ax = plt.gca()
    ax.set_aspect('equal')

    return fpr, tpr

    


In [None]:
train_fp, train_tp = plot_roc('Train Baseline', ytrain, train_pred_baseline, color=colors[0])
test_fp, test_tp = plot_roc('Test baseline', ytest, test_pred_baseline, color=colors[0], linestyle='--')
plt.legend(loc='lower right')

plot area under the curve: AUPRC - area under the percision-recall 

In [90]:
def plot_prc(name, labels, preds, **kwargs):
    precision, recall, _ = metrics.precision_recall_curve(labels, preds)

    plt.plot(precision, recall, label=name, linewidth=2, **kwargs)
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.grid(True)
    ax = plt.gca()
    ax.set_aspect('equal')

In [None]:
plot_prc('Train baseline', ytrain, train_pred_baseline, color=colors[0])
plot_prc('Test baseline', ytest, test_pred_baseline, color=colors[0], linestyle='--')
plt.legend(loc='lower right')

In [None]:
weight_0 = (1 / neg) * (sum / 2.0)
weight_1 = (1 / pos) * (sum / 2.0)

class_weight = {0: weight_0, 1: weight_1}

print(f'weight class 0: {weight_0:.2f}')
print(f'weight class 1: {weight_1:.2f}')

Training with class weight: 

In [None]:
weighted_model = create_model()
weighted_model.load_weights(initial_weights)

weighted_history = weighted_model.fit(
    xtrain,
    ytrain,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[eraly_stopping],
    validation_data=(xval, yval),
    class_weight=class_weight
)

In [None]:
plot_metrics(weighted_history)

In [None]:
train_pred_weighted = weighted_model.predict(xtrain, batch_size=BATCH_SIZE)
test_pred_weighted = weighted_model.predict(Xtest, batch_size=BATCH_SIZE)

In [None]:
train_pred_baseline.shape, train_pred_weighted.shape

In [126]:
weighted_results = weighted_model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE, verbose=0)

In [None]:
for name, value in zip(weighted_model.metrics_names, weighted_results):
    print(name, ': ', value)
print()
plot_cm(ytest, test_pred_weighted)

In [None]:
plot_roc('Train baseline', ytrain, train_pred_baseline, color=colors[0])
plot_roc('Train baseline', ytest, test_pred_baseline, color=colors[0], linestyle='--')

plot_roc('Test weighted', ytrain, train_pred_weighted, color=colors[1])
plot_roc('Test weighted', ytest, test_pred_weighted, color=colors[1], linestyle='--')

plt.legend(loc='lower right')

In [None]:
plot_prc('Train baseline', ytrain, train_pred_baseline, color=colors[0])
plot_prc('Train baseline', ytest, test_pred_baseline, color=colors[0], linestyle='--')

plot_prc('Test weighted', ytrain, train_pred_weighted, color=colors[1])
plot_prc('Test weighted', ytest, test_pred_weighted, color=colors[1], linestyle='--')

plt.legend(loc='lower right')