In [None]:
%load_ext nb_black

In [None]:
import datetime as dt

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import FunctionTransformer, FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import SparseCategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import set_random_seed


from helpers.profiling import timing
from helpers.series_list import bag_of_words_series, get_n_elements
from modelling.utils.performance import (
    get_classification_report,
    plot_conf_matrix,
    plot_precision_recall_curve_over_thresholds,
    plot_precision_recall_curve,
)


from helpers.read_file import read_feather
from pipelines.preprocessing_pandas import (
    create_cos,
    create_sin,
    extract_hour_str,
    extract_weekday_timestamp,
    frequency_encoding,
)

In [None]:
plt.style.use("ggplot")

# Load data

In [None]:
df = read_feather("../../data/train.fth")
df.head()

In [None]:
df["date"] = df.apply(
    lambda row: dt.datetime(int(row["Year"]), int(row["Month"]), int(row["Day"])),
    axis=1,
)

In [None]:
errors_series = df["Errors?"]

In [None]:
n_errors_series = get_n_elements(errors_series)
errors_indicator_series = pd.Series(np.where(n_errors_series > 0, 1, 0))

df["errors_indicator"] = errors_indicator_series

In [None]:
df["Amount"] = df["Amount"].apply(lambda x: float(x[1:]))

In [None]:
df["Merchant State"] = df["Merchant State"].fillna("")

In [None]:
bag_of_words = bag_of_words_series(errors_series)

In [None]:
df = pd.concat([df, bag_of_words], axis=1, sort=False)

In [None]:
df.iloc[:1].columns

# Split train, test sets

In [None]:
X_train, X_test = train_test_split(
    df, stratify=errors_indicator_series, test_size=0.2, random_state=42
)

In [None]:
# Train autoencoder only on transactions without errors

X_train = X_train[X_train["Errors?"] != X_train["Errors?"]]
X_train

# Preprocess data / creating pipelines

In [None]:
frequency_encoding_fn = FunctionTransformer(
    frequency_encoding, kw_args={"normalize": False}
)

ohe_fn = OneHotEncoder(handle_unknown="ignore")

scaler_fn = StandardScaler(with_mean=True, with_std=True)

In [None]:
def create_cyclical_feature_fn(raw_feature: str, period: int) -> FeatureUnion:
    return FeatureUnion(
        [
            (
                f"{raw_feature}_sin",
                FunctionTransformer(create_sin, kw_args={"period": period}),
            ),
            (
                f"{raw_feature}_cos",
                FunctionTransformer(create_cos, kw_args={"period": period}),
            ),
        ]
    )

## Card

In [None]:
card_pipeline = ohe_fn

## Month

In [None]:
month_pipeline = create_cyclical_feature_fn("Month", period=12)

## Day 

In [None]:
day_pipeline = create_cyclical_feature_fn("Day", period=31)

In [None]:
weekday_pipeline = Pipeline(
    [
        (
            "extract_weekday",
            FunctionTransformer(extract_weekday_timestamp),
        ),
        ("ohe", ohe_fn),
    ]
)

## Time

In [None]:
hour_sin = FunctionTransformer(create_sin, kw_args={"period": 24})
hour_cos = FunctionTransformer(create_cos, kw_args={"period": 24})

hour_trig = FeatureUnion([("hour_sin", hour_sin), ("hour_cos", hour_cos)])

In [None]:
hour_pipeline = Pipeline(
    [
        (
            "extract_hour",
            FunctionTransformer(extract_hour_str, kw_args={"time_format": "%H:%M"}),
        ),
        ("hour_trig", hour_trig),
    ]
)

## Amount

In [None]:
amount_pipeline = Pipeline([("scale", scaler_fn)])

## Use Chip

In [None]:
use_chip_pipeline = ohe_fn

## Merchant Name

In [None]:
merchant_name_pipeline = Pipeline(
        [(
            "merchant_name",
            frequency_encoding_fn),
            ('scaler', scaler_fn)
        ]
    )


## Merchant City

In [None]:
merchant_city_pipeline = Pipeline(
    [("merchant_city", frequency_encoding_fn), ("scaler", scaler_fn)]
)

## Merchant State

In [None]:
merchant_state_pipeline = Pipeline(
        [(
            "merchant_state",
            frequency_encoding_fn
        ),
            ("scaler", scaler_fn)
        ]
    )


## MCC

In [None]:
mcc_pipeline = Pipeline([("mcc", frequency_encoding_fn), ("scaler", scaler_fn)])

## Creating pipeline

In [None]:
pipeline = ColumnTransformer(
    [
        #         (
        #             "passthrough",
        #             "passthrough",
        #             [
        #                 "Errors?_Insufficient Balance",
        #                 "Errors?_Bad Expiration",
        #                 "Errors?_Bad Zipcode",
        #                 "Errors?_Technical Glitch",
        #                 "Errors?_Bad Card Number",
        #                 "Errors?_Bad CVV",
        #                 "Errors?_Bad PIN",
        #             ],
        #         ),
        ("Amount", amount_pipeline, ["Amount"]),
        ("Hour", hour_pipeline, ["Time"]),
        ("weekday_pipeline", weekday_pipeline, ["date"]),
        #         ("Use Chip", use_chip_pipeline, ["Use Chip"]),
        #         ("Merchant Name", merchant_name_pipeline, "Merchant Name"),
        #         ("Merchant City", merchant_city_pipeline, "Merchant City"),
        #         ("Merchant State", merchant_state_pipeline, "Merchant State"),
        #         ("MCC", mcc_pipeline, "MCC"),
    ],
)

In [None]:
X_train_prepared = pipeline.fit_transform(X_train)
X_train_prepared = pd.DataFrame(X_train_prepared).fillna(0).values

In [None]:
X_test_prepared = pipeline.transform(X_test)
X_test_prepared = pd.DataFrame(X_test_prepared).fillna(0).values


# Create model

## Define extra steps during training

In [None]:
early_stopping = EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=10,
    verbose=1,
    mode="auto",
    baseline=None,
    restore_best_weights=False,
)

reduce_lr_on_plateau = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.1,
    patience=10,
    verbose=0,
    mode="auto",
    min_delta=0.0001,
    cooldown=0,
    min_lr=0,
)



In [None]:
X_train_prepared.shape[1]

In [None]:
set_random_seed(42)
n_features = X_train_prepared.shape[1]

encoder = Sequential(
    [
        Dense(
            units=10,
            kernel_initializer="he_normal",
            activation="relu",
            input_shape=(n_features,),
        ),
        Dropout(0.2),
        Dense(units=5, kernel_initializer="he_normal", activation="relu"),
    ]
)


# Decoder layers
decoder = tf.keras.Sequential(
    [
        Dense(
            units=10,
            kernel_initializer="he_normal",
            activation="relu",
            input_shape=(5,),
        ),
        Dropout(0.2),
        Dense(units=n_features, activation="sigmoid"),
    ]
)


autoencoder = Sequential([encoder, decoder])

In [None]:
# Compile the autoencoder
autoencoder.compile(
    optimizer="adam", loss="mse", metrics=["mse"]
)  # Fit the autoencoder

history = autoencoder.fit(
    X_train_prepared,
    X_train_prepared,
    epochs=100,
    batch_size=512,
    validation_data=(X_test_prepared, X_test_prepared),
    shuffle=True,
    callbacks=[early_stopping, reduce_lr_on_plateau],
)

# Learning curves

In [None]:
 fig, ax = plt.subplots(1, 1, figsize=(8, 4))

metric = 'loss'
ax.plot(history.history[f"{metric}"], label="training")
ax.plot(history.history[f"val_{metric}"], label="validation")
ax.set_title(f"{metric}")
ax.legend()



# Anomaly Detection

## Get reconstruction errors during the training phase

In [None]:
y_pred_train = autoencoder.predict(X_train_prepared)

In [None]:
reconstruction_errors_training = tf.keras.losses.MeanSquaredError().call(
    y_true=X_train_prepared, y_pred=y_pred_train
)
threshold = (
    reconstruction_errors_training.numpy().mean()
    + reconstruction_errors_training.numpy().std()
)

threshold

## Validation set

In [None]:
y_pred_test = autoencoder.predict(X_test_prepared)

In [None]:
# Get the mean absolute error between actual and reconstruction/prediction
reconstruction_errors_test = tf.keras.losses.MeanSquaredError().call(
    X_test_prepared, y_pred_test
)
reconstruction_errors_test.numpy()

### Transactions in the test set with errors

In [None]:
X_test["reconstruction_error"] = reconstruction_errors_test

In [None]:
mask_error = X_test["Errors?"] == X_test["Errors?"]
mask_errorless = X_test["Errors?"] != X_test["Errors?"]

In [None]:
transactions_with_errors_test = X_test[mask_error]
transactions_without_errors_test = X_test[mask_errorless][:1581]

sample = pd.concat(
    [transactions_with_errors_test, transactions_without_errors_test], axis=0
)
sample.shape

In [None]:
sample["reconstruction_error"].describe()

In [None]:
reconstruction_error_mask = sample["reconstruction_error"] < threshold
sample = sample[reconstruction_error_mask]
sample

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(16, 8))

ax.plot(np.arange(0, len(sample[mask_error])), sample[mask_error]['reconstruction_error'].values)
ax.plot(np.arange(0, len(sample[mask_errorless])), sample[mask_errorless]['reconstruction_error'].values)

ax.axhline(threshold)


# <font color='green'> TEST </font>