In [None]:
# -*- coding: utf-8 -*-
import os, sys, random, io, urllib
from datetime import datetime

import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.optim as optim

import pandas as pd
import random as rd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
from IPython.display import Image, display

USE_CUDA = (torch.backends.cudnn.version() != None)

seed_value = 1234
rd.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
if USE_CUDA:
    torch.cuda.manual_seed(seed_value)

ori_dataset = pd.read_csv('./data/fraud_dataset_v2.csv')

In [None]:
label = ori_dataset.pop('label')

categorical_attr_names = ['KTOSL', 'PRCTR', 'BSCHL', 'HKONT', 'BUKRS', 'WAERS']

ori_dataset_categ_transformed = pd.get_dummies(ori_dataset[categorical_attr_names])

numeric_attr_names = ['DMBTR', 'WRBTR']

numeric_attr = ori_dataset[numeric_attr_names] + 1e-4
numeric_attr = numeric_attr.apply(np.log)

ori_dataset_numeric_attr = (numeric_attr - numeric_attr.min()) / (numeric_attr.max() - numeric_attr.min())

ori_subset_transformed = pd.concat([ori_dataset_categ_transformed, ori_dataset_numeric_attr], axis = 1)
ori_subset_transformed = ori_subset_transformed[label == 'regular'][:320000]

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()

        self.map_L1 = nn.Linear(input_size, hidden_size[0], bias=True)
        nn.init.xavier_uniform_(self.map_L1.weight)
        nn.init.constant_(self.map_L1.bias, 0.0)
        self.map_R1 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L2 = nn.Linear(hidden_size[0], hidden_size[1], bias=True)
        nn.init.xavier_uniform_(self.map_L2.weight)
        nn.init.constant_(self.map_L2.bias, 0.0)
        self.map_R2 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L3 = nn.Linear(hidden_size[1], hidden_size[2], bias=True)
        nn.init.xavier_uniform_(self.map_L3.weight)
        nn.init.constant_(self.map_L3.bias, 0.0)
        self.map_R3 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L4 = nn.Linear(hidden_size[2], hidden_size[3], bias=True)
        nn.init.xavier_uniform_(self.map_L4.weight)
        nn.init.constant_(self.map_L4.bias, 0.0)
        self.map_R4 = torch.nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L5 = nn.Linear(hidden_size[3], hidden_size[4], bias=True)
        nn.init.xavier_uniform_(self.map_L5.weight)
        nn.init.constant_(self.map_L5.bias, 0.0)
        self.map_R5 = torch.nn.LeakyReLU(negative_slope=0.4, inplace=True)

    def forward(self, x):
        x = self.map_R1(self.map_L1(x))
        x = self.map_R2(self.map_L2(x))
        x = self.map_R3(self.map_L3(x))
        x = self.map_R4(self.map_L4(x))
        x = self.map_R5(self.map_L5(x))

        return x

class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size):
        super(Decoder, self).__init__()

        self.map_L1 = nn.Linear(hidden_size[0], hidden_size[1], bias=True)
        nn.init.xavier_uniform_(self.map_L1.weight)
        nn.init.constant_(self.map_L1.bias, 0.0)
        self.map_R1 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L2 = nn.Linear(hidden_size[1], hidden_size[2], bias=True)
        nn.init.xavier_uniform_(self.map_L2.weight)
        nn.init.constant_(self.map_L2.bias, 0.0)
        self.map_R2 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L3 = nn.Linear(hidden_size[2], hidden_size[3], bias=True)
        nn.init.xavier_uniform_(self.map_L3.weight)
        nn.init.constant_(self.map_L3.bias, 0.0)
        self.map_R3 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L4 = nn.Linear(hidden_size[3], hidden_size[4], bias=True)
        nn.init.xavier_uniform_(self.map_L4.weight)
        nn.init.constant_(self.map_L4.bias, 0.0)
        self.map_R4 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L5 = nn.Linear(hidden_size[4], output_size, bias=True)
        nn.init.xavier_uniform_(self.map_L5.weight)
        nn.init.constant_(self.map_L5.bias, 0.0)
        self.map_S5 = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.map_R1(self.map_L1(x))
        x = self.map_R2(self.map_L2(x))
        x = self.map_R3(self.map_L3(x))
        x = self.map_R4(self.map_L4(x))
        x = self.map_S5(self.map_L5(x))

        return x

encoder_eval = Encoder(input_size=ori_subset_transformed.shape[1], hidden_size=[256, 64, 16, 4, 2])
decoder_eval = Decoder(output_size=ori_subset_transformed.shape[1], hidden_size=[2, 4, 16, 64, 256])

if USE_CUDA:
    encoder_eval = encoder_eval.cuda()
    decoder_eval = decoder_eval.cuda()

encoder_eval.load_state_dict(torch.load('./models/20190818-03_22_18_ep_401_encoder_model.pth'))
decoder_eval.load_state_dict(torch.load('./models/20190818-03_22_18_ep_401_decoder_model.pth'))

In [None]:
class Discriminator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Discriminator, self).__init__()

        self.map_L1 = nn.Linear(input_size, hidden_size[0], bias=True)
        nn.init.xavier_uniform_(self.map_L1.weight)
        nn.init.constant_(self.map_L1.bias, 0.0)
        self.map_R1 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L2 = nn.Linear(hidden_size[0], hidden_size[1], bias=True)
        nn.init.xavier_uniform_(self.map_L2.weight)
        nn.init.constant_(self.map_L2.bias, 0.0)
        self.map_R2 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L3 = nn.Linear(hidden_size[1], hidden_size[2], bias=True)
        nn.init.xavier_uniform_(self.map_L3.weight)
        nn.init.constant_(self.map_L3.bias, 0.0)
        self.map_R3 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L4 = nn.Linear(hidden_size[2], output_size, bias=True)
        nn.init.xavier_uniform_(self.map_L4.weight)
        nn.init.constant_(self.map_L4.bias, 0.0)

    def forward(self, x):
        x = self.map_R1(self.map_L1(x))
        x = self.map_R2(self.map_L2(x))
        x = self.map_R3(self.map_L3(x))
        x = self.map_L4(x)

        return x

nz = 16

class Generator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Generator, self).__init__()

        self.map_L1 = nn.Linear(input_size, hidden_size[0], bias=True)
        nn.init.xavier_uniform_(self.map_L1.weight)
        nn.init.constant_(self.map_L1.bias, 0.0)
        self.map_R1 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L2 = nn.Linear(hidden_size[0], hidden_size[1], bias=True)
        nn.init.xavier_uniform_(self.map_L2.weight)
        nn.init.constant_(self.map_L2.bias, 0.0)
        self.map_R2 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L3 = nn.Linear(hidden_size[1], hidden_size[2], bias=True)
        nn.init.xavier_uniform_(self.map_L3.weight)
        nn.init.constant_(self.map_L3.bias, 0.0)
        self.map_R3 = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        self.map_L4 = nn.Linear(hidden_size[2], output_size, bias=True)
        nn.init.xavier_uniform_(self.map_L4.weight)
        nn.init.constant_(self.map_L4.bias, 0.0)
        self.map_S4 = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.map_R1(self.map_L1(x))
        x = self.map_R2(self.map_L2(x))
        x = self.map_R3(self.map_L3(x))
        x = self.map_S4(self.map_L4(x))

        return x

discriminator_train = Discriminator(input_size=ori_subset_transformed.shape[1], hidden_size=[256, 64, 16], output_size=1)
generator_train = Generator(input_size=nz, hidden_size=[64, 128, 256], output_size=ori_subset_transformed.shape[1])

if USE_CUDA:
    discriminator_train = discriminator_train.cuda()
    generator_train = generator_train.cuda()

In [None]:
reconstruction_criterion = torch.nn.HingeEmbeddingLoss()

if USE_CUDA:
    reconstruction_criterion = reconstruction_criterion.cuda()

learning_rate = 1e-5

discriminator_optimizer = optim.RMSprop(discriminator_train.parameters(), lr=learning_rate)
generator_optimizer = optim.RMSprop(generator_train.parameters(), lr=learning_rate)

In [None]:
num_epochs = 10
mini_batch_size = 128

torch_dataset = torch.from_numpy(ori_subset_transformed.values).float()

dataloader = DataLoader(torch_dataset, batch_size=mini_batch_size, shuffle=True, num_workers=0)

if USE_CUDA:
    dataloader = DataLoader(torch_dataset.cuda(), batch_size=mini_batch_size, shuffle=True)

epoch_discriminator_losses = []
epoch_generator_losses = []

In [None]:
clip_value = 0.015
n_critic = 5

beta = 0.1

encoder_eval.eval()
decoder_eval.eval()
discriminator_train.train()
generator_train.train()

for epoch in range(num_epochs):
    mini_batch_count = 0

    batch_discriminator_losses = 0.0
    batch_generator_losses = 0.0

    start_time = datetime.now()

    for mini_batch_data in dataloader:
        mini_batch_count += 1

        if USE_CUDA:
            mini_batch_torch = torch.cuda.FloatTensor(mini_batch_data)
        else:
            mini_batch_torch = torch.FloatTensor(mini_batch_data)

        # =================== discriminator training ===================

        discriminator_train.zero_grad()

        noise = torch.randn(mini_batch_size, nz)
        if USE_CUDA:
            noise = noise.cuda()

        g_fake = generator_train(noise)

        d_real = discriminator_train(mini_batch_torch)
        d_fake = discriminator_train(g_fake.detach())

        discriminator_loss = -torch.mean(d_real) + torch.mean(d_fake)

        discriminator_loss.backward()
        batch_discriminator_losses += discriminator_loss.item()
        discriminator_optimizer.step()

        for p in discriminator_train.parameters():
            p.data.clamp_(-clip_value, clip_value)

        # =================== generator training =======================

        if mini_batch_count % n_critic == 0:
            generator_train.zero_grad()

            rec_input = decoder_eval(encoder_eval(g_fake)) - g_fake
            if USE_CUDA:
                rec_loss = beta * reconstruction_criterion(rec_input, torch.cuda.FloatTensor([-1]))
            else:
                rec_loss = beta * reconstruction_criterion(rec_input, torch.FloatTensor([-1]))

            d_fake = discriminator_train(g_fake)
            generator_loss = -torch.mean(d_fake) + rec_loss

            generator_loss.backward()
            batch_generator_losses += generator_loss.item()
            generator_optimizer.step()

    epoch_discriminator_loss = batch_discriminator_losses / mini_batch_count
    epoch_discriminator_losses.extend([epoch_discriminator_loss])

    epoch_generator_loss = batch_generator_losses / mini_batch_count
    epoch_generator_losses.extend([epoch_generator_loss])

    now = datetime.utcnow().strftime("%Y%m%d-%H:%M:%S")
    print('[LOG TRAIN {}] epoch: {:04}/{:04}, discriminator loss: {:.4f}'.format(now, epoch + 1, num_epochs, epoch_discriminator_loss))
    print('[LOG TRAIN {}] epoch: {:04}/{:04}, generator loss: {:.4f}'.format(now, epoch + 1, num_epochs, epoch_generator_loss))

    model_name = "{}_ep_{}_wgan_discriminator.pth".format(now, (epoch+1))
    torch.save(discriminator_train.state_dict(), os.path.join("./models", model_name))

    model_name = "{}_ep_{}_wgan_generator.pth".format(now, (epoch+1))
    torch.save(generator_train.state_dict(), os.path.join("./models", model_name))

In [None]:
plt.plot(range(0, len(epoch_discriminator_losses)), epoch_discriminator_losses)

plt.title('Discriminator training performance')

plt.xlabel('training epochs')
plt.ylabel('discrimination loss')

In [None]:
plt.plot(range(0, len(epoch_generator_losses)), epoch_generator_losses)

plt.title('Generator training performance')

plt.xlabel('training epochs')
plt.ylabel('generation loss')

In [None]:
generator_eval = Generator(input_size=nz, hidden_size=[64, 128, 256], output_size=ori_subset_transformed.shape[1])

if USE_CUDA:
    generator_eval = generator_eval.cuda()

generator_eval.load_state_dict(torch.load('./models/20200419-00_27_05_ep_500_wgan_generator.pth'))

generator_eval.eval()

torch_dataset = torch.from_numpy(ori_subset_transformed.values).float()

noise = torch.randn(128000, nz)
if USE_CUDA:
    noise = noise.cuda()
g_all = generator_eval(noise).cpu().detach()

# g_all = torch.from_numpy(pd.read_csv('./data/outlier_009_081.csv')[:128000].values).float()

torch_dataset = torch.cat((torch_dataset, g_all), dim=0)
dataloader_eval = DataLoader(torch_dataset, batch_size=mini_batch_size, shuffle=False, num_workers=0)

if USE_CUDA:
    dataloader_eval = DataLoader(torch_dataset.cuda(), batch_size=mini_batch_size, shuffle=False)

In [None]:
batch_count = 0

for enc_transactions_batch in dataloader_eval:
    z_enc_transactions_batch = encoder_eval(enc_transactions_batch)

    if batch_count == 0:
        z_enc_transactions_all = z_enc_transactions_batch
    else:
        z_enc_transactions_all = torch.cat((z_enc_transactions_all, z_enc_transactions_batch), dim=0)

    batch_count += 1

z_enc_transactions_all = z_enc_transactions_all.cpu().detach().numpy()

In [None]:
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)

regular_data = z_enc_transactions_all[:ori_subset_transformed.shape[0]]
outliers = z_enc_transactions_all[ori_subset_transformed.shape[0]:]

ax.scatter(regular_data[:, 0], regular_data[:, 1], c='C0', marker="o", label='regular', edgecolors='w', linewidth=0.5)
ax.scatter(outliers[:, 0], outliers[:, 1], c='C1', marker="x", label='outlier', edgecolors='w', s=60)

ax.legend(loc='best')

In [None]:
tau = 5
radius = 0.8

x_centroid = (radius * np.sin(np.linspace(0, 2 * np.pi, tau, endpoint=False)) + 1) / 2
y_centroid = (radius * np.cos(np.linspace(0, 2 * np.pi, tau, endpoint=False)) + 1) / 2

mu_gauss = np.vstack([x_centroid, y_centroid]).T

def compute_euclid_distance(x, y):
    euclidean_distance = np.sqrt(np.sum((x - y) ** 2, axis=1))

    return euclidean_distance

distances = np.apply_along_axis(func1d=compute_euclid_distance, axis=1, arr=z_enc_transactions_all, y=mu_gauss)

mode_divergence = np.min(distances, axis=1)

cluster_ids = np.argmin(distances, axis=1)

mode_divergence_all_scaled = np.asarray(mode_divergence)

for cluster_id in np.unique(cluster_ids).tolist():
    mask = cluster_ids == cluster_id
    mode_divergence_all_scaled[mask] = (mode_divergence[mask] - mode_divergence[mask].min()) / (mode_divergence[mask].ptp())

In [None]:
label = [0] * ori_subset_transformed.shape[0] + [1] * (z_enc_transactions_all.shape[0] - ori_subset_transformed.shape[0])
plot_data = pd.concat([pd.Series(mode_divergence_all_scaled, name='mode_divergence'),
                       pd.Series(label, name='label'),
                       pd.Series(cluster_ids, name='cluster_id')],
                       axis=1)

num_clusters = len(np.unique(cluster_ids))

fig, axes = plt.subplots(1, num_clusters, sharey=True, figsize=(14, 10))

for mode in range(0, num_clusters):
    plot_data = plot_data.sample(frac=1.0)

    z_mode = plot_data[plot_data['cluster_id'] == mode]

    regular_data = z_mode[z_mode['label'] == 0]
    outliers = z_mode[z_mode['label'] == 1]

    axes[mode].scatter(regular_data.index, regular_data['mode_divergence'],
                       c='C0', marker='o', s=30, linewidth=0.3, label='regular', edgecolors='w')

    axes[mode].scatter(outliers.index, outliers['mode_divergence'],
                       c='C1', marker='x', s=120, linewidth=3, label='outlier', edgecolors='w')

    xlabel = '$\\tau={}$' + str(mode+1) if mode == 0 else str(mode+1)
    axes[mode].set_xlabel(xlabel, fontsize=24)

    axes[mode].set_ylim([0.0, 0.7])

    axes[mode].set_xticks([int(plot_data.shape[0]/2)])
    axes[mode].set_xticklabels(['$x_{i}$'])

axes[0].set_ylabel('mode divergence $MD$', fontsize=20)

handles, labels = axes[2].get_legend_handles_labels()
plt.legend(handles, labels, loc='center', fontsize=20, ncol=3, borderaxespad=0.,
           bbox_to_anchor=(-6.5, 1., 9., .1))

plt.grid(True)

In [None]:
reconstruction_criterion_categorical_eval = nn.BCEWithLogitsLoss(reduction='none')
reconstruction_criterion_numeric_eval = nn.MSELoss(reduction='none')

if USE_CUDA:
    reconstruction_criterion_categorical_eval = reconstruction_criterion_categorical_eval.cuda()
    reconstruction_criterion_numeric_eval = reconstruction_criterion_numeric_eval.cuda()

encoder_eval.eval()
decoder_eval.eval()

batch_count = 0

for enc_transactions_batch in dataloader_eval:
    z_enc_transactions_batch = encoder_eval(enc_transactions_batch)

    reconstruction_batch = decoder_eval(z_enc_transactions_batch)

    input_cat_all = enc_transactions_batch[:, :ori_dataset_categ_transformed.shape[1]]
    input_num_all = enc_transactions_batch[:, ori_dataset_categ_transformed.shape[1]:]

    rec_cat_all = reconstruction_batch[:, :ori_dataset_categ_transformed.shape[1]]
    rec_num_all = reconstruction_batch[:, ori_dataset_categ_transformed.shape[1]:]

    rec_error_cat_all = reconstruction_criterion_categorical_eval(input=rec_cat_all, target=input_cat_all).mean(dim=1)
    rec_error_num_all = reconstruction_criterion_numeric_eval(input=rec_num_all, target=input_num_all).mean(dim=1)

    rec_error_all_batch = rec_error_cat_all + rec_error_num_all

    if batch_count == 0:
        rec_error_all = rec_error_all_batch
    else:
        rec_error_all = torch.cat((rec_error_all, rec_error_all_batch), dim=0)

    batch_count += 1

rec_error_all = rec_error_all.cpu().detach().numpy()

rec_error_all_scaled = np.asarray(rec_error_all)

for cluster_id in np.unique(cluster_ids).tolist():
    mask = cluster_ids == cluster_id
    rec_error_all_scaled[mask] = (rec_error_all[mask] - rec_error_all[mask].min()) / (rec_error_all[mask].ptp())

In [None]:
plot_data = pd.concat([pd.Series(rec_error_all_scaled, name='rec_error'),
                       pd.Series(label, name='label'),
                       pd.Series(cluster_ids, name='cluster_id')],
                       axis=1)

num_clusters = len(np.unique(cluster_ids))

fig, axes = plt.subplots(1, num_clusters, sharey=True, figsize=(14, 10))

for mode in range(0, num_clusters):
    plot_data = plot_data.sample(frac=1.0)

    z_mode = plot_data[plot_data['cluster_id'] == mode]

    regular_data = z_mode[z_mode['label'] == 0]
    outliers = z_mode[z_mode['label'] == 1]

    axes[mode].scatter(regular_data.index, regular_data['rec_error'],
                       c='C0', marker='o', s=30, linewidth=0.3, label='regular', edgecolors='w')

    axes[mode].scatter(outliers.index, outliers['rec_error'],
                       c='C1', marker='x', s=120, linewidth=3, label='outlier', edgecolors='w')

    xlabel = '$\\tau={}$' + str(mode+1) if mode == 0 else str(mode+1)
    axes[mode].set_xlabel(xlabel, fontsize=24)

    axes[mode].set_ylim([0.0, 0.8])

    axes[mode].set_xticks([int(plot_data.shape[0]/2)])
    axes[mode].set_xticklabels(['$x_{i}$'])

axes[0].set_ylabel('reconstruction error $RE$', fontsize=20)

handles, labels = axes[2].get_legend_handles_labels()
plt.legend(handles, labels, loc='center', fontsize=20, ncol=3, borderaxespad=0.,
           bbox_to_anchor=(-6.5, 1., 9., .1))

plt.grid(True)

In [None]:
generated = 0

while generated < 267000:
    noise = torch.randn(mini_batch_size, nz)
    if USE_CUDA:
        noise = noise.cuda()
    g_batch = generator_eval(noise).detach()

    z_enc_transactions_batch = encoder_eval(g_batch).detach()
    reconstruction_batch = decoder_eval(z_enc_transactions_batch).detach()
    z_enc_transactions_batch = z_enc_transactions_batch.cpu().numpy()

    distances_batch = np.apply_along_axis(func1d=compute_euclid_distance, axis=1, arr=z_enc_transactions_batch, y=mu_gauss)
    mode_divergence_batch = np.min(distances_batch, axis=1)

    input_cat_all = g_batch[:, :ori_dataset_categ_transformed.shape[1]]
    input_num_all = g_batch[:, ori_dataset_categ_transformed.shape[1]:]
    rec_cat_all = reconstruction_batch[:, :ori_dataset_categ_transformed.shape[1]]
    rec_num_all = reconstruction_batch[:, ori_dataset_categ_transformed.shape[1]:]

    rec_error_cat_all = reconstruction_criterion_categorical_eval(input=rec_cat_all, target=input_cat_all).mean(dim=1)
    rec_error_num_all = reconstruction_criterion_numeric_eval(input=rec_num_all, target=input_num_all).mean(dim=1)

    rec_error_all_batch = (rec_error_cat_all + rec_error_num_all).cpu().numpy()
    g_batch = g_batch.cpu().numpy()[(mode_divergence_batch > 0.09) & (rec_error_all_batch > 0.81)]

    pd.DataFrame(g_batch).to_csv('./data/outlier_009_081.csv', header=0, index=0, mode='a')
    generated += g_batch.shape[0]