# AnoGAN

In [280]:
import warnings
warnings.filterwarnings('ignore')

## Libraries import

In [281]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder
import time

import torchvision.utils as vutils
import torch
import torch.nn as nn
import torch.nn.init as init
from torch.utils.data import Dataset

## Train dataset loader for AnoGAN

In [282]:
class MyTrainDataset(Dataset):
 
      def __init__(self,file):
            price_df=file
 
            x=file.values
            y=torch.ones(len(x),1)
 
            self.x_train=torch.tensor(x,dtype=torch.float32)
            self.y_train=y
 
      def __len__(self):
            return len(self.y_train)
   
      def __getitem__(self,idx):
            return self.x_train[idx],self.y_train[idx]

## Test dataset loader for AnoGAN

In [283]:
class MyTestDataset(Dataset):
 
      def __init__(self,file):
            price_df=file
 
            x=file[file.columns[:-1]].values
            y=file[file.columns[-1:]].values
 
            self.x_train=torch.tensor(x,dtype=torch.float32)
            self.y_train=torch.tensor(y,dtype=torch.float32)
 
 
      def __len__(self):
            return len(self.y_train)
   
      def __getitem__(self,idx):
            return self.x_train[idx],self.y_train[idx]

## AnoGAN Generator

In [284]:
# Generator Code

class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.layer1 = nn.Sequential(
                        nn.Linear(in_features = input_dim, out_features = 512), 
                        nn.modules.BatchNorm1d(512),
                        nn.LeakyReLU()
        )
        self.layer2 = nn.Sequential(
                        nn.Linear(in_features = 512,out_features = 256),  
                        nn.modules.BatchNorm1d(256),
                        nn.LeakyReLU(),
        )
        self.layer3 = nn.Sequential(
                        nn.Linear(in_features = 256,out_features = 128), 
                        nn.modules.BatchNorm1d(128),
                        nn.LeakyReLU(),
        )
        self.layer4 = nn.Sequential(
                        nn.Linear(in_features = 128,out_features = input_dim),
                        nn.Tanh()
        )
        
       
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()


    def forward(self, input):
        h0 = self.layer1(input)
        h1 = self.layer2(h0)
        h2 = self.layer3(h1)
        h3 = self.layer4(h2)
        out = h3
        return out

## AnoGAN Discriminator

In [285]:
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.layer1 = nn.Sequential(
                    nn.Linear(in_features = input_dim, out_features = 512),  
                    nn.modules.BatchNorm1d(512),
                    nn.LeakyReLU()
        )
        self.layer2 = nn.Sequential(
                    nn.Linear(in_features = 512,out_features = 256),  
                    nn.modules.BatchNorm1d(256),
                    nn.LeakyReLU()
        )
        self.layer3 = nn.Sequential(
                    nn.Linear(in_features = 256,out_features = 128),
                    nn.modules.BatchNorm1d(128),
                    nn.LeakyReLU()
        )
        self.layer4 = nn.Sequential(
                    nn.Linear(in_features = 128,out_features = 1),
                    nn.Sigmoid()
        )
        
        
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=1.0)
            if module.bias is not None:
                module.bias.data.zero_()
    
        

    def forward(self, input):
        h0 = self.layer1(input)
        h1 = self.layer2(h0)
        h2 = self.layer3(h1)
        feature = h2
        h3 = self.layer4(h2)
        out = h3
        return out, feature

## Anomaly score for AnoGAN

In [286]:
def Anomaly_score(x,G_z,Lambda=0.1):
    discriminator.eval()
    _,x_feature = discriminator(x)
    x_feature = x_feature.view(-1)
    _,G_z_feature = discriminator(G_z)
    
    residual_loss = torch.sum(torch.abs(x-G_z))
    discrimination_loss = torch.sum(torch.abs(x_feature-G_z_feature))
    
    total_loss = (1-Lambda)*residual_loss + Lambda*discrimination_loss
    return total_loss

## Arrhythmia

**Dataset source**: http://odds.cs.stonybrook.edu/arrhythmia-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

F. Keller, E. Muller, K. Bohm.“HiCS: High-contrast subspaces for density-based outlier ranking.” ICDE, 2012.

In [287]:
data = pd.read_csv('./arrhythmia.csv', sep = ',')

In [288]:
# dropping columns that consist only of 0's
data = data.drop(columns = ['Col15', 'Col63', 'Col65', 'Col79', 'Col127', 'Col128','Col135', 'Col137', 'Col139','Col141',
'Col147', 'Col152', 'Col153','Col160','Col200', 'Col260', 'Col270'])

In [289]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,...,Col265,Col266,Col267,Col268,Col269,Col271,Col272,Col273,Col274,y
0,75.0,0.0,190.0,80.0,91.0,193.0,371.0,174.0,121.0,-16.0,...,-0.3,0.0,9.0,-0.9,0.0,0.9,2.9,23.3,49.4,1
1,56.0,1.0,165.0,64.0,81.0,174.0,401.0,149.0,39.0,25.0,...,-0.5,0.0,8.5,0.0,0.0,0.2,2.1,20.4,38.8,0
2,54.0,0.0,172.0,95.0,138.0,163.0,386.0,185.0,102.0,96.0,...,0.9,0.0,9.5,-2.4,0.0,0.3,3.4,12.3,49.0,0
3,55.0,0.0,175.0,94.0,100.0,202.0,380.0,179.0,143.0,28.0,...,0.1,0.0,12.2,-2.2,0.0,0.4,2.6,34.6,61.6,0
4,75.0,0.0,190.0,80.0,88.0,181.0,360.0,177.0,103.0,-16.0,...,-0.4,0.0,13.1,-3.6,0.0,-0.1,3.9,25.4,62.8,1


In [290]:
data.shape

(452, 258)

In [291]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,386
1,66


## AnoGAN

In [292]:
num_epochs = 20
batch_size = 64
learning_rate = 0.0002
input_dim = data.shape[1]-1

In [293]:
(
    x_train,
    x_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(data[data.columns[:-1]].copy(), data['y'].copy(), data.copy().index, test_size=0.3,stratify=data[['y']])

In [294]:
y_train = pd.DataFrame(y_train)
x_train = x_train.loc[y_train[y_train.y == 0].index]

In [295]:
myDs=MyTrainDataset(x_train)
train_loader=torch.utils.data.DataLoader(myDs,batch_size=batch_size,shuffle=False, drop_last = True)

In [296]:
netG = Generator()
netD = Discriminator()

# Initialize BCELoss function
criterion = nn.BCELoss()

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = torch.optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizerG = torch.optim.Adam(netG.parameters(), lr=learning_rate, betas=(0.5, 0.999))

In [297]:
G_losses = []
D_losses = []
iters = 0

start = time.process_time()
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    print(epoch)
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        
        real_cpu = data[0]
        #print(real_cpu)
        b_size = real_cpu.size(0)
        # Format batch
        label = torch.full((b_size,), real_label, dtype=torch.float)
        # Forward pass real batch through D
        output, _ = netD(real_cpu)
        output = output.view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, input_dim)
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output, _ = netD(fake.detach())
        output = output.view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        #print(errD_fake)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output, _ = netD(fake)
        output = output.view(-1)
        
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        iters += 1

end = time.process_time()
arrhythmia_gan_train_time = end - start
print(end - start)

Starting Training Loop...
0
[0/20][0/4]	Loss_D: 20.5923	Loss_G: 0.0117	D(x): 0.9631	D(G(z)): 0.9898 / 0.9890
1
[1/20][0/4]	Loss_D: 23.5759	Loss_G: 0.0861	D(x): 0.9639	D(G(z)): 0.9657 / 0.9647
2
[2/20][0/4]	Loss_D: 23.2540	Loss_G: 0.0167	D(x): 0.9641	D(G(z)): 0.9862 / 0.9855
3
[3/20][0/4]	Loss_D: 24.5077	Loss_G: 0.0177	D(x): 0.9645	D(G(z)): 0.9866 / 0.9863
4
[4/20][0/4]	Loss_D: 21.5883	Loss_G: 0.0899	D(x): 0.9647	D(G(z)): 0.9558 / 0.9556
5
[5/20][0/4]	Loss_D: 15.4044	Loss_G: 0.0713	D(x): 0.9651	D(G(z)): 0.9542 / 0.9545
6
[6/20][0/4]	Loss_D: 22.8782	Loss_G: 0.0523	D(x): 0.9655	D(G(z)): 0.9722 / 0.9716
7
[7/20][0/4]	Loss_D: 22.6653	Loss_G: 0.0395	D(x): 0.9658	D(G(z)): 0.9723 / 0.9718
8
[8/20][0/4]	Loss_D: 17.3109	Loss_G: 0.0225	D(x): 0.9660	D(G(z)): 0.9821 / 0.9816
9
[9/20][0/4]	Loss_D: 18.2154	Loss_G: 0.0450	D(x): 0.9664	D(G(z)): 0.9678 / 0.9670
10
[10/20][0/4]	Loss_D: 16.9895	Loss_G: 0.0460	D(x): 0.9664	D(G(z)): 0.9664 / 0.9656
11
[11/20][0/4]	Loss_D: 17.3723	Loss_G: 0.1419	D(x): 0.9665

In [298]:
x_test['y'] = y_test
myDs=MyTestDataset(x_test)
test_loader=torch.utils.data.DataLoader(myDs,batch_size=1,shuffle=False,drop_last = True)

In [299]:
generator = netG
discriminator = netD

In [300]:
start = time.process_time()
losses = []
for j,(image,label) in enumerate(test_loader):
    z = init.normal_(torch.zeros(1,input_dim),mean=0,std=0.1)
    z_optimizer = torch.optim.Adam([z],lr=1e-4)
    
    generator.eval()
    gen_fake = generator(z)
    loss = Anomaly_score(image,gen_fake)
    losses.append(loss.detach().numpy().tolist())
    
threshold = np.mean(losses) + np.std(losses)
anomaly_mask = pd.Series(losses) > threshold
preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
preds = preds.tolist()

x_test['y'] = y_test
x_test['anomaly score'] = losses
x_test['prediction'] = preds

end = time.process_time()
arrhythmia_gan_test_time = end - start
print(end - start)

2.484375


In [301]:
confusion_matrix(x_test['y'], x_test['prediction'])

array([[106,  10],
       [ 16,   4]], dtype=int64)

In [302]:
fpr, tpr, _ = metrics.roc_curve(x_test['y'], x_test['anomaly score'])
arrhythmia_gan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.593103448275862

In [303]:
arrhythmia_gan_report = classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89       116
           1       0.29      0.20      0.24        20

    accuracy                           0.81       136
   macro avg       0.58      0.56      0.56       136
weighted avg       0.78      0.81      0.79       136



In [304]:
print(arrhythmia_gan_report['1']['precision'])
print(arrhythmia_gan_report['1']['recall'])
print(arrhythmia_gan_report['1']['f1-score'])

0.2857142857142857
0.2
0.23529411764705882


In [305]:
precision, recall, thresholds = precision_recall_curve(x_test['y'], x_test['anomaly score'])
arrhythmia_gan_auc_precision_recall = metrics.auc(recall, precision)
print(arrhythmia_gan_auc_precision_recall)

0.32238383264947656


## Cardiocotography

**Dataset source**: http://odds.cs.stonybrook.edu/cardiotocogrpahy-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

C. C. Aggarwal and S. Sathe, “Theoretical foundations and algorithms for outlier ensembles.” ACM SIGKDD Explorations Newsletter, vol. 17, no. 1, pp. 24–47, 2015.

Saket Sathe and Charu C. Aggarwal. LODES: Local Density meets Spectral Outlier Detection. SIAM Conference on Data Mining, 2016.

In [306]:
data = pd.read_csv('./Cardiotocography.csv')

In [307]:
data.shape

(1831, 22)

In [308]:
data['y'] = data['y'].astype(int)

In [309]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,1655
1,176


In [310]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,...,Col13,Col14,Col15,Col16,Col17,Col18,Col19,Col20,Col21,y
0,0.004912,0.693191,-0.20364,0.595322,0.35319,-0.061401,-0.278295,-1.650444,0.759072,-0.420487,...,-0.798376,1.854728,0.622631,0.963083,0.301464,0.193113,0.231498,-0.289786,-0.493294,0
1,0.110729,-0.079903,-0.20364,1.268942,0.396246,-0.061401,-0.278295,-1.71027,0.759072,-0.420487,...,-0.798376,1.854728,0.278625,0.963083,0.301464,0.129265,0.093563,-0.256385,-0.493294,0
2,0.216546,-0.272445,-0.20364,1.050988,0.148753,-0.061401,-0.278295,-1.71027,1.106509,-0.420487,...,-1.332931,0.314688,2.342663,-0.488279,0.061002,0.065417,0.024596,-0.256385,1.140018,0
3,0.004912,0.727346,-0.20364,1.212171,-0.683598,-0.061401,-0.278295,-1.71027,1.106509,-0.420487,...,-1.332931,0.314688,1.65465,-0.488279,0.061002,0.193113,0.093563,-0.323186,1.140018,0
4,-0.100905,0.363595,1.321366,1.02712,0.141359,-0.061401,-0.278295,-0.992364,-0.051613,-0.420487,...,-0.085638,-0.565334,0.278625,-0.488279,-0.059229,0.065417,0.024596,-0.456787,1.140018,0


## AnoGAN

In [311]:
num_epochs = 20
batch_size = 64
learning_rate = 0.0002
input_dim = data.shape[1]-1

In [312]:
(
    x_train,
    x_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(data[data.columns[:-1]], data['y'], data.index, test_size=0.3,stratify=data[['y']])

In [313]:
y_train = pd.DataFrame(y_train)
x_train = x_train.loc[y_train[y_train.y == 0].index]

In [314]:
myDs=MyTrainDataset(x_train)
train_loader=torch.utils.data.DataLoader(myDs,batch_size=batch_size,shuffle=False, drop_last = True)

In [315]:
netG = Generator()
netD = Discriminator()

# Initialize BCELoss function
criterion = nn.BCELoss()

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = torch.optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizerG = torch.optim.Adam(netG.parameters(), lr=learning_rate, betas=(0.5, 0.999))

In [316]:
img_list = []
G_losses = []
D_losses = []
iters = 0

start = time.process_time()
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    print(epoch)
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        
        real_cpu = data[0]
        #print(real_cpu)
        b_size = real_cpu.size(0)
        # Format batch
        label = torch.full((b_size,), real_label, dtype=torch.float)
        # Forward pass real batch through D
        output, _ = netD(real_cpu)
        output = output.view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, input_dim)
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output, _ = netD(fake.detach())
        output = output.view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        #print(errD_fake)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output, _ = netD(fake)
        output = output.view(-1)
        
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        iters += 1
        
end = time.process_time()
cardio_gan_train_time = end - start
print(end - start)

Starting Training Loop...
0
[0/20][0/18]	Loss_D: 5.1298	Loss_G: 3.0200	D(x): 0.4250	D(G(z)): 0.4925 / 0.4916
1
[1/20][0/18]	Loss_D: 4.1783	Loss_G: 2.3902	D(x): 0.4536	D(G(z)): 0.4706 / 0.4707
2
[2/20][0/18]	Loss_D: 3.4773	Loss_G: 1.9850	D(x): 0.4799	D(G(z)): 0.4653 / 0.4650
3
[3/20][0/18]	Loss_D: 3.3718	Loss_G: 1.7044	D(x): 0.5003	D(G(z)): 0.5074 / 0.5057
4
[4/20][0/18]	Loss_D: 3.1828	Loss_G: 1.7382	D(x): 0.5101	D(G(z)): 0.4698 / 0.4693
5
[5/20][0/18]	Loss_D: 2.8949	Loss_G: 1.4979	D(x): 0.5144	D(G(z)): 0.4980 / 0.4952
6
[6/20][0/18]	Loss_D: 2.9216	Loss_G: 1.4844	D(x): 0.5180	D(G(z)): 0.5036 / 0.5017
7
[7/20][0/18]	Loss_D: 3.0322	Loss_G: 1.4285	D(x): 0.5161	D(G(z)): 0.5262 / 0.5242
8
[8/20][0/18]	Loss_D: 2.5539	Loss_G: 1.3055	D(x): 0.5202	D(G(z)): 0.4658 / 0.4646
9
[9/20][0/18]	Loss_D: 2.5104	Loss_G: 1.1440	D(x): 0.5195	D(G(z)): 0.4919 / 0.4906
10
[10/20][0/18]	Loss_D: 2.6298	Loss_G: 1.0531	D(x): 0.5164	D(G(z)): 0.5308 / 0.5284
11
[11/20][0/18]	Loss_D: 2.5239	Loss_G: 1.2785	D(x): 0.5169

In [317]:
x_test['y'] = y_test
myDs=MyTestDataset(x_test)
test_loader=torch.utils.data.DataLoader(myDs,batch_size=1,shuffle=False)

In [318]:
generator = netG
discriminator = netD

In [319]:
start = time.process_time()
losses = []
for j,(image,label) in enumerate(test_loader):
    z = init.normal_(torch.zeros(1,input_dim),mean=0,std=0.1)
    z_optimizer = torch.optim.Adam([z],lr=1e-4)
    
    generator.eval()
    gen_fake = generator(z)
    loss = Anomaly_score(image,gen_fake)
    losses.append(loss.detach().numpy().tolist())
    
threshold = np.mean(losses) + np.std(losses)
anomaly_mask = pd.Series(losses) > threshold
preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
preds = preds.tolist()

x_test['y'] = y_test
x_test['anomaly score'] = losses
x_test['prediction'] = preds

end = time.process_time()
cardio_gan_test_time = end - start
print(end - start)

10.59375


In [320]:
confusion_matrix(x_test['y'], x_test['prediction'])

array([[463,  34],
       [ 14,  39]], dtype=int64)

In [321]:
fpr, tpr, _ = metrics.roc_curve(x_test['y'], x_test['anomaly score'])
cardio_gan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.9532288068030828

In [322]:
cardio_gan_report = classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.97      0.93      0.95       497
           1       0.53      0.74      0.62        53

    accuracy                           0.91       550
   macro avg       0.75      0.83      0.78       550
weighted avg       0.93      0.91      0.92       550



In [323]:
print(cardio_gan_report['1']['precision'])
print(cardio_gan_report['1']['recall'])
print(cardio_gan_report['1']['f1-score'])

0.5342465753424658
0.7358490566037735
0.6190476190476191


In [324]:
precision, recall, thresholds = precision_recall_curve(x_test['y'], x_test['anomaly score'])
cardio_gan_auc_precision_recall = metrics.auc(recall, precision)
print(cardio_gan_auc_precision_recall)

0.6824427867704227


## ForestCover

**Dataset source**: http://odds.cs.stonybrook.edu/forestcovercovertype-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

Kai Ming Ting, Guang-Tong Zhou, Fei Tony Liu & Tan Swee Chuan. (2010). Mass Estimation and Its Applications. Proceedings of The 16th ACM SIGKDD Conference on Knowledge Discovery and Data Mining 2010. pp. 989-998.

Swee Chuan Tan, Kai Ming Ting & Fei Tony Liu. (2011). Fast Anomaly Detection for Streaming Data. Proceedings of the International Joint Conference on Artificial Intelligence 2011. pp.1151-1156.

In [325]:
data = pd.read_csv('./ForestCover.csv')

In [326]:
data.shape

(286048, 11)

In [327]:
pd.pivot_table(data,
             values = 'Col2',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col2
y,Unnamed: 1_level_1
0,283301
1,2747


In [328]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,y
0,2804,139,9,268,65,3180,234,238,135,6121,0
1,2785,155,18,242,118,3090,238,238,122,6211,0
2,2579,132,6,300,-15,67,230,237,140,6031,0
3,2886,151,11,371,26,5253,234,240,136,4051,0
4,2742,134,22,150,69,3215,248,224,92,6091,0


## AnoGAN

In [329]:
num_epochs = 20
batch_size = 512
learning_rate = 0.0002
input_dim = data.shape[1]-1

In [330]:
(
    x_train,
    x_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(data[data.columns[:-1]], data['y'], data.index, test_size=0.3,stratify=data[['y']])

In [331]:
y_train = pd.DataFrame(y_train)
x_train = x_train.loc[y_train[y_train.y == 0].index]

In [332]:
myDs=MyTrainDataset(x_train)
train_loader=torch.utils.data.DataLoader(myDs,batch_size=batch_size,shuffle=False, drop_last = True)

In [333]:
netG = Generator()
netD = Discriminator()

# Initialize BCELoss function
criterion = nn.BCELoss()

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = torch.optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizerG = torch.optim.Adam(netG.parameters(), lr=learning_rate, betas=(0.5, 0.999))

In [334]:
img_list = []
G_losses = []
D_losses = []
iters = 0

start = time.process_time()
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    print(epoch)
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        
        real_cpu = data[0]
        #print(real_cpu)
        b_size = real_cpu.size(0)
        # Format batch
        label = torch.full((b_size,), real_label, dtype=torch.float)
        # Forward pass real batch through D
        output, _ = netD(real_cpu)
        output = output.view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, input_dim)
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output, _ = netD(fake.detach())
        output = output.view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        #print(errD_fake)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output, _ = netD(fake)
        output = output.view(-1)
        
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())
    
        iters += 1
        
end = time.process_time()
forestcover_gan_train_time = end - start
print(end - start)

Starting Training Loop...
0
[0/20][0/387]	Loss_D: 8.6557	Loss_G: 3.2158	D(x): 0.4045	D(G(z)): 0.4766 / 0.4772
[0/20][50/387]	Loss_D: 2.5428	Loss_G: 1.0143	D(x): 0.5085	D(G(z)): 0.5623 / 0.5586
[0/20][100/387]	Loss_D: 1.8631	Loss_G: 1.1452	D(x): 0.5494	D(G(z)): 0.4710 / 0.4675
[0/20][150/387]	Loss_D: 1.4539	Loss_G: 1.1660	D(x): 0.6280	D(G(z)): 0.4501 / 0.4430
[0/20][200/387]	Loss_D: 1.2721	Loss_G: 1.3452	D(x): 0.6464	D(G(z)): 0.3920 / 0.3875
[0/20][250/387]	Loss_D: 1.1415	Loss_G: 1.5149	D(x): 0.6810	D(G(z)): 0.3577 / 0.3524
[0/20][300/387]	Loss_D: 0.9949	Loss_G: 1.5678	D(x): 0.7028	D(G(z)): 0.3312 / 0.3245
[0/20][350/387]	Loss_D: 0.9296	Loss_G: 1.7109	D(x): 0.7034	D(G(z)): 0.3050 / 0.2999
1
[1/20][0/387]	Loss_D: 0.9975	Loss_G: 1.6129	D(x): 0.6858	D(G(z)): 0.3211 / 0.3144
[1/20][50/387]	Loss_D: 0.8951	Loss_G: 1.6448	D(x): 0.7136	D(G(z)): 0.3052 / 0.2969
[1/20][100/387]	Loss_D: 0.8786	Loss_G: 1.9138	D(x): 0.7157	D(G(z)): 0.2821 / 0.2748
[1/20][150/387]	Loss_D: 0.6989	Loss_G: 1.9402	D(x): 

In [335]:
x_test['y'] = y_test
myDs=MyTestDataset(x_test)
test_loader=torch.utils.data.DataLoader(myDs,batch_size=1,shuffle=False)

In [336]:
generator = netG
discriminator = netD

In [337]:
start = time.process_time()
losses = []
for j,(image,label) in enumerate(test_loader):
    z = init.normal_(torch.zeros(1,input_dim),mean=0,std=0.1)
    z_optimizer = torch.optim.Adam([z],lr=1e-4)
    generator.eval()
    gen_fake = generator(z)
    loss = Anomaly_score(image,gen_fake)
    losses.append(loss.detach().numpy().tolist())
    
threshold = np.mean(losses) + np.std(losses)
anomaly_mask = pd.Series(losses) > threshold
preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
preds = preds.tolist()

x_test['y'] = y_test
x_test['anomaly score'] = losses
x_test['prediction'] = preds

end = time.process_time()
forestcover_gan_test_time = end - start
print(end - start)

1458.296875


In [338]:
confusion_matrix(x_test['y'], x_test['prediction'])

array([[70773, 14218],
       [  824,     0]], dtype=int64)

In [339]:
fpr, tpr, _ = metrics.roc_curve(x_test['y'], x_test['anomaly score'])
forestcover_gan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.04266760312599632

In [340]:
forestcover_gan_report = classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.99      0.83      0.90     84991
           1       0.00      0.00      0.00       824

    accuracy                           0.82     85815
   macro avg       0.49      0.42      0.45     85815
weighted avg       0.98      0.82      0.90     85815



In [341]:
print(forestcover_gan_report['1']['precision'])
print(forestcover_gan_report['1']['recall'])
print(forestcover_gan_report['1']['f1-score'])

0.0
0.0
0.0


In [342]:
precision, recall, thresholds = precision_recall_curve(x_test['y'], x_test['anomaly score'])
forestcover_gan_auc_precision_recall = metrics.auc(recall, precision)
print(forestcover_gan_auc_precision_recall)

0.0049248904148862125


## Annthyroid

**Dataset source**: http://odds.cs.stonybrook.edu/annthyroid-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Aditional sources:**

Abe, Naoki, Bianca Zadrozny, and John Langford. “Outlier detection by active learning.” Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2006.

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

In [343]:
data = pd.read_csv('./annthyroid.csv')

In [344]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,y
0,0.73,0.0006,0.015,0.12,0.082,0.146,0
1,0.24,0.00025,0.03,0.143,0.133,0.108,0
2,0.47,0.0019,0.024,0.102,0.131,0.078,0
3,0.64,0.0009,0.017,0.077,0.09,0.085,0
4,0.23,0.00025,0.026,0.139,0.09,0.153,0


In [345]:
data.shape

(7200, 7)

In [346]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,6666
1,534


## AnoGAN

In [347]:
num_epochs = 20
batch_size = 512
learning_rate = 0.0002
input_dim = data.shape[1]-1
num_epochs = 20

In [348]:
(
    x_train,
    x_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(data[data.columns[:-1]], data['y'], data.index, test_size=0.3,stratify=data[['y']])

In [349]:
y_train = pd.DataFrame(y_train)
x_train = x_train.loc[y_train[y_train.y == 0].index]

In [350]:
myDs=MyTrainDataset(x_train)
train_loader=torch.utils.data.DataLoader(myDs,batch_size=batch_size,shuffle=False, drop_last = True)

In [351]:
netG = Generator()
netD = Discriminator()

# Initialize BCELoss function
criterion = nn.BCELoss()


# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = torch.optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizerG = torch.optim.Adam(netG.parameters(), lr=learning_rate, betas=(0.5, 0.999))

In [352]:
img_list = []
G_losses = []
D_losses = []
iters = 0

start = time.process_time()
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    print(epoch)
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        
        real_cpu = data[0]
        #print(real_cpu)
        b_size = real_cpu.size(0)
        # Format batch
        label = torch.full((b_size,), real_label, dtype=torch.float)
        # Forward pass real batch through D
        output, _ = netD(real_cpu)
        output = output.view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, input_dim)
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output, _ = netD(fake.detach())
        output = output.view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        #print(errD_fake)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output, _ = netD(fake)
        output = output.view(-1)
        
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        iters += 1
        
end = time.process_time()
annthyroid_gan_train_time = end - start
print(end - start)

Starting Training Loop...
0
[0/20][0/9]	Loss_D: 6.5559	Loss_G: 5.4583	D(x): 0.1997	D(G(z)): 0.2840 / 0.2830
1
[1/20][0/9]	Loss_D: 4.5263	Loss_G: 3.4387	D(x): 0.1788	D(G(z)): 0.2131 / 0.2137
2
[2/20][0/9]	Loss_D: 3.9052	Loss_G: 2.6849	D(x): 0.1883	D(G(z)): 0.2512 / 0.2520
3
[3/20][0/9]	Loss_D: 3.5160	Loss_G: 2.0896	D(x): 0.2213	D(G(z)): 0.3107 / 0.3108
4
[4/20][0/9]	Loss_D: 3.1858	Loss_G: 1.7220	D(x): 0.2624	D(G(z)): 0.3736 / 0.3725
5
[5/20][0/9]	Loss_D: 2.8148	Loss_G: 1.6934	D(x): 0.3066	D(G(z)): 0.3709 / 0.3689
6
[6/20][0/9]	Loss_D: 2.5353	Loss_G: 1.4784	D(x): 0.3520	D(G(z)): 0.3978 / 0.3934
7
[7/20][0/9]	Loss_D: 2.2631	Loss_G: 1.3035	D(x): 0.3958	D(G(z)): 0.4057 / 0.4029
8
[8/20][0/9]	Loss_D: 2.1711	Loss_G: 1.3343	D(x): 0.4374	D(G(z)): 0.4354 / 0.4310
9
[9/20][0/9]	Loss_D: 1.9993	Loss_G: 1.3002	D(x): 0.4759	D(G(z)): 0.4343 / 0.4298
10
[10/20][0/9]	Loss_D: 1.7478	Loss_G: 1.3309	D(x): 0.5101	D(G(z)): 0.3936 / 0.3889
11
[11/20][0/9]	Loss_D: 1.6563	Loss_G: 1.2532	D(x): 0.5433	D(G(z)): 0.

In [353]:
x_test['y'] = y_test
myDs=MyTestDataset(x_test)
test_loader=torch.utils.data.DataLoader(myDs,batch_size=1,shuffle=False)

In [354]:
generator = netG
discriminator = netD

In [355]:
start = time.process_time()
losses = []
for j,(image,label) in enumerate(test_loader):
    z = init.normal_(torch.zeros(1,input_dim),mean=0,std=0.1)
    z_optimizer = torch.optim.Adam([z],lr=1e-4)
    
    generator.eval()
    gen_fake = generator(z)
    loss = Anomaly_score(image,gen_fake)
    losses.append(loss.detach().numpy().tolist())
    
threshold = np.mean(losses) + np.std(losses)
anomaly_mask = pd.Series(losses) > threshold
preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
preds = preds.tolist()

x_test['y'] = y_test
x_test['anomaly score'] = losses
x_test['prediction'] = preds

end = time.process_time()
annthyroid_gan_test_time = end - start
print(end - start)

42.453125


In [356]:
confusion_matrix(x_test['y'], x_test['prediction'])

array([[1621,  379],
       [ 123,   37]], dtype=int64)

In [357]:
fpr, tpr, _ = metrics.roc_curve(x_test['y'], x_test['anomaly score'])
annthyroid_gan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.51930625

In [358]:
annthyroid_gan_report = classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.93      0.81      0.87      2000
           1       0.09      0.23      0.13       160

    accuracy                           0.77      2160
   macro avg       0.51      0.52      0.50      2160
weighted avg       0.87      0.77      0.81      2160



In [359]:
print(annthyroid_gan_report['1']['precision'])
print(annthyroid_gan_report['1']['recall'])
print(annthyroid_gan_report['1']['f1-score'])

0.0889423076923077
0.23125
0.12847222222222224


In [360]:
precision, recall, thresholds = precision_recall_curve(x_test['y'], x_test['anomaly score'])
annthyroid_gan_auc_precision_recall = metrics.auc(recall, precision)
print(annthyroid_gan_auc_precision_recall)

0.08312800128402117


## Credit card

**Dataset source**: https://www.kaggle.com/mlg-ulb/creditcardfraud

**Additional sources:**

Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015

Dal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon

Dal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE

Dal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)

Carcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-Aël; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier

Carcillo, Fabrizio; Le Borgne, Yann-Aël; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing

Bertrand Lebichot, Yann-Aël Le Borgne, Liyun He, Frederic Oblé, Gianluca Bontempi Deep-Learning Domain Adaptation Techniques for Credit Cards Fraud Detection, INNSBDDL 2019: Recent Advances in Big Data and Deep Learning, pp 78-88, 2019

Fabrizio Carcillo, Yann-Aël Le Borgne, Olivier Caelen, Frederic Oblé, Gianluca Bontempi Combining Unsupervised and Supervised Learning in Credit Card Fraud Detection Information Sciences, 2019

Yann-Aël Le Borgne, Gianluca Bontempi Machine Learning for Credit Card Fraud Detection - Practical Handbook

In [361]:
data = pd.read_csv('./creditcard.csv')

In [362]:
data = data.drop(columns = ['Time'])

In [363]:
data.shape

(284807, 30)

In [364]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [365]:
pd.pivot_table(data,
             values = 'V1',
               index = 'Class', 
              aggfunc = 'count')

Unnamed: 0_level_0,V1
Class,Unnamed: 1_level_1
0,284315
1,492


## AnoGAN

In [366]:
num_epochs = 20
batch_size = 512
learning_rate = 0.0002
input_dim = data.shape[1]-1
num_epochs = 20

In [367]:
(
    x_train,
    x_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(data[data.columns[:-1]], data['Class'], data.index, test_size=0.3,stratify=data[['Class']])

In [368]:
y_train = pd.DataFrame(y_train)
x_train = x_train.loc[y_train[y_train.Class == 0].index]

In [369]:
myDs=MyTrainDataset(x_train)
train_loader=torch.utils.data.DataLoader(myDs,batch_size=batch_size,shuffle=False, drop_last = True)

In [370]:
netG = Generator()
netD = Discriminator()

# Initialize BCELoss function
criterion = nn.BCELoss()

# Create batch of latent vectors that we will use to visualize
#  the progression of the generator
fixed_noise = torch.randn(batch_size, input_dim)

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = torch.optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizerG = torch.optim.Adam(netG.parameters(), lr=learning_rate, betas=(0.5, 0.999))

In [371]:
img_list = []
G_losses = []
D_losses = []
iters = 0

start = time.process_time()
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    print(epoch)
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        
        real_cpu = data[0]
        #print(real_cpu)
        b_size = real_cpu.size(0)
        # Format batch
        label = torch.full((b_size,), real_label, dtype=torch.float)
        # Forward pass real batch through D
        output, _ = netD(real_cpu)
        output = output.view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, input_dim)
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output, _ = netD(fake.detach())
        output = output.view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        #print(errD_fake)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output, _ = netD(fake)
        output = output.view(-1)
        
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        # Check how the generator is doing by saving G's output on fixed_noise
        if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(train_loader)-1)):
            with torch.no_grad():
                fake = netG(fixed_noise).detach().cpu()
            img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

        iters += 1
        
end = time.process_time()
creditcard_gan_train_time = end - start
print(end - start)

Starting Training Loop...
0
[0/20][0/388]	Loss_D: 10.7736	Loss_G: 0.5119	D(x): 0.5789	D(G(z)): 0.8313 / 0.8304
[0/20][50/388]	Loss_D: 5.0215	Loss_G: 0.2715	D(x): 0.8593	D(G(z)): 0.8609 / 0.8588
[0/20][100/388]	Loss_D: 3.4959	Loss_G: 0.2883	D(x): 0.8702	D(G(z)): 0.8266 / 0.8235
[0/20][150/388]	Loss_D: 2.4650	Loss_G: 0.3741	D(x): 0.8472	D(G(z)): 0.7630 / 0.7579
[0/20][200/388]	Loss_D: 1.7207	Loss_G: 0.5730	D(x): 0.8646	D(G(z)): 0.6564 / 0.6494
[0/20][250/388]	Loss_D: 1.2850	Loss_G: 0.8452	D(x): 0.8346	D(G(z)): 0.5365 / 0.5292
[0/20][300/388]	Loss_D: 1.0199	Loss_G: 0.9314	D(x): 0.8952	D(G(z)): 0.4987 / 0.4868
[0/20][350/388]	Loss_D: 0.7815	Loss_G: 1.1666	D(x): 0.9440	D(G(z)): 0.4231 / 0.4123
1
[1/20][0/388]	Loss_D: 0.9238	Loss_G: 1.2149	D(x): 0.8017	D(G(z)): 0.3999 / 0.3884
[1/20][50/388]	Loss_D: 0.7857	Loss_G: 1.3266	D(x): 0.8498	D(G(z)): 0.3884 / 0.3730
[1/20][100/388]	Loss_D: 0.5978	Loss_G: 1.5549	D(x): 0.8773	D(G(z)): 0.3111 / 0.2994
[1/20][150/388]	Loss_D: 0.7832	Loss_G: 1.6711	D(x):

In [372]:
x_test['y'] = y_test
myDs=MyTestDataset(x_test)
test_loader=torch.utils.data.DataLoader(myDs,batch_size=1,shuffle=False)

In [373]:
generator = netG
discriminator = netD

In [374]:
start = time.process_time()
losses = []
for j,(image,label) in enumerate(test_loader):
    z = init.normal_(torch.zeros(1,input_dim),mean=0,std=0.1)
    z_optimizer = torch.optim.Adam([z],lr=1e-4)
    
    generator.eval()
    gen_fake = generator(z)
    loss = Anomaly_score(image,gen_fake)
    losses.append(loss.detach().numpy().tolist())
    
threshold = np.mean(losses) + np.std(losses)
anomaly_mask = pd.Series(losses) > threshold
preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
preds = preds.tolist()

x_test['y'] = y_test
x_test['anomaly score'] = losses
x_test['prediction'] = preds

end = time.process_time()
creditcard_gan_test_time = end - start
print(end - start)

1392.734375


In [375]:
confusion_matrix(x_test['y'], x_test['prediction'])

array([[80774,  4521],
       [  127,    21]], dtype=int64)

In [376]:
fpr, tpr, _ = metrics.roc_curve(x_test['y'], x_test['anomaly score'])
creditcard_gan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.7769046377991803

In [377]:
creditcard_gan_report = classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     85295
           1       0.00      0.14      0.01       148

    accuracy                           0.95     85443
   macro avg       0.50      0.54      0.49     85443
weighted avg       1.00      0.95      0.97     85443



In [378]:
print(creditcard_gan_report['1']['precision'])
print(creditcard_gan_report['1']['recall'])
print(creditcard_gan_report['1']['f1-score'])

0.004623513870541612
0.14189189189189189
0.008955223880597017


In [379]:
precision, recall, thresholds = precision_recall_curve(x_test['y'], x_test['anomaly score'])
creditcard_gan_auc_precision_recall = metrics.auc(recall, precision)
print(creditcard_gan_auc_precision_recall)

0.0046865041389160655


## Mammography

**Dataset source**: http://odds.cs.stonybrook.edu/mammography-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

Abe, Naoki, Bianca Zadrozny, and John Langford. “Outlier detection by active learning.” Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2006.

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

In [380]:
data = pd.read_csv('./mammography.csv')

In [381]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,y
0,0.23002,5.072578,-0.276061,0.832444,-0.377866,0.480322,0
1,0.155491,-0.16939,0.670652,-0.859553,-0.377866,-0.945723,0
2,-0.784415,-0.443654,5.674705,-0.859553,-0.377866,-0.945723,0
3,0.546088,0.131415,-0.456387,-0.859553,-0.377866,-0.945723,0
4,-0.102987,-0.394994,-0.140816,0.979703,-0.377866,1.013566,0


In [382]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,10923
1,260


In [383]:
data.shape

(11183, 7)

## AnoGAN

In [384]:
num_epochs = 20
batch_size = 512
learning_rate = 0.0002
input_dim = data.shape[1]-1

In [385]:
(
    x_train,
    x_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(data[data.columns[:-1]], data['y'], data.index, test_size=0.3,stratify=data[['y']])

In [386]:
y_train = pd.DataFrame(y_train)
x_train = x_train.loc[y_train[y_train.y == 0].index]

In [387]:
myDs=MyTrainDataset(x_train)
train_loader=torch.utils.data.DataLoader(myDs,batch_size=batch_size,shuffle=False, drop_last = True)

In [388]:
netG = Generator()
netD = Discriminator()

# Initialize BCELoss function
criterion = nn.BCELoss()

# Create batch of latent vectors that we will use to visualize
#  the progression of the generator
fixed_noise = torch.randn(batch_size, input_dim)

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = torch.optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizerG = torch.optim.Adam(netG.parameters(), lr=learning_rate, betas=(0.5, 0.999))

In [389]:
img_list = []
G_losses = []
D_losses = []
iters = 0

start = time.process_time()
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    print(epoch)
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        
        real_cpu = data[0]
        #print(real_cpu)
        b_size = real_cpu.size(0)
        # Format batch
        label = torch.full((b_size,), real_label, dtype=torch.float)
        # Forward pass real batch through D
        output, _ = netD(real_cpu)
        output = output.view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, input_dim)
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output, _ = netD(fake.detach())
        output = output.view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        #print(errD_fake)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output, _ = netD(fake)
        output = output.view(-1)
        
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        # Check how the generator is doing by saving G's output on fixed_noise
        if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(train_loader)-1)):
            with torch.no_grad():
                fake = netG(fixed_noise).detach().cpu()
            img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

        iters += 1
        
end = time.process_time()
mammography_gan_train_time = end - start
print(end - start)

Starting Training Loop...
0
[0/20][0/14]	Loss_D: 4.5676	Loss_G: 4.1149	D(x): 0.1869	D(G(z)): 0.2721 / 0.2719
1
[1/20][0/14]	Loss_D: 3.2955	Loss_G: 2.2728	D(x): 0.3557	D(G(z)): 0.3079 / 0.3072
2
[2/20][0/14]	Loss_D: 2.7184	Loss_G: 1.5456	D(x): 0.4429	D(G(z)): 0.3937 / 0.3916
3
[3/20][0/14]	Loss_D: 2.2512	Loss_G: 1.3203	D(x): 0.5083	D(G(z)): 0.4187 / 0.4142
4
[4/20][0/14]	Loss_D: 1.8655	Loss_G: 1.3071	D(x): 0.5638	D(G(z)): 0.3924 / 0.3891
5
[5/20][0/14]	Loss_D: 1.6137	Loss_G: 1.2628	D(x): 0.6107	D(G(z)): 0.4075 / 0.4021
6
[6/20][0/14]	Loss_D: 1.4453	Loss_G: 1.2475	D(x): 0.6439	D(G(z)): 0.4098 / 0.4043
7
[7/20][0/14]	Loss_D: 1.3956	Loss_G: 1.1668	D(x): 0.6662	D(G(z)): 0.4364 / 0.4273
8
[8/20][0/14]	Loss_D: 1.2551	Loss_G: 1.2207	D(x): 0.6878	D(G(z)): 0.4174 / 0.4096
9
[9/20][0/14]	Loss_D: 1.2053	Loss_G: 1.2156	D(x): 0.6949	D(G(z)): 0.4110 / 0.4023
10
[10/20][0/14]	Loss_D: 1.1997	Loss_G: 1.2575	D(x): 0.6941	D(G(z)): 0.4011 / 0.3920
11
[11/20][0/14]	Loss_D: 1.1174	Loss_G: 1.3472	D(x): 0.7042

In [390]:
x_test['y'] = y_test
myDs=MyTestDataset(x_test)
test_loader=torch.utils.data.DataLoader(myDs,batch_size=1,shuffle=False)

In [391]:
generator = netG
discriminator = netD

In [392]:
start = time.process_time()
losses = []
for j,(image,label) in enumerate(test_loader):
    z = init.normal_(torch.zeros(1,input_dim),mean=0,std=0.1)
    z_optimizer = torch.optim.Adam([z],lr=1e-4)

    generator.eval()
    gen_fake = generator(z)
    loss = Anomaly_score(image,gen_fake)
    losses.append(loss.detach().numpy().tolist())
    
threshold = np.mean(losses) + np.std(losses)
anomaly_mask = pd.Series(losses) > threshold
preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
preds = preds.tolist()

x_test['y'] = y_test
x_test['anomaly score'] = losses
x_test['prediction'] = preds

end = time.process_time()
mammography_gan_test_time = end - start
print(end - start)

54.9375


In [393]:
confusion_matrix(x_test['y'], x_test['prediction'])

array([[3005,  272],
       [  36,   42]], dtype=int64)

In [394]:
fpr, tpr, _ = metrics.roc_curve(x_test['y'], x_test['anomaly score'])
mammography_gan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.861313897169863

In [395]:
mammography_gan_report = classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.99      0.92      0.95      3277
           1       0.13      0.54      0.21        78

    accuracy                           0.91      3355
   macro avg       0.56      0.73      0.58      3355
weighted avg       0.97      0.91      0.93      3355



In [396]:
print(mammography_gan_report['1']['precision'])
print(mammography_gan_report['1']['recall'])
print(mammography_gan_report['1']['f1-score'])

0.1337579617834395
0.5384615384615384
0.2142857142857143


In [397]:
precision, recall, thresholds = precision_recall_curve(x_test['y'], x_test['anomaly score'])
mammography_gan_auc_precision_recall = metrics.auc(recall, precision)
print(mammography_gan_auc_precision_recall)

0.1424669105185536


## Shuttle

**Dataset source**: http://odds.cs.stonybrook.edu/shuttle-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

Abe, Naoki, Bianca Zadrozny, and John Langford. “Outlier detection by active learning.” Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2006.

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

Kai Ming Ting, Guang-Tong Zhou, Fei Tony Liu & Tan Swee Chuan. (2010). Mass Estimation and Its Applications. Proceedings of The 16th ACM SIGKDD Conference on Knowledge Discovery and Data Mining 2010. pp. 989-998.

Swee Chuan Tan, Kai Ming Ting & Fei Tony Liu. (2011). Fast Anomaly Detection for Streaming Data. Proceedings of the International Joint Conference on Artificial Intelligence 2011. pp.1151-1156.

In [398]:
data = pd.read_csv('./shuttle.csv', sep = ',')

In [399]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,y
0,50,21,77,0,28,0,27,48,22,1
1,53,0,82,0,52,-5,29,30,2,0
2,37,0,76,0,28,18,40,48,8,0
3,37,0,79,0,34,-26,43,46,2,0
4,85,0,88,-4,6,1,3,83,80,1


In [400]:
data.shape

(49097, 10)

In [401]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,45586
1,3511


## AnoGAN

In [402]:
num_epochs = 20
batch_size = 512
learning_rate = 0.0002
input_dim = data.shape[1]-1

In [403]:
(
    x_train,
    x_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(data[data.columns[:-1]], data['y'], data.index, test_size=0.3,stratify=data[['y']])

In [404]:
y_train = pd.DataFrame(y_train)
x_train = x_train.loc[y_train[y_train.y == 0].index]

In [405]:
myDs=MyTrainDataset(x_train)
train_loader=torch.utils.data.DataLoader(myDs,batch_size=batch_size,shuffle=False, drop_last = True)

In [406]:
netG = Generator()
netD = Discriminator()

# Initialize BCELoss function
criterion = nn.BCELoss()

# Create batch of latent vectors that we will use to visualize
#  the progression of the generator
fixed_noise = torch.randn(batch_size, input_dim)

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = torch.optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizerG = torch.optim.Adam(netG.parameters(), lr=learning_rate, betas=(0.5, 0.999))

In [407]:
img_list = []
G_losses = []
D_losses = []
iters = 0

start = time.process_time()
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    print(epoch)
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        
        real_cpu = data[0]
        #print(real_cpu)
        b_size = real_cpu.size(0)
        # Format batch
        label = torch.full((b_size,), real_label, dtype=torch.float)
        # Forward pass real batch through D
        output, _ = netD(real_cpu)
        output = output.view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, input_dim)
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output, _ = netD(fake.detach())
        output = output.view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        #print(errD_fake)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output, _ = netD(fake)
        output = output.view(-1)
        
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        # Check how the generator is doing by saving G's output on fixed_noise
        if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(train_loader)-1)):
            with torch.no_grad():
                fake = netG(fixed_noise).detach().cpu()
            img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

        iters += 1
        
end = time.process_time()
shuttle_gan_train_time = end - start
print(end - start)

Starting Training Loop...
0
[0/20][0/62]	Loss_D: 2.9971	Loss_G: 5.0737	D(x): 0.2576	D(G(z)): 0.1969 / 0.1956
[0/20][50/62]	Loss_D: 3.6019	Loss_G: 1.8457	D(x): 0.2313	D(G(z)): 0.3437 / 0.3437
1
[1/20][0/62]	Loss_D: 1.5418	Loss_G: 1.5460	D(x): 0.6006	D(G(z)): 0.3688 / 0.3662
[1/20][50/62]	Loss_D: 2.7629	Loss_G: 1.1340	D(x): 0.3481	D(G(z)): 0.4434 / 0.4400
2
[2/20][0/62]	Loss_D: 1.3073	Loss_G: 1.1106	D(x): 0.7694	D(G(z)): 0.4338 / 0.4268
[2/20][50/62]	Loss_D: 2.0825	Loss_G: 1.2750	D(x): 0.4375	D(G(z)): 0.3832 / 0.3778
3
[3/20][0/62]	Loss_D: 1.0625	Loss_G: 1.2385	D(x): 0.8386	D(G(z)): 0.4005 / 0.3924
[3/20][50/62]	Loss_D: 1.7826	Loss_G: 1.3059	D(x): 0.5093	D(G(z)): 0.3843 / 0.3794
4
[4/20][0/62]	Loss_D: 1.0091	Loss_G: 1.2673	D(x): 0.8797	D(G(z)): 0.3972 / 0.3868
[4/20][50/62]	Loss_D: 1.7798	Loss_G: 1.1814	D(x): 0.5264	D(G(z)): 0.4372 / 0.4294
5
[5/20][0/62]	Loss_D: 1.0904	Loss_G: 1.1771	D(x): 0.8978	D(G(z)): 0.4311 / 0.4183
[5/20][50/62]	Loss_D: 1.5991	Loss_G: 1.3866	D(x): 0.5358	D(G(z)): 

In [408]:
x_test['y'] = y_test
myDs=MyTestDataset(x_test)
test_loader=torch.utils.data.DataLoader(myDs,batch_size=1,shuffle=False)

In [409]:
generator = netG
discriminator = netD

In [410]:
start = time.process_time()
losses = []
for j,(image,label) in enumerate(test_loader):
    z = init.normal_(torch.zeros(1,input_dim),mean=0,std=0.1)
    z_optimizer = torch.optim.Adam([z],lr=1e-4)

    generator.eval()
    gen_fake = generator(z)
    loss = Anomaly_score(image,gen_fake)
    losses.append(loss.detach().numpy().tolist())
    
threshold = np.mean(losses) + np.std(losses)
anomaly_mask = pd.Series(losses) > threshold
preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
preds = preds.tolist()

x_test['y'] = y_test
x_test['anomaly score'] = losses
x_test['prediction'] = preds

end = time.process_time()
shuttle_gan_test_time = end - start
print(end - start)

223.640625


In [411]:
confusion_matrix(x_test['y'], x_test['prediction'])

array([[13643,    34],
       [ 1030,    23]], dtype=int64)

In [412]:
fpr, tpr, _ = metrics.roc_curve(x_test['y'], x_test['anomaly score'])
shuttle_gan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.9895958034926132

In [413]:
shuttle_gan_report = classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96     13677
           1       0.40      0.02      0.04      1053

    accuracy                           0.93     14730
   macro avg       0.67      0.51      0.50     14730
weighted avg       0.89      0.93      0.90     14730



In [414]:
print(shuttle_gan_report['1']['precision'])
print(shuttle_gan_report['1']['recall'])
print(shuttle_gan_report['1']['f1-score'])

0.40350877192982454
0.02184235517568851
0.04144144144144145


In [415]:
precision, recall, thresholds = precision_recall_curve(x_test['y'], x_test['anomaly score'])
shuttle_gan_auc_precision_recall = metrics.auc(recall, precision)
print(shuttle_gan_auc_precision_recall)

0.8330589350818651


## mnist

**Dataset source**: http://odds.cs.stonybrook.edu/mnist-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

Bandaragoda, Tharindu R., et al. “Efficient Anomaly Detection by Isolation Using Nearest Neighbour Ensemble.” 2014 IEEE International Conference on Data Mining Workshop. IEEE, 2014.

In [416]:
data = pd.read_csv('./mnist.csv')

In [417]:
data = data.drop(columns = ['Col1','Col4', 'Col7', 'Col22', 'Col27', 'Col29', 'Col38', 'Col41', 'Col51', 'Col53', 'Col54', 'Col61', 'Col62', 'Col71', 'Col73', 'Col79', 'Col87', 'Col88', 'Col89', 'Col90',
'Col92', 'Col100'])

In [418]:
pd.pivot_table(data,
             values = 'Col2',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col2
y,Unnamed: 1_level_1
0,6903
1,700


In [419]:
data.shape

(7603, 79)

In [420]:
data.head()

Unnamed: 0,Col2,Col3,Col5,Col6,Col8,Col9,Col10,Col11,Col12,Col13,...,Col86,Col91,Col93,Col94,Col95,Col96,Col97,Col98,Col99,y
0,-73.804153,198.205963,-13.124617,-1.1501,-0.141633,179.24939,114.661163,-80.736702,130.659348,162.649841,...,-15.392716,188.055649,-4.469967,158.381409,-137.100632,27.131416,-2.274633,-0.00065,-12.351267,0
1,-73.804153,197.205963,-13.124617,-1.1501,-0.141633,179.24939,-44.338833,-80.736702,128.659348,190.649841,...,-15.392716,186.055649,-4.469967,123.381416,-137.100632,157.131409,-2.274633,-0.00065,-12.351267,0
2,-73.804153,-53.794033,-13.124617,-1.1501,-0.141633,-73.750618,-44.338833,170.263306,130.659348,46.649849,...,-15.392716,188.055649,-4.469967,157.381409,-137.100632,-93.868584,-2.274633,-0.00065,-12.351267,0
3,-73.804153,86.205963,-13.124617,-1.1501,-0.141633,76.249382,208.661163,107.263298,130.659348,190.649841,...,-15.392716,188.055649,-4.469967,157.381409,-137.100632,74.131416,-2.274633,-0.00065,-12.351267,0
4,-27.804153,199.205963,-13.124617,-1.1501,-0.141633,179.24939,-44.338833,-80.736702,130.659348,91.649849,...,-15.392716,188.055649,-4.469967,22.381416,-137.100632,159.131409,-2.274633,-0.00065,-12.351267,0


## AnoGAN

In [421]:
num_epochs = 20
batch_size = 512
learning_rate = 0.0002
input_dim = data.shape[1]-1

In [422]:
(
    x_train,
    x_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(data[data.columns[:-1]], data['y'], data.index, test_size=0.3,stratify=data[['y']])

In [423]:
y_train = pd.DataFrame(y_train)
x_train = x_train.loc[y_train[y_train.y == 0].index]

In [424]:
myDs=MyTrainDataset(x_train)
train_loader=torch.utils.data.DataLoader(myDs,batch_size=batch_size,shuffle=False, drop_last = True)

In [425]:
netG = Generator()
netD = Discriminator()

# Initialize BCELoss function
criterion = nn.BCELoss()

# Create batch of latent vectors that we will use to visualize
#  the progression of the generator
fixed_noise = torch.randn(batch_size, input_dim)

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = torch.optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizerG = torch.optim.Adam(netG.parameters(), lr=learning_rate, betas=(0.5, 0.999))

In [426]:
img_list = []
G_losses = []
D_losses = []
iters = 0

start = time.process_time()
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    print(epoch)
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        
        real_cpu = data[0]
        #print(real_cpu)
        b_size = real_cpu.size(0)
        # Format batch
        label = torch.full((b_size,), real_label, dtype=torch.float)
        # Forward pass real batch through D
        output, _ = netD(real_cpu)
        output = output.view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, input_dim)
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output, _ = netD(fake.detach())
        output = output.view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        #print(errD_fake)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output, _ = netD(fake)
        output = output.view(-1)
        
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        # Check how the generator is doing by saving G's output on fixed_noise
        if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(train_loader)-1)):
            with torch.no_grad():
                fake = netG(fixed_noise).detach().cpu()
            img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

        iters += 1
        
end = time.process_time()
mnist_gan_train_time = end - start
print(end - start)

Starting Training Loop...
0
[0/20][0/9]	Loss_D: 17.0741	Loss_G: 0.5482	D(x): 0.8133	D(G(z)): 0.8519 / 0.8512
1
[1/20][0/9]	Loss_D: 14.2361	Loss_G: 0.4072	D(x): 0.8238	D(G(z)): 0.8751 / 0.8743
2
[2/20][0/9]	Loss_D: 11.9060	Loss_G: 0.2821	D(x): 0.8315	D(G(z)): 0.8942 / 0.8931
3
[3/20][0/9]	Loss_D: 11.3974	Loss_G: 0.3030	D(x): 0.8363	D(G(z)): 0.8754 / 0.8743
4
[4/20][0/9]	Loss_D: 9.8349	Loss_G: 0.3401	D(x): 0.8389	D(G(z)): 0.8719 / 0.8707
5
[5/20][0/9]	Loss_D: 8.4643	Loss_G: 0.2456	D(x): 0.8395	D(G(z)): 0.8947 / 0.8935
6
[6/20][0/9]	Loss_D: 8.5878	Loss_G: 0.2350	D(x): 0.8386	D(G(z)): 0.8912 / 0.8899
7
[7/20][0/9]	Loss_D: 7.3728	Loss_G: 0.2212	D(x): 0.8359	D(G(z)): 0.9006 / 0.8993
8
[8/20][0/9]	Loss_D: 6.2379	Loss_G: 0.2612	D(x): 0.8319	D(G(z)): 0.8815 / 0.8801
9
[9/20][0/9]	Loss_D: 6.6016	Loss_G: 0.3695	D(x): 0.8263	D(G(z)): 0.8472 / 0.8456
10
[10/20][0/9]	Loss_D: 5.7664	Loss_G: 0.3056	D(x): 0.8197	D(G(z)): 0.8618 / 0.8604
11
[11/20][0/9]	Loss_D: 5.6583	Loss_G: 0.3167	D(x): 0.8125	D(G(z))

In [427]:
x_test['y'] = y_test
myDs=MyTestDataset(x_test)
test_loader=torch.utils.data.DataLoader(myDs,batch_size=1,shuffle=False)

In [428]:
generator = netG
discriminator = netD

In [429]:
start = time.process_time()
losses = []
for j,(image,label) in enumerate(test_loader):
    z = init.normal_(torch.zeros(1,input_dim),mean=0,std=0.1)
    z_optimizer = torch.optim.Adam([z],lr=1e-4)

    generator.eval()
    gen_fake = generator(z)
    loss = Anomaly_score(image,gen_fake)
    losses.append(loss.detach().numpy().tolist())
    
threshold = np.mean(losses) + np.std(losses)
anomaly_mask = pd.Series(losses) > threshold
preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
preds = preds.tolist()

x_test['y'] = y_test
x_test['anomaly score'] = losses
x_test['prediction'] = preds

end = time.process_time()
mnist_gan_test_time = end - start
print(end - start)

40.890625


In [430]:
confusion_matrix(x_test['y'], x_test['prediction'])

array([[1703,  368],
       [ 199,   11]], dtype=int64)

In [431]:
fpr, tpr, _ = metrics.roc_curve(x_test['y'], x_test['anomaly score'])
mnist_gan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.2474075095996873

In [432]:
mnist_gan_report = classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.90      0.82      0.86      2071
           1       0.03      0.05      0.04       210

    accuracy                           0.75      2281
   macro avg       0.46      0.44      0.45      2281
weighted avg       0.82      0.75      0.78      2281



In [433]:
print(mnist_gan_report['1']['precision'])
print(mnist_gan_report['1']['recall'])
print(mnist_gan_report['1']['f1-score'])

0.029023746701846966
0.05238095238095238
0.03735144312393888


In [434]:
precision, recall, thresholds = precision_recall_curve(x_test['y'], x_test['anomaly score'])
mnist_gan_auc_precision_recall = metrics.auc(recall, precision)
print(mnist_gan_auc_precision_recall)

0.056654942258299375


## vowels

**Dataset source**: http://odds.cs.stonybrook.edu/japanese-vowels-data/

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

C. C. Aggarwal and S. Sathe, “Theoretical foundations and algorithms for outlier ensembles.” ACM SIGKDD Explorations Newsletter, vol. 17, no. 1, pp. 24–47, 2015.

Saket Sathe and Charu C. Aggarwal. LODES: Local Density meets Spectral Outlier Detection. SIAM Conference on Data Mining, 2016.

In [435]:
data = pd.read_csv('./vowels.csv')

In [436]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0.0,1406
1.0,50


In [437]:
data.shape

(1456, 13)

In [438]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Col11,Col12,y
0,0.580469,-0.902534,0.617899,-0.997942,-2.463799,-0.846455,2.349849,0.3754,-0.649334,1.604637,-0.62306,-0.383125,0.0
1,0.784375,-1.077366,0.615781,-0.921911,-2.388553,-0.638047,2.106684,0.361018,-0.714317,1.260236,-0.423339,-0.287791,0.0
2,0.791292,-1.086242,0.669773,-0.806112,-2.260781,-0.538491,2.053282,0.266492,-0.842815,1.081797,-0.267201,-0.172203,0.0
3,1.217306,-1.083425,0.855483,-0.724879,-2.155552,-0.101879,1.768597,0.303151,-1.04471,0.65529,0.214298,-0.34184,0.0
4,1.065352,-1.030178,0.773297,-0.452289,-1.955907,0.248205,1.530474,0.25374,-0.968961,-0.208287,0.331578,0.007288,0.0


## AnoGAN

In [439]:
num_epochs = 20
batch_size = 512
learning_rate = 0.0002
input_dim = data.shape[1]-1

In [440]:
(
    x_train,
    x_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(data[data.columns[:-1]], data['y'], data.index, test_size=0.3,stratify=data[['y']])

In [441]:
y_train = pd.DataFrame(y_train)
x_train = x_train.loc[y_train[y_train.y == 0].index]

In [442]:
myDs=MyTrainDataset(x_train)
train_loader=torch.utils.data.DataLoader(myDs,batch_size=batch_size,shuffle=False, drop_last = True)

In [443]:
netG = Generator()
netD = Discriminator()

# Initialize BCELoss function
criterion = nn.BCELoss()

# Create batch of latent vectors that we will use to visualize
#  the progression of the generator
fixed_noise = torch.randn(batch_size, input_dim)

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = torch.optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizerG = torch.optim.Adam(netG.parameters(), lr=learning_rate, betas=(0.5, 0.999))

In [444]:
img_list = []
G_losses = []
D_losses = []
iters = 0

start = time.process_time()
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    print(epoch)
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        
        real_cpu = data[0]
        #print(real_cpu)
        b_size = real_cpu.size(0)
        # Format batch
        label = torch.full((b_size,), real_label, dtype=torch.float)
        # Forward pass real batch through D
        output, _ = netD(real_cpu)
        output = output.view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, input_dim)
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output, _ = netD(fake.detach())
        output = output.view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        #print(errD_fake)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output, _ = netD(fake)
        output = output.view(-1)
        
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        # Check how the generator is doing by saving G's output on fixed_noise
        if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(train_loader)-1)):
            with torch.no_grad():
                fake = netG(fixed_noise).detach().cpu()
            img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

        iters += 1
        
end = time.process_time()
vowels_gan_train_time = end - start
print(end - start)

Starting Training Loop...
0
[0/20][0/1]	Loss_D: 5.3591	Loss_G: 4.6144	D(x): 0.2015	D(G(z)): 0.2483 / 0.2473
1
[1/20][0/1]	Loss_D: 5.2632	Loss_G: 4.4410	D(x): 0.2010	D(G(z)): 0.2486 / 0.2485
2
[2/20][0/1]	Loss_D: 5.0895	Loss_G: 4.2689	D(x): 0.2007	D(G(z)): 0.2352 / 0.2351
3
[3/20][0/1]	Loss_D: 4.9926	Loss_G: 4.1917	D(x): 0.2006	D(G(z)): 0.2154 / 0.2154
4
[4/20][0/1]	Loss_D: 4.9247	Loss_G: 4.1313	D(x): 0.2009	D(G(z)): 0.2191 / 0.2191
5
[5/20][0/1]	Loss_D: 4.8285	Loss_G: 4.0538	D(x): 0.2013	D(G(z)): 0.1941 / 0.1946
6
[6/20][0/1]	Loss_D: 4.7875	Loss_G: 3.9843	D(x): 0.2021	D(G(z)): 0.2151 / 0.2153
7
[7/20][0/1]	Loss_D: 4.7218	Loss_G: 3.8036	D(x): 0.2027	D(G(z)): 0.2084 / 0.2091
8
[8/20][0/1]	Loss_D: 4.6836	Loss_G: 3.7089	D(x): 0.2035	D(G(z)): 0.2271 / 0.2276
9
[9/20][0/1]	Loss_D: 4.5224	Loss_G: 3.6834	D(x): 0.2045	D(G(z)): 0.1958 / 0.1962
10
[10/20][0/1]	Loss_D: 4.5214	Loss_G: 3.6101	D(x): 0.2056	D(G(z)): 0.2151 / 0.2153
11
[11/20][0/1]	Loss_D: 4.4345	Loss_G: 3.4913	D(x): 0.2064	D(G(z)): 0.

In [445]:
x_test['y'] = y_test
myDs=MyTestDataset(x_test)
test_loader=torch.utils.data.DataLoader(myDs,batch_size=1,shuffle=False)

In [446]:
generator = netG
discriminator = netD

In [447]:
start = time.process_time()
losses = []
for j,(image,label) in enumerate(test_loader):
    z = init.normal_(torch.zeros(1,input_dim),mean=0,std=0.1)
    z_optimizer = torch.optim.Adam([z],lr=1e-4)

    generator.eval()
    gen_fake = generator(z)
    loss = Anomaly_score(image,gen_fake)
    losses.append(loss.detach().numpy().tolist())
    
threshold = np.mean(losses) + np.std(losses)
anomaly_mask = pd.Series(losses) > threshold
preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
preds = preds.tolist()

x_test['y'] = y_test
x_test['anomaly score'] = losses
x_test['prediction'] = preds

end = time.process_time()
vowels_gan_test_time = end - start
print(end - start)

7.3125


In [448]:
confusion_matrix(x_test['y'], x_test['prediction'])

array([[351,  71],
       [ 12,   3]], dtype=int64)

In [449]:
fpr, tpr, _ = metrics.roc_curve(x_test['y'], x_test['anomaly score'])
vowels_gan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.5744075829383886

In [450]:
vowels_gan_report = classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.97      0.83      0.89       422
           1       0.04      0.20      0.07        15

    accuracy                           0.81       437
   macro avg       0.50      0.52      0.48       437
weighted avg       0.94      0.81      0.87       437



In [451]:
print(vowels_gan_report['1']['precision'])
print(vowels_gan_report['1']['recall'])
print(vowels_gan_report['1']['f1-score'])

0.04054054054054054
0.2
0.06741573033707865


In [452]:
precision, recall, thresholds = precision_recall_curve(x_test['y'], x_test['anomaly score'])
vowels_gan_auc_precision_recall = metrics.auc(recall, precision)
print(vowels_gan_auc_precision_recall)

0.05141399632850409


## Seismic

**Dataset source**: http://odds.cs.stonybrook.edu/seismic-dataset/ (data is transformed from .arff to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

Saket Sathe and Charu C. Aggarwal. LODES: Local Density meets Spectral Outlier Detection. SIAM Conference on Data Mining, 2016.

In [453]:
data = pd.read_csv('./seismic.csv', sep = ',')

In [454]:
data = data.drop(columns = ['nbumps6','nbumps7','nbumps89'])

In [455]:
data.shape

(2584, 16)

In [456]:
drop_enc = OneHotEncoder(drop='first').fit_transform(data[['seismic','seismoacoustic','shift','ghazard']])

In [457]:
cat_var = pd.DataFrame(drop_enc.toarray())
cat_var.columns = ['seismic: b', 'seismoacoustic: b','seismoacoustic: c','shift: W','ghazard: b','ghazard: c']

In [458]:
data = pd.concat([data, cat_var], axis = 1)

In [459]:
data = data.drop(columns = ['seismic','seismoacoustic','shift','ghazard'])

In [460]:
data.head()

Unnamed: 0,genergy,gpuls,gdenergy,gdpuls,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,energy,maxenergy,class,seismic: b,seismoacoustic: b,seismoacoustic: c,shift: W,ghazard: b,ghazard: c
0,15180,48,-72,-72,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,14720,33,-70,-79,1,0,1,0,0,2000,2000,0,0.0,0.0,0.0,0.0,0.0,0.0
2,8050,30,-81,-78,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,28820,171,-23,40,1,0,1,0,0,3000,3000,0,0.0,0.0,0.0,0.0,0.0,0.0
4,12640,57,-63,-52,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [461]:
pd.pivot_table(data,
             values = 'genergy',
               index = 'class', 
              aggfunc = 'count')

Unnamed: 0_level_0,genergy
class,Unnamed: 1_level_1
0,2414
1,170


## AnoGAN

In [462]:
num_epochs = 20
batch_size = 64
learning_rate = 0.0002
input_dim = data.shape[1]-1

input_dim_num = 11
input_dim_cat = 6

In [463]:
(
    x_train,
    x_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(data[data.columns[:-1]], data['class'], data.index, test_size=0.3,stratify=data[['class']])

In [464]:
y_train = pd.DataFrame(y_train)
x_train = x_train.loc[y_train[y_train['class'] == 0].index]

In [465]:
myDs=MyTrainDataset(x_train)
train_loader=torch.utils.data.DataLoader(myDs,batch_size=batch_size,shuffle=False, drop_last = True)

In [466]:
netG = Generator()
netD = Discriminator()

# Initialize BCELoss function
criterion = nn.BCELoss()

# Create batch of latent vectors that we will use to visualize
#  the progression of the generator
fixed_noise = torch.randn(batch_size, input_dim)

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = torch.optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizerG = torch.optim.Adam(netG.parameters(), lr=learning_rate, betas=(0.5, 0.999))

In [467]:
img_list = []
G_losses = []
D_losses = []
iters = 0

start = time.process_time()
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    print(epoch)
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        
        real_cpu = data[0]
        #print(real_cpu)
        b_size = real_cpu.size(0)
        # Format batch
        label = torch.full((b_size,), real_label, dtype=torch.float)
        # Forward pass real batch through D
        output, _ = netD(real_cpu)
        output = output.view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        #noise = torch.randn(b_size, input_dim)
        noise = torch.cat((torch.randn(b_size, input_dim_num),torch.randint(low=0, high = 1, size = (b_size, input_dim_cat))),-1)
        
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output, _ = netD(fake.detach())
        output = output.view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        #print(errD_fake)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output, _ = netD(fake)
        output = output.view(-1)
        
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        # Check how the generator is doing by saving G's output on fixed_noise
        if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(train_loader)-1)):
            with torch.no_grad():
                fake = netG(fixed_noise).detach().cpu()
            img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

        iters += 1
        
end = time.process_time()
seismic_gan_train_time = end - start
print(end - start)

Starting Training Loop...
0
[0/20][0/26]	Loss_D: 6.4474	Loss_G: 7.6650	D(x): 0.1302	D(G(z)): 0.1424 / 0.1410
1
[1/20][0/26]	Loss_D: 5.1347	Loss_G: 6.8697	D(x): 0.4192	D(G(z)): 0.1334 / 0.1322
2
[2/20][0/26]	Loss_D: 4.5433	Loss_G: 6.3320	D(x): 0.6182	D(G(z)): 0.1418 / 0.1382
3
[3/20][0/26]	Loss_D: 4.4931	Loss_G: 5.5369	D(x): 0.6946	D(G(z)): 0.2046 / 0.1990
4
[4/20][0/26]	Loss_D: 4.2503	Loss_G: 5.0859	D(x): 0.7240	D(G(z)): 0.1981 / 0.1923
5
[5/20][0/26]	Loss_D: 4.2916	Loss_G: 5.2521	D(x): 0.7416	D(G(z)): 0.2500 / 0.2421
6
[6/20][0/26]	Loss_D: 4.1722	Loss_G: 5.5023	D(x): 0.7548	D(G(z)): 0.1929 / 0.1885
7
[7/20][0/26]	Loss_D: 4.0035	Loss_G: 4.6586	D(x): 0.7626	D(G(z)): 0.1982 / 0.1911
8
[8/20][0/26]	Loss_D: 3.8790	Loss_G: 5.3794	D(x): 0.7620	D(G(z)): 0.1698 / 0.1641
9
[9/20][0/26]	Loss_D: 3.7364	Loss_G: 5.1522	D(x): 0.7660	D(G(z)): 0.1070 / 0.1040
10
[10/20][0/26]	Loss_D: 3.7297	Loss_G: 4.7930	D(x): 0.7637	D(G(z)): 0.1193 / 0.1156
11
[11/20][0/26]	Loss_D: 3.5909	Loss_G: 4.9294	D(x): 0.7649

In [468]:
x_test['y'] = y_test
myDs=MyTestDataset(x_test)
test_loader=torch.utils.data.DataLoader(myDs,batch_size=1,shuffle=False)

In [469]:
generator = netG
discriminator = netD

In [470]:
start = time.process_time()
losses = []
for j,(image,label) in enumerate(test_loader):
    z = torch.cat((torch.randn(b_size, input_dim_num),torch.randint(low=0, high = 1, size = (b_size, input_dim_cat))),-1)
    #z = init.normal_(torch.zeros(1,input_dim),mean=0,std=0.1)
    z_optimizer = torch.optim.Adam([z],lr=1e-4)

    generator.eval()
    gen_fake = generator(z)
    loss = Anomaly_score(image,gen_fake)
    losses.append(loss.detach().numpy().tolist())
    
threshold = np.mean(losses) + np.std(losses)
anomaly_mask = pd.Series(losses) > threshold
preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
preds = preds.tolist()

x_test['y'] = y_test
x_test['anomaly score'] = losses
x_test['prediction'] = preds

end = time.process_time()
seismic_gan_test_time = end - start
print(end - start)

24.734375


In [471]:
confusion_matrix(x_test['y'], x_test['prediction'])

array([[682,  43],
       [ 38,  13]], dtype=int64)

In [472]:
fpr, tpr, _ = metrics.roc_curve(x_test['y'], x_test['anomaly score'])
seismic_gan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.7625693035835023

In [473]:
seismic_gan_report = classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94       725
           1       0.23      0.25      0.24        51

    accuracy                           0.90       776
   macro avg       0.59      0.60      0.59       776
weighted avg       0.90      0.90      0.90       776



In [474]:
print(seismic_gan_report['1']['precision'])
print(seismic_gan_report['1']['recall'])
print(seismic_gan_report['1']['f1-score'])

0.23214285714285715
0.2549019607843137
0.24299065420560748


In [475]:
precision, recall, thresholds = precision_recall_curve(x_test['y'], x_test['anomaly score'])
seismic_gan_auc_precision_recall = metrics.auc(recall, precision)
print(seismic_gan_auc_precision_recall)

0.15982246343679407


## Musk

**Dataset source**: http://odds.cs.stonybrook.edu/musk-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources:**

C. C. Aggarwal and S. Sathe, “Theoretical foundations and algorithms for outlier ensembles.” ACM SIGKDD Explorations Newsletter, vol. 17, no. 1, pp. 24–47, 2015.

In [476]:
data = pd.read_csv('./musk.csv', sep = ',')

In [477]:
data.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,...,Col158,Col159,Col160,Col161,Col162,Col163,Col164,Col165,Col166,y
0,46.0,-108.0,-60.0,-69.0,-117.0,49.0,38.0,-161.0,-8.0,5.0,...,-308.0,52.0,-7.0,39.0,126.0,156.0,-50.0,-112.0,96.0,1.0
1,41.0,-188.0,-145.0,22.0,-117.0,-6.0,57.0,-171.0,-39.0,-100.0,...,-59.0,-2.0,52.0,103.0,136.0,169.0,-61.0,-136.0,79.0,1.0
2,46.0,-194.0,-145.0,28.0,-117.0,73.0,57.0,-168.0,-39.0,-22.0,...,-134.0,-154.0,57.0,143.0,142.0,165.0,-67.0,-145.0,39.0,1.0
3,41.0,-188.0,-145.0,22.0,-117.0,-7.0,57.0,-170.0,-39.0,-99.0,...,-60.0,-4.0,52.0,104.0,136.0,168.0,-60.0,-135.0,80.0,1.0
4,41.0,-188.0,-145.0,22.0,-117.0,-7.0,57.0,-170.0,-39.0,-99.0,...,-60.0,-4.0,52.0,104.0,137.0,168.0,-60.0,-135.0,80.0,1.0


In [478]:
data['y'] = data['y'].astype(int)

In [479]:
data.shape

(3062, 167)

In [480]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,2965
1,97


## AnoGAN

In [481]:
num_epochs = 20
batch_size = 64
learning_rate = 0.0002
input_dim = data.shape[1]-1

In [482]:
(
    x_train,
    x_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(data[data.columns[:-1]], data['y'], data.index, test_size=0.3,stratify=data[['y']])

In [483]:
y_train = pd.DataFrame(y_train)
x_train = x_train.loc[y_train[y_train.y == 0].index]

In [484]:
myDs=MyTrainDataset(x_train)
train_loader=torch.utils.data.DataLoader(myDs,batch_size=batch_size,shuffle=False, drop_last = True)

In [485]:
netG = Generator()
netD = Discriminator()

# Initialize BCELoss function
criterion = nn.BCELoss()

# Create batch of latent vectors that we will use to visualize
#  the progression of the generator
fixed_noise = torch.randn(batch_size, input_dim)

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = torch.optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizerG = torch.optim.Adam(netG.parameters(), lr=learning_rate, betas=(0.5, 0.999))

In [486]:
img_list = []
G_losses = []
D_losses = []
iters = 0

start = time.process_time()
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    print(epoch)
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        
        real_cpu = data[0]
        #print(real_cpu)
        b_size = real_cpu.size(0)
        # Format batch
        label = torch.full((b_size,), real_label, dtype=torch.float)
        # Forward pass real batch through D
        output, _ = netD(real_cpu)
        output = output.view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, input_dim)
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output, _ = netD(fake.detach())
        output = output.view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        #print(errD_fake)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output, _ = netD(fake)
        output = output.view(-1)
        
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        # Check how the generator is doing by saving G's output on fixed_noise
        if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(train_loader)-1)):
            with torch.no_grad():
                fake = netG(fixed_noise).detach().cpu()
            img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

        iters += 1
        
end = time.process_time()
musk_gan_train_time = end - start
print(end - start)

Starting Training Loop...
0
[0/20][0/32]	Loss_D: 6.3082	Loss_G: 1.7306	D(x): 0.6849	D(G(z)): 0.6572 / 0.6555
1
[1/20][0/32]	Loss_D: 3.9791	Loss_G: 1.4699	D(x): 0.6452	D(G(z)): 0.6590 / 0.6560
2
[2/20][0/32]	Loss_D: 3.5313	Loss_G: 1.4424	D(x): 0.6107	D(G(z)): 0.6064 / 0.6040
3
[3/20][0/32]	Loss_D: 3.2272	Loss_G: 1.7251	D(x): 0.5848	D(G(z)): 0.5513 / 0.5492
4
[4/20][0/32]	Loss_D: 3.0356	Loss_G: 1.6170	D(x): 0.5733	D(G(z)): 0.5503 / 0.5485
5
[5/20][0/32]	Loss_D: 2.7556	Loss_G: 1.5533	D(x): 0.5665	D(G(z)): 0.5336 / 0.5315
6
[6/20][0/32]	Loss_D: 2.7218	Loss_G: 1.7188	D(x): 0.5667	D(G(z)): 0.5143 / 0.5124
7
[7/20][0/32]	Loss_D: 2.6000	Loss_G: 1.6838	D(x): 0.5649	D(G(z)): 0.5058 / 0.5037
8
[8/20][0/32]	Loss_D: 2.2831	Loss_G: 1.5770	D(x): 0.5715	D(G(z)): 0.4668 / 0.4651
9
[9/20][0/32]	Loss_D: 1.9546	Loss_G: 1.5185	D(x): 0.5722	D(G(z)): 0.4283 / 0.4265
10
[10/20][0/32]	Loss_D: 2.0034	Loss_G: 1.6714	D(x): 0.5725	D(G(z)): 0.4349 / 0.4330
11
[11/20][0/32]	Loss_D: 2.0786	Loss_G: 1.4402	D(x): 0.5765

In [487]:
x_test['y'] = y_test
myDs=MyTestDataset(x_test)
test_loader=torch.utils.data.DataLoader(myDs,batch_size=1,shuffle=False)

In [488]:
generator = netG
discriminator = netD

In [489]:
start = time.process_time()
losses = []
for j,(image,label) in enumerate(test_loader):
    z = init.normal_(torch.zeros(1,input_dim),mean=0,std=0.1)
    z_optimizer = torch.optim.Adam([z],lr=1e-4)

    generator.eval()
    gen_fake = generator(z)
    loss = Anomaly_score(image,gen_fake)
    losses.append(loss.detach().numpy().tolist())
    
threshold = np.mean(losses) + np.std(losses)
anomaly_mask = pd.Series(losses) > threshold
preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
preds = preds.tolist()

x_test['y'] = y_test
x_test['anomaly score'] = losses
x_test['prediction'] = preds

end = time.process_time()
musk_gan_test_time = end - start
print(end - start)

12.515625


In [490]:
confusion_matrix(x_test['y'], x_test['prediction'])

array([[744, 146],
       [ 20,   9]], dtype=int64)

In [491]:
fpr, tpr, _ = metrics.roc_curve(x_test['y'], x_test['anomaly score'])
musk_gan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.803525765207284

In [492]:
musk_gan_report = classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90       890
           1       0.06      0.31      0.10        29

    accuracy                           0.82       919
   macro avg       0.52      0.57      0.50       919
weighted avg       0.94      0.82      0.87       919



In [493]:
print(musk_gan_report['1']['precision'])
print(musk_gan_report['1']['recall'])
print(musk_gan_report['1']['f1-score'])

0.05806451612903226
0.3103448275862069
0.09782608695652176


In [494]:
precision, recall, thresholds = precision_recall_curve(x_test['y'], x_test['anomaly score'])
musk_gan_auc_precision_recall = metrics.auc(recall, precision)
print(musk_gan_auc_precision_recall)

0.06933432232095288


## bank

**Dataset source**: https://github.com/GuansongPang/ADRepository-Anomaly-detection-datasets/tree/main/categorical%20data

Pang, G., Shen, C., Cao, L., & Hengel, A. V. D. (2021). Deep learning for anomaly detection: A review. ACM Computing Surveys (CSUR), 54(2), 1-38.

In [495]:
data = pd.read_csv('./bank.csv')

In [496]:
data.head()

Unnamed: 0,age,job=housemaid,job=services,job=admin.,job=blue-collar,job=technician,job=retired,job=management,job=unemployed,job=self-employed,...,previous,poutcome=nonexistent,poutcome=failure,poutcome=success,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,class
0,0.209877,0,0,0,0,0,0,0,0,0,...,0.0,1,0,0,1.0,0.882307,0.376569,0.98073,1.0,0
1,0.296296,0,0,1,0,0,0,0,0,0,...,0.0,1,0,0,1.0,0.484412,0.615063,0.981183,1.0,0
2,0.246914,1,0,0,0,0,0,0,0,0,...,0.0,1,0,0,0.9375,0.698753,0.60251,0.957379,0.859735,0
3,0.160494,0,1,0,0,0,0,0,0,0,...,0.142857,0,1,0,0.333333,0.26968,0.192469,0.150759,0.512287,0
4,0.530864,0,0,0,1,0,0,0,0,0,...,0.0,1,0,0,0.333333,0.340608,0.154812,0.17479,0.512287,1


In [497]:
pd.pivot_table(data,
             values = 'age',
               index = 'class', 
              aggfunc = 'count')

Unnamed: 0_level_0,age
class,Unnamed: 1_level_1
0,36548
1,4640


## AnoGAN

In [498]:
num_epochs = 20
batch_size = 512
learning_rate = 0.0002
input_dim = data.shape[1]-1

input_dim_cat = 52
input_dim_num = 10

In [499]:
(
    x_train,
    x_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(data[data.columns[:-1]], data['class'], data.index, test_size=0.3,stratify=data[['class']])

In [500]:
y_train = pd.DataFrame(y_train)
x_train = x_train.loc[y_train[y_train['class'] == 0].index]

In [501]:
myDs=MyTrainDataset(x_train)
train_loader=torch.utils.data.DataLoader(myDs,batch_size=batch_size,shuffle=False, drop_last = True)

In [502]:
netG = Generator()
netD = Discriminator()

# Initialize BCELoss function
criterion = nn.BCELoss()

# Create batch of latent vectors that we will use to visualize
#  the progression of the generator
fixed_noise = torch.randn(batch_size, input_dim)

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = torch.optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizerG = torch.optim.Adam(netG.parameters(), lr=learning_rate, betas=(0.5, 0.999))

In [503]:
img_list = []
G_losses = []
D_losses = []
iters = 0

start = time.process_time()
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    print(epoch)
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        
        real_cpu = data[0]
        #print(real_cpu)
        b_size = real_cpu.size(0)
        # Format batch
        label = torch.full((b_size,), real_label, dtype=torch.float)
        # Forward pass real batch through D
        output, _ = netD(real_cpu)
        output = output.view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        #noise = torch.randn(b_size, input_dim)
        noise = torch.cat((torch.randn(b_size, input_dim_num),torch.randint(low=0, high = 1, size = (b_size, input_dim_cat))),-1)
        
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output, _ = netD(fake.detach())
        output = output.view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        #print(errD_fake)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output, _ = netD(fake)
        output = output.view(-1)
        
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        # Check how the generator is doing by saving G's output on fixed_noise
        if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(train_loader)-1)):
            with torch.no_grad():
                fake = netG(fixed_noise).detach().cpu()
            img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

        iters += 1
        
end = time.process_time()
bank_gan_train_time = end - start
print(end - start)

Starting Training Loop...
0
[0/20][0/49]	Loss_D: 9.0326	Loss_G: 0.6769	D(x): 0.7988	D(G(z)): 0.7860 / 0.7850
1
[1/20][0/49]	Loss_D: 4.7214	Loss_G: 0.1594	D(x): 0.7750	D(G(z)): 0.8992 / 0.8973
2
[2/20][0/49]	Loss_D: 4.0397	Loss_G: 0.1473	D(x): 0.7067	D(G(z)): 0.8900 / 0.8876
3
[3/20][0/49]	Loss_D: 3.7829	Loss_G: 0.1846	D(x): 0.6181	D(G(z)): 0.8613 / 0.8576
4
[4/20][0/49]	Loss_D: 3.6480	Loss_G: 0.2172	D(x): 0.5285	D(G(z)): 0.8340 / 0.8301
5
[5/20][0/49]	Loss_D: 3.6018	Loss_G: 0.2786	D(x): 0.4572	D(G(z)): 0.7975 / 0.7933
6
[6/20][0/49]	Loss_D: 3.6678	Loss_G: 0.2961	D(x): 0.4097	D(G(z)): 0.7861 / 0.7807
7
[7/20][0/49]	Loss_D: 3.7405	Loss_G: 0.2979	D(x): 0.3747	D(G(z)): 0.7775 / 0.7721
8
[8/20][0/49]	Loss_D: 3.7918	Loss_G: 0.3183	D(x): 0.3515	D(G(z)): 0.7667 / 0.7606
9
[9/20][0/49]	Loss_D: 3.8693	Loss_G: 0.2987	D(x): 0.3344	D(G(z)): 0.7777 / 0.7711
10
[10/20][0/49]	Loss_D: 3.7292	Loss_G: 0.3631	D(x): 0.3255	D(G(z)): 0.7409 / 0.7337
11
[11/20][0/49]	Loss_D: 3.7841	Loss_G: 0.3082	D(x): 0.3224

In [504]:
x_test['y'] = y_test
myDs=MyTestDataset(x_test)
test_loader=torch.utils.data.DataLoader(myDs,batch_size=1,shuffle=False)

In [505]:
generator = netG
discriminator = netD

In [506]:
start = time.process_time()
losses = []
for j,(image,label) in enumerate(test_loader):
    z = torch.cat((torch.randn(b_size, input_dim_num),torch.randint(low=0, high = 1, size = (b_size, input_dim_cat))),-1)
    #z = init.normal_(torch.zeros(1,input_dim),mean=0,std=0.1)
    z_optimizer = torch.optim.Adam([z],lr=1e-4)

    generator.eval()
    gen_fake = generator(z)
    loss = Anomaly_score(image,gen_fake)
    losses.append(loss.detach().numpy().tolist())
    
threshold = np.mean(losses) + np.std(losses)
anomaly_mask = pd.Series(losses) > threshold
preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
preds = preds.tolist()

x_test['y'] = y_test
x_test['anomaly score'] = losses
x_test['prediction'] = preds

end = time.process_time()
bank_gan_test_time = end - start
print(end - start)

1221.09375


In [507]:
confusion_matrix(x_test['y'], x_test['prediction'])

array([[8999, 1966],
       [1276,  116]], dtype=int64)

In [508]:
fpr, tpr, _ = metrics.roc_curve(x_test['y'], x_test['anomaly score'])
bank_gan_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

0.34309283456766826

In [509]:
bank_gan_report = classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(x_test['y'], x_test['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85     10965
           1       0.06      0.08      0.07      1392

    accuracy                           0.74     12357
   macro avg       0.47      0.45      0.46     12357
weighted avg       0.78      0.74      0.76     12357



In [510]:
print(bank_gan_report['1']['precision'])
print(bank_gan_report['1']['recall'])
print(bank_gan_report['1']['f1-score'])

0.05571565802113353
0.08333333333333333
0.06678180771445019


In [511]:
precision, recall, thresholds = precision_recall_curve(x_test['y'], x_test['anomaly score'])
bank_gan_auc_precision_recall = metrics.auc(recall, precision)
print(bank_gan_auc_precision_recall)

0.08204586983883812


## Performance

In [512]:
performance = pd.DataFrame(columns = ['F1 score', 'recall', 'precision', 'AUC', 'AUPRC', 
                                      'Training time','Inference time','Total time'])

In [513]:
f1_score_gan = {'arrhythmia':arrhythmia_gan_report['1']['f1-score'],
                       'cardio':cardio_gan_report['1']['f1-score'], 
                        'forestcover':forestcover_gan_report['1']['f1-score'], 
                       'annthyroid':annthyroid_gan_report['1']['f1-score'],       
                        'creditcard':creditcard_gan_report['1']['f1-score'], 
                       'mammography':mammography_gan_report['1']['f1-score'], 
                        'shuttle':shuttle_gan_report['1']['f1-score'], 
                      'mnist':mnist_gan_report['1']['f1-score'], 
                  'vowels':vowels_gan_report['1']['f1-score'], 
                  'seismic':seismic_gan_report['1']['f1-score'], 
                  'musk':musk_gan_report['1']['f1-score'], 
                  'bank':bank_gan_report['1']['f1-score']}
f1_score_gan_df = pd.DataFrame.from_dict(f1_score_gan, orient='index', columns = ['F1 score']).reset_index()

In [514]:
recall_gan = {'arrhythmia':arrhythmia_gan_report['1']['recall'],
                       'cardio':cardio_gan_report['1']['recall'], 
                        'forestcover':forestcover_gan_report['1']['recall'], 
                       'annthyroid':annthyroid_gan_report['1']['recall'],       
                        'creditcard':creditcard_gan_report['1']['recall'], 
                       'mammography':mammography_gan_report['1']['recall'], 
                        'shuttle':shuttle_gan_report['1']['recall'], 
                      'mnist':mnist_gan_report['1']['recall'], 
                  'vowels':vowels_gan_report['1']['recall'], 
                  'seismic':seismic_gan_report['1']['recall'], 
                  'musk':musk_gan_report['1']['recall'], 
                  'bank':bank_gan_report['1']['recall'], }
recall_gan_df = pd.DataFrame.from_dict(recall_gan, orient='index', columns = ['Recall']).reset_index()

In [515]:
precision_gan = {'arrhythmia':arrhythmia_gan_report['1']['precision'],
                       'cardio':cardio_gan_report['1']['precision'], 
                        'forestcover':forestcover_gan_report['1']['precision'], 
                       'annthyroid':annthyroid_gan_report['1']['precision'],       
                        'creditcard':creditcard_gan_report['1']['precision'], 
                       'mammography':mammography_gan_report['1']['precision'], 
                        'shuttle':shuttle_gan_report['1']['precision'], 
                      'mnist':mnist_gan_report['1']['precision'], 
                  'vowels':vowels_gan_report['1']['precision'], 
                  'seismic':seismic_gan_report['1']['precision'], 
                  'musk':musk_gan_report['1']['precision'], 
                  'bank':bank_gan_report['1']['precision'], }
precision_gan_df = pd.DataFrame.from_dict(precision_gan, orient='index', columns = ['Precision']).reset_index()

In [516]:
auc_gan = {'arrhythmia':arrhythmia_gan_auc,
                       'cardio':cardio_gan_auc, 
                        'forestcover':forestcover_gan_auc, 
                       'annthyroid':annthyroid_gan_auc,       
                        'creditcard':creditcard_gan_auc, 
                       'mammography':mammography_gan_auc, 
                        'shuttle':shuttle_gan_auc, 
                      'mnist':mnist_gan_auc, 
                  'vowels':vowels_gan_auc, 
                  'seismic':seismic_gan_auc, 
                  'musk':musk_gan_auc, 
                  'bank':bank_gan_auc}
auc_gan_df = pd.DataFrame.from_dict(auc_gan, orient='index', columns = ['AUC']).reset_index()

In [517]:
auprc_gan = {'arrhythmia':arrhythmia_gan_auc_precision_recall,
                       'cardio':cardio_gan_auc_precision_recall, 
                        'forestcover':forestcover_gan_auc_precision_recall, 
                       'annthyroid':annthyroid_gan_auc_precision_recall,       
                        'creditcard':creditcard_gan_auc_precision_recall, 
                       'mammography':mammography_gan_auc_precision_recall, 
                        'shuttle':shuttle_gan_auc_precision_recall, 
                      'mnist':mnist_gan_auc_precision_recall, 
                  'vowels':vowels_gan_auc_precision_recall, 
                  'seismic':seismic_gan_auc_precision_recall, 
                  'musk':musk_gan_auc_precision_recall, 
                  'bank':bank_gan_auc_precision_recall}
auprc_gan_df = pd.DataFrame.from_dict(auprc_gan, orient='index', columns = ['AUPRC']).reset_index()

In [518]:
training_time_gan = {'arrhythmia':arrhythmia_gan_train_time,
                       'cardio':cardio_gan_train_time, 
                        'forestcover':forestcover_gan_train_time, 
                       'annthyroid':annthyroid_gan_train_time,       
                        'creditcard': creditcard_gan_train_time, 
                       'mammography':mammography_gan_train_time, 
                        'shuttle':shuttle_gan_train_time, 
                      'mnist':mnist_gan_train_time, 
                  'vowels':vowels_gan_train_time, 
                  'seismic':seismic_gan_train_time, 
                  'musk':musk_gan_train_time, 
                  'bank':bank_gan_train_time}
training_time_gan_df = pd.DataFrame.from_dict(training_time_gan, orient='index', columns = ['Training time']).reset_index()

In [519]:
test_time_gan = {'arrhythmia':arrhythmia_gan_test_time,
                       'cardio':cardio_gan_test_time, 
                        'forestcover':forestcover_gan_test_time, 
                       'annthyroid':annthyroid_gan_test_time,       
                        'creditcard':creditcard_gan_test_time, 
                       'mammography':mammography_gan_test_time, 
                        'shuttle':shuttle_gan_test_time, 
                      'mnist':mnist_gan_test_time, 
                  'vowels':vowels_gan_test_time, 
                  'seismic':seismic_gan_test_time, 
                  'musk':musk_gan_test_time, 
                  'bank':bank_gan_test_time}
test_time_gan_df = pd.DataFrame.from_dict(test_time_gan, orient='index', columns = ['Testing time']).reset_index()

In [520]:
total_time_gan = {'arrhythmia':arrhythmia_gan_train_time + arrhythmia_gan_test_time,
                       'cardio':cardio_gan_train_time + cardio_gan_test_time, 
                        'forestcover':forestcover_gan_train_time + forestcover_gan_test_time, 
                       'annthyroid':annthyroid_gan_train_time + annthyroid_gan_test_time,       
                        'creditcard': creditcard_gan_train_time + creditcard_gan_test_time, 
                       'mammography':mammography_gan_train_time + mammography_gan_test_time, 
                        'shuttle':shuttle_gan_train_time + shuttle_gan_test_time, 
                      'mnist':mnist_gan_train_time + mnist_gan_test_time, 
                  'vowels':vowels_gan_train_time + vowels_gan_test_time, 
                  'seismic':seismic_gan_train_time + seismic_gan_test_time, 
                  'musk':musk_gan_train_time + musk_gan_test_time, 
                  'bank':bank_gan_train_time + bank_gan_test_time}
total_time_gan_df = pd.DataFrame.from_dict(total_time_gan, orient='index', columns = ['Total time']).reset_index()

In [521]:
pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(f1_score_gan_df, recall_gan_df, how = 'inner'), 
                                    precision_gan_df, how ='inner'),
         auc_gan_df, how = 'inner'), auprc_gan_df, how = 'inner'), training_time_gan_df, how = 'inner'), 
         test_time_gan_df, how = 'inner'),total_time_gan_df, how = 'inner')

Unnamed: 0,index,F1 score,Recall,Precision,AUC,AUPRC,Training time,Testing time,Total time
0,arrhythmia,0.235294,0.2,0.285714,0.593103,0.322384,18.5,2.484375,20.984375
1,cardio,0.619048,0.735849,0.534247,0.953229,0.682443,70.203125,10.59375,80.796875
2,forestcover,0.0,0.0,0.0,0.042668,0.004925,3768.421875,1458.296875,5226.71875
3,annthyroid,0.128472,0.23125,0.088942,0.519306,0.083128,95.546875,42.453125,138.0
4,creditcard,0.008955,0.141892,0.004624,0.776905,0.004687,3683.5,1392.734375,5076.234375
5,mammography,0.214286,0.538462,0.133758,0.861314,0.142467,134.578125,54.9375,189.515625
6,shuttle,0.041441,0.021842,0.403509,0.989596,0.833059,560.96875,223.640625,784.609375
7,mnist,0.037351,0.052381,0.029024,0.247408,0.056655,75.578125,40.890625,116.46875
8,vowels,0.067416,0.2,0.040541,0.574408,0.051414,9.015625,7.3125,16.328125
9,seismic,0.242991,0.254902,0.232143,0.762569,0.159822,90.390625,24.734375,115.125
