<a href="https://colab.research.google.com/github/yash-clear/Anomaly_Detection/blob/main/CNNpytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Time Series Anomaly Detection

## NUMENTA ANOMALY BENCHMARK

## CONVOLUTIONAL NEURAL NETWORK (CNN) with PyTorch

### CLONING THE REPOSITORY

In [1]:

%%bash
if [ ! -d "NAB" ]; then
    git clone https://github.com/numenta/NAB
fi

Cloning into 'NAB'...
Checking out files:  58% (656/1119)   Checking out files:  59% (661/1119)   Checking out files:  60% (672/1119)   Checking out files:  61% (683/1119)   Checking out files:  62% (694/1119)   Checking out files:  63% (705/1119)   Checking out files:  64% (717/1119)   Checking out files:  65% (728/1119)   Checking out files:  66% (739/1119)   Checking out files:  67% (750/1119)   Checking out files:  68% (761/1119)   Checking out files:  69% (773/1119)   Checking out files:  70% (784/1119)   Checking out files:  71% (795/1119)   Checking out files:  72% (806/1119)   Checking out files:  73% (817/1119)   Checking out files:  74% (829/1119)   Checking out files:  75% (840/1119)   Checking out files:  76% (851/1119)   Checking out files:  77% (862/1119)   Checking out files:  78% (873/1119)   Checking out files:  79% (885/1119)   Checking out files:  80% (896/1119)   Checking out files:  81% (907/1119)   Checking out files:  82% (918/1119)   Che

### IMPORTING THE PACKAGES

In [2]:

from pathlib import Path # convenient way to deal w/ paths
import plotly.graph_objects as go # creates plots
import numpy as np # standard for data processing
import pandas as pd # standard for data processing
import json # we have anomalies' timestamps in json format

In [3]:

# Path to the whole data from NAB git repository
nab = Path.cwd()/'NAB'

# This folder contains all files w/ metrics
data_path = nab/'data'

# There is also separate json file 
# w/ timestamps of anomalies in files w/ metrics
labels_filepath = '/content/NAB/labels/combined_labels.json'

# Path from data folder to the training file
training_filename = 'realAWSCloudwatch/rds_cpu_utilization_cc0c53.csv'

# Path from data folder to the validation file
valid_filename = 'realAWSCloudwatch/rds_cpu_utilization_e47b3b.csv'

In [4]:

with open(labels_filepath, 'r') as f:
    anomalies_timestamps = json.load(f)  

TRAINING SET

In [5]:

train = pd.read_csv('/content/NAB/data/realAWSCloudwatch/rds_cpu_utilization_cc0c53.csv')
valid = pd.read_csv('/content/NAB/data/realAWSCloudwatch/rds_cpu_utilization_e47b3b.csv')
train.head()

Unnamed: 0,timestamp,value
0,2014-02-14 14:30:00,6.456
1,2014-02-14 14:35:00,5.816
2,2014-02-14 14:40:00,6.268
3,2014-02-14 14:45:00,5.816
4,2014-02-14 14:50:00,5.862


VALIDATION SET

In [6]:
valid.head()

Unnamed: 0,timestamp,value
0,2014-04-10 00:02:00,14.012
1,2014-04-10 00:07:00,13.334
2,2014-04-10 00:12:00,15.0
3,2014-04-10 00:17:00,13.998
4,2014-04-10 00:22:00,14.332


In [7]:
p=list(range(0, 4032))


In [8]:

from sklearn.preprocessing import StandardScaler

# Let's make it function for further usage
def parse_and_standardize(df: pd.DataFrame, scaler: StandardScaler = None):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['stand_value'] = df['value']
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(df['stand_value'].values.reshape(-1, 1))
    df['stand_value'] = scaler.transform(df['stand_value'].values.reshape(-1, 1))
    return scaler

data_scaler1 = parse_and_standardize(train)
data_scaler2=parse_and_standardize(valid)

In [9]:

train_anomalies = train[train['timestamp'].isin(anomalies_timestamps['realAWSCloudwatch/rds_cpu_utilization_cc0c53.csv'])]
valid_anomalies = valid[valid['timestamp'].isin(anomalies_timestamps['realAWSCloudwatch/rds_cpu_utilization_e47b3b.csv'])]
train_anomalies

Unnamed: 0,timestamp,value,stand_value
3080,2014-02-25 07:15:00,25.1033,4.652449
3579,2014-02-27 00:50:00,19.165,3.026441


In [10]:
# Prepare layout w/ titles

import plotly.graph_objects as go
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization')) 

# Create the figure for plotting the data
fig = go.Figure(layout=layout) 

# Add non-anomaly data to the figure
fig.add_trace(go.Scatter(x=train['timestamp'], y=train['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))

# Add anomaly data to the figure
fig.add_trace(go.Scatter(x=train_anomalies['timestamp'],
                         y=train_anomalies['value'], 
                         mode='markers', name='Anomaly',
                         marker=dict(color='green', size=13)))
fig.update_layout(
    title="Training set"
)
fig.show()

In [11]:

fig = go.Figure()
fig.add_trace(go.Scatter(x=valid['timestamp'], y=valid['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=valid_anomalies['timestamp'],
                         y=valid_anomalies['value'], 
                         mode='markers', name='Anomaly',
                         marker=dict(color='green', size=13)))
fig.update_layout(
    title="Validation set"
    )

fig.show()

In [12]:

# PyTorch itself
import torch 

# Dataset - the base class to be inherited
from torch.utils.data import Dataset, DataLoader

In [13]:

class CPUDataset(Dataset):
    def __init__(self, data: pd.DataFrame, size: int, 
                 step: int = 1):
        self.chunks = torch.FloatTensor(data['stand_value']).unfold(0, size+1, step)
        self.chunks = self.chunks.view(-1, 1, size+1)
    def __len__(self):
        return self.chunks.size(0)
    
    def __getitem__(self, i):
        x = self.chunks[i, :, :-1]
        y = self.chunks[i, :, -1:].squeeze(1)
        return x, y

In [14]:

n_factors = 10
train_ds = CPUDataset(train, n_factors)
valid_ds = CPUDataset(valid, n_factors)

In [15]:
# here PyTorch has all neural net functions and activations
import torch.nn as nn

def conv_layer(in_feat, out_feat, kernel_size=3, stride=1,
               padding=1, relu=True):
    res = [
        nn.Conv1d(in_feat, out_feat, kernel_size=kernel_size,
                  stride=stride, padding=padding, bias=False),
        nn.BatchNorm1d(out_feat),
    ]
    if relu:
        res.append(nn.ReLU())
    return nn.Sequential(*res)

### RESNET PRETRAINED MODEL

In [16]:

class ResBlock(nn.Module):
    def __init__(self, in_feat, out_feat):
        super().__init__()
        self.in_feat, self.out_feat = in_feat, out_feat
        self.conv1 = conv_layer(in_feat, out_feat)
        self.conv2 = conv_layer(out_feat, out_feat, relu=False)
        if self.apply_shortcut:
            self.shortcut = conv_layer(in_feat, out_feat,
                                       kernel_size=1, padding=0,
                                       relu=False)
    
    def forward(self, x):
        out = self.conv1(x)
        if self.apply_shortcut:
            x = self.shortcut(x)
        return x + self.conv2(out)
    
    @property
    def apply_shortcut(self):
        return self.in_feat != self.out_feat

In [17]:

class AdaptiveConcatPool1d(nn.Module):
    def __init__(self):
        super().__init__()
        self.ap = nn.AdaptiveAvgPool1d(1)
        self.mp = nn.AdaptiveMaxPool1d(1)
    
    def forward(self, x): 
        return torch.cat([self.mp(x), self.ap(x)], 1)

In [18]:

class CNN(nn.Module):
    def __init__(self, out_size):
        super().__init__()
        self.base = nn.Sequential(
            ResBlock(1, 8), #shape = batch, 8, n_factors
            ResBlock(8, 8), 
            ResBlock(8, 16), #shape = batch, 16, n_factors
            ResBlock(16, 16),
            ResBlock(16, 32), #shape = batch, 32, n_factors
            ResBlock(32, 32),
            ResBlock(32, 64), #shape = batch, 64, n_factors
            ResBlock(64, 64),
        )
        self.head = nn.Sequential(
            AdaptiveConcatPool1d(), #shape = batch, 128, 1
            nn.Flatten(),
            nn.Linear(128, out_size)
        )
        
    def forward(self, x):
        out = self.base(x)
        out = self.head(out)
        return out

## TRAIN THE MODEL

In [19]:
from torch.optim import Adam
def train_model(model: CNN, dataloaders: dict, optimizer: torch.optim.Optimizer, 
                scheduler, criterion, device: torch.device, epochs: int):
    losses_data = {'train': [], 'valid': []}
    model.to(device)
    
    # Loop over epochs
    for epoch in tqdm(range(epochs)):
        print(f'Epoch {epoch}/{epochs-1}')
        
        # Training and validation phases
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.
            running_total = 0.
            
            # Loop over batches of data
            for idx, batch in tqdm(enumerate(dataloaders[phase]), 
                                   total=len(dataloaders[phase]), 
                                   leave=False
                                   ):
                x, y = batch
                x = x.to(device)
                y = y.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    out = model(x)
                    loss = criterion(out, y)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        scheduler.step()

                running_loss += loss.item() * y.size(0)
                running_total += y.size(0)

            epoch_loss = running_loss / running_total
            print(f'{phase.capitalize()} Loss: {epoch_loss}')
            losses_data[phase].append(epoch_loss)
    return losses_data

In [20]:

epochs = 50
cnn_model = CNN(out_size=1)
dataloaders = {
    'train': DataLoader(train_ds, batch_size=128, shuffle=True),
    'valid': DataLoader(valid_ds, batch_size=128)
}
optim = torch.optim.Adam(cnn_model.parameters(), lr=1e-1, weight_decay=1e-3)
sched = torch.optim.lr_scheduler.OneCycleLR(optim, max_lr=1e-3, steps_per_epoch=len(dataloaders['train']), epochs=epochs)
criterion = nn.MSELoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [21]:

from tqdm.notebook import tqdm_notebook as tqdm
losses = train_model(cnn_model, dataloaders, optim, sched, criterion, device, epochs)


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Epoch 0/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.8395746312757619


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.597457299749022
Epoch 1/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.13984973049175675


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.25008954248177595
Epoch 2/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.0700171522199663


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.39525812652197306
Epoch 3/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.050854865078129384


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.2541169440887835
Epoch 4/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.04033115666409679


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.24182847600517315
Epoch 5/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.035473731735907406


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.18391959706885433
Epoch 6/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.03672821951306678


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.13246479411880394
Epoch 7/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.0617447726887918


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.14451283714324162
Epoch 8/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.059548255734141936


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.13565572067235668
Epoch 9/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.04331855297881475


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.2172943177715885
Epoch 10/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.04522786063639042


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.16912407758967576
Epoch 11/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.037212238345556384


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.16654818727436171
Epoch 12/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.03650879890675097


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.13578643253853906
Epoch 13/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.03325484128606847


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.11558515633711584
Epoch 14/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.027563050966330098


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.1254836873094399
Epoch 15/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.023519235050415945


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.09673914797826379
Epoch 16/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.02717831633403371


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.12564728448601845
Epoch 17/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.0278233971533818


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.10495410983514157
Epoch 18/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.025862720424021317


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.11008438678048015
Epoch 19/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.026176951534319936


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.09736106575020713
Epoch 20/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.023418589314971025


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.10343386082062965
Epoch 21/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.02254880039870724


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.10112051391989837
Epoch 22/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.0211108555819993


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08422667718960657
Epoch 23/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.020252210694753758


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.0982880428107386
Epoch 24/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.019544349806424933


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08918281600031167
Epoch 25/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.020369044465695017


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.10502920263923413
Epoch 26/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.021069681680377355


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.09542331027542253
Epoch 27/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.020364822148313456


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08341186800772488
Epoch 28/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.020173791447707163


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.10725364304915967
Epoch 29/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.021041249477525423


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08752584523344917
Epoch 30/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.018501103412469248


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08920818923596036
Epoch 31/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.017959624562229107


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.09354522832067733
Epoch 32/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.017400682558343074


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08254531715648765
Epoch 33/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.0172823009945819


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08448846161195082
Epoch 34/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.019019468543446223


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.09466240352759724
Epoch 35/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.018114136263805945


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.0902422866596475
Epoch 36/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.016700123605785648


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.0848249746565274
Epoch 37/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.016457479469355867


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08574065732784025
Epoch 38/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.016396143429688943


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.09015825589954823
Epoch 39/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.016002038243670833


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.09468426632086831
Epoch 40/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.01681257002120481


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.09174371615778386
Epoch 41/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.015903481062777686


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08832567478592629
Epoch 42/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.01577641614603943


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08779355443084139
Epoch 43/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.015586310171178105


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08777840955489491
Epoch 44/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.015476325711616085


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08773765133435396
Epoch 45/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.015512810016911401


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.09186311935693811
Epoch 46/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.015222484338982285


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08982662034458572
Epoch 47/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.015418137333849986


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08771461736890643
Epoch 48/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.015480401870632042


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.08737080287760701
Epoch 49/49


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Train Loss: 0.01524284588587982


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Valid Loss: 0.0876173941323127



### LOSSES

In [22]:

layout = dict(xaxis=dict(title='Epoch'), yaxis=dict(title='Loss'))
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(y=losses['train'], mode='lines', name='Train Loss',))
fig.add_trace(go.Scatter(y=losses['valid'], mode='lines', name='Valid Loss'))

In [23]:

# Switching model into evaluation mode
cnn_model = cnn_model.eval()

# Calculation of the predictions for training data
with torch.no_grad():
    res_train = cnn_model(train_ds[:][0].to(device))
res_train = res_train.cpu()

# Calculation of the predictions for validation data
with torch.no_grad():
    res_valid = cnn_model(valid_ds[:][0].to(device))
res_valid = res_valid.cpu()

### PREDICTIONS

In [24]:
res_valid1=res_valid.tolist()
l=[]
for i in res_valid1:
  l.append(i[0])
  k=i[0]
print(len(l))
for i in range(10): 
  l.append(k)
print(len(l))  
valid['predicted']=l
res_valid=torch.Tensor(l)
valid.head()

4022
4032


Unnamed: 0,timestamp,value,stand_value,predicted
0,2014-04-10 00:02:00,14.012,-0.878378,-0.799385
1,2014-04-10 00:07:00,13.334,-0.999353,-0.681162
2,2014-04-10 00:12:00,15.0,-0.702091,-0.685296
3,2014-04-10 00:17:00,13.998,-0.880876,-0.57328
4,2014-04-10 00:22:00,14.332,-0.821281,-0.699519


In [25]:

import plotly.graph_objects as go
fig = go.Figure()

layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization')) 
# Create the figure for plotting the data
fig = go.Figure(layout=layout) 

fig.add_trace(go.Scatter(x=valid['timestamp'],
                        y=valid['stand_value'], 
                        mode='markers', name='Ground Truth',
                        marker=dict(color='blue', size=5)))

fig.add_trace(go.Scatter(x=valid['timestamp'], y=valid['predicted'], 
                        mode='markers', name='Predicted Value',
                        marker=dict(color='orange')))
fig.update_layout(
    title="Validation set"
    )

fig.show()

In [26]:
res_train1=res_train.tolist()
l1=[]
for i in res_train1:
  l1.append(i[0])
  k=i[0]
print(len(l))
for i in range(10): 
  l1.append(k)
print(len(l1)) 
train['predicted']=l1
res_train=torch.Tensor(l)
train.head()

4032
4032


Unnamed: 0,timestamp,value,stand_value,predicted
0,2014-02-14 14:30:00,6.456,-0.453498,-0.542821
1,2014-02-14 14:35:00,5.816,-0.628741,-0.560242
2,2014-02-14 14:40:00,6.268,-0.504976,-0.591162
3,2014-02-14 14:45:00,5.816,-0.628741,-0.535936
4,2014-02-14 14:50:00,5.862,-0.616145,-0.542114


In [27]:

import plotly.graph_objects as go
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization')) 

# Create the figure for plotting the data
fig = go.Figure(layout=layout) 

# Add non-anomaly data to the figure
fig.add_trace(go.Scatter(x=train['timestamp'], y=train['stand_value'], 
                         mode='markers', name='Ground Truth',
                         marker=dict(color='blue')))

# Add anomaly data to the figure
fig.add_trace(go.Scatter(x=train['timestamp'],
                         y=train['predicted'], 
                         mode='markers', name='Predicted Value',
                         marker=dict(color='orange')))
fig.update_layout(
    title="Training set"
)
fig.show()

In [28]:

def calculate_prediction_errors(
    model: CNN, dataset: CPUDataset, criterion, 
    device: torch.device
    ):
    with torch.no_grad():
        errors = []
        for x, y in tqdm(dataset):
            x = x.to(device)[None]
            y = y.to(device)[None]
            predicted = model(x)
            prediction_error = criterion(predicted, y)
            errors.append(prediction_error.cpu())
        return errors

train_pred_errors = calculate_prediction_errors(cnn_model, train_ds, criterion, device)
valid_pred_errors = calculate_prediction_errors(cnn_model, valid_ds, criterion, device)

HBox(children=(FloatProgress(value=0.0, max=4022.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4022.0), HTML(value='')))




STATIC THRESHOLD

In [29]:
pred_error_threshold_train = np.mean(train_pred_errors) + 3 * np.std(train_pred_errors)
pred_error_threshold_valid = np.mean(valid_pred_errors) + 3 * np.std(valid_pred_errors)
pred_error_threshold_valid

6.091458328068256

In [30]:
# We use Series from pandas to calculate windowed errors
window=40
std_coef=6
train_pred_errors_windowed = pd.Series(train_pred_errors).rolling(window=window, min_periods=1)
# Dynamic threshold for the training data
train_dynamic_threshold = train_pred_errors_windowed.mean() + std_coef * train_pred_errors_windowed.std()

valid_pred_errors_windowed = pd.Series(valid_pred_errors).rolling(window=window, min_periods=1)
# Dynamic threshold for the validation data
valid_dynamic_threshold = valid_pred_errors_windowed.mean() + std_coef * valid_pred_errors_windowed.std()


In [31]:
li=[0]*4032
k=valid_anomalies.index.tolist()
for i in k:
  li[i]=1
valid['anomaly_found']=li

In [32]:
li=[0]*4032
k=train_anomalies.index.tolist()
for i in k:
  li[i]=1
train['anomaly_found']=li

In [33]:
from typing import Union

def detect_anomalies(
    result: torch.Tensor, dataset: CPUDataset, 
    threshold: Union[float, pd.Series], n_factors: int = 0):
    anomalies_idxs = []
    # We filter each item
    for i in range(len(dataset)):
        # The case of dynamic threshold
        if type(threshold) == pd.Series:
            is_anomaly = (criterion(result[i], dataset[i][1]) > threshold[i])
        # The case of static threshold
        else:
            is_anomaly = (criterion(result[i], dataset[i][1]) > threshold)
        if is_anomaly:
            # Since the index of the prediction is next after 
            # the index of the last factor we should add the amount
            # of the factors
            anomalies_idxs.append(i + n_factors)
            
    return anomalies_idxs

In [34]:
train_anomalies_idxs = detect_anomalies(
    res_train, train_ds, pred_error_threshold_train, n_factors
)
valid_anomalies_idxs = detect_anomalies(
    res_valid, valid_ds, pred_error_threshold_valid, n_factors
)



Using a target size (torch.Size([1])) that is different to the input size (torch.Size([])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.



In [35]:
anomaly_label_train=[0]*4032
for i in train_anomalies_idxs:
  anomaly_label_train[i]=1
train['anomaly_label']=anomaly_label_train  
  
anomaly_label_valid=[0]*4032
for i in valid_anomalies_idxs:
  anomaly_label_valid[i]=1
valid['anomaly_label']=anomaly_label_valid    

In [36]:
valid.head()

Unnamed: 0,timestamp,value,stand_value,predicted,anomaly_found,anomaly_label
0,2014-04-10 00:02:00,14.012,-0.878378,-0.799385,0,0
1,2014-04-10 00:07:00,13.334,-0.999353,-0.681162,0,0
2,2014-04-10 00:12:00,15.0,-0.702091,-0.685296,0,0
3,2014-04-10 00:17:00,13.998,-0.880876,-0.57328,0,0
4,2014-04-10 00:22:00,14.332,-0.821281,-0.699519,0,0


In [37]:
valid_detected=valid.iloc[valid_anomalies_idxs]
valid_detected.head()

Unnamed: 0,timestamp,value,stand_value,predicted,anomaly_found,anomaly_label
946,2014-04-13 06:52:00,76.23,10.223065,1.376252,1,1
947,2014-04-13 06:57:00,65.835,8.368305,-1.630959,0,1
950,2014-04-13 07:12:00,16.668,-0.404473,-0.516616,0,1
951,2014-04-13 07:17:00,16.666,-0.40483,-0.526648,0,1
952,2014-04-13 07:22:00,15.668,-0.582901,-0.576486,0,1


In [38]:

fig = go.Figure()
fig.add_trace(go.Scatter(x=valid['timestamp'], y=valid['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=valid_anomalies['timestamp'],
                         y=valid_anomalies['value'], 
                         mode='markers', name='Anomaly',
                         marker=dict(color='green', size=13)))

fig.add_trace(go.Scatter(x=valid_detected['timestamp'], y=valid_detected['value'], 
                         mode='markers', name='predicted Anomaly',
                         marker=dict(color='red')))
fig.update_layout(
    title="Validation set"
    )

fig.show()

In [39]:
train_detected=train.iloc[train_anomalies_idxs]



In [40]:
import plotly.graph_objects as go
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization')) 

# Create the figure for plotting the data
fig = go.Figure(layout=layout) 

# Add non-anomaly data to the figure
fig.add_trace(go.Scatter(x=train['timestamp'], y=train['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))

# Add anomaly data to the figure
fig.add_trace(go.Scatter(x=train_anomalies['timestamp'],
                         y=train_anomalies['value'], 
                         mode='markers', name='Anomaly',
                         marker=dict(color='green', size=13)))

fig.add_trace(go.Scatter(x=train_detected['timestamp'], y=train_detected['value'], 
                         mode='markers', name='Predicted Anomaly',
                         marker=dict(color='red')))
fig.update_layout(
    title="Training set"
)
fig.show()

### METRICS

In [41]:
from sklearn.metrics import precision_recall_fscore_support

def calculate_metrics(
    ground_truth: pd.DataFrame, anomalies_idxs: list
    ):
    predictions = pd.DataFrame(
        index=range(len(ground_truth)), 
        columns=['anomaly_found']
    )
    predictions['anomaly_found'] = 0
    predictions.iloc[anomalies_idxs] = 1
    anomalies_given=(ground_truth['anomaly_found'].tolist()).count(1)
    # Calculation of the confusion matrix can be done using pandas
    confusion_matrix = pd.crosstab(
        ground_truth.loc[:, 'anomaly_label'],
        predictions['anomaly_found'], 
        margins=True
    )
   # print(confusion_matrix)
    precision, recall, f1, _ = precision_recall_fscore_support(
        ground_truth.loc[:, 'anomaly_label'],
        predictions['anomaly_found'], 
        beta=2., 
        average='binary'
    )
    precision=anomalies_given/len(anomalies_idxs)
    
    #recall=anomalies_given/(len(anomalies_idxs)+anomalies_given)
    f1=2*(precision*recall)/(precision+recall)
    return confusion_matrix, precision, recall, f1

In [42]:
train_conf_matrix, *train_metrics = calculate_metrics(
    train, train_anomalies_idxs
)

print(f'Train:\n Precision: {train_metrics[0]:.3f}\n' 
      f'Recall: {train_metrics[1]:.3f}\n' 
      f'F1 score: {train_metrics[2]:.3f}')

Train:
 Precision: 0.002
Recall: 1.000
F1 score: 0.004


In [43]:

valid_conf_matrix, *valid_metrics = calculate_metrics(
    valid, valid_anomalies_idxs
)

print(f'Valid:\n Precision: {valid_metrics[0]:.3f}\n' 
      f'Recall: {valid_metrics[1]:.3f}\n' 
      f'F1 score: {valid_metrics[2]:.3f}')

Valid:
 Precision: 0.250
Recall: 1.000
F1 score: 0.400
