<a href="https://colab.research.google.com/github/yash-clear/Anomaly_Detection/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Time Series Anomaly Detection

## The Numenta Anomaly Benchmark

## LONG SHORT TERM MEMORY (LSTM)

### Cloning the repository

In [1]:

%%bash
if [ ! -d "NAB" ]; then
    git clone https://github.com/numenta/NAB
fi

Cloning into 'NAB'...
Checking out files:  58% (656/1119)   Checking out files:  59% (661/1119)   Checking out files:  60% (672/1119)   Checking out files:  61% (683/1119)   Checking out files:  62% (694/1119)   Checking out files:  63% (705/1119)   Checking out files:  64% (717/1119)   Checking out files:  65% (728/1119)   Checking out files:  66% (739/1119)   Checking out files:  67% (750/1119)   Checking out files:  68% (761/1119)   Checking out files:  69% (773/1119)   Checking out files:  70% (784/1119)   Checking out files:  71% (795/1119)   Checking out files:  72% (806/1119)   Checking out files:  73% (817/1119)   Checking out files:  74% (829/1119)   Checking out files:  75% (840/1119)   Checking out files:  76% (851/1119)   Checking out files:  77% (862/1119)   Checking out files:  78% (873/1119)   Checking out files:  79% (885/1119)   Checking out files:  80% (896/1119)   Checking out files:  81% (907/1119)   Checking out files:  82% (918/1119)   Che

### IMPORTING PACKAGES

In [2]:

from pathlib import Path # convenient way to deal w/ paths
import plotly.graph_objects as go # creates plots
import numpy as np # standard for data processing
import pandas as pd # standard for data processing
import json # we have anomalies' timestamps in json format

### LOADING THE DATASET

In [3]:

# Path to the whole data from NAB git repository
nab = Path.cwd()/'NAB'

# This folder contains all files w/ metrics
data_path = nab/'data'

# There is also separate json file 
# w/ timestamps of anomalies in files w/ metrics
labels_filepath = '/content/NAB/labels/combined_labels.json'

# Path from data folder to the training file
training_filename = 'realAWSCloudwatch/rds_cpu_utilization_cc0c53.csv'

# Path from data folder to the validation file
valid_filename = 'realAWSCloudwatch/rds_cpu_utilization_e47b3b.csv'

In [4]:

with open(labels_filepath, 'r') as f:
    anomalies_timestamps = json.load(f)

TRAINING SET

In [5]:

train = pd.read_csv('/content/NAB/data/realAWSCloudwatch/rds_cpu_utilization_cc0c53.csv')
valid = pd.read_csv('/content/NAB/data/realAWSCloudwatch/rds_cpu_utilization_e47b3b.csv')
train.head()

Unnamed: 0,timestamp,value
0,2014-02-14 14:30:00,6.456
1,2014-02-14 14:35:00,5.816
2,2014-02-14 14:40:00,6.268
3,2014-02-14 14:45:00,5.816
4,2014-02-14 14:50:00,5.862


VALIDATION SET

In [6]:
valid.head()

Unnamed: 0,timestamp,value
0,2014-04-10 00:02:00,14.012
1,2014-04-10 00:07:00,13.334
2,2014-04-10 00:12:00,15.0
3,2014-04-10 00:17:00,13.998
4,2014-04-10 00:22:00,14.332


In [7]:

from sklearn.preprocessing import StandardScaler

# Let's make it function for further usage
def parse_and_standardize(df: pd.DataFrame, scaler: StandardScaler = None):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['stand_value'] = df['value']
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(df['stand_value'].values.reshape(-1, 1))
    df['stand_value'] = scaler.transform(df['stand_value'].values.reshape(-1, 1))
    return scaler

data_scaler = parse_and_standardize(train)
parse_and_standardize(valid, data_scaler)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [8]:

train_anomalies = train[train['timestamp'].isin(anomalies_timestamps['realAWSCloudwatch/rds_cpu_utilization_cc0c53.csv'])]
valid_anomalies = valid[valid['timestamp'].isin(anomalies_timestamps['realAWSCloudwatch/rds_cpu_utilization_e47b3b.csv'])]
train_anomalies

Unnamed: 0,timestamp,value,stand_value
3080,2014-02-25 07:15:00,25.1033,4.652449
3579,2014-02-27 00:50:00,19.165,3.026441


In [9]:
# Prepare layout w/ titles

import plotly.graph_objects as go
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization')) 

# Create the figure for plotting the data
fig = go.Figure(layout=layout) 

# Add non-anomaly data to the figure
fig.add_trace(go.Scatter(x=train['timestamp'], y=train['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))

# Add anomaly data to the figure
fig.add_trace(go.Scatter(x=train_anomalies['timestamp'],
                         y=train_anomalies['value'], 
                         mode='markers', name='Anomaly',
                         marker=dict(color='green', size=13)))
fig.update_layout(
    title="Training set"
)
fig.show()

In [10]:

fig = go.Figure()
fig.add_trace(go.Scatter(x=valid['timestamp'], y=valid['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=valid_anomalies['timestamp'],
                         y=valid_anomalies['value'], 
                         mode='markers', name='Anomaly',
                         marker=dict(color='green', size=13)))
fig.update_layout(
    title="Validation set"
    )

fig.show()

In [11]:

# PyTorch itself
import torch 

# Dataset - the base class to be inherited
from torch.utils.data import Dataset, DataLoader

In [12]:

class CPUDataset(Dataset):
    def __init__(self, data: pd.DataFrame, size: int):
        self.chunks = torch.FloatTensor(data['stand_value']).unfold(0, size, size)
        
    def __len__(self):
        return self.chunks.size(0)
    
    def __getitem__(self, i):
        x = self.chunks[i]
        return x

train_ds = CPUDataset(train, 64)
valid_ds = CPUDataset(valid, 64)

In [13]:
import torch.nn as nn
class LSTMModel(nn.Module):
    def __init__(self, in_size, hidden_size, out_size, device):
        super().__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(in_size, hidden_size)
        self.linear = nn.Linear(hidden_size, out_size)
        self.device = device
        self.init_hidden()
        
    def forward(self, x):
        out, self.hidden_state = self.lstm(
            x.view(len(x), 1, -1), self.hidden_state
        )
        self.hidden_state = tuple(
            [h.detach() for h in self.hidden_state]
        )
        out = out.view(len(x), -1)
        out = self.linear(out)
        return out
    
    def init_hidden(self):
        self.hidden_state = (
            torch.zeros((1, 1, self.hidden_size)).to(self.device),
            torch.zeros((1, 1, self.hidden_size)).to(self.device))

In [14]:
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
#import torch.optim as opt
#from scipy.optimize import Adam
from tqdm.notebook import tqdm_notebook as tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(1, 128, 1, device)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 50
model = LSTMModel(1, 128, 1, device)
dataloaders = {
    'train': DataLoader(train_ds, batch_size=1),
    'valid': DataLoader(valid_ds, batch_size=1)
}
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-3, steps_per_epoch=len(dataloaders['train']), epochs=50)
criterion = nn.MSELoss()

## TRAIN THE MODEL

In [15]:

def train_model(model: LSTMModel, dataloaders: dict, optimizer: torch.optim.Optimizer, 
                scheduler, criterion, device: torch.device, epochs: int):
    losses_data = {'train': [], 'valid': []}
    model.to(device)
    for epoch in tqdm(range(epochs)):
        print(f'Epoch {epoch}/{epochs-1}')
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.
            running_total = 0.
            
        # Here changes start
            for idx, sequence in enumerate(dataloaders[phase]):
                value = sequence
                value = value.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    out = model(value.view(-1, 1))
                    loss = criterion(out.view(-1), value.view(-1))
        # Here changes end

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        scheduler.step()

                running_loss += loss.item() * out.size(0)
                running_total += out.size(0)

            epoch_loss = running_loss / running_total
            print(f'{phase.capitalize()} Loss: {epoch_loss}')
            losses_data[phase].append(epoch_loss)
    return losses_data
losses = train_model(model, dataloaders, optimizer, sched, criterion, device, epochs)


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Epoch 0/49
Train Loss: 1.0116872555679746
Valid Loss: 11.13111028217134
Epoch 1/49
Train Loss: 0.9256312170672039
Valid Loss: 10.28365023173983
Epoch 2/49
Train Loss: 0.779119601798436
Valid Loss: 8.474772419248309
Epoch 3/49
Train Loss: 0.2320513157913136
Valid Loss: 2.6865566919838626
Epoch 4/49
Train Loss: 0.18923211704555248
Valid Loss: 3.845829281468122
Epoch 5/49
Train Loss: 0.18407524912987674
Valid Loss: 3.998662403695995
Epoch 6/49
Train Loss: 0.05292866356109106
Valid Loss: 3.336146886061345
Epoch 7/49
Train Loss: 0.05550551684647207
Valid Loss: 2.960107738773028
Epoch 8/49
Train Loss: 0.04839806636011908
Valid Loss: 2.84514809317059
Epoch 9/49
Train Loss: 0.03840273018142888
Valid Loss: 2.8314357444289184
Epoch 10/49
Train Loss: 0.03842999140495464
Valid Loss: 2.8759689663669894
Epoch 11/49
Train Loss: 0.03955144639540878
Valid Loss: 2.94325416606097
Epoch 12/49
Train Loss: 0.040480855623230574
Valid Loss: 2.961675979551815
Epoch 13/49
Train Loss: 0.040403725601555335
Valid 

In [16]:
losses.values()

dict_values([[1.0116872555679746, 0.9256312170672039, 0.779119601798436, 0.2320513157913136, 0.18923211704555248, 0.18407524912987674, 0.05292866356109106, 0.05550551684647207, 0.04839806636011908, 0.03840273018142888, 0.03842999140495464, 0.03955144639540878, 0.040480855623230574, 0.040403725601555335, 0.038814787756622074, 0.036402768632840546, 0.03404669436655702, 0.03214374375677416, 0.03068083961908188, 0.029529612932709, 0.028582086346836552, 0.02776836111978997, 0.027046345004309264, 0.026388867595602595, 0.0257717337281931, 0.02516336733519676, 0.024520105826446696, 0.023796026772331624, 0.022969674041849516, 0.02206504290982608, 0.02113578404994711, 0.02022523036418808, 0.019347685784663237, 0.018499478852997225, 0.017669711875477955, 0.01682888313255731, 0.0159394329650298, 0.015077585034767195, 0.01425702786738319, 0.013494770295755376, 0.01278500432991201, 0.01217879113807742, 0.011718408311791127, 0.011621103901046492, 0.011279138110001526, 0.010772592606141216, 0.01064726

In [17]:
norm1= [1*float(i)/sum(losses['train']) for i in losses['train']]
norm2= [1*float(i)/sum(losses['valid']) for i in losses['valid']]

In [18]:
p=range(1,len(losses['valid'])+1)
vals=list(p)

### LOSSES

In [19]:
import plotly.graph_objects as go

# your data
loss = {'train':norm1,"valid":norm2}
epochs = {'train':vals,'valid':vals}

# generate a plot for each dictionary key
data = []
for k in loss.keys():
    plot = go.Scatter(y=loss[k],
                      x=epochs[k],
                      mode="lines",
                      name=k
                     ) 
    data.append(plot)

# create a figure with all plots and display it
fig = go.Figure(data=data)
fig.show()

In [20]:

train_values = train['stand_value'].values.astype(np.float32).flatten()
valid_values = valid['stand_value'].values.astype(np.float32).flatten()

In [21]:
model.eval()

# Calculation of the predictions for training data
with torch.no_grad():
    res_train = model(torch.tensor(train_values).to(device))
res_train = res_train.cpu()

# Calculation of the predictions for validation data
with torch.no_grad():
    res_valid = model(torch.tensor(valid_values).to(device))
res_valid = res_valid.cpu()

In [22]:
res_valid1=res_valid.tolist()
l=[]
for i in res_valid1:
  l.append(i[0])
#print(l) 
valid['predicted']=l
res_valid=torch.Tensor(l)
valid.head()

Unnamed: 0,timestamp,value,stand_value,predicted
0,2014-04-10 00:02:00,14.012,1.615463,1.720773
1,2014-04-10 00:07:00,13.334,1.429815,1.624861
2,2014-04-10 00:12:00,15.0,1.885994,1.787065
3,2014-04-10 00:17:00,13.998,1.611629,1.690325
4,2014-04-10 00:22:00,14.332,1.703084,1.725825


In [23]:

import plotly.graph_objects as go
fig = go.Figure()

layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization')) 
# Create the figure for plotting the data
fig = go.Figure(layout=layout) 

fig.add_trace(go.Scatter(x=valid['timestamp'],
                        y=valid['stand_value'], 
                        mode='markers', name='Ground Truth',
                        marker=dict(color='blue', size=5)))

fig.add_trace(go.Scatter(x=valid['timestamp'], y=valid['predicted'], 
                        mode='markers', name='Predicted Value',
                        marker=dict(color='orange')))
fig.update_layout(
    title="Validation set"
    )

fig.show()

In [24]:
res_train1=res_train.tolist()
l=[]
for i in res_train1:
  l.append(i[0])
#print(l) 
train['predicted']=l
res_train=torch.Tensor(l)
train.head()

Unnamed: 0,timestamp,value,stand_value,predicted
0,2014-02-14 14:30:00,6.456,-0.453498,0.984477
1,2014-02-14 14:35:00,5.816,-0.628741,0.540265
2,2014-02-14 14:40:00,6.268,-0.504976,0.136284
3,2014-02-14 14:45:00,5.816,-0.628741,-0.270372
4,2014-02-14 14:50:00,5.862,-0.616145,-0.553552


In [25]:

import plotly.graph_objects as go
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization')) 

# Create the figure for plotting the data
fig = go.Figure(layout=layout) 

# Add non-anomaly data to the figure
fig.add_trace(go.Scatter(x=train['timestamp'], y=train['stand_value'], 
                         mode='markers', name='Ground Truth',
                         marker=dict(color='blue')))

# Add anomaly data to the figure
fig.add_trace(go.Scatter(x=train['timestamp'],
                         y=train['predicted'], 
                         mode='markers', name='Predicted Value',
                         marker=dict(color='orange')))
fig.update_layout(
    title="Training set"
)
fig.show()

In [26]:

def calculate_prediction_errors(target, predicted, criterion):
    reconstruction_errors = []
    for t, p in zip(target, predicted):
        reconstruction_errors = np.append(
            reconstruction_errors, 
            criterion(p, t).cpu().numpy().flatten()
        )
    return reconstruction_errors

train_pred_errors = calculate_prediction_errors(
    res_train.view(-1), torch.tensor(train_values).view(-1), criterion
)
valid_pred_errors = calculate_prediction_errors(
    res_valid.view(-1), torch.tensor(valid_values).view(-1), criterion
)

STATIC THRESHOLD

In [27]:
pred_error_threshold_train = np.mean(train_pred_errors) + 3 * np.std(train_pred_errors)
pred_error_threshold_valid = np.mean(valid_pred_errors) + 3 * np.std(valid_pred_errors)

In [28]:
window=40
std_coef=6
# We use Series from pandas to calculate windowed errors
train_pred_errors_windowed = pd.Series(train_pred_errors).rolling(window=window, min_periods=1)
# Dynamic threshold for the training data
train_dynamic_threshold = train_pred_errors_windowed.mean() + std_coef * train_pred_errors_windowed.std()

valid_pred_errors_windowed = pd.Series(valid_pred_errors).rolling(window=window, min_periods=1)
# Dynamic threshold for the validation data
valid_dynamic_threshold = valid_pred_errors_windowed.mean() + std_coef * valid_pred_errors_windowed.std()


In [29]:

def detect_anomalies(df, errors, pred_error_thresholds):
    df['error'] = errors
    df['upper_bound'] = pred_error_thresholds
    # Here we have for each value its own threshold
    indices = df.index[df['error'] >= df['upper_bound']].values.tolist()
    indices = [i for i in indices]
    #print(df.head(15))
    return indices

In [30]:
px1 = pd.DataFrame(res_train)
px2 = pd.DataFrame(res_valid)

In [31]:
pred_error_threshold_valid

14.79520478932595

In [32]:
train_anomalies_idxs = detect_anomalies(
    px1, torch.tensor(train_values), pred_error_threshold_train
)
valid_anomalies_idxs = detect_anomalies(
    px2, torch.tensor(valid_values), pred_error_threshold_valid
)


In [33]:
valid_anomalies_idxs

[946, 947]

In [34]:
valid_detected=valid.iloc[valid_anomalies_idxs]
valid_detected

Unnamed: 0,timestamp,value,stand_value,predicted
946,2014-04-13 06:52:00,76.23,18.651805,4.615803
947,2014-04-13 06:57:00,65.835,15.805478,4.486662


In [35]:

fig = go.Figure()
fig.add_trace(go.Scatter(x=valid['timestamp'], y=valid['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=valid_anomalies['timestamp'],
                         y=valid_anomalies['value'], 
                         mode='markers', name='Anomaly',
                         marker=dict(color='green', size=13)))

fig.add_trace(go.Scatter(x=valid_detected['timestamp'], y=valid_detected['value'], 
                         mode='markers', name='predicted Anomaly',
                         marker=dict(color='red')))
fig.update_layout(
    title="Validation set"
    )

fig.show()

In [36]:
train_detected=train.iloc[train_anomalies_idxs]
train_detected.head()

Unnamed: 0,timestamp,value,stand_value,predicted
3080,2014-02-25 07:15:00,25.1033,4.652449,1.745818
3081,2014-02-25 07:20:00,17.186,2.484558,1.880048
3082,2014-02-25 07:25:00,14.452,1.735942,1.747131
3083,2014-02-25 07:30:00,13.968,1.603415,1.68287
3084,2014-02-25 07:35:00,13.352,1.434744,1.582635


In [37]:
import plotly.graph_objects as go
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization')) 

# Create the figure for plotting the data
fig = go.Figure(layout=layout) 

# Add non-anomaly data to the figure
fig.add_trace(go.Scatter(x=train['timestamp'], y=train['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))

# Add anomaly data to the figure
fig.add_trace(go.Scatter(x=train_anomalies['timestamp'],
                         y=train_anomalies['value'], 
                         mode='markers', name='Anomaly',
                         marker=dict(color='green', size=13)))

fig.add_trace(go.Scatter(x=train_detected['timestamp'], y=train_detected['value'], 
                         mode='markers', name='Predicted Anomaly',
                         marker=dict(color='red')))
fig.update_layout(
    title="Training set"
)
fig.show()

### METRICS

In [38]:
li=[0]*4032
k=valid_anomalies.index.tolist()
for i in k:
  li[i]=1
valid['anomaly_found']=li

In [39]:

anomaly_label_valid=[0]*4032
for i in valid_anomalies_idxs:
  anomaly_label_valid[i]=1
valid['anomaly_label']=anomaly_label_valid    

In [40]:
anomaly_label_train=[0]*4032
for i in train_anomalies_idxs:
  anomaly_label_train[i]=1
train['anomaly_label']=anomaly_label_train  

In [41]:
li=[0]*4032
k=train_anomalies.index.tolist()
for i in k:
  li[i]=1
train['anomaly_found']=li

In [42]:
from sklearn.metrics import precision_recall_fscore_support

def calculate_metrics(
    ground_truth: pd.DataFrame, anomalies_idxs: list
    ):
    predictions = pd.DataFrame(
        index=range(len(ground_truth)), 
        columns=['anomaly_found']
    )
    predictions['anomaly_found'] = 0
    predictions.iloc[anomalies_idxs] = 1
    anomalies_given=(ground_truth['anomaly_found'].tolist()).count(1)
    # Calculation of the confusion matrix can be done using pandas
    confusion_matrix = pd.crosstab(
        ground_truth.loc[:, 'anomaly_label'],
        predictions['anomaly_found'], 
        margins=True
    )
   # print(confusion_matrix)
    precision, recall, f1, _ = precision_recall_fscore_support(
        ground_truth.loc[:, 'anomaly_label'],
        predictions['anomaly_found'], 
        beta=2., 
        average='binary'
    )
    precision=anomalies_given/len(anomalies_idxs)
    recall=anomalies_given/(len(anomalies_idxs)+anomalies_given)
    f1=2*(precision*recall)/(precision+recall)
    return confusion_matrix, precision, recall, f1

In [43]:

valid_conf_matrix, *valid_metrics = calculate_metrics(
    valid, valid_anomalies_idxs
)

print(f'Valid:\n Precision: {valid_metrics[0]:.3f}\n' 
      f'Recall: {valid_metrics[1]:.3f}\n' 
      f'F1 score: {valid_metrics[2]:.3f}')


Valid:
 Precision: 1.000
Recall: 0.500
F1 score: 0.667


In [44]:
train_conf_matrix, *train_metrics = calculate_metrics(
    train, train_anomalies_idxs
)

print(f'Train:\n Precision: {train_metrics[0]:.3f}\n' 
      f'Recall: {train_metrics[1]:.3f}\n' 
      f'F1 score: {train_metrics[2]:.3f}')

Train:
 Precision: 0.002
Recall: 0.002
F1 score: 0.002
