## BGL data preprocessing

In [1]:
import os
import pandas as pd

BGL_log_path = '../data/BGL/BGL.log'
assert(os.path.isfile(BGL_log_path))

BGL_parsed_path = '../data/BGL/BGL_parsed_result_level_03.csv'
assert(os.path.isfile(BGL_parsed_path))

In [2]:
components, timestamps = [], []

with open(BGL_log_path) as f:
    for log_line in f.readlines():
        # remove header & \n at the end of each line
        log_tokens = log_line.split()
        
        components.append(log_tokens[7])
        timestamps.append(int(log_tokens[1]))
        
time_elapsed = [0]
time_elapsed.extend([timestamps[ind] - timestamps[ind-1] for ind in range(1, len(timestamps))])

parsed_log_df = pd.read_csv(BGL_parsed_path)
parsed_log_df['Component'] = components
parsed_log_df['Timestamp'] = timestamps
parsed_log_df['TimeElapsed'] = time_elapsed

KeyboardInterrupt: 

In [4]:
parsed_log_df.head()

Unnamed: 0,EventId,Templates,Parameters,LogMessage,Anomaly,Timestamp,Session,Level,Component,TimeElapsed
0,1750000,instruction cache parity error corrected,[],instruction cache parity error corrected,False,1117838570,R02-M1-N0-C:J12-U11,INFO,KERNEL,0
1,1750000,instruction cache parity error corrected,[],instruction cache parity error corrected,False,1117838570,R02-M1-N0-C:J12-U11,INFO,KERNEL,0
2,1750000,instruction cache parity error corrected,[],instruction cache parity error corrected,False,1117838570,R02-M1-N0-C:J12-U11,INFO,KERNEL,0
3,1750000,instruction cache parity error corrected,[],instruction cache parity error corrected,False,1117838570,R02-M1-N0-C:J12-U11,INFO,KERNEL,0
4,1750000,instruction cache parity error corrected,[],instruction cache parity error corrected,False,1117838570,R02-M1-N0-C:J12-U11,INFO,KERNEL,0


In [11]:
parsed_log_df.to_csv('../data/BGL/BGL_parsed_result_full_03.csv', index=False)

## Dataset Preparation

In [1]:
import pandas as pd
dataset = 'BGL'
parsed_path = '../data/{}/{}.log_structured.csv'.format(dataset, dataset)
parsed_log_df = pd.read_csv(parsed_path)

In [2]:
import time
import torch

from dataset import LogDataset
from partition import partition
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
session_train, session_test, num_components, num_classes, num_levels, level2ind = partition(parsed_log_df, 
                                                                                            'timestamp', 
                                                                                             0.5, 
                                                                                             True,
                                                                                             200,
                                                                                             False)

print(f'Number of componens: {num_components}, number of training events: {num_classes}, number of levels: {num_levels}.')

2023-03-20 20:33:55,758 - INFO - filter_abnormal enabled when creating training data.
2023-03-20 20:34:07,698 - INFO - Training sessions generated, mean = 1.1978604769583299, stddev = 81.32467272087531.
2023-03-20 20:34:14,384 - INFO - partitionByOrder done, 22066 sessions are generated.
2023-03-20 20:34:14,385 - INFO - Sequential Partitioning done. 1653 event ids are identified.
2023-03-20 20:34:14,386 - INFO - Number of training and testing sessions are 11783 and 10283.


Number of componens: 9, number of training events: 183, number of levels: 6.


In [7]:
batch_size = 1024
eval_batch_size = 1024
step_size = 1
window_size = 10

dataset_train = LogDataset(session_train, 
                           window_size, 
                           step_size, 
                           num_classes)

dataset_test = LogDataset(session_test, 
                          window_size, 
                          step_size, 
                          num_classes)

In [8]:
def collate_fn(batch_input_dict):
    keys = [input_dict['session_key'] for input_dict in batch_input_dict]
    templates = [input_dict['templates'] for input_dict in batch_input_dict]
    event_ids = [input_dict['eventids'] for input_dict in batch_input_dict]
    time_elapsed = [input_dict['time_elapsed'] for input_dict in batch_input_dict]
    components = [input_dict['components'] for input_dict in batch_input_dict]
    levels = [input_dict['levels'] for input_dict in batch_input_dict]
    
    next_logs = [input_dict['next'] for input_dict in batch_input_dict]
    anomaly = [input_dict['anomaly'] for input_dict in batch_input_dict]

    return {'session_key': keys,
            'templates': templates,
            'eventids': event_ids,
            'time_elapsed': time_elapsed,
            'components': components,
            'levels': levels,
            'next': next_logs,
            'anomaly': anomaly}

dataloader_train = DataLoader(dataset_train, 
                              collate_fn=collate_fn, 
                              batch_size=batch_size, 
                              shuffle=False, 
                              pin_memory=False)

dataloader_test = DataLoader(dataset_test, 
                             collate_fn=collate_fn, 
                             batch_size=eval_batch_size, 
                             shuffle=False, 
                             pin_memory=False)

## Training autoencoder

In [9]:
class AutoEncoderEmbedding(torch.nn.Module):
    def __init__(self, num_components, num_levels):
        super(AutoEncoderEmbedding, self).__init__()
        self.num_components = num_components
        self.num_levels = num_levels
        
        components_embedding = torch.vstack([torch.eye(num_components), torch.zeros(1, num_components)])
        levels_embedding = torch.vstack([torch.eye(num_levels), torch.zeros(1, num_levels)])
        self.component_embedder = torch.nn.Embedding.from_pretrained(components_embedding, freeze=True)
        self.level_embedder = torch.nn.Embedding.from_pretrained(levels_embedding, freeze=True)
        
    def forward(self, input_dict):
        components = torch.tensor(input_dict['components'])
        levels = torch.tensor(input_dict['levels'])
        time_elapsed = torch.tensor(input_dict['time_elapsed']).unsqueeze(-1).cuda()
        
        components[self.num_components < components] = self.num_components
        levels[self.num_levels < levels] = self.num_levels
        
        components_embedding = self.component_embedder(components.cuda())
        levels_embedding = self.level_embedder(levels.cuda())
        return torch.cat([time_elapsed, components_embedding, levels_embedding], dim=2)

class AutoEncoder(torch.nn.Module):
    def __init__(self,
                 num_components,
                 num_levels,
                 window_size):
        super(AutoEncoder, self).__init__()
        self.EmbeddingLayer = AutoEncoderEmbedding(num_components, num_levels)
        
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(window_size * (num_components+num_levels+1), 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 12),
            torch.nn.ReLU(),
            torch.nn.Linear(12, 3)
        )
        
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(3, 12),
            torch.nn.ReLU(),
            torch.nn.Linear(12, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, window_size * (num_components+num_levels+1)),
            torch.nn.Tanh()
        )
        
    def forward(self, input_dict):
        embedding_matrix = self.EmbeddingLayer(input_dict)
        embedding = embedding_matrix.view(embedding_matrix.size(0), -1)
        encoding = self.encoder(embedding)
        return embedding, self.decoder(encoding)
    
class VariationalAutoEncoder(torch.nn.Module):
    def __init__(self, 
                 num_components,
                 num_levels,
                 window_size):
        super(VariationalAutoEncoder, self).__init__()
        self.EmbeddingLayer = AutoEncoderEmbedding(num_components, num_levels)
        self.fc1 = torch.nn.Linear(window_size * (num_components+num_levels+1), 100)
        self.fc21 = torch.nn.Linear(100, 10)
        self.fc22 = torch.nn.Linear(100, 10)
        self.fc3 = torch.nn.Linear(10, 100)
        self.fc4 = torch.nn.Linear(100, window_size * (num_components+num_levels+1))
        
    def encode(self, embedding):
        h1 = torch.nn.functional.relu(self.fc1(embedding))
        return self.fc21(h1), self.fc22(h1)

    def reparametrize(self, mu, logvar):
        std = logvar.mul(0.5).exp_()
        if torch.cuda.is_available():
            eps = torch.cuda.FloatTensor(std.size()).normal_()
        else:
            eps = torch.FloatTensor(std.size()).normal_()
        eps = torch.autograd.Variable(eps)
        return eps.mul(std).add_(mu)

    def decode(self, z):
        h3 = torch.nn.functional.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(h3))

    def forward(self, input_dict):
        embedding_matrix = self.EmbeddingLayer(input_dict)
        embedding = embedding_matrix.view(embedding_matrix.size(0), -1)
        
        mu, logvar = self.encode(embedding)
        z = self.reparametrize(mu, logvar)
        return embedding, self.decode(z), mu, logvar
    
reconstruction_function = torch.nn.MSELoss(reduction='sum')

def loss_function(recon_x, x, mu, logvar):
    """
    recon_x: generating images
    x: origin images
    mu: latent mean
    logvar: latent log variance
    """
    BCE = reconstruction_function(recon_x, x)  # mse loss
    # loss = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD_element = mu.pow(2).add_(logvar.exp()).mul_(-1).add_(1).add_(logvar)
    KLD = torch.sum(KLD_element).mul_(-0.5)
    # KL divergence
    return BCE + KLD

In [11]:
batch_cnt = 0
learning_rate = 2e-3
num_epochs = 10
total_loss = 0
thresh = 0.02
# training_losses = []

model = AutoEncoder(num_components, num_levels, window_size+1).cuda()
# model.load_state_dict(torch.load('../checkpoint/bgl_ae_08_epoch10.pth'))
criterion = torch.nn.MSELoss()
# model = VariationalAutoEncoder(num_components, num_levels, window_size+1).cuda()
# criterion = loss_function

eval_criterion = torch.nn.MSELoss(reduction='none')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

for epoch in range(num_epochs):
    model.train()
    
    for batch in dataloader_train:
        batch_cnt += 1
        
        batch_embedding, output = model(batch)
        batch_loss = criterion(output, batch_embedding)
        # batch_embedding, output, mu, logvar = model(batch)
        # batch_loss = criterion(output, batch_embedding, mu, logvar)
        
        total_loss += batch_loss.mean()
        
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        
    print(f'[{epoch+1}|{num_epochs}] Training finished, training loss: {total_loss/batch_cnt :.3f}.')
    
    TOP = 0
    TON = 0
    TP = 0
    FP = 0

    model = model.eval()

    for batch in dataloader_test:
        batch_size = len(batch['anomaly'])
        batch_embedding, output = model(batch)
        # batch_embedding, output, mu, logvar = model(batch)
        batch_loss = eval_criterion(output, batch_embedding).mean(axis=1)

        pred = torch.lt(batch_loss, thresh).tolist()
        is_anomaly = batch['anomaly']
        TOP += batch_size - sum(is_anomaly)

        for ind in range(batch_size):
            if pred[ind]:
                if is_anomaly[ind]:
                    FP += 1
                else:
                    TP += 1

        precision = TP / (TP + FP)
        recall = TP / TOP
        F1 = 2 * precision * recall / (precision + recall)
        
    print(f'[{epoch+1}|{num_epochs}] Evaluation done, FP: {FP}, precision: {precision: .3f}, recall: {recall :.3f}, F1-measure: {F1: .3f}.')
    torch.save(model.state_dict(), f'../checkpoint/tmp/bgl_ae_05_epoch{epoch+1}.pth')

[1|10] Training finished, training loss: 0.066.
[1|10] Evaluation done, FP: 35, precision:  1.000, recall: 0.790, F1-measure:  0.883.
[2|10] Training finished, training loss: 0.065.
[2|10] Evaluation done, FP: 37, precision:  1.000, recall: 0.791, F1-measure:  0.883.
[3|10] Training finished, training loss: 0.065.
[3|10] Evaluation done, FP: 42617, precision:  0.974, recall: 0.865, F1-measure:  0.916.
[4|10] Training finished, training loss: 0.064.
[4|10] Evaluation done, FP: 42623, precision:  0.974, recall: 0.865, F1-measure:  0.917.
[5|10] Training finished, training loss: 0.064.
[5|10] Evaluation done, FP: 42681, precision:  0.974, recall: 0.866, F1-measure:  0.917.
[6|10] Training finished, training loss: 0.064.
[6|10] Evaluation done, FP: 42660, precision:  0.974, recall: 0.866, F1-measure:  0.917.
[7|10] Training finished, training loss: 0.064.
[7|10] Evaluation done, FP: 42625, precision:  0.974, recall: 0.866, F1-measure:  0.917.
[8|10] Training finished, training loss: 0.064.

In [66]:
criterion = torch.nn.MSELoss(reduction='none')
training_losses = []

for batch in dataloader_train:
    batch_cnt += 1
    batch_embedding, output = model(batch)
    batch_loss = criterion(output, batch_embedding).mean(axis=1).tolist()
    
    training_losses.extend(batch_loss)

assert(len(training_losses) == len(dataset_train))

# import matplotlib.pyplot as plt
# plt.hist(training_losses)
# plt.show()

# Fit a Gaussion distribution on training losses
import statistics
mean = statistics.mean(training_losses)
stddev = statistics.stdev(training_losses)
print(f'mean = {mean}, standard deviation = {stddev}.')

In [23]:
batch_cnt = 0
# normal_cnt = 0
# anomaly_cnt = 0
total_loss = 0

model = model.eval()

for batch in dataloader_test:
    batch_cnt += 1
    batch_embedding, output = model(batch)
    batch_loss = criterion(output, batch_embedding).mean(axis=1).tolist()
    
print(f'Evaluation finished, evaluation loss: {total_loss/batch_cnt :.3f}.')

Evaluation finished, evaluation loss: 0.269.


In [9]:
print(level2ind)
ind2level = {ind:level for level, ind in level2ind.items()}



In [8]:
ind2level

{0: 'INFO',
 1: 'FATAL',
 3: 'SEVERE',
 4: 'ERROR',
 5: 'Kill',
 6: 'FAILURE',
 7: 'single',
 8: 'microseconds',
 9: '0x00544eb8,'}

In [36]:
# Directly use LEVEL to determine whether an instance is normal

TOP = 0
TON = 0
TP = 0
FP = 0

for batch in dataloader_test:
    batch_size = len(batch['anomaly'])
    is_anomaly = batch['anomaly']
    batch_levels = [batch['levels'][ind][-1] for ind in range(batch_size)]
    
    TOP += batch_size - sum(is_anomaly)
    
    for ind in range(batch_size):
        level = ind2level.get(batch_levels[ind], 'UNKNOWN')
        if level != 'FATAL' and level != 'ERROR': # predicted as normal
            if is_anomaly[ind]:
                FP += 1
            else:
                TP += 1
    
precision = TP / (TP + FP)
recall = TP / TOP
F1 = 2 * precision * recall / (precision + recall)
print(f'Precision: {precision: .3f}, Recall: {recall :.3f}, F1-measure: {F1: .3f}.')

Precision:  1.000, Recall: 0.913, F1-measure:  0.955.


In [10]:
TOP / len(dataset_test)

0.9436136336414951

In [21]:
TOP = 0
TON = 0
TP = 0
FP = 0

model = AutoEncoder(num_components, num_levels, window_size+1).cuda()
model.load_state_dict(torch.load('../checkpoint/tmp/bgl_ae_05_epoch2.pth'))
# model.load_state_dict(torch.load('../model/autoencoder_10.pth'))
model.eval()

criterion = torch.nn.MSELoss(reduction='none')
thresh = 0.06

for batch in dataloader_test:
    batch_size = len(batch['anomaly'])
    batch_embedding, output = model(batch)
    batch_loss = criterion(output, batch_embedding).mean(axis=1)
    
    pred = torch.lt(batch_loss, thresh).tolist()
    is_anomaly = batch['anomaly']
    TOP += batch_size - sum(is_anomaly)
    
    for ind in range(batch_size):
        if pred[ind]:
            if is_anomaly[ind]:
                FP += 1
            else:
                TP += 1
    
precision = TP / (TP + FP)
recall = TP / TOP
F1 = 2 * precision * recall / (precision + recall)
print(f'FP: {FP}, Precision: {precision: .3f}, Recall: {recall :.3f}, F1-measure: {F1: .3f}.')

FP: 937, Precision:  0.999, Recall: 0.799, F1-measure:  0.888.


In [None]:
## Directly apply autoencoder to anomaly detection
TOP = 0
TON = 0
TP = 0
FP = 0

model = AutoEncoder(num_components, num_levels, window_size+1).cuda()
model.load_state_dict(torch.load('../checkpoint/autoencoder_10.pth'))
model.eval()

criterion = torch.nn.MSELoss(reduction='none')
session_dict = {}
thresh = 0.03

for batch in dataloader_test:
    batch_size = len(batch['anomaly'])
    batch_embedding, output = model(batch)
    batch_loss = criterion(output, batch_embedding).mean(axis=1)
    
    pred = torch.lt(batch_loss, thresh).tolist()
    is_anomaly = batch['anomaly']
    
    for ind in range(batch_size):
        session_key = batch['session_key'][ind]
        if session_key not in session_dict:
            session_dict[session_key] = {'anomaly': False, 'matched': True}
        session_dict[session_key]['anomaly'] |= is_anomaly[ind]
        session_dict[session_key]['matched'] &= pred[ind]
        
for key, session_info in session_dict.items():
    if session_info['anomaly']:
        TOP += 1
        if not session_info['matched']:
            TP += 1
    else:
        TON += 1
        if not session_info['matched']:
            FP += 1
                
FN = TOP - TP
precision = TP / (TP + FP)
recall = TP / TOP
F1 = 2 * precision * recall / (precision + recall)
print(f'TOP: {TOP}, TON: {TON}, FP: {FP}, FN: {FN}, Precision: {precision: .3f}, Recall: {recall :.3f}, F1-measure: {F1: .3f}.')

RuntimeError: Error(s) in loading state_dict for AutoEncoder:
	size mismatch for EmbeddingLayer.component_embedder.weight: copying a param with shape torch.Size([8, 7]) from checkpoint, the shape in current model is torch.Size([12, 11]).
	size mismatch for EmbeddingLayer.level_embedder.weight: copying a param with shape torch.Size([3, 2]) from checkpoint, the shape in current model is torch.Size([8, 7]).
	size mismatch for encoder.0.weight: copying a param with shape torch.Size([64, 110]) from checkpoint, the shape in current model is torch.Size([64, 209]).
	size mismatch for decoder.6.weight: copying a param with shape torch.Size([110, 64]) from checkpoint, the shape in current model is torch.Size([209, 64]).
	size mismatch for decoder.6.bias: copying a param with shape torch.Size([110]) from checkpoint, the shape in current model is torch.Size([209]).

In [34]:
criterion = torch.nn.MSELoss(reduction='none')
loss = criterion(output, batch_embedding)

In [38]:
loss.mean()

tensor(17.1385, device='cuda:0', grad_fn=<MeanBackward0>)

In [52]:
loss.mean(axis=1)

tensor([2.6772e-02, 2.6814e-02, 2.6704e-02, 1.0258e-01, 1.0325e-01, 2.0002e+00,
        2.0007e+00, 3.8456e+00, 3.8460e+00, 5.9550e+00, 5.9552e+00, 6.8634e+00,
        6.8634e+00, 7.5539e+00, 7.6425e+00, 7.6427e+00, 6.6280e+00, 6.6281e+00,
        4.7721e+00, 4.7722e+00, 4.1142e+00, 4.1144e+00, 3.2010e+00, 3.2010e+00,
        2.5124e+00, 2.3785e+00, 2.3778e+00, 1.5177e+00, 1.5188e+00, 1.5234e+00,
        1.5189e+00, 5.6686e-03, 5.6032e-03, 5.1649e-06, 5.6587e-06, 2.4496e-03,
        2.4540e-03, 9.2625e+01, 1.4530e+02, 1.4531e+02, 1.4531e+02, 1.4530e+02,
        1.4530e+02, 1.4531e+02, 1.4531e+02, 1.4531e+02, 1.4531e+02, 1.4531e+02,
        5.2730e+01, 5.4766e-06, 5.9373e-06, 5.2084e-06, 5.2146e-06, 5.5956e-06,
        5.5772e-06, 5.4947e-06, 5.4557e-06, 5.9535e-06, 5.2146e-06, 5.2013e-06,
        5.5992e-06, 5.5539e-06, 5.4766e-06, 5.9373e-06, 5.2084e-06, 5.2146e-06,
        5.2013e-06, 5.5992e-06, 5.5539e-06, 5.4766e-06, 1.0992e-01, 1.1044e-01,
        1.1020e-01, 1.1026e-01, 1.2302e-

In [59]:
torch.save(model.state_dict(), '../model/autoencoder_10.pth')