In [12]:
import os
import pandas as pd
import torch

path = '../data/BGL/BGL_parsed_result_04.csv'
print(os.path.isfile(path))

True


In [2]:
BGL_parsed_result = pd.read_csv(path)
BGL_parsed_result.fillna({'Templates': ''}, inplace=True)
BGL_parsed_result.head()

Unnamed: 0,EventId,Templates,Parameters,LogMessage,Anomaly,Timestamp,Session
0,1750000,instruction cache parity error corrected,[],instruction cache parity error corrected,False,1117838570,R02-M1-N0-C:J12-U11
1,1750000,instruction cache parity error corrected,[],instruction cache parity error corrected,False,1117838570,R02-M1-N0-C:J12-U11
2,1750000,instruction cache parity error corrected,[],instruction cache parity error corrected,False,1117838570,R02-M1-N0-C:J12-U11
3,1750000,instruction cache parity error corrected,[],instruction cache parity error corrected,False,1117838570,R02-M1-N0-C:J12-U11
4,1750000,instruction cache parity error corrected,[],instruction cache parity error corrected,False,1117838570,R02-M1-N0-C:J12-U11


In [4]:
from vocab import buildVocab

pretrain_path = '../data/wiki-news-300d-1M.vec'
embedding_matrix = buildVocab(BGL_parsed_result, pretrain_path)

2023-02-16 20:30:27,746 - INFO - buildVocab identified 1035 unique tokens in structured log Dataframe.
2023-02-16 20:31:45,950 - INFO - 802 tokens can be converted to their corresponding semantic embeddings.
2023-02-16 20:31:45,953 - INFO - Number of out-of-vocabulary words is 233.


In [5]:
from partition import partition

filter_abnormal = True
partition_method = 'timestamp'
session_size = 200
train_ratio = 0.01

session_train, session_test, num_classes = partition(BGL_parsed_result, partition_method, train_ratio, filter_abnormal, session_size)
print(f'Training data has {num_classes} unique templates.')

2023-02-16 20:33:17,731 - INFO - filter_abnormal enabled when creating training data.
2023-02-16 20:33:27,075 - INFO - partitionByOrder done, 23591 sessions are generated.
2023-02-16 20:33:27,077 - INFO - Sequential Partitioning done. 1686 event ids are identified.
2023-02-16 20:33:27,078 - INFO - Number of training and testing sessions are 237 and 23354


Training data has 48 unique templates.


In [13]:
# Obtain unique templates of the training data
training_eventid_templates = {}
testing_eventid_templates = {}

for ind, event_id in enumerate(BGL_parsed_result['EventId']):
    if num_classes <= event_id:
        testing_eventid_templates.setdefault(event_id, (BGL_parsed_result['Templates'][ind], BGL_parsed_result['TokensId'][ind]))
    else:
        training_eventid_templates.setdefault(event_id, (BGL_parsed_result['Templates'][ind], BGL_parsed_result['TokensId'][ind]))
        
uniq_training_templates = list(training_eventid_templates.values())
uniq_testing_templates = list(testing_eventid_templates.values())

In [14]:
uniq_training_templates[:10]

[('instruction cache parity error corrected', [594, 620, 360, 92, 647]),
 ('MidplaneSwitchController performing bit sparing on <*> bit <*>',
  [10, 660, 577, 143, 587, 269, 143]),
 ('generating <*>', [171]),
 ('<*> ddr errors s detected and corrected on rank <*> symbol <*> bit <*>',
  [703, 782, 125, 643, 276, 647, 269, 680, 104, 143]),
 ('<*> L3 EDRAM error s dcr <*> detected and corrected',
  [510, 92, 125, 643, 276, 647]),
 ('CE sym <*> at <*> mask <*>', [666, 4, 683, 494]),
 ('total of <*> ddr error s detected and corrected',
  [88, 636, 703, 92, 125, 643, 276, 647]),
 ('ddr: activating redundant bit <*> rank <*> symbol <*>',
  [703, 732, 724, 143, 680, 104]),
 ('ddr: excessive soft failures consider replacing the card',
  [703, 206, 749, 516, 463, 696, 128, 94]),
 ('ciod: Error loading <*> <*> <*> <*> <*> <*> invalid or missing program image No such file or directory',
  [38, 623, 706, 137, 323, 326, 708, 627, 46, 380, 137, 69])]

In [15]:
uniq_testing_templates[:10]

[('idoproxydb has been started: Name: <*> Input parameters: -enableflush -loguserinfo db.properties BlueGene1',
  [293, 356, 113, 345, 297, 402, 668, 711, 160, 544]),
 ('ciodb has been restarted.', [293, 356, 646]),
 ('mmcs_db_server has been started: <*> mmcs_db_server --useDatabase BGL --dbproperties <*> --iolog bgl BlueLight logs BGL --reconnect-blocks all',
  [668,
   416,
   293,
   356,
   113,
   668,
   416,
   398,
   406,
   797,
   160,
   325,
   333,
   797,
   285,
   759,
   349]),
 ('ciod: LOGIN chdir <*> <*> <*> <*> <*> failed: No such file or directory',
  [663, 107, 480, 627, 46, 380, 137, 69]),
 ('ciod: failed to read message prefix on control stream CioStream socket to <*>',
  [480, 445, 499, 640, 786, 269, 427, 225, 182, 443, 453, 445]),
 ('<*> L3 directory error s dcr <*> detected and corrected',
  [510, 69, 92, 125, 643, 276, 647]),
 ('data TLB error interrupt', [630, 578, 92, 765]),
 ('machine check interrupt', [317, 405, 765]),
 ('machine check status register

In [16]:
training_tokens_id = [entry[1] for entry in uniq_training_templates]
testing_tokens_id = [entry[1] for entry in uniq_testing_templates]

In [32]:
def templateEmbedding(tokens_id):
    '''
    Calculate template embedding by aggregating its tokens' embedding
    '''
    tokens_embedding = embedding_matrix[tokens_id]
    if len(tokens_id) == 0:
        return torch.sum(tokens_embedding, axis=0)
    return torch.mean(tokens_embedding, axis=0)

training_template_embeddings = torch.vstack([templateEmbedding(tokens_id) for tokens_id in training_tokens_id])
cosine_func = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

def nearestTemplate(tokens_id):
    curr_embedding = templateEmbedding(tokens_id)
    cos_sim = cosine_func(training_template_embeddings, curr_embedding)
    _, top_k = torch.topk(cos_sim, 5)
    ret = []
    
    for entry in top_k.tolist():
        ret.append(uniq_training_templates[entry][0])
        
    return ret

In [36]:
for template, tokens_id in uniq_testing_templates:
    nearest_templates = nearestTemplate(tokens_id)
    print(f'Nearest templates of \n{template} \n are \n {nearest_templates}')

Nearest templates of 
idoproxydb has been started: Name: <*> Input parameters: -enableflush -loguserinfo db.properties BlueGene1 
 are 
 ['ciod: Error loading <*> <*> <*> <*> <*> <*> invalid or missing program image No such file or directory', '<*> ddr errors s detected and corrected on rank <*> symbol <*> bit <*>', '<*> tree receiver <*> in re-synch state event s dcr <*> detected', 'total of <*> ddr error s detected and corrected', '<*> L3 EDRAM error s dcr <*> detected and corrected']
Nearest templates of 
ciodb has been restarted. 
 are 
 ['<*> ddr errors s detected and corrected on rank <*> symbol <*> bit <*>', 'total of <*> ddr error s detected and corrected', '<*> L3 EDRAM error s dcr <*> detected and corrected', '<*> tree receiver <*> in re-synch state event s dcr <*> detected', 'ciod: Error loading <*> <*> <*> <*> <*> <*> invalid or missing program image No such file or directory']
Nearest templates of 
mmcs_db_server has been started: <*> mmcs_db_server --useDatabase BGL --dbp