In [1]:
import numpy as np
import pickle as pk
import random

train_file = "data/hdfs_train"
test_normal_file = "data/hdfs_test_normal"
test_abnormal_file = "data/hdfs_test_abnormal"

SOS = "<SOS>"
EOS = "<EOS>"

balance_test_set = True

# Train set

In [2]:
events_set = set()
events = list()

for data_line in open(train_file, "rt"):
    event_trace = data_line.strip("\n").split()
    for event in event_trace:
        events_set.add(event)
    events.append(event_trace)
    
events_set.add(SOS)
events_set.add(EOS)

### Note: We need to know the full alphabet to map correctly

In [3]:
normal_events = list()
abnormal_events = list()

for data_line in open(test_abnormal_file, "rt"):
    event_trace = data_line.strip("\n").split()
    for event in event_trace:
        events_set.add(event)
    abnormal_events.append(event_trace)

if balance_test_set:
    counter = 0
    for data_line in open(test_normal_file, "rt"):
        counter += 1
    ratio = len(abnormal_events) / counter
    
for data_line in open(test_normal_file, "rt"):
    if balance_test_set and random.uniform(0, 1) > ratio:
        continue
    event_trace = data_line.strip("\n").split()
    for event in event_trace:
        events_set.add(event)
    normal_events.append(event_trace)

In [4]:
len(normal_events), len(abnormal_events)

(16856, 16838)

In [5]:
event_dict = {event: i for i, event in enumerate(events_set)}
alphabet_size = len(event_dict)

X_train = list()
X_train_oh = list() # shifted right to predict
for i in range(len(events)):
    trace = events[i]
    trace_oh = np.zeros((len(trace)+1, alphabet_size))
    
    encoded_trace = list()
    encoded_trace.insert(0, event_dict[SOS])
    
    for j, event in enumerate(trace):
        mapped_e = event_dict[event]
        encoded_trace.append(mapped_e)
        trace_oh[j][mapped_e] = 1

    trace_oh[-1][event_dict[EOS]] = 1
    X_train_oh.append(trace_oh)
    X_train.append(np.array(encoded_trace))
X_train[5:8], X_train_oh[5:8]

([array([20,  8,  6,  6,  6,  9,  9,  9, 19, 15, 19, 15, 19, 15,  2,  2,  1,
          2,  1,  2,  2,  2,  1,  1,  2,  2, 10, 10, 10,  7,  7,  7]),
  array([20,  6,  8,  6,  6,  9,  9, 19, 15, 19, 15, 19, 15,  9, 10, 10, 10,
          7,  7,  7]),
  array([20,  8,  6,  6,  6,  9,  9,  9, 19, 15, 19, 15, 19, 15,  1,  1,  2,
         28, 10, 10, 10,  7,  7,  7])],
 [array([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0

In [6]:
event_dict

{'1': 0,
 '4': 1,
 '3': 2,
 '13': 3,
 '25': 4,
 '19': 5,
 '5': 6,
 '21': 7,
 '22': 8,
 '26': 9,
 '23': 10,
 '27': 11,
 '17': 12,
 '20': 13,
 '28': 14,
 '9': 15,
 '24': 16,
 '14': 17,
 '7': 18,
 '11': 19,
 '<SOS>': 20,
 '8': 21,
 '15': 22,
 '6': 23,
 '18': 24,
 '12': 25,
 '10': 26,
 '16': 27,
 '2': 28,
 '<EOS>': 29}

In [7]:
pk.dump({
    "X": X_train,
    "X_oh": X_train_oh,
    "alphabet_size": alphabet_size,
    "event_mapping": event_dict,
    },
    open("train_data.pk", "wb")   
)

# Test set

In [8]:
X_test = list()
X_test_oh = list() # shifted right to predict
labels = list()

for i in range(len(abnormal_events)):
    trace = abnormal_events[i]
    trace_oh = np.zeros((len(trace)+1, alphabet_size))
    
    encoded_trace = list()
    encoded_trace.append(event_dict[SOS])
    
    for j, event in enumerate(trace):
        mapped_e = event_dict[event]
        encoded_trace.append(mapped_e)
        trace_oh[j][mapped_e] = 1

    trace_oh[-1][event_dict[EOS]] = 1
    X_test_oh.append(trace_oh)
    X_test.append(np.array(encoded_trace))
    labels.append(1)

for i in range(len(normal_events)):
    trace = normal_events[i]
    trace_oh = np.zeros((len(trace)+1, alphabet_size))
    
    encoded_trace = list()
    encoded_trace.append(event_dict[SOS])
    
    for j, event in enumerate(trace):
        mapped_e = event_dict[event]
        encoded_trace.append(mapped_e)
        trace_oh[j][mapped_e] = 1

    trace_oh[-1][event_dict[EOS]] = 1
    X_test_oh.append(trace_oh)
    X_test.append(np.array(encoded_trace))
    labels.append(0)

np.unique(labels, return_counts=True)

(array([0, 1]), array([16856, 16838]))

In [9]:
zipped = list(zip(X_test, X_test_oh, labels))
random.shuffle(zipped)
X_test, X_test_oh, labels = zip(*zipped)

In [10]:
import pickle as pk


pk.dump({
    "X": X_test,
    "X_oh": X_test_oh,
    "y": labels,
    "alphabet_size": alphabet_size,
    "event_mapping": event_dict,
    },
    open("test_data.pk", "wb")   
)

In [11]:
!ls

data			    preprocess_deeplog_data.ipynb  train_model.ipynb
evaluate_predictions.ipynb  preprocess_logpai_data.ipynb   X_pred.pk
model.keras		    test_data.pk
predict_and_save.ipynb	    train_data.pk
