In [1]:
!ls data

BGL_preprocessed.less_templates.csv  labels.txt  sequences.dat
BGL_preprocessed.log_structured.csv  README.md	 X_train.dat


In [2]:
import numpy as np

trace_file = "data/sequences.dat"
label_file = "data/labels.txt"

SOS = "<SOS>"
EOS = "<EOS>"

In [3]:
def get_label(label_line: list):
    if "1" in set(label_line.split()):
        return 1
    return 0

In [4]:
import copy

label_dict = {"Normal": 0, "Anomaly": 1}
label_dict_inv = {v: k for k, v in label_dict.items()}
events_set = set()

events = list()
labels = list()

for i, (data_line, label_line) in enumerate(zip(open(trace_file, "rt"), open(label_file, "rt"))):
    if i==0:
        continue
    event_trace = data_line.split()[2:]
    if len(event_trace) < 2:
        continue
    
    for event in event_trace:
        events_set.add(event)
    events.append(copy.copy(event_trace))
    
    label = get_label(label_line)
    labels.append(copy.copy(label))
events_set.add(SOS)
events_set.add(EOS)

In [5]:
event_dict = {event: i for i, event in enumerate(events_set)}
alphabet_size = len(event_dict)

#events_oh = list()
for i in range(len(events)):
    trace = events[i]
#    trace_oh = np.zeros((len(trace)+1, alphabet_size))
    for j, event in enumerate(trace):
        events[i][j] = event_dict[event]
#        trace_oh[j][event_dict[event]] = 1
#    events[i].insert(0, event_dict[SOS])
#    trace_oh[-1][event_dict[EOS]] = 1
#    events_oh.append(trace_oh)
#events[5:8], events_oh[5:8]

In [6]:
event_dict

{'107': 0,
 '168': 1,
 '109': 2,
 '186': 3,
 '95': 4,
 '47': 5,
 '202': 6,
 '98': 7,
 '245': 8,
 '170': 9,
 '125': 10,
 '167': 11,
 '216': 12,
 '20': 13,
 '318': 14,
 '207': 15,
 '270': 16,
 '300': 17,
 '189': 18,
 '278': 19,
 '263': 20,
 '289': 21,
 '102': 22,
 '84': 23,
 '302': 24,
 '23': 25,
 '128': 26,
 '89': 27,
 '178': 28,
 '280': 29,
 '149': 30,
 '297': 31,
 '240': 32,
 '248': 33,
 '234': 34,
 '71': 35,
 '235': 36,
 '249': 37,
 '242': 38,
 '219': 39,
 '236': 40,
 '313': 41,
 '86': 42,
 '26': 43,
 '271': 44,
 '58': 45,
 '277': 46,
 '104': 47,
 '131': 48,
 '42': 49,
 '187': 50,
 '198': 51,
 '12': 52,
 '273': 53,
 '191': 54,
 '99': 55,
 '111': 56,
 '78': 57,
 '43': 58,
 '213': 59,
 '3': 60,
 '37': 61,
 '45': 62,
 '82': 63,
 '228': 64,
 '250': 65,
 '101': 66,
 '304': 67,
 '67': 68,
 '49': 69,
 '75': 70,
 '92': 71,
 '155': 72,
 '<EOS>': 73,
 '152': 74,
 '146': 75,
 '25': 76,
 '136': 77,
 '281': 78,
 '134': 79,
 '112': 80,
 '157': 81,
 '272': 82,
 '259': 83,
 '100': 84,
 '48': 85,
 '2

In [7]:
len(events), np.unique(np.array(labels),return_counts=True)

(4050403, (array([0, 1]), array([3296098,  754305])))

In [8]:
_, (n_normal, n_anomalies) = np.unique(np.array(labels),return_counts=True)
n_normal, n_anomalies

(3296098, 754305)

In [9]:
import random

X_train = list()
#X_train_oh = list()
X_test = list()
#X_test_oh = list()
y_test = list()

n_normal_test = n_anomalies # we want test set to be balanced in labels
p_test = n_normal_test / (n_normal - n_anomalies)
for event, label in zip(events, labels):
    p = random.uniform(0, 1)

    if label==1 or p <= p_test:
        X_test.append(event)
        #X_test_oh.append(event_oh)
        y_test.append(label)
    else:        
        X_train.append(event)
        #X_train_oh.append(event_oh)

len(X_train), len(X_test), np.unique(y_test, return_counts=True)

(2318181, 1732222, (array([0, 1]), array([977917, 754305])))

In [10]:
import pickle as pk

pk.dump({
    "X": X_train,
    "alphabet_size": alphabet_size,
    "event_mapping": event_dict,
    },
    open("train_data.pk", "wb") ,
)

#pk.dump(
#   X_train[:300000],
#    open("X_train.pk", "wb"),
#    protocol=4
#)
#pk.dump(
#    X_train_oh,
#    open("X_train_oh.pk", "wb"),
#    protocol=4
#)
#pk.dump(
#   alphabet_size,
#    open("alphabet_size.pk", "wb"),
#    protocol=4
#)

del X_train, events, labels

In [11]:
def unison_shuffled_copies(a, b):
    assert(len(a)==len(b))
    tmp = list(zip(a, b))
    random.shuffle(tmp)
    a, b = zip(*tmp)
    return list(a), list(b)

X_test, y_test = unison_shuffled_copies(X_test, y_test)

In [12]:
pk.dump({
    "X": X_test,
    "y": y_test,
    "alphabet_size": alphabet_size,
    "event_mapping": event_dict,
    },
    open("test_data.pk", "wb"),
)