In [1]:
!ls data

BGL_preprocessed.less_templates.csv  sequences.dat
BGL_preprocessed.standard.csv	     sequences_less_templates.dat
distilled_model_mu_large	     X_test.dat
labels_less_templates.txt	     X_train.dat
labels.txt			     X_train_less_sequences.dat
README.md			     y_test.txt


In [2]:
import numpy as np

trace_file = "data/sequences_less_templates.dat"
label_file = "data/labels_less_templates.txt"

SOS = "<SOS>"
EOS = "<EOS>"

In [3]:
def get_label(label_line: list):
    if "1" in set(label_line.split()):
        return 1
    return 0

In [4]:
import copy

label_dict = {"Normal": 0, "Anomaly": 1}
label_dict_inv = {v: k for k, v in label_dict.items()}
events_set = set()

events = list()
labels = list()

for i, (data_line, label_line) in enumerate(zip(open(trace_file, "rt"), open(label_file, "rt"))):
    if i==0:
        continue
    event_trace = data_line.split()[2:]
    if len(event_trace) < 2:
        continue
    
    for event in event_trace:
        events_set.add(event)
    events.append(copy.copy(event_trace))
    
    label = get_label(label_line)
    labels.append(copy.copy(label))
events_set.add(SOS)
events_set.add(EOS)

In [5]:
event_dict = {event: i for i, event in enumerate(events_set)}
alphabet_size = len(event_dict)

#events_oh = list()
for i in range(len(events)):
    trace = events[i]
#    trace_oh = np.zeros((len(trace)+1, alphabet_size))
    for j, event in enumerate(trace):
        events[i][j] = event_dict[event]
#        trace_oh[j][event_dict[event]] = 1
#    events[i].insert(0, event_dict[SOS])
#    trace_oh[-1][event_dict[EOS]] = 1
#    events_oh.append(trace_oh)
#events[5:8], events_oh[5:8]

In [6]:
event_dict

{'110': 0,
 '156': 1,
 '261': 2,
 '16': 3,
 '64': 4,
 '314': 5,
 '39': 6,
 '81': 7,
 '186': 8,
 '80': 9,
 '244': 10,
 '56': 11,
 '36': 12,
 '1': 13,
 '188': 14,
 '77': 15,
 '312': 16,
 '253': 17,
 '133': 18,
 '68': 19,
 '112': 20,
 '172': 21,
 '310': 22,
 '305': 23,
 '12': 24,
 '308': 25,
 '255': 26,
 '85': 27,
 '13': 28,
 '311': 29,
 '108': 30,
 '29': 31,
 '218': 32,
 '139': 33,
 '207': 34,
 '243': 35,
 '285': 36,
 '262': 37,
 '87': 38,
 '214': 39,
 '115': 40,
 '42': 41,
 '111': 42,
 '304': 43,
 '217': 44,
 '232': 45,
 '53': 46,
 '208': 47,
 '121': 48,
 '259': 49,
 '223': 50,
 '138': 51,
 '197': 52,
 '307': 53,
 '45': 54,
 '99': 55,
 '270': 56,
 '181': 57,
 '234': 58,
 '227': 59,
 '17': 60,
 '146': 61,
 '37': 62,
 '102': 63,
 '113': 64,
 '35': 65,
 '136': 66,
 '293': 67,
 '179': 68,
 '273': 69,
 '251': 70,
 '94': 71,
 '41': 72,
 '199': 73,
 '283': 74,
 '128': 75,
 '79': 76,
 '271': 77,
 '298': 78,
 '288': 79,
 '89': 80,
 '302': 81,
 '101': 82,
 '295': 83,
 '249': 84,
 '290': 85,
 '228

In [7]:
len(events), np.unique(np.array(labels),return_counts=True)

(3576870, (array([0, 1]), array([3296098,  280772])))

In [8]:
_, (n_normal, n_anomalies) = np.unique(np.array(labels),return_counts=True)
n_normal, n_anomalies

(3296098, 280772)

In [9]:
import random

X_train = list()
#X_train_oh = list()
X_test = list()
#X_test_oh = list()
y_test = list()

n_normal_test = n_anomalies # we want test set to be balanced in labels
p_test = n_normal_test / (n_normal - n_anomalies)
for event, label in zip(events, labels):
    p = random.uniform(0, 1)

    if label==1 or p <= p_test:
        X_test.append(event)
        #X_test_oh.append(event_oh)
        y_test.append(label)
    else:        
        X_train.append(event)
        #X_train_oh.append(event_oh)

len(X_train), len(X_test), np.unique(y_test, return_counts=True)

(2989284, 587586, (array([0, 1]), array([306814, 280772])))

In [10]:
import pickle as pk

pk.dump({
    "X": X_train,
    "alphabet_size": alphabet_size,
    "event_mapping": event_dict,
    },
    open("train_data.pk", "wb") ,
)

#pk.dump(
#   X_train[:300000],
#    open("X_train.pk", "wb"),
#    protocol=4
#)
#pk.dump(
#    X_train_oh,
#    open("X_train_oh.pk", "wb"),
#    protocol=4
#)
#pk.dump(
#   alphabet_size,
#    open("alphabet_size.pk", "wb"),
#    protocol=4
#)

del X_train, events, labels

In [11]:
def unison_shuffled_copies(a, b):
    assert(len(a)==len(b))
    tmp = list(zip(a, b))
    random.shuffle(tmp)
    a, b = zip(*tmp)
    return list(a), list(b)

X_test, y_test = unison_shuffled_copies(X_test, y_test)

In [12]:
pk.dump({
    "X": X_test,
    "y": y_test,
    "alphabet_size": alphabet_size,
    "event_mapping": event_dict,
    },
    open("test_data.pk", "wb"),
)