In [1]:
"""
Pipelines for processing raw logs to structured data
including sampling (optional), log parsing, log sequence generation by windowing, and train test splitting
"""


import os
import pickle

from dataset import SimpleParserFactory, split_train_test_aiia, generate_test_set_aiia, sample_raw_data


# define options
options=dict()

# directory path
options["dataset_name"] = "aiia"
options["data_dir"] = "~/.dataset/"
options["output_dir"] = "~/.output/"

# log file name
options["log_file"] = "normal.txt"

options["parser_type"] = "drain"
options["log_format"] = "Id,Content"

REGEX1='(0x)[0-9a-fA-F]+'
REGEX2='\d+.\d+.\d+.\d+'
REGEX3='(/[-\w]+)+'
REGEX4='\d+'
options["regex"] = [REGEX1,REGEX2,REGEX3,REGEX4]
options["keep_para"] = False

options["st"] = 0.3
options["depth"] = 3
options["max_child"] = 100
options["tau"] = 0.5

options["window_type"] = "sliding_aiia"
options["window_size"] = 50
options["step_size"] = 5
options["train_size"] = 0.7

# evalue logs
options["evalue_files"] = ["evalue-"+str(i)+".txt" for i in range(0,20)]

# parser path
options["parserPickle_path"] = "/root/.output/aiia/parser.pkl"



# the main process
options["output_dir"] = os.path.expanduser(options["output_dir"])
options["data_dir"] = os.path.expanduser(options["data_dir"])

options["data_dir"] = os.path.join(options["data_dir"], options["dataset_name"] + "/")
options["output_dir"] = os.path.join(options["output_dir"], options["dataset_name"] + "/")


if not os.path.exists(options["output_dir"]):
    os.makedirs(options["output_dir"], exist_ok=True)

In [2]:

# parse normal logs
if options["parser_type"] is not None:
    options["log_format"] = " ".join([f"<{field}>" for field in options["log_format"].split(",")])
    parser = SimpleParserFactory.create_parser(options["data_dir"], options["output_dir"], options["parser_type"], options["log_format"],
                                                options["regex"], options["keep_para"],
                                                options["st"], options["depth"], options["max_child"], options["tau"])
    parser.parse(options["log_file"])

    with open(options["parserPickle_path"], "wb") as f:
        pickle.dump(parser, f)


Parsing file: /root/.dataset/aiia/normal.txt
Total size after encoding is 263292 263292
Parsing done. [Time taken: 0:03:04.876182]


In [4]:
# split normal to train and valid set

split_train_test_aiia(data_dir=options["data_dir"],
                    output_dir=options["output_dir"],
                    log_file=options["log_file"],
                    dataset_name=options["dataset_name"],
                    window_type=options["window_type"],
                    window_size=options["window_size"],
                    step_size=options["step_size"],
                    train_size=options["train_size"])



Loading /root/.output/aiia/normal.txt_structured.csv
process 52000 time window
There are 52650 instances (sliding windows) in this dataset

Saving /root/.output/aiia/train
training size 36855

Saving /root/.output/aiia/test_normal
test normal size 15795


In [6]:
# parse evalue logs
with open(options["parserPickle_path"], "rb") as f:
    parser = pickle.load(f)

for evalue_file in options["evalue_files"]:
    print("Now processing "+evalue_file+".")
    evaluefile_path = "evalue/" + evalue_file

    options["log_format"] = " ".join([f"<{field}>" for field in options["log_format"].split(",")])
    parser.parse(evaluefile_path)

Now processing evalue-0.txt.
Parsing file: /root/.dataset/aiia/evalue/evalue-0.txt
Total size after encoding is 200 200
Parsing done. [Time taken: 0:00:00.096656]
Now processing evalue-1.txt.
Parsing file: /root/.dataset/aiia/evalue/evalue-1.txt
Total size after encoding is 200 200
Parsing done. [Time taken: 0:00:00.053904]
Now processing evalue-2.txt.
Parsing file: /root/.dataset/aiia/evalue/evalue-2.txt
Total size after encoding is 200 200
Parsing done. [Time taken: 0:00:00.054818]
Now processing evalue-3.txt.
Parsing file: /root/.dataset/aiia/evalue/evalue-3.txt
Total size after encoding is 200 200
Parsing done. [Time taken: 0:00:00.055191]
Now processing evalue-4.txt.
Parsing file: /root/.dataset/aiia/evalue/evalue-4.txt
Total size after encoding is 200 200
Parsing done. [Time taken: 0:00:00.055055]
Now processing evalue-5.txt.
Parsing file: /root/.dataset/aiia/evalue/evalue-5.txt
Total size after encoding is 200 200
Parsing done. [Time taken: 0:00:00.054998]
Now processing evalue-

In [7]:
# generate test set for each evalue files

for evalue_file in options["evalue_files"]:
    print("Now processing "+evalue_file+".")
    evaluefile_path = "evalue/" + evalue_file

    generate_test_set_aiia(output_dir=options["output_dir"],
                        log_file=evaluefile_path,
                        window_type=options["window_type"],
                        window_size=options["window_size"],
                        step_size=options["step_size"])

Now processing evalue-0.txt.

Loading /root/.output/aiia/evalue/evalue-0.txt_structured.csv

There are 31 instances (sliding windows) in this dataset

Saving /root/.output/aiia/evalue/evalue-0.txt.test
test set size                                              eventids
0   [529e337f, 021fe350, 55c69cea, 6e8a4e1d, 529e3...
1   [529e337f, 529e337f, 529e337f, c488b4d1, c488b...
2   [c488b4d1, c488b4d1, 59c44720, 60f6a87c, 6ba3b...
3   [d3e17449, 13d46b69, d3e17449, 13d46b69, d3e17...
4   [529e337f, 529e337f, 021fe350, 55c69cea, 6e8a4...
5   [60f6a87c, 6ba3b096, 60f6a87c, 6ba3b096, 55cef...
6   [d3e17449, 13d46b69, d3e17449, 13d46b69, d3e17...
7   [529e337f, 021fe350, 55c69cea, 6e8a4e1d, 529e3...
8   [c488b4d1, c488b4d1, c488b4d1, c488b4d1, c488b...
9   [529e337f, 529e337f, 529e337f, 529e337f, 529e3...
10  [3d16c6a6, 41931a58, aa48d060, c488b4d1, f4365...
11  [b9e7313b, 13d46b69, d3e17449, 13d46b69, d3e17...
12  [6e8a4e1d, 529e337f, 529e337f, 529e337f, 529e3...
13  [5aeefaf4, 4d0090a6, c48