In [None]:
import re
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from google.colab import drive
from typing import List

In [None]:
drive.mount('/content/drive')

In [None]:
NUM_EVENTS = 29

In [None]:
def get_event_trace(event_trace_str: str) -> List[str]:
    return event_trace_str[1:-1].split(",")

def get_last_event(event: List[str]) -> int:
    return int(event[-1][1:])

def plot_hist(columns, percentages, fig_path, xlabel, ylabel, title, ylim):
    plt.figure(figsize=(10, 6))
    bars = plt.bar(columns, percentages)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.xticks(rotation=90)
    plt.ylim(0, ylim)
    plt.tight_layout()

    for bar, percentage in zip(bars, percentages):
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            height + 0.2,
            f'{percentage:.2f}%',
            ha='center',
            va='bottom',
            rotation=90
        )

    plt.savefig(fig_path, format='png', dpi=300)
    plt.show()

In [None]:
df = pd.read_csv('/content/drive/MyDrive/licenta/work/datasets/HDFS_v1/preprocessed/Event_occurrence_matrix.csv')

labels = df["Label"].values
labels[labels == "Success"] = 1
labels[labels == "Fail"] = -1
labels = labels.astype(np.int8)

normal_traces_count = np.sum(labels == 1)
abnormal_traces_count = np.sum(labels == -1)

outlier_ratio = abnormal_traces_count / (normal_traces_count + abnormal_traces_count)

print(f"normal_traces_count: {normal_traces_count}")
print(f"abnormal_traces_count: {abnormal_traces_count}")
print(f"outlier_ratio: {outlier_ratio * 100}")

In [None]:
event_columns = [f"E{i}" for i in range(1, NUM_EVENTS + 1)]
event_count = df[event_columns].sum().to_numpy()

total_events = np.sum(event_count)
event_percentages = (event_count / total_events) * 100

fig_path = '/content/drive/MyDrive/licenta/work/images/event_occurrences.png'
plot_hist(columns=event_columns, percentages=event_percentages, fig_path=fig_path, xlabel="Events", ylabel="Event occurrrences", title='Percentage Occurrence of Each Event', ylim=18)

In [None]:
event_traces_df = pd.read_csv('/content/drive/MyDrive/licenta/work/datasets/HDFS_v1/preprocessed/Event_traces.csv')
event_traces_df["Features"] = event_traces_df["Features"].apply(get_event_trace)
last_event_series = event_traces_df["Features"].apply(get_last_event)

last_trace_event = np.zeros(NUM_EVENTS)
for i in range(1, NUM_EVENTS + 1):
    last_trace_event[i - 1] = np.sum(last_event_series == i)

event_columns = [f"E{i}" for i in range(1, NUM_EVENTS + 1)]
total_traces = np.sum(last_trace_event)
event_percentages = (last_trace_event / total_traces) * 100

fig_path = '/content/drive/MyDrive/licenta/work/images/last_event_occurrences.png'
plot_hist(columns=event_columns, percentages=event_percentages, fig_path=fig_path, xlabel="Events", ylabel="Event occurrrences", title='Percentage Occurrence of Each Event on the Last Position in a Trace', ylim=100)


In [None]:
event_traces_df = pd.read_csv('/content/drive/MyDrive/licenta/work/datasets/HDFS_v1/preprocessed/Event_traces.csv')
event_traces_df["Features"] = event_traces_df["Features"].apply(get_event_trace)
last_event_series = event_traces_df[event_traces_df["Label"] == "Fail"]["Features"].apply(get_last_event)

last_trace_event = np.zeros(NUM_EVENTS)
for i in range(1, NUM_EVENTS + 1):
    last_trace_event[i - 1] = np.sum(last_event_series == i)

event_columns = [f"E{i}" for i in range(1, NUM_EVENTS + 1)]
total_traces = np.sum(last_trace_event)
event_percentages = (last_trace_event / total_traces) * 100

fig_path = '/content/drive/MyDrive/licenta/work/images/last_event_occurrences_anomalous_trace.png'
plot_hist(columns=event_columns, percentages=event_percentages, fig_path=fig_path, xlabel="Events", ylabel="Event occurrrences", title='Percentage Occurrence of Each Event on the Last Position in an Anomalous Trace', ylim=70)


In [None]:
event_traces_df = pd.read_csv('/content/drive/MyDrive/licenta/work/datasets/HDFS_v1/preprocessed/Event_traces.csv')
event_traces_df["Features"] = event_traces_df["Features"].apply(get_event_trace)
last_event_series_abnormal_trace = event_traces_df[event_traces_df["Label"] == "Fail"]["Features"].apply(get_last_event)

last_trace_event_abnormal = np.zeros(NUM_EVENTS)
for i in range(1, NUM_EVENTS + 1):
    last_trace_event_abnormal[i - 1] = np.sum(last_event_series_abnormal_trace == i)

event_columns = [f"E{i}" for i in range(1, NUM_EVENTS + 1)]
event_percentages = (last_trace_event_abnormal / event_count) * 100

fig_path = '/content/drive/MyDrive/licenta/work/images/event_fatality.png'
plot_hist(columns=event_columns, percentages=event_percentages, fig_path=fig_path, xlabel="Events", ylabel="Fatality", title="Percentage Fatality of Each Event", ylim=110)

In [None]:
def get_log_template(template: str):
    template = template.replace("*", "\*")
    return template.replace("[\*]", "(.*)")


event_templates_df = pd.read_csv('/content/drive/MyDrive/licenta/work/datasets/HDFS_v1/preprocessed/HDFS.log_templates.csv')
event_templates = event_templates_df["EventTemplate"].apply(get_log_template)
template_res = [re.compile(template) for template in event_templates]

logs_file_path = "/content/drive/MyDrive/licenta/work/datasets/HDFS_v1/HDFS.log"


with open(logs_file_path, "r") as file:
    cnt_matches = 0
    cnt_logs = 0
    for log in file:
        log = log.strip()
        cnt_logs += 1
        for template_re in template_res:
            match = template_re.fullmatch(log)
            if match:
                cnt_matches += 1
                # print(f"{i} Log: {log}")
                parameters = match.groups()
                # print("Extracted Parameters:", parameters)
                # print("-" * 40)
                break
    print(f"{cnt_matches}/{cnt_logs} matches")

In [None]:
import re

def get_log_template(template: str):
    template = template.replace("*", "\*")
    return template.replace("[\*]", "(.*)")

logs = [
    '081109 203518 35 INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906',
    '081109 203518 143 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010',
    '081109 203518 35 INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906',
    '081109 203519 143 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.10.6:40524 dest: /10.250.10.6:50010',
    '081109 203519 145 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.14.224:42420 dest: /10.250.14.224:50010',
    '081109 203519 145 INFO dfs.DataNode$PacketResponder: PacketResponder 1 for block blk_-1608999687919862906 terminating',
    '081109 203519 145 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-1608999687919862906 terminating',
    '081109 203519 145 INFO dfs.DataNode$PacketResponder: Received block blk_-1608999687919862906 of size 91178 from /10.250.10.6',
    '081109 203519 145 INFO dfs.DataNode$PacketResponder: Received block blk_-1608999687919862906 of size 91178 from /10.250.19.102',
    '081109 203519 147 INFO dfs.DataNode$PacketResponder: PacketResponder 0 for block blk_-1608999687919862906 terminating'
]

template = "[*]BLOCK* NameSystem[*]allocateBlock:[*]"

print(get_log_template(template))

template_re = re.compile(get_log_template(template))

for log in logs:
    match = template_re.fullmatch(log)
    if match:
        print(f"Log: {log}")
        # Extract all matched groups (parameters)
        parameters = match.groups()
        print("Extracted Parameters:", parameters)
        print("-" * 40)

In [None]:
def get_log_template(template: str):
    template = template.replace(".", "\.")
    template = template.replace("(", "\(")
    template = template.replace(")", "\)")
    template = template.replace("[", "\[")
    template = template.replace("]", "\]")
    template = template.replace("$", "\$")
    template = template.replace("<*>", "(.*)")
    template = "(.*) " + template
    return template.strip()

event_templates_df = pd.read_csv('/content/drive/MyDrive/licenta/work/datasets/BGL/BGL_templates.csv')
event_templates = event_templates_df["EventTemplate"].apply(get_log_template)
template_res = [re.compile(template) for template in event_templates]
NUM_EVENTS = len(template_res) + 1

logs_file_path = "/content/drive/MyDrive/licenta/work/datasets/BGL/BGL.log"
labels = []
events = []

with open(logs_file_path, "r") as file:
    cnt_matches = 0
    cnt_logs = 0
    for log in file:
        log = log.strip()
        cnt_logs += 1
        ok = 0
        for index, template_re in enumerate(template_res):
            match = template_re.fullmatch(log)
            if match:
                cnt_matches += 1
                ok = 1
                events.append(index + 1)
                break
        labels.append(1 if log[0] == "-" else -1)
        if ok == 0:
            events.append(NUM_EVENTS)
            print(cnt_logs)
            print(log)
        if cnt_logs % 100000 == 0:
            print("-" * 100)

    print(f"{cnt_matches}/{cnt_logs} matches")

In [None]:
labels = np.array(labels)
events = np.array(events).astype(np.uint8)

normal_logs_count = np.sum(labels == 1)
abnormal_logs_count = np.sum(labels == -1)
outlier_ratio = abnormal_logs_count / (normal_logs_count + abnormal_logs_count)
print(f"normal_logs_count: {normal_logs_count}")
print(f"abnormal_logs_count: {abnormal_logs_count}")
print(f"outlier_ratio: {outlier_ratio}")
print(f"logs_count: {normal_logs_count + abnormal_logs_count}")

In [None]:
event_list = np.array([i for i in range(1, NUM_EVENTS + 1)])
event_occurrences = np.zeros(NUM_EVENTS)
for event in event_list:
    event_occurrences[event - 1] = np.sum(events == event)

num_logs = event_occurrences.sum()
percentages = (event_occurrences / num_logs) * 100

plt.figure(figsize=(10, 6))
plt.bar(event_list, percentages)
plt.xlabel('Events')
plt.ylabel('Event occurences')
plt.title('Histogram of Event Occurrences (in Percentages)')
fig_path = '/content/drive/MyDrive/licenta/work/images/event_occurrence_bgl.png'
plt.savefig(fig_path, format='png', dpi=300)
plt.show()

In [None]:
import re

def get_log_template(template: str):
    template = template.replace(".", "\.")
    template = template.replace("(", "\(")
    template = template.replace(")", "\)")
    template = template.replace("$", "\$")
    template = template.replace("<*>", "(.*)")
    template = "(.*) " + template
    return template

logs = [
    '- 1117961702 2005.06.05 R11-M0-NB-C:J07-U11 2005-06-05-01.55.02.516307 R11-M0-NB-C:J07-U11 RAS KERNEL FATAL floating pt ex mode 0 enable......0'
]

template = "floating pt ex mode <*> <*>"

print(get_log_template(template))

template_re = re.compile(get_log_template(template))

for log in logs:
    match = template_re.fullmatch(log)
    if match:
        print(f"Log: {log}")
        # Extract all matched groups (parameters)
        parameters = match.groups()
        print("Extracted Parameters:", parameters)
        print("-" * 40)