In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pyod.models.ocsvm import OCSVM
from pyod.models.deep_svdd import DeepSVDD
from pyod.models.auto_encoder import AutoEncoder
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, average_precision_score
from typing import List
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.models import Model

In [None]:
def metrics_hist(precision, recall, f1_score, auroc, auprc, fpr, fnr, fig_name):
    metrics = ["Precision", "Recall", "F1-score", "AUROC", "AUPRC", "FPR", "FNR"]
    values = [precision, recall, f1_score, auroc, auprc, fpr, fnr]

    percentages = [v * 100 for v in values]
    colors = ["blue", "green", "red", "orange", "magenta", "yellow", "pink"]

    plt.figure(figsize=(5.5, 10))
    bars = plt.bar(metrics, percentages, color=colors, width=1)
    plt.ylim(0, 100)

    for bar, value in zip(bars, percentages):
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2.0, yval + 1, f'{value:.2f}%', ha='center', va='bottom')

    plt.title('Metrics')
    plt.ylabel('Percentage')

    plt.savefig(f"../images/metrics/{fig_name}.png")
    plt.show()

In [None]:
def compute_metrics(y_true, y_pred, inliers_label, outliers_label):
    y_true[y_true == inliers_label] = 0
    y_true[y_true == outliers_label] = 1
    y_pred[y_pred == inliers_label] = 0
    y_pred[y_pred == outliers_label] = 1

    print(f"true_inliers: {np.sum(y_true == 0)}")
    print(f"true_outliers: {np.sum(y_true == 1)}")
    print(f"pred_inliers: {np.sum(y_pred == 0)}")
    print(f"pred_outliers: {np.sum(y_pred == 1)}")

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auroc = roc_auc_score(y_true, y_pred)
    auprc = average_precision_score(y_true, y_pred)

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)

    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")
    print(f"AUROC: {auroc}")
    print(f"AUPRC: {auprc}")
    print(f"False Positive Rate (FPR): {fpr}")
    print(f"False Negative Rate (FNR): {fnr}")
    print(f"Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print(f"tp: {tp}, fp: {fp}, tn: {tn}, fn: {fn}")

    return precision, recall, f1, auroc, auprc, fpr, fnr

In [None]:
def label_data(inliers_label, outliers_label, labels):
    labels[labels == 1] = outliers_label
    labels[labels == 0] = inliers_label
    labels = labels.astype(np.int8)
    num_inliers = np.sum(labels == inliers_label)
    num_outliers = np.sum(labels == outliers_label)
    print(f"num_inliers: {num_inliers}, inliers_ratio: {num_inliers / (num_inliers + num_outliers)}")
    print(f"num_outliers: {num_outliers}, outliers_ratio: {num_outliers / (num_inliers + num_outliers)}")
    return labels

In [None]:
def get_events_from_logs(file_path):
    def get_log_template(template: str):
        template = template.replace(".", "\.")
        template = template.replace("(", "\(")
        template = template.replace(")", "\)")
        template = template.replace("[", "\[")
        template = template.replace("]", "\]")
        template = template.replace("$", "\$")
        template = template.replace("<*>", "(.*)")
        template = "(.*) " + template
        return template.strip()
    
    if os.path.exists(file_path):
        return pd.read_csv(file_path)

    event_templates_df = pd.read_csv('../datasets/BGL/BGL_templates.csv')
    event_templates = event_templates_df["EventTemplate"].apply(get_log_template)
    template_res = [re.compile(template) for template in event_templates]
    num_events = len(template_res) + 1

    logs_file_path = "../datasets/BGL/BGL.log"
    num_logs = 4747963
    labels = []
    events = []
    log_ids = []
    time_intervals = []

    with open(logs_file_path, "r") as file:
        cnt_matches = 0
        cnt_logs = 0
        last_time = 0
        for log in tqdm(file, total=num_logs, desc="Preprocessing logs"):
            log = log.strip()
            current_time = int(log.split(" ")[1])
            cnt_logs += 1
            ok = 0
            for index, template_re in enumerate(template_res):
                match = template_re.fullmatch(log)
                if match:
                    cnt_matches += 1
                    ok = 1
                    events.append(index + 1)
                    break
            if last_time == 0:
                last_time = current_time 
            if ok == 0:
                events.append(num_events)    
            log_ids.append(cnt_logs)
            labels.append(0 if log[0] == "-" else 1)
            time_intervals.append(current_time - last_time)
            last_time = current_time

        print(f"{cnt_matches}/{cnt_logs} matches")
    
    df = pd.DataFrame({
        "LogId": log_ids,
        "Event": events,
        "Label": labels,
        "TimeInterval": time_intervals
    })
    df.to_csv(file_path, index=False)

    return df

In [None]:
def get_event_traces(window_size=256, step_size=128):
    file_name = f"window_size={window_size}-step_size={step_size}.npz"
    file_path = f"../datasets/BGL/event_traces/{file_name}"

    if os.path.exists(file_path):
        event_traces = np.load(file_path)
        windows = event_traces["windows"]
        window_labels = event_traces["labels"]
        return windows, window_labels
    
    bgl_events_df = get_events_from_logs("../datasets/BGL/BGL_events.csv")
    events = bgl_events_df["Event"].values
    labels = bgl_events_df["Label"].values
    time_intervals = bgl_events_df["TimeInterval"].values

    num_events = events.shape[0]
    window_labels = []
    windows = []

    for i in tqdm(range(0, num_events, step_size), desc="Creating event traces windows"):
        window = None
        if i + window_size <= num_events: 
            window = np.concatenate([events[i:i + window_size], time_intervals[i:i + window_size]], axis=0)
            label = np.sum(labels[i:i + window_size], axis=0)
        else:
            window = np.concatenate([events[i:], np.array([0] * (i + window_size - num_events)), time_intervals[i:], np.array([0] * (i + window_size - num_events))])
            label = np.sum(labels[i:], axis=0)
        window_labels.append(1 if label > 0 else 0)
        windows.append(window)

    windows = np.array(windows)
    window_labels = np.array(window_labels).astype(np.int8)
    np.savez(file_path, windows=windows, labels=window_labels)

    scaler = MinMaxScaler()
    windows = scaler.fit_transform(windows)

    return windows, window_labels


In [None]:
def get_event_occurrence(window_size=1000, step_size=50, num_events=378):
    def get_event_occurence(window, num_events):
        event_occurrence = np.zeros(num_events)
        for i in range(1, num_events + 1):
            event_occurrence[i - 1] = np.sum(window == i)
        return event_occurrence


    file_name = f"window_size={window_size}-step_size={step_size}.npz"
    file_path = f"../datasets/BGL/event_occurrence/{file_name}"

    if os.path.exists(file_path):
        event_traces = np.load(file_path)
        windows = event_traces["windows"]
        window_labels = event_traces["labels"]
        return windows, window_labels
    
    bgl_events_df = get_events_from_logs("../datasets/BGL/BGL_events.csv")
    events = bgl_events_df["Event"].values
    labels = bgl_events_df["Label"].values

    num_logs = events.shape[0]
    windows = []
    window_labels = []

    for i in tqdm(range(0, num_logs, step_size), desc="Creating event occurrence windows"):
        window = None
        if i + window_size <= num_logs: 
            window = get_event_occurence(events[i:i + window_size], num_events)
            label = np.sum(labels[i:i + window_size], axis=0)
        else:
            window = get_event_occurence(events[i:], num_events)
            label = np.sum(labels[i:], axis=0)
        windows.append(window)
        window_labels.append(1 if label > 0 else 0)

    windows = np.array(windows)
    window_labels = np.array(window_labels).astype(np.int8)
    np.savez(file_path, windows=windows, labels=window_labels)

    scaler = MinMaxScaler()
    windows = scaler.fit_transform(windows)

    return windows, window_labels

In [None]:
def ensemble_ocsvm(classifier, data, labels, inliers_label, outliers_label, num_models=5, kernel="rbf", train_size=0.2, contamination=0.3, gamma="scale"):
    models = []

    for _ in tqdm(range(num_models), desc="Training One-Clss SVMs"):
        x_train, _, _, _ = train_test_split(data, labels, train_size=train_size, random_state=None)
        ocsvm = classifier(kernel=kernel, nu=contamination, gamma=gamma)
        ocsvm.fit(x_train)
        models.append(ocsvm)    

    predictions = np.zeros((data.shape[0], num_models))
    for i, model in enumerate(tqdm(models, desc="Computing predictions")):
        preds = model.predict(data)
        predictions[:, i] = (preds == outliers_label).astype(int)

    pred_labels = (predictions.sum(axis=1) >= (num_models // 2)).astype(int)
    pred_labels = np.where(pred_labels == 1, outliers_label, inliers_label)

    return pred_labels

In [None]:
def ocsvm_experiment(classifier, inliers_label, outliers_label, title, num_events, kernel="rbf", train_size=0.2, contamination=0.3, gamma="scale", window_size=256, step_size=128, experiment_type="occurrence_matrix"):
    data = None
    if experiment_type == "event_occurrence":
        data, labels = get_event_occurrence(window_size, step_size, num_events)
    elif experiment_type == "event_traces":
        data, labels = get_event_traces(window_size, step_size)
    else:
        return
    labels = label_data(inliers_label, outliers_label, labels)

    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=train_size, random_state=42)
    ocsvm =  classifier(kernel=kernel, nu=contamination, gamma=gamma)

    ocsvm.fit(x_train)
    y_test_pred = ocsvm.predict(x_test)

    precision, recall, f1, auroc, auprc, fpr, fnr = compute_metrics(y_test, y_test_pred, inliers_label, outliers_label)
    metrics_hist(precision, recall, f1, auroc, auprc, fpr, fnr, title)


In [None]:
def ensemble_ocsvm_experiment(classifier, inliers_label, outliers_label, title, num_events, num_models=5, kernel="rbf", train_size=0.2, contamination=0.3, gamma="scale", window_size=256, step_size=128, experiment_type="event_occurrence"):
    data = None
    if experiment_type == "event_occurrence":
        data, labels = get_event_occurrence(window_size, step_size, num_events)
    elif experiment_type == "event_traces":
        data, labels = get_event_traces(window_size, step_size)
    else:
        return
    labels = label_data(inliers_label, outliers_label, labels)

    pred_labels = ensemble_ocsvm(
        classifier=classifier, 
        data=data,
        labels=labels,
        inliers_label=inliers_label,
        outliers_label=outliers_label, 
        num_models=num_models, 
        kernel=kernel, 
        train_size=train_size, 
        contamination=contamination, 
        gamma=gamma)

    precision, recall, f1, auroc, auprc, fpr, fnr = compute_metrics(labels, pred_labels, inliers_label, outliers_label)
    metrics_hist(precision, recall, f1, auroc, auprc, fpr, fnr, title)

In [None]:
def deep_svdd_experiment(inliers_label, outliers_label, title, num_events, train_size=0.2, window_size=256, step_size=128, experiment_type="occurrence_matrix",
                         c=None, use_ae=False, hidden_neurons=None, hidden_activation='relu', output_activation='sigmoid', optimizer='adam', epochs=100, 
                         batch_size=32, dropout_rate=0.2, l2_regularizer=0.1, validation_size=0.1, preprocessing=True, verbose=1, random_state=None, contamination=0.1):
    data = None
    if experiment_type == "event_occurrence":
        data, labels = get_event_occurrence(window_size, step_size, num_events)
    elif experiment_type == "event_traces":
        data, labels = get_event_traces(window_size, step_size)
    else:
        return
    labels = label_data(inliers_label, outliers_label, labels)
    
    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=train_size, random_state=42)
    model = DeepSVDD(c=c, use_ae=use_ae, hidden_neurons=hidden_neurons, hidden_activation=hidden_activation, output_activation=output_activation,
                     optimizer=optimizer, epochs=epochs, batch_size=batch_size, dropout_rate=dropout_rate, l2_regularizer=l2_regularizer, 
                     validation_size=validation_size, preprocessing=preprocessing, verbose=verbose, random_state=random_state, contamination=contamination)
    
    model.fit(x_train)
    y_test_pred = model.predict(x_test)

    precision, recall, f1, auroc, auprc, fpr, fnr = compute_metrics(y_test, y_test_pred, inliers_label, outliers_label)
    metrics_hist(precision, recall, f1, auroc, auprc, fpr, fnr, title)

In [None]:
def ae_ocsvm_experiment(classifier, inliers_label, outliers_label, title, num_events, kernel="rbf", gamma="scale", train_size=0.2, window_size=256, step_size=128, experiment_type="occurrence_matrix",
                         hidden_neurons=None, hidden_activation='relu', output_activation='sigmoid', optimizer='adam', epochs=100, 
                         batch_size=32, dropout_rate=0.2, l2_regularizer=0.1, validation_size=0.1, preprocessing=True, verbose=1, random_state=None, contamination=0.1):
    data = None
    if experiment_type == "event_occurrence":
        data, labels = get_event_occurrence(window_size, step_size, num_events)
    elif experiment_type == "event_traces":
        data, labels = get_event_traces(window_size, step_size)
    else:
        return
    labels = label_data(inliers_label, outliers_label, labels)
    

    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=train_size, random_state=42)
    autoencoder = AutoEncoder(hidden_neurons=hidden_neurons, hidden_activation=hidden_activation, output_activation=output_activation,
                     optimizer=optimizer, epochs=epochs, batch_size=batch_size, dropout_rate=dropout_rate, l2_regularizer=l2_regularizer, 
                     validation_size=validation_size, preprocessing=preprocessing, verbose=verbose, random_state=random_state, 
                     contamination=contamination)
    
    autoencoder.fit(x_train)
    
    bottleneck_layer = len(hidden_neurons) // 2 if hidden_neurons else 1
    encoder_model = Model(inputs=autoencoder.model_.input, outputs=autoencoder.model_.get_layer(index=bottleneck_layer).output)
    
    x_train_encoded = encoder_model.predict(x_train)
    x_test_encoded = encoder_model.predict(x_test)

    ocsvm = classifier(kernel=kernel, gamma=gamma, nu=contamination)
    ocsvm.fit(x_train_encoded)

    y_test_pred = ocsvm.predict(x_test_encoded)

    precision, recall, f1, auroc, auprc, fpr, fnr = compute_metrics(y_test, y_test_pred, inliers_label, outliers_label)
    metrics_hist(precision, recall, f1, auroc, auprc, fpr, fnr, title)

In [None]:
##########################################################
################ Scikit OCSVM Experiment #################
##########################################################

train_size = 0.5
num_events = 378
kernel = "rbf"
contamination = 0.3
gamma = "scale"
window_size = 512
step_size = 32
experiment_type="event_occurrence"

ocsvm_experiment(
    classifier=OneClassSVM,
    inliers_label=1,
    outliers_label=-1,
    title=f"bgl-scikit-{experiment_type}-ocsvm-train_size={train_size}-kernel={kernel}-nu={contamination}-gamma={gamma}-window_size={window_size}-step_size={step_size}",
    num_events = num_events,
    kernel=kernel,
    train_size=train_size,
    contamination=contamination,
    gamma=gamma,
    window_size=window_size,
    step_size=step_size,
    experiment_type=experiment_type)

In [None]:
#######################################################################
################ Scikit Event Traces OCSVM Experiment #################
#######################################################################

train_size = 0.8
num_events = 378
kernel = "rbf"
contamination = 0.3
gamma = "scale"
window_size = 256
step_size = 128
experiment_type = "event_traces"

ocsvm_experiment(
    classifier=OneClassSVM,
    inliers_label=1,
    outliers_label=-1,
    title=f"bgl-scikit-{experiment_type}-ocsvm-train_size={train_size}-kernel={kernel}-nu={contamination}-gamma={gamma}-window_size={window_size}-step_size={step_size}",
    num_events = num_events,
    kernel=kernel,
    train_size=train_size,
    contamination=contamination,
    gamma=gamma,
    window_size=window_size,
    step_size=step_size,
    experiment_type=experiment_type)

In [None]:
####################################################################################
################ Scikit Event Occurrence Ensemble OCSVM Experiment #################
####################################################################################

train_size = 0.3
num_events = 378
kernel = "rbf"
contamination = 0.2
num_models = 5
gamma = "scale"
window_size = 512
step_size = 32
experiment_type = "event_occurrence"

ensemble_ocsvm_experiment(
    classifier=OneClassSVM, 
    inliers_label=1, 
    outliers_label=-1, 
    title=f"bgl-scikit-{experiment_type}-ensemble-ocsvm-train_size={train_size}-kernel={kernel}-nu={contamination}-gamma={gamma}-window_size={window_size}-step_size={step_size}",
    num_events=num_events, 
    num_models=num_models, 
    kernel=kernel, 
    train_size=train_size, 
    contamination=contamination, 
    gamma=gamma,
    window_size=window_size,
    step_size=step_size,
    experiment_type=experiment_type)

In [None]:
####################################################################################
################ Scikit Event Traces Ensemble OCSVM Experiment #################
####################################################################################

train_size = 0.3
num_events = 378
kernel = "rbf"
contamination = 0.3
num_models = 5
gamma = "scale"
window_size = 256
step_size = 32
experiment_type = "event_traces"

ensemble_ocsvm_experiment(
    classifier=OneClassSVM, 
    inliers_label=1, 
    outliers_label=-1, 
    title=f"bgl-scikit-{experiment_type}-ensemble-ocsvm-train_size={train_size}-kernel={kernel}-nu={contamination}-gamma={gamma}-window_size={window_size}-step_size={step_size}",
    num_events=num_events, 
    num_models=num_models, 
    kernel=kernel, 
    train_size=train_size, 
    contamination=contamination, 
    gamma=gamma,
    window_size=window_size,
    step_size=step_size,
    experiment_type=experiment_type)

In [None]:
#############################################################################
################ PyOd Event Occurrence Deep-SVDD Experiment #################
#############################################################################

train_size = 0.8
num_events = 378
contamination = 0.3
experiment_type = "event_occurrence"
epochs = 20
batch_size = 32
window_size = 512
step_size = 32

deep_svdd_experiment(
    inliers_label=0,
    outliers_label=1,
    title=f"bgl-deep-svdd-{experiment_type}-train_size={train_size}-epochs={epochs}-contamination={contamination}-window_size={window_size}-step_size={step_size}",
    num_events=num_events,
    train_size=train_size,
    experiment_type=experiment_type,
    epochs=epochs,
    batch_size=batch_size,
    verbose=2,
    contamination=contamination,
    window_size=window_size,
    step_size=step_size)

In [None]:
############################################################################
################ PyOd Event Occurrence AE-OCSVM Experiment #################
############################################################################

train_size = 0.8
num_events = 378
contamination = 0.3
experiment_type = "event_occurrence"
epochs = 20
batch_size = 32
window_size = 512
step_size = 32
gamma = "auto"
kernel = "rbf"
hidden_neurons = [64, 32, 32, 64]


ae_ocsvm_experiment(
    classifier=OneClassSVM,
    inliers_label=1,
    outliers_label=-1, 
    title=f"hdfs-ae-ocsvm-{experiment_type}-train_size={train_size}-epochs={epochs}-contamination={contamination}-window_size={window_size}-step_size={step_size}",
    num_events=num_events, 
    kernel=kernel,
    gamma=gamma,
    train_size=train_size,  
    experiment_type=experiment_type,
    epochs=epochs,
    batch_size=batch_size, 
    verbose=2, 
    contamination=contamination,
    window_size=window_size,
    step_size=step_size)