In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

import numpy as np
import pandas as pd
import torch as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import List
from pyod.models.ocsvm import OCSVM
from pyod.models.deep_svdd import DeepSVDD
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.auto_encoder_torch import AutoEncoder
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, average_precision_score
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.models import Model

In [None]:
def metrics_hist(precision, recall, f1_score, auroc, auprc, fpr, fnr, fig_name):
    metrics = ["Precision", "Recall", "F1-score", "AUROC", "AUPRC", "FPR", "FNR"]
    values = [precision, recall, f1_score, auroc, auprc, fpr, fnr]

    percentages = [v * 100 for v in values]
    colors = ["blue", "green", "red", "orange", "magenta", "yellow", "pink"]

    plt.figure(figsize=(5, 10))
    bars = plt.bar(metrics, percentages, color=colors, width=1)
    plt.ylim(0, 100)

    for bar, value in zip(bars, percentages):
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2.0, yval + 1, f'{value:.2f}%', ha='center', va='bottom')

    plt.title('Metrics')
    plt.ylabel('Percentage')

    plt.savefig(f"../images/metrics/{fig_name}.png")
    plt.show()

In [None]:
def compute_metrics(y_true, y_pred, inliers_label, outliers_label):
    y_true[y_true == inliers_label] = 0
    y_true[y_true == outliers_label] = 1
    y_pred[y_pred == inliers_label] = 0
    y_pred[y_pred == outliers_label] = 1

    print(f"true_inliers: {np.sum(y_true == 0)}")
    print(f"true_outliers: {np.sum(y_true == 1)}")
    print(f"pred_inliers: {np.sum(y_pred == 0)}")
    print(f"pred_outliers: {np.sum(y_pred == 1)}")

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auroc = roc_auc_score(y_true, y_pred)
    auprc = average_precision_score(y_true, y_pred)

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)

    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")
    print(f"AUROC: {auroc}")
    print(f"AUPRC: {auprc}")
    print(f"False Positive Rate (FPR): {fpr}")
    print(f"False Negative Rate (FNR): {fnr}")
    print(f"Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print(f"tp: {tp}, fp: {fp}, tn: {tn}, fn: {fn}")

    return precision, recall, f1, auroc, auprc, fpr, fnr

In [None]:
def data_stats(inliers_label, outliers_label, labels):
    num_inliers = np.sum(labels == inliers_label)
    num_outliers = np.sum(labels == outliers_label)
    print(f"num_inliers: {num_inliers}, inliers_ratio: {num_inliers / (num_inliers + num_outliers)}")
    print(f"num_outliers: {num_outliers}, outliers_ratio: {num_outliers / (num_inliers + num_outliers)}")

In [None]:
def label_data(inliers_label, outliers_label, labels):
    labels[labels == "Success"] = inliers_label
    labels[labels == "Fail"] = outliers_label
    labels = labels.astype(np.int8)
    data_stats(inliers_label, outliers_label, labels)
    return labels

In [None]:
def ensemble_ocsvm(classifier, data, labels, inliers_label, outliers_label, num_models=5, kernel="rbf", train_size=0.2, contamination=0.3, gamma="scale"):
    models = []

    for _ in tqdm(range(num_models), desc="Training One-Clss SVMs"):
        x_train, _, _, _ = train_test_split(data, labels, train_size=train_size, random_state=None)
        ocsvm = classifier(kernel=kernel, nu=contamination, gamma=gamma)
        ocsvm.fit(x_train)
        models.append(ocsvm)    

    predictions = np.zeros((data.shape[0], num_models))
    for i, model in enumerate(tqdm(models, desc="Computing predictions")):
        preds = model.predict(data)
        predictions[:, i] = (preds == outliers_label).astype(int)

    pred_labels = (predictions.sum(axis=1) >= (num_models // 2)).astype(int)
    pred_labels = np.where(pred_labels == 1, outliers_label, inliers_label)

    return pred_labels

In [None]:
def occurrence_matrix_preprocessing(inliers_label, outliers_label, num_events):
    hdfs_occurrence_matrix_df = pd.read_csv('../datasets/HDFS_v1/preprocessed/Event_occurrence_matrix.csv')

    event_columns = [f"E{i}" for i in range(1, num_events + 1)]
    event_occurrence = hdfs_occurrence_matrix_df[event_columns].values

    scaler = MinMaxScaler()
    event_occurrence = scaler.fit_transform(event_occurrence)

    labels = hdfs_occurrence_matrix_df["Label"].values
    labels = label_data(inliers_label, outliers_label, labels)

    return event_occurrence, labels

In [None]:
def event_traces_preprocessing(inliers_label, outliers_label, window_size, step_size):
     
    def set_label(label: str) -> int:
        return outliers_label if label == "Fail" else inliers_label

    def get_event_trace(event_trace_str: str) -> List[str]:
        return event_trace_str[1:-1].split(",")

    def get_time_interval(time_interval_str: str) -> List[float]:
        return [float(x) for x in time_interval_str[1:-1].split(",")]

    def get_windows(row):
        windows = []
        event_trace = row["Features"]
        time_interval = row["TimeInterval"]
        latency = row["Latency"]
        label = row["Label"]
        trace_len = len(event_trace)
        window_labels = []
        event_trace = [int(event[1:]) for event in event_trace]
        time_interval.append(0)

        for i in range(0, trace_len, step_size):
            window = []
            if i + window_size <= trace_len:
                window = event_trace[i:i + window_size] + time_interval[i:i + window_size] + [latency]
            else:
                window = event_trace[i:] + ([0] * (i + window_size - trace_len)) + time_interval[i:] + ([0] * (i + window_size - trace_len)) + [latency]
            if i + window_size >= trace_len and label == outliers_label:
                window_labels.append(outliers_label)
            else:
                window_labels.append(inliers_label)
            windows.append(window)
            
        return window_labels, windows


    hdfs_event_traces_df = pd.read_csv('../datasets/HDFS_v1/preprocessed/Event_traces.csv')
    hdfs_event_traces_df["Label"] = hdfs_event_traces_df["Label"].apply(set_label)
    hdfs_event_traces_df["Features"] = hdfs_event_traces_df["Features"].apply(get_event_trace)
    hdfs_event_traces_df["TimeInterval"] = hdfs_event_traces_df["TimeInterval"].apply(get_time_interval)

    hdfs_event_traecs_series = hdfs_event_traces_df.apply(get_windows, axis=1)
    windows = hdfs_event_traecs_series.apply(lambda x: x[1]).values
    window_labels = hdfs_event_traecs_series.apply(lambda x: x[0]).values

    windows = np.concatenate(windows, axis=0)
    window_labels = np.concatenate(window_labels, axis=0)

    scaler = MinMaxScaler()
    windows = scaler.fit_transform(windows)

    data_stats(inliers_label, outliers_label, window_labels)

    return windows, window_labels

In [None]:
def ocsvm_experiment(classifier, inliers_label, outliers_label, title, num_events, kernel="rbf", train_size=0.2, contamination=0.3, gamma="scale", window_size=30, step_size=1, experiment_type="occurrence_matrix"):
    data = None
    if experiment_type == "occurrence_matrix":
        data, labels = occurrence_matrix_preprocessing(inliers_label, outliers_label, num_events)
    elif experiment_type == "event_trace":
        data, labels = event_traces_preprocessing(inliers_label, outliers_label, window_size, step_size)
    else:
        return

    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=train_size, random_state=42)
    ocsvm =  classifier(kernel=kernel, nu=contamination, gamma=gamma)

    ocsvm.fit(x_train)
    y_test_pred = ocsvm.predict(x_test)

    precision, recall, f1, auroc, auprc, fpr, fnr = compute_metrics(y_test, y_test_pred, inliers_label, outliers_label)
    metrics_hist(precision, recall, f1, auroc, auprc, fpr, fnr, title)

In [None]:
def ensemble_ocsvm_experiment(classifier, inliers_label, outliers_label, title, num_events, num_models=5, kernel="rbf", train_size=0.2, contamination=0.3, gamma="scale", window_size=30, step_size=1, experiment_type="occurrence_matrix"):
    data = None
    if experiment_type == "occurrence_matrix":
        data, labels = occurrence_matrix_preprocessing(inliers_label, outliers_label, num_events)
    elif experiment_type == "event_trace":
        data, labels = event_traces_preprocessing(inliers_label, outliers_label, window_size, step_size)
    else:
        return

    pred_labels = ensemble_ocsvm(
        classifier=classifier, 
        data=data,
        labels=labels,
        inliers_label=inliers_label,
        outliers_label=outliers_label, 
        num_models=num_models, 
        kernel=kernel, 
        train_size=train_size, 
        contamination=contamination, 
        gamma=gamma)

    precision, recall, f1, auroc, auprc, fpr, fnr = compute_metrics(labels, pred_labels, inliers_label, outliers_label)
    metrics_hist(precision, recall, f1, auroc, auprc, fpr, fnr, title)

In [None]:
def deep_svdd_experiment(inliers_label, outliers_label, title, num_events, train_size=0.2, window_size=30, step_size=1, experiment_type="occurrence_matrix",
                         c=None, use_ae=False, hidden_neurons=None, hidden_activation='relu', output_activation='sigmoid', optimizer='adam', epochs=100, 
                         batch_size=32, dropout_rate=0.2, l2_regularizer=0.1, validation_size=0.1, preprocessing=True, verbose=1, random_state=None, contamination=0.1):
    data = None
    if experiment_type == "occurrence_matrix":
        data, labels = occurrence_matrix_preprocessing(inliers_label, outliers_label, num_events)
    elif experiment_type == "event_trace":
        data, labels = event_traces_preprocessing(inliers_label, outliers_label, window_size, step_size)
    else:
        return
    
    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=train_size, random_state=42)
    model = DeepSVDD(c=c, use_ae=use_ae, hidden_neurons=hidden_neurons, hidden_activation=hidden_activation, output_activation=output_activation,
                     optimizer=optimizer, epochs=epochs, batch_size=batch_size, dropout_rate=dropout_rate, l2_regularizer=l2_regularizer, 
                     validation_size=validation_size, preprocessing=preprocessing, verbose=verbose, random_state=random_state, contamination=contamination)
    
    model.fit(x_train)
    y_test_pred = model.predict(x_test)

    precision, recall, f1, auroc, auprc, fpr, fnr = compute_metrics(y_test, y_test_pred, inliers_label, outliers_label)
    metrics_hist(precision, recall, f1, auroc, auprc, fpr, fnr, title)

In [None]:
def ae_ocsvm_experiment(classifier, inliers_label, outliers_label, title, num_events, kernel="rbf", gamma="scale", train_size=0.2, window_size=30, step_size=1, experiment_type="occurrence_matrix",
                         hidden_neurons=None, hidden_activation='relu', output_activation='sigmoid', optimizer='adam', epochs=100, 
                         batch_size=32, dropout_rate=0.2, l2_regularizer=0.1, validation_size=0.1, preprocessing=True, verbose=1, random_state=None, contamination=0.1):
    data = None
    if experiment_type == "occurrence_matrix":
        data, labels = occurrence_matrix_preprocessing(inliers_label, outliers_label, num_events)
    elif experiment_type == "event_trace":
        data, labels = event_traces_preprocessing(inliers_label, outliers_label, window_size, step_size)
    else:
        return
    

    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=train_size, random_state=42)
    autoencoder = AutoEncoder(hidden_neurons=hidden_neurons, hidden_activation=hidden_activation, output_activation=output_activation,
                     optimizer=optimizer, epochs=epochs, batch_size=batch_size, dropout_rate=dropout_rate, l2_regularizer=l2_regularizer, 
                     validation_size=validation_size, preprocessing=preprocessing, verbose=verbose, random_state=random_state, 
                     contamination=contamination)
    
    autoencoder.fit(x_train)
    
    bottleneck_layer = len(hidden_neurons) // 2 if hidden_neurons else 1
    encoder_model = Model(inputs=autoencoder.model_.input, outputs=autoencoder.model_.get_layer(index=bottleneck_layer).output)
    
    x_train_encoded = encoder_model.predict(x_train)
    x_test_encoded = encoder_model.predict(x_test)

    ocsvm = classifier(kernel=kernel, gamma=gamma, nu=contamination)
    ocsvm.fit(x_train_encoded)

    y_test_pred = ocsvm.predict(x_test_encoded)

    precision, recall, f1, auroc, auprc, fpr, fnr = compute_metrics(y_test, y_test_pred, inliers_label, outliers_label)
    metrics_hist(precision, recall, f1, auroc, auprc, fpr, fnr, title)

In [None]:
def ae_torch_ocsvm_experiment(classifier, inliers_label, outliers_label, title, num_events, kernel="rbf", gamma="scale", train_size=0.2, window_size=30, step_size=1, experiment_type="occurrence_matrix",
                         hidden_neurons=None, hidden_activation='relu', batch_norm=True, learning_rate=0.001, epochs=100, batch_size=32, dropout_rate=0.2, weight_decay=1e-05, preprocessing=True, 
                         contamination=0.1, device=None):
    data = None
    if experiment_type == "occurrence_matrix":
        data, labels = occurrence_matrix_preprocessing(inliers_label, outliers_label, num_events)
    elif experiment_type == "event_trace":
        data, labels = event_traces_preprocessing(inliers_label, outliers_label, window_size, step_size)
    else:
        return


    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=train_size, random_state=42)
    autoencoder = AutoEncoder(hidden_neurons=hidden_neurons, hidden_activation=hidden_activation, batch_norm=batch_norm, learning_rate=learning_rate,
                            epochs=epochs, batch_size=batch_size, dropout_rate=dropout_rate, weight_decay=weight_decay, preprocessing=preprocessing, 
                            contamination=contamination, device=device)
    
    autoencoder.fit(x_train)
    
    bottleneck_layer = len(hidden_neurons) // 2 + 1 if hidden_neurons else 2
    encoder_model = autoencoder.model_[:bottleneck_layer] 
    x_train_encoded = encoder_model(nn.tensor(x_train).float()).detach().numpy()
    x_test_encoded = encoder_model(nn.tensor(x_test).float()).detach().numpy()
    
    x_train_encoded = encoder_model.predict(x_train)
    x_test_encoded = encoder_model.predict(x_test)

    ocsvm = classifier(kernel=kernel, gamma=gamma, nu=contamination)
    ocsvm.fit(x_train_encoded)

    y_test_pred = ocsvm.predict(x_test_encoded)

    precision, recall, f1, auroc, auprc, fpr, fnr = compute_metrics(y_test, y_test_pred, inliers_label, outliers_label)
    metrics_hist(precision, recall, f1, auroc, auprc, fpr, fnr, title)

In [None]:
##########################################################
################ Scikit OCSVM Experiment #################
##########################################################

train_size = 0.1
num_events = 29
kernel = "rbf"
contamination = 0.1
gamma = "scale"

ocsvm_experiment(
    classifier=OneClassSVM,
    inliers_label=1,
    outliers_label=-1,
    title=f"hdfs-scikit-ocsvm-train_size={train_size}-kernel={kernel}-nu={contamination}-gamma={gamma}",
    num_events = num_events,
    kernel=kernel,
    train_size=train_size,
    contamination=contamination,
    gamma=gamma)


In [None]:
##########################################################
################ PyOd OCSVM Experiment ###################
##########################################################

train_size = 0.8
num_events = 29
kernel = "rbf"
contamination = 0.1
gamma = "scale"

ocsvm_experiment(
    classifier=OCSVM,
    inliers_label=0,
    outliers_label=1,
    title=f"hdfs-pyod-ocsvm-train_size={train_size}-kernel={kernel}-nu={contamination}-gamma={gamma}",
    num_events = num_events,
    kernel=kernel,
    train_size=train_size,
    contamination=contamination,
    gamma=gamma)


In [None]:
#####################################################################
################ Scikit Ensemble OCSVMs Experiment ##################
#####################################################################

train_size = 0.01
num_events = 29
kernel = "rbf"
contamination = 0.3
num_models = 5
gamma = "scale"

ensemble_ocsvm_experiment(
    classifier=OneClassSVM, 
    inliers_label=1, 
    outliers_label=-1, 
    title=f"hdfs-scikit-ensemble-ocsvm-num_models={num_models}-train_size={train_size}-kernel={kernel}-nu={contamination}-gamma={gamma}",
    num_events=num_events, 
    num_models=num_models, 
    kernel=kernel, 
    train_size=train_size, 
    contamination=contamination, 
    gamma=gamma)

In [None]:
#####################################################################
################ PyOd Ensemble OCSVMs Experiment ##################
#####################################################################

train_size = 0.01
num_events = 29
kernel = "rbf"
contamination = 0.3
num_models = 5
gamma = "scale"

ensemble_ocsvm_experiment(
    classifier=OCSVM, 
    inliers_label=0, 
    outliers_label=1, 
    title=f"hdfs-pyod-ensemble-ocsvm-num_models={num_models}-train_size={train_size}-kernel={kernel}-nu={contamination}-gamma={gamma}",
    num_events=num_events, 
    num_models=num_models, 
    kernel=kernel, 
    train_size=train_size, 
    contamination=contamination, 
    gamma=gamma)

In [None]:
######################################################################
################ Scikit Event-Trace OCSVM Experiment #################
######################################################################

train_size = 0.01
num_events = 29
kernel = "rbf"
contamination = 0.3
gamma = "scale"
window_size = 30
step_size = 1

ocsvm_experiment(
    classifier=OneClassSVM,
    inliers_label=1,
    outliers_label=-1,
    title=f"hdfs-scikit-event-trace-ocsvm-train_size={train_size}-kernel={kernel}-nu={contamination}-gamma={gamma}-window_size={window_size}-step_size={step_size}",
    num_events = num_events,
    kernel=kernel,
    train_size=train_size,
    contamination=contamination,
    gamma=gamma,
    window_size=window_size,
    step_size=step_size,
    experiment_type="event_trace")

In [None]:
######################################################################
################ PyOd Event-Trace OCSVM Experiment #################
######################################################################

train_size = 0.01
num_events = 29
kernel = "rbf"
contamination = 0.3
gamma = "scale"
window_size = 30
step_size = 1

ocsvm_experiment(
    classifier=OCSVM,
    inliers_label=0,
    outliers_label=1,
    title=f"hdfs-pyod-event-trace-ocsvm-train_size={train_size}-kernel={kernel}-nu={contamination}-gamma={gamma}-window_size={window_size}-step_size={step_size}",
    num_events = num_events,
    kernel=kernel,
    train_size=train_size,
    contamination=contamination,
    gamma=gamma,
    window_size=window_size,
    step_size=step_size,
    experiment_type="event_trace")

In [None]:
#####################################################################
################ PyOd Ensemble Event-Trace OCSVMs Experiment ##################
#####################################################################

train_size = 0.01
num_events = 29
kernel = "rbf"
contamination = 0.3
num_models = 5
gamma = "scale"
window_size = 30
step_size = 1


ensemble_ocsvm_experiment(
    classifier=OCSVM, 
    inliers_label=0, 
    outliers_label=1, 
    title=f"hdfs-pyod-event-trace-ensemble-ocsvm-num_models={num_models}-train_size={train_size}-kernel={kernel}-nu={contamination}-gamma={gamma}",
    num_events=num_events, 
    num_models=num_models, 
    kernel=kernel, 
    train_size=train_size, 
    contamination=contamination, 
    gamma=gamma,
    window_size=window_size,
    step_size=step_size,
    experiment_type="event_trace")

In [None]:
############################################################
################ PyOd Deep-SVDD Experiment #################
############################################################

train_size = 0.2
num_events = 29
contamination = 0.3
experiment_type = "occurrence_matrix"
epochs = 5
batch_size = 32
window_size = 30
step_size = 1


deep_svdd_experiment(
    inliers_label=0,
    outliers_label=1, 
    title=f"hdfs-deep-svdd-{experiment_type}-train_size={train_size}-epochs={epochs}-contamination={contamination}-window_size={window_size}-step_size={step_size}",
    num_events=num_events, 
    train_size=train_size,  
    experiment_type=experiment_type,
    epochs=epochs,
    batch_size=batch_size, 
    verbose=2, 
    contamination=contamination,
    window_size=window_size,
    step_size=step_size)

In [None]:
############################################################
################ PyOd AE-OCSVM Experiment #################
############################################################

train_size = 0.2
num_events = 29
contamination = 0.3
experiment_type = "occurrence_matrix"
epochs = 5
batch_size = 32
gamma = "auto"
kernel = "rbf"
hidden_neurons = [20, 10, 20]


ae_ocsvm_experiment(
    classifier=OneClassSVM,
    inliers_label=1,
    outliers_label=-1, 
    title=f"hdfs-ae-ocsvm-{experiment_type}-train_size={train_size}-epochs={epochs}-contamination={contamination}",
    num_events=num_events, 
    kernel=kernel,
    gamma=gamma,
    train_size=train_size,  
    experiment_type=experiment_type,
    epochs=epochs,
    batch_size=batch_size, 
    verbose=2, 
    contamination=contamination)

In [None]:
############################################################
################ PyOd Torch AE-OCSVM Experiment #################
############################################################

train_size = 0.8
num_events = 29
contamination = 0.3
experiment_type = "event_trace"
epochs = 20
batch_size = 32
gamma = "auto"
kernel = "rbf"
hidden_neurons = [20, 10, 20]
window_size = 30
step_size = 1


ae_torch_ocsvm_experiment(
    classifier=OneClassSVM,
    inliers_label=1,
    outliers_label=-1, 
    title=f"hdfs-torch-ae-ocsvm-{experiment_type}-train_size={train_size}-epochs={epochs}-contamination={contamination}-window_size={window_size}-step_size={step_size}",
    num_events=num_events, 
    kernel=kernel,
    gamma=gamma,
    train_size=train_size,  
    experiment_type=experiment_type,
    epochs=epochs,
    batch_size=batch_size,  
    contamination=contamination,
    window_size=window_size,
    step_size=step_size)