### Use Antagonist to train a symptom detection model 

#### Reproducibility

In [1]:
# Torch
import torch
torch.manual_seed(0)
torch.use_deterministic_algorithms(True)

# Python
import random
random.seed(0)

# Numpy
import numpy as np
np.random.seed(0)

#### Dataset preparation

Note: the dataset needs to be downloaded using the script `download_SMD_dataset.sh` in the `scripts/antagonist-ml` folder.

In [2]:
import os 
from utils import SMD

In [3]:
data_folder = r"..\..\..\data"
db = SMD(dataset_folder=os.path.join(data_folder,'ServerMachineDataset'))

In [4]:
dataframes_train, _ = db.read_dataset(group_name="Group 1", train=True, retrieve_labels=True)
dataframes, files = db.read_dataset(group_name="Group 1", train=False, retrieve_labels=True)

In [5]:
service_idx = 0
df,service_id = dataframes[service_idx], files[service_idx]
df,labels = df[df.columns[:-2].tolist()+['timestamp']], df[['label']]
df_train = dataframes_train[service_idx]
df_train = df_train[df_train.columns[:-1].tolist()+['timestamp']]

In [6]:
network_incidents = db.get_interpretation_labels(service_id)

#### ML Model Loading

In [7]:
import datetime
import pandas as pd
from ml_ad import AENetworkAnomaly

In [8]:
current_day = datetime.datetime(2020, 1, 14)

In [9]:
# Create models folder
models_folder = os.path.join(data_folder,'models')
os.makedirs(models_folder,exist_ok=True)

In [10]:
model_name = f'ae_model_{(current_day).strftime("%Y%m%d")}'
model_folder = os.path.join(models_folder, model_name)

In [None]:
if os.path.exists(model_folder):
    # Train the model
    ml_model = AENetworkAnomaly.load(model_folder)
else:
    # Train new model 
    ml_model = AENetworkAnomaly(n_inputs=df.shape[1]-1)

    # Get data up to current day
    df_today = df.loc[df["timestamp"] < current_day.ctime()]

    mask_train = labels[: df_today.shape[0]]["label"] == 0
    df_today = pd.concat([df_train, df_today[mask_train]], ignore_index=True)

    # Train the model
    X_train = df_today.drop('timestamp',axis=1).values
    ml_model.fit(X_train)

    ml_model.store(model_folder)

#### Comparing with champion models

In [12]:
from sklearn.metrics import classification_report

In [13]:
champion_model_name='ae_model_20200113'
ch_models_folder = os.path.join(models_folder,champion_model_name)

ch_model = AENetworkAnomaly.load(ch_models_folder)

In [14]:
eval_data = df.loc[df["timestamp"] < current_day.ctime()]
eval_labels = labels[: eval_data.shape[0]]["label"]

In [15]:
X_eval = eval_data.drop('timestamp',axis=1).values
y_pred_curr = ml_model.predict(X_eval, aggregate=True).astype(int)
y_pred_champ = ch_model.predict(X_eval, aggregate=True).astype(int)

In [16]:
# Evaluation on the overall test set
print(classification_report(eval_labels,y_pred_champ,zero_division=1))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98     17163
           1       0.79      0.67      0.72      1557

    accuracy                           0.96     18720
   macro avg       0.88      0.83      0.85     18720
weighted avg       0.96      0.96      0.96     18720



In [17]:
print(classification_report(eval_labels,y_pred_curr,zero_division=1))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     17163
           1       0.85      0.65      0.74      1557

    accuracy                           0.96     18720
   macro avg       0.91      0.82      0.86     18720
weighted avg       0.96      0.96      0.96     18720

