### Use Antagonist to train a symptom detection model 

#### Reproducibility

Set seeds to ensure reproducible results.

In [None]:
# Torch
import torch
torch.manual_seed(0)
torch.use_deterministic_algorithms(True)

# Python
import random
random.seed(0)

# Numpy
import numpy as np
np.random.seed(0)

#### Dataset preparation

Note: the dataset needs to loaded into influxDB.

In [None]:
import datetime
import pandas as pd
from utils import SMDInfluxDB

In [None]:
# Import the antagonist_ml methods
import sys 
sys.path.append("..")
from antagonist_ml.service import get_network_symptoms_labels, store_network_anomalies_labels, store_network_symptom_labels

In [None]:
group = "Group-1"
machine_id = 'machine-1-1'

In [None]:
# Import data in the last year to be sure to read all the dataset
end = datetime.datetime.now()
start = end - datetime.timedelta(days=365)

In [None]:
db = SMDInfluxDB()
dataframes, machines = db.read_dataset(
    start_date=start,
    end_date=end,
    machine_name=machine_id,
)

In [None]:
df = dataframes[0]
df = df[df.columns[1:].tolist()+['timestamp']]

In [None]:
ground_truth = get_network_symptoms_labels(
    "localhost:5001",
    source_type="human",
    start_timestamp=start.timestamp(),
    end_timestamp=end.timestamp(),
    tags={"machine": machine_id},
)

In [None]:
y_label = np.zeros(df.shape[0])

for symptom in ground_truth:
    y_label[(df["timestamp"] >= pd.Timestamp(symptom['start-time'], unit="s", tz="UTC"))&(df["timestamp"] <= pd.Timestamp(symptom['end-time'], unit="s", tz="UTC"))] = 1

df_labels = pd.DataFrame(y_label, columns=["label"])

#### ML Model Loading

In [None]:
import os
import datetime
from ml_ad import AENetworkAnomaly

In [None]:
# Filter up to current day to simulate the predition on the next one
current_day = df['timestamp'].min() + datetime.timedelta(days=33)
next_day = current_day + datetime.timedelta(days=1)

In [None]:
# Create models folder
data_folder = r"..\..\..\data"
models_folder = os.path.join(data_folder,'models')
os.makedirs(models_folder,exist_ok=True)

In [None]:
model_name = f'ae_model_{(current_day).strftime("%Y%m%d")}'
model_folder = os.path.join(models_folder, 'new_model')

In [None]:
if os.path.exists(model_folder):
    # Load the cached model if it exist
    ml_model = AENetworkAnomaly.load(model_folder)
else:
    # Create new model 
    ml_model = AENetworkAnomaly(n_inputs=df.shape[1]-1)

    # Get data up to current day (training set)
    df_today = df.loc[df["timestamp"] < current_day.ctime()]

    # Train the model
    X_train = df_today.drop('timestamp',axis=1).values
    ml_model.fit(X_train)

    # Cache the trained model
    ml_model.store(model_folder)

#### Comparing with champion models (model currently deployed)

In [None]:
from sklearn.metrics import classification_report

In [None]:
# Load the champion model (the one trained in the detection notebook)
champion_model_name='ae_model_champion'
ch_models_folder = os.path.join(models_folder,champion_model_name)

ch_model = AENetworkAnomaly.load(ch_models_folder)

In [None]:
# Filter evaluation data and labels
eval_data = df.loc[df["timestamp"] < current_day.ctime()]
eval_labels = df_labels[: eval_data.shape[0]]["label"]

In [None]:
# Predict using both models (new and champions)
X_eval = eval_data.drop('timestamp',axis=1).values
y_pred_curr = ml_model.predict(X_eval, aggregate=True).astype(int)
y_pred_champ = ch_model.predict(X_eval, aggregate=True).astype(int)

In [None]:
# Champion model performance metrics
print(classification_report(eval_labels,y_pred_champ,zero_division=1))

In [None]:
# New model performance metrics
print(classification_report(eval_labels,y_pred_curr,zero_division=1))