### Use Antagonist to train a symptom detection model 

#### ToDo

- [OK] Change threshold to detect symptoms
- [OK] Use training data from a metric in addition to current data
- [OK] Remove known anomalies from training data 
- Generate list of network incident aggregating the symptoms
- Use API from the plugin to get and store labels

#### Reproducibility

In [1]:
# Torch
import torch
torch.manual_seed(0)
torch.use_deterministic_algorithms(True)

# Python
import random
random.seed(0)

# Numpy
import numpy as np
np.random.seed(0)

#### Dataset preparation

Note: the dataset needs to be downloaded using the script `download_SMD_dataset.sh` in the `scripts/antagonist-ml` folder.

In [2]:
from data_utils import SMD

In [3]:
db = SMD(dataset_folder=r"D:\antagonist\data\ServerMachineDataset")

In [4]:
dataframes_train, _ = db.read_dataset(group_name="Group 1", train=True, retrieve_labels=True)
dataframes, files = db.read_dataset(group_name="Group 1", train=False, retrieve_labels=True)


In [5]:
service_idx = 0
df,service_id = dataframes[service_idx], files[service_idx]
df,labels = df[df.columns[:-2].tolist()+['timestamp']], df[['label']]
df_train = dataframes_train[service_idx]
df_train = df_train[df_train.columns[:-1].tolist()+['timestamp']]

In [6]:
network_incidents = db.get_interpretation_labels(service_id)

#### Train model utils

In [7]:
import pandas as pd
from auto_encoder import Vanilla_AE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

In [8]:
n_inputs = df.shape[1] - 1
layer_sizes = [8, 4, 2]
lr = 0.005
batch_size = 32
epochs = 40
validation_split = 0.2
early_stopping = True
patience = 3
Q = 0.99  # residual cut

In [9]:
def train_model(df: pd.DataFrame):
    ae = Vanilla_AE(n_inputs=n_inputs, layer_sizes=layer_sizes)

    # Get data but the timestamp
    X_train = df.values[:, :-1]

    # scaler init and fitting
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # model fitting
    ae.fit(
        X_train_scaled,
        early_stopping=early_stopping,
        validation_split=validation_split,
        epochs=epochs,
        lr=lr,
        batch_size=batch_size,
        verbose=0,
        shuffle=True,
        patience=patience,
        delta=0.001,
    )

    # results predicting
    residuals_train = (
        pd.DataFrame(X_train_scaled - ae.predict(X_train_scaled)).abs()
    )
    threshold = residuals_train.quantile(Q,axis=0) * 5 / 2

    return scaler, ae, threshold 

In [10]:
def find_consecutive_true_np(arr):
    result = []
    for i in range(arr.shape[1]):
        s = arr[:, i]
        m = np.r_[False, s, False]
        idx = np.flatnonzero(m[1:] != m[:-1])
        result.append(list(zip(idx[::2], idx[1::2])) )
    return result

def predict(df, scaler, ae, threshold, aggregate=False ):

    X_hat = scaler.transform(df.values[:,:-1])
    residuals_full_df = X_hat - ae.predict(X_hat)
    residuals_full_df = pd.DataFrame(residuals_full_df).abs()
    
    symptoms = (residuals_full_df > threshold).values

    return symptoms.any(axis=1) if aggregate else symptoms

In [11]:
def eval_model(df, labels,  scaler, ae, threshold ):

    X_hat = scaler.transform(df.values[:,:-1])
    residuals_full_df = X_hat - ae.predict(X_hat)
    residuals_full_df = pd.DataFrame(residuals_full_df).abs()
    residuals_full_df['outlier'] = (residuals_full_df > threshold).any(axis=1).astype(int).values

    return f1_score(labels['label'].values,residuals_full_df['outlier'].values, average='binary')

#### Iterative process simulation

Every day the model is retrained with the new data and new labels

In [12]:
import datetime
from collections import defaultdict

In [13]:
start_date = datetime.datetime.fromtimestamp(df['timestamp'].astype('int64').min()/10**9)
end_date = datetime.datetime.fromtimestamp(df['timestamp'].dt.ceil('D').astype('int64').max()/10**9)

In [17]:
scaler, ae, threshold = None, None, None
previous_day = None

symptoms_predictions = dict()

for current_day in pd.date_range(start=start_date, end=end_date, freq="D"):
    current_day = datetime.datetime.fromtimestamp(current_day.timestamp())
    df_today = df.loc[df["timestamp"] < current_day.ctime()]

    if df_today.shape[0] == 0:
        # first day
        continue

    # Predict symptoms for the current day
    if scaler is not None and previous_day is not None:
        df_pred = df.loc[
            (df["timestamp"] >= previous_day.ctime())
            & (df["timestamp"] < current_day.ctime())
        ]
        y_pred = predict(df_pred, scaler, ae, threshold, aggregate=False)

        intervals = find_consecutive_true_np(y_pred)
        for metric_id, symptoms in enumerate(intervals):
            if len(symptoms) > 0:
                symptoms_predictions[current_day] = defaultdict(list)
            for symp in symptoms:
                symptoms_predictions[current_day][metric_id].append(
                    [
                        df_pred["timestamp"].iloc[symp[0]].timestamp(),
                        df_pred["timestamp"].iloc[symp[1] - 1].timestamp(),
                    ]
                )

        break

    # Retrain the model on the available data removing anomalies (simulating human validation)
    mask_train = labels[: df_today.shape[0]]["label"] == 0
    df_today = pd.concat([df_train, df_today[mask_train]], ignore_index=True)
    scaler, ae, threshold = train_model(df_today)
    previous_day = current_day

Early stopping at epoch 8


In [18]:
# Evaluation on the overall test set
f1 = eval_model(df, labels, scaler, ae, threshold)
print(f"F1 score: {round(f1,4)}")

F1 score: 0.3014


### Convert to objects

In [30]:
# Aggregate symptoms in network incidents according to the OR rule
incidents_predictions = {}
for model_date, symptoms in symptoms_predictions.items():
    print(model_date.strftime('%Y-%m-%d'))

    # aggregate overlapping symptoms coming from different metrics
    day_symptoms = [
        (metric_id, symptom[0], symptom[1]) for metric_id, symptoms_list in symptoms.items() for symptom in symptoms_list 
    ]

    # sort by starting timestamp
    day_symptoms.sort(key=lambda x: x[1])

    if len(day_symptoms) == 0:
        continue


    # create a list of incident in the form [(start_timestamp, end_timestamp, [symptom1, symptom2]),...]
    start = day_symptoms[0][1] 
    end = day_symptoms[0][2]
    network_incidents = [(start, end, [day_symptoms[0]])]
    for symptom in day_symptoms[1:]:
        # if overlapping add to the current incident, new incident otherwise
        if symptom[1] <= end:
            network_incidents[-1][2].append(symptom)
            end = max(end, symptom[2])
            network_incidents[-1][1] = end
        else:
            start = symptom[1]
            end = symptom[2]
            network_incidents.append((start, end, [symptom]))

    incidents_predictions[model_date] = network_incidents

2020-01-03


In [None]:
import uuid

In [36]:
network_anomalies = []
symptoms = []
symptoms_to_network_anomalies = []

for model_date, network_incidents in incidents_predictions.items():
    for network_incident in network_incidents:
        ni_uuid=str(uuid.uuid4())
        network_anomalies.append({
            'author': {
                            'author_type': 'algorithm',
                            'name':  "ae_continual_learning",
                            'version': int(model_date.timestamp())
                        },
            'description': model_date.strftime("%Y-%m-%d"),
            'start': network_incident[0],
            'end': network_incident[1],
            'id': ni_uuid,
            'state': 'potential',
            'version': 1
        })

        for symptom in network_incident[2]:
            symptom_uuid=str(uuid.uuid4())
            symptoms.append(
                {
                    "confidence-score": 1.0,
                    "description": "Automatic generated symptom",
                    "start-time": datetime.datetime.fromtimestamp(symptom[1]).strftime("%Y-%m-%dT%H:%M:%S"),
                    "end-time":  datetime.datetime.fromtimestamp(symptom[2]).strftime("%Y-%m-%dT%H:%M:%S"),
                    "event-id": str(uuid.uuid4()),
                    "id": symptom_uuid,
                    "source-name": f"ae_{model_date.strftime('%Y-%m-%d')}",
                    "source-type": "algorithm",
                    "tags": {
                        "machine":service_id,
                        "metric_id":symptom[0]
                    }
                }
            )

            symptoms_to_network_anomalies.append({
                "symptom-id": symptom_uuid,
                "incident-id": ni_uuid
            })

In [37]:
network_anomalies

[{'author': {'author_type': 'algorithm',
   'name': 'ae_continual_learning',
   'version': 1578009600},
  'description': '2020-01-03',
  'start': 1577925900.0,
  'end': 1577925960.0,
  'id': '1fe81d3f-c082-43aa-8012-9793d1f86ba5',
  'state': 'potential',
  'version': 1},
 {'author': {'author_type': 'algorithm',
   'name': 'ae_continual_learning',
   'version': 1578009600},
  'description': '2020-01-03',
  'start': 1577926200.0,
  'end': 1577926200.0,
  'id': 'f70debdc-c346-4cf6-9b60-3666aa26ad47',
  'state': 'potential',
  'version': 1},
 {'author': {'author_type': 'algorithm',
   'name': 'ae_continual_learning',
   'version': 1578009600},
  'description': '2020-01-03',
  'start': 1577928240.0,
  'end': 1577928240.0,
  'id': 'aa42090f-fb91-41fb-8e07-3ae219c0bf0b',
  'state': 'potential',
  'version': 1},
 {'author': {'author_type': 'algorithm',
   'name': 'ae_continual_learning',
   'version': 1578009600},
  'description': '2020-01-03',
  'start': 1577938440.0,
  'end': 1577938440.0,
 

In [38]:
#    "action": "Reachability",
#    "cause": "Peer Down",
#    "pattern": "drop",
#    "plane": "control",
#    "reason": "Withdraw",
symptoms

[{'confidence-score': 1.0,
  'description': 'Automatic generated symptom',
  'start-time': '2020-01-02T00:45:00',
  'end-time': '2020-01-02T00:46:00',
  'event-id': 'fc1501be-066f-4dbe-94ac-fc68632b4b58',
  'id': '45a52fde-f646-4df1-9f4b-a5624f0c8bd2',
  'source-name': 'ae_2020-01-03',
  'source-type': 'algorithm',
  'tags': {'machine': 'machine-1-1.txt', 'metric_id': 33}},
 {'confidence-score': 1.0,
  'description': 'Automatic generated symptom',
  'start-time': '2020-01-02T00:50:00',
  'end-time': '2020-01-02T00:50:00',
  'event-id': 'aaf5d962-378c-45ab-bf75-891ddc64ba75',
  'id': '7f629060-4941-4df8-9f24-ede16173ec7d',
  'source-name': 'ae_2020-01-03',
  'source-type': 'algorithm',
  'tags': {'machine': 'machine-1-1.txt', 'metric_id': 33}},
 {'confidence-score': 1.0,
  'description': 'Automatic generated symptom',
  'start-time': '2020-01-02T01:24:00',
  'end-time': '2020-01-02T01:24:00',
  'event-id': '79cce49b-a022-46d2-b018-868782bd4821',
  'id': '2003a711-74e2-4619-9944-f05b6a3b

In [39]:
symptoms_to_network_anomalies

[{'symptom-id': '45a52fde-f646-4df1-9f4b-a5624f0c8bd2',
  'incident-id': '1fe81d3f-c082-43aa-8012-9793d1f86ba5'},
 {'symptom-id': '7f629060-4941-4df8-9f24-ede16173ec7d',
  'incident-id': 'f70debdc-c346-4cf6-9b60-3666aa26ad47'},
 {'symptom-id': '2003a711-74e2-4619-9944-f05b6a3b9132',
  'incident-id': 'aa42090f-fb91-41fb-8e07-3ae219c0bf0b'},
 {'symptom-id': 'dd5a2084-c76c-472c-aac8-319f6ff5a4fe',
  'incident-id': '4f7de320-3bb3-4414-9820-76e527fc50dd'},
 {'symptom-id': 'c48d4323-79ae-42cd-a64d-c8598cfb12f0',
  'incident-id': '21005306-4907-470e-a48d-8cde30dfa9e2'}]