In [1]:
import re
import sys
import time
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data

path_root = '../anomaly_detection/'
sys.path.append(str(path_root))

from timeinf.utils import block_time_series, sync_time_block_index
from detectors import InfluenceFunctionDetector

from typing import Any, Dict, List, Optional, Union
from kronfluence.task import Task
from kronfluence.analyzer import Analyzer, prepare_model

torch.set_default_dtype(torch.double)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def construct_regression_mlp(input_size, output_size) -> nn.Module:
    model = torch.nn.Sequential(
        nn.Linear(input_size, output_size, bias=True),
    )
    return model

def train(
    model: nn.Module,
    dataset: data.Dataset,
    batch_size: int,
    num_train_epochs: int,
    learning_rate: float,
    weight_decay: float,
    seed: int = 0,
    disable_tqdm: bool = True,
) -> nn.Module:
    train_dataloader = data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
    torch.manual_seed(seed)
    random.seed(seed)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    model.train()
    for epoch in range(num_train_epochs):
        total_loss = 0.0
        with tqdm(train_dataloader, unit="batch", disable=disable_tqdm) as tepoch:
            for batch in tepoch:
                tepoch.set_description(f"Epoch {epoch}")
                optimizer.zero_grad(set_to_none=True)
                inputs, targets = batch
                outputs = model(inputs)
                loss = F.mse_loss(outputs, targets)
                total_loss += loss.detach().float()
                loss.backward()
                optimizer.step()
                tepoch.set_postfix(loss=total_loss.item() / len(train_dataloader))
    return model

In [3]:
class RegressionTask(Task):
    def compute_train_loss(
        self,
        batch: Any,
        model: nn.Module,
        sample: bool = False,
    ) -> torch.Tensor:
        inputs, targets = batch
        outputs = model(inputs)
        if not sample:
            return F.mse_loss(outputs, targets, reduction="sum")
        # Sample the outputs from the model's prediction for true Fisher.
        with torch.no_grad():
            sampled_targets = torch.normal(outputs.detach(), std=math.sqrt(0.5))
        return F.mse_loss(outputs, sampled_targets, reduction="sum")

    def compute_measurement(
        self,
        batch: Any,
        model: nn.Module,
    ) -> torch.Tensor:
        return self.compute_train_loss(batch, model, sample=False)

    def get_influence_tracked_modules(self) -> Optional[List[str]]:
        return None  # Compute influence scores on all available modules.

    def get_attention_mask(self, batch: Any) -> Optional[Union[Dict[str, torch.Tensor], torch.Tensor]]:
        return None  # Attention mask not used.

In [4]:
dataset = "SMAP_MSL"
data_path = Path("../data/multivariate/") / dataset
test_df = pd.read_csv(data_path/"labeled_anomalies.csv")
smap_df = test_df.loc[test_df.spacecraft == "SMAP"]
df = smap_df.loc[smap_df.chan_id != "P-2"]

In [5]:
class Config:
    def __init__(self):
        self.win_size = 100
        self.data_path = '../anomaly_detection/data_processed/SMAP'
        self.dimensions = [0]
        self.dataset = 'SMAP'
        self.verbose = False
        
df = smap_df.loc[smap_df.chan_id != "P-2"]
config = Config()
detector = InfluenceFunctionDetector(config)

train_batch_size = 128
num_train_epochs = 100
learning_rate = 5e-3
weight_decay = 1e-03

In [None]:
len_test_dict, len_anomaly_dict, len_ratio_dict = {}, {}, {}
prec_dict, rec_dict, f1_dict, auc_dict = {}, {}, {}, {}
time_dict = {}

for channel in df.chan_id.iloc:

    if config.dimensions is not None:
        ts_test = np.load(data_path/"test"/f"{channel}.npy")[:,config.dimensions]
    else:
        ts_test = np.load(data_path/"test"/f"{channel}.npy")

    seq_len = len(ts_test)
    anomaly_seqs = df.loc[df.chan_id == channel].anomaly_sequences.to_numpy().item()
    anomaly_seqs = re.findall(r'\d+', anomaly_seqs)
    anomaly_intervals = []
    for i in list(range(0, len(anomaly_seqs), 2)):
        anomaly_intervals.append(anomaly_seqs[i:i+2])
    anomaly_intervals = np.array(anomaly_intervals).astype(int)

    ground_truth = np.zeros(ts_test.shape[0])
    plt.figure(figsize=(16,1))
    plt.plot(ts_test, c="k", linewidth=.5)
    for anomaly_points in anomaly_intervals:
        plt.axvspan(anomaly_points[0], anomaly_points[-1], facecolor='red', alpha=.2)
        ground_truth[anomaly_points[0]:anomaly_points[-1]] = 1.

    anomaly_len = sum(ground_truth)
    anomaly_ratio = anomaly_len / seq_len
    print(f"anomaly ratio is {anomaly_ratio * 100.:.3f} %.")

    len_test_dict.update({channel: seq_len})
    len_anomaly_dict.update({channel: anomaly_len})
    len_ratio_dict.update({channel: anomaly_ratio})

    block_length = 100
    X_train, Y_train = block_time_series(ts_test, block_length)
    train_dataset = torch.utils.data.TensorDataset(
        torch.tensor(X_train).squeeze(), torch.tensor(Y_train)
    )

    print(f"start detection for channel {channel} ..")
    start_time = time.time()

    n_dim = Y_train.shape[-1]
    model = construct_regression_mlp(block_length, n_dim)
    model = train(
        model,
        dataset=train_dataset,
        batch_size=train_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        seed=0,
        disable_tqdm=True,
    )
    task = RegressionTask()
    model = prepare_model(model, task)
    analyzer = Analyzer(
        analysis_name=f"smap_{channel}",
        model=model,
        task=task,
        cpu=False,
        disable_tqdm=False,
    )
    analyzer.fit_all_factors(factors_name=f"smap_{channel}_factor", dataset=train_dataset)
    analyzer.compute_self_scores(
        scores_name=f"smap_{channel}_self_score",
        factors_name=f"smap_{channel}_factor",
        train_dataset=train_dataset,
        overwrite_output_dir=True,
    )
    self_scores = analyzer.load_self_scores(scores_name=f"smap_{channel}_self_score")
    anomaly_scores = self_scores["all_modules"]

    end_time = time.time()
    elapsed_time = round(end_time - start_time, 3)

    plt.figure(figsize=(16,1))
    loos_viz = anomaly_scores
    plt.plot(loos_viz, c="k", linewidth=.5)
    for anomaly_points in anomaly_intervals:
        plt.axvspan(anomaly_points[0], anomaly_points[-1], facecolor='red', alpha=.2)
        ground_truth[anomaly_points[0]:anomaly_points[-1]] = 1.
    plt.ylabel("anomaly score")
    plt.show()

    prec, rec, f1, auc = detector.evaluate(ground_truth[block_length:], anomaly_scores, anomaly_ratio)

    prec_dict.update({channel: prec})
    rec_dict.update({channel: rec})
    f1_dict.update({channel: f1})
    auc_dict.update({channel: auc})
    time_dict.update({channel: elapsed_time})
    
    # break

smap_metrics = pd.DataFrame({
    "Num_of_Test": len_test_dict,
    "Len_of_Anomaly": len_anomaly_dict,
    "Anomaly_Ratio": len_ratio_dict,
    "Precision": prec_dict,
    "Recall": rec_dict,
    "F1": f1_dict,
    "AUC": auc_dict,
    'Detection_Time(s)': time_dict
})

smap_metrics.insert(0, "Dataset", smap_metrics.index)
smap_metrics.reset_index(drop = True, inplace = True)
smap_metrics.to_csv('smap_ekfac.csv')