# Setup

In [None]:
# setup darts
!pip install "darts @ git+https://github.com/unit8co/darts.git@master" seaborn openpyxl

In [None]:
# Download data from data source to local
import io
import os
import requests
import shutil
import zipfile

import numpy as np
import pandas as pd

In [None]:
data_dir = "data"
anom_dir = os.path.join(data_dir, "anomaly_detection")
for dir_path in [data_dir, anom_dir]:
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

## Data Download and Preprocessing - Anomaly Detection - ECG

In [None]:
def download_ecg():
    # URL of the zip file
    zip_url = "https://my.hidrive.com/api/sharelink/download?id=lmCmAjUP"
    
    file_path = os.path.join(anom_dir, "svdb.zip")
    unzipped_path = os.path.join(anom_dir, "multivariate")
    processed_path = os.path.join(anom_dir, "842.test.csv")
    if not os.path.exists(processed_path):
        if not os.path.exists(file_path):
            # Send a GET request to download the zip file
            response = requests.get(zip_url)
            
            # Check if the request was successful
            if response.status_code == 200:
                # Save the zip file to the local drive
                with open(file_path, "wb") as file:
                    file.write(response.content)
                print("Zip file downloaded successfully.")
            else:
                print("Failed to download.")
        else:
            print("Zip file already downloaded.")    
        
        # Extract the zip file
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(anom_dir)
        file_path
        df = pd.read_csv(os.path.join(anom_dir, "multivariate", "SVDB", "842.test.csv"))
        df.to_csv(processed_path, index=False)
        shutil.rmtree(unzipped_path, ignore_errors=True)
        os.remove(file_path)
        print("Zip file extracted successfully.")
    else:
        print("File already downloaded.")

In [None]:
print("Downloading ECG Data..")
download_ecg()

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# ECG Arrhythmia detection

This project aims to demonstrate the Anomaly Detection module implemented in Darts
### Dataset
The MIT-BIH Supraventricular Arrhythmia Database (SVDB) contains 2 channels, and 78 half-hour ECG recordings obtained from 47 objects between 1975-1979. To keep the data lean, we only stored data of patient 842.

### Task
Develop an anomaly detection model to identity arrhythmia in the ECG signal.

## Task #1
### Load data of a patient into a darts timeseries object

In [None]:
import os
from darts import TimeSeries

# Load data into darts TimeSeries object
fpath = os.path.join("data", "anomaly_detection", "842.test.csv")
timeseries = TimeSeries.from_csv(fpath, time_col='timestamp')
ts_ecg = timeseries[['ECG1','ECG2']]
ts_anomaly = timeseries['is_anomaly']

### Visualize signal and anomaly

In [None]:
import matplotlib.pyplot as plt

fig,ax = plt.subplots(1,1, figsize=(10,5))
ts_ecg['ECG1'].plot(ax=ax,label='ECG1', lw=0.5)
(ts_anomaly-2).plot(ax=ax,label='is_anomaly',color='r', lw=0.5)

## Task #2
### Identify a region of ~15000 datapoints with anomalies (otherwise training takes a while) and visualize it

In [None]:
# Identify a subset for demonstration
start, end = 15000, 30000
# Create subset time series ecg and anomaly object

"""
Extract a subset from `ts_ecg` and `ts_anomaly` using the start and end points from above:

Hint: you can slice TimeSeries like this:  `series[idx_left:idx_right]`
"""
ts_ecg_subset= # TO FILL
ts_anomaly_subset = # TO FILL

In [None]:
# Visualize the subset
fig,ax = plt.subplots(figsize=(10, 5))
ts_ecg_subset['ECG1'].plot(label='ECG1', lw=1.)
((ts_anomaly_subset/2)-1.5).plot(label='is_anomaly', color='r')

## Task #3
### Create training and test sets (e.g., 10k/5k)

In [None]:
"""
Extract a train and test set from `ts_ecg_subset` and `ts_anomaly_subset`.

The train set should end at index 10'000, and test set at index 15'000
"""
train_end, test_end = # TO FILL 
ts_ecg_train = # TO FILL (extract from the `*_subset` series)
ts_ecg_test =  # TO FILL
ts_anomaly_test = # TO FILL

In [None]:
# Visualize the train / test set as well as the test set anomalies
fig,ax = plt.subplots(figsize=(10, 5))
ts_ecg_train['ECG1'].plot(label='train', lw=1.)
ts_ecg_test['ECG1'].plot(label='test', lw=1.)
((ts_anomaly_test/2)-1.5).plot(label='is_anomaly', color='r')

## Task #4
### Assess data properties such as periodicity and identify most common period.

In [None]:
from darts.utils.statistics import plot_acf

# Visualise signal auto correlation to identify most common periodicity
plot_acf(ts=ts_ecg_subset['ECG1'], max_lag=220, m=92)

In [None]:
# Identified most common period
period = 92

## Task #5
### Develop an anomaly detection model step by step by (see figure below):
1. Create a forecasting model based on the train timeseries ECG data
2. Compute historical forecasting for the test timeseries ECG data
3. Compute anomaly scores using 2 different scores based on the forecasted and actual ECG signal

<img src="images/ad_inside_anomaly_model.png" alt="Image" width="60%" height="60%">

Links:
- Forecasting models: https://unit8co.github.io/darts/generated_api/darts.models.forecasting.html
- Scorers: https://unit8co.github.io/darts/generated_api/darts.ad.scorers.html?highlight=scorer#

#### Create a Forecasting model

In [None]:
from darts.models import LinearRegressionModel

"""
Create a `LinearRegressionModel` and fit it on `ts_ecg_train`

Setup the model like this:
- use the last `period` steps of the target series as model input (hint: `lags`) 
- train the model to predict one step at a time (hint: `output_chunk_length`)

Documentation: https://unit8co.github.io/darts/generated_api/darts.models.forecasting.linear_regression_model.html#linear-regression-model
"""

# create the model
forecasting_model = # TO FILL

# train the forecasting model on the training dataset `ts_ecg_train`
forecasting_model.fit(
    # TO FILL
)

Okay and let's compute the historical forecasts and the residuals of the forecasts.

In [None]:
# historical predictions
ts_ecg_test_predicted = forecasting_model.historical_forecasts(ts_ecg_test, retrain=False)

# residuals of these predictions = (y_true - y_pred)
ts_ecg_residuals = forecasting_model.residuals(ts_ecg_test, historical_forecasts=ts_ecg_test_predicted)

# Visualization of predicted and actual signal
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(10, 10))
ts_ecg_test_predicted['ECG1'].plot(ax=ax1, label='ECG1_predicted')
ts_ecg_test['ECG1'].plot(ax=ax1, color='r', label='ECG1_test')
ts_ecg_residuals['ECG1'].plot(ax=ax2, color='b', label='Difference')

#### Use a NormScorer for scoring

In [None]:
from darts.ad.scorers import NormScorer
from darts.ad.utils import eval_metric_from_scores

"""
Create a NormScorer and compute the anomaly scores on the test set (between the actual values and 
historical forecasts).

The scorer should use:
- a norm order of `1`
- it should compare components jointly for scoring (hint parameter: `component_wise`)

Documentation:
- Scorer(): https://unit8co.github.io/darts/generated_api/darts.ad.scorers.norm_scorer.html
- Scorer.score_from_prediction(): https://unit8co.github.io/darts/generated_api/darts.ad.scorers.norm_scorer.html#darts.ad.scorers.norm_scorer.NormScorer.score_from_prediction
"""
# create the scorer
scorer = NormScorer(
    ord= # TO FILL
    component_wise= # TO FILL
)

# compute the anomaly scores between the actual test series and the historical forecasts for that series
scores = scorer.score_from_prediction(
    series= # TO FILL
    pred_series= # TO FILL
)
scores.plot(label='Anomaly Score')

In [None]:
from darts.ad.utils import eval_metric_from_scores

"""
Evaluate the computed anomaly score using utility methods in darts
"""
eval_metric_from_scores(
    pred_scores= # TO FILL (hint: the predicted scores), 
    anomalies= # TO FILL (hint: the actual anomalies), 
    metric= # TO FILL (e.g. 'AUC_ROC')
)

#### Use fittable KMeansScorer for scoring
The Norm scorer calculates the norm between the predicted and actual time series point-wise. Since predicting the peaks of the ECG signals is challenging for the forecasting model, the biggest differences between the actual and predicted values are mostly found at the peak locations.

To overcome the issue from point-wise comparisons, we'll use a windowing approach. This extracts vectors/chunks from an input series by applying a sliding window of width `window` to it. Currently, the scorers that support windowing are `KMeansScorer`, `PyODScorer` and `WasserSteinScorer`. They are also trainable, meaning that we have to fit them first on a set of window vectors/chunks. You want the training set to be anomaly-free!

To fit the scorers, you can either:
- 1) fit it only on some actual time series with `scorer.fit()`, or
- 2) fit it on the difference between some actual series and model forecasts for those series with `scorer.fit_from_prediction()`.

The scoring on new data also has two options (pick the same option as done when fitting): 
- 1) score on some new time series with `scorer.score()`, or
- 2) score on the difference between some new series and model forecasts for those series with `scorer.score_from_prediction()`.
 
In option 2), the scorers simply compute some difference function (e.g. absolute differnce, ...) to reduce the actual series and forecasts into one series. After that, internally it works exactly the same as option 1).

Let's look at `KMeansScorer` as an example. It fits `k` centroids on the windowed anomaly-free input data. To score how anomalous a new series is, it computes the closest centroid distance for each window of that series. If there were any anomalies, then the distances should be larger than any of the anomaly free trainig windows.

The figure below illustrates how the KMeanScorer works when applied directly to a time series with option 1) (or option 2) after taking the difference).

#### Training & Scoring
<img src="images/kmeansscorer.png" alt="Image" width="70%" height="70%">

<img src="images/ad_windowing.png" alt="Image" width="70%" height="70%">

In the example below, we will use the previously developed forecasting model to create historical predictions for the train dataset and train the KMeanScorer on the absolute difference between the actual training and forecasted training datasets (default).

The difference function can be changed with parameter `diff_fn`. It can be any of Darts ["per time step" metrics](https://unit8co.github.io/darts/generated_api/darts.metrics.html).  By default, it uses `darts.metrics.ae` (absolute difference).

In [None]:
# First we compute historical forecasts on the train dataset to later train the scorer
ts_ecg_train_predicted = forecasting_model.historical_forecasts(ts_ecg_train, retrain=False)

In [None]:
from darts.ad.scorers import KMeansScorer
"""
Create a KMeanScorer, and train it on the train set (on the actual series and historical forecasts).

The scorer should use a window size of `50`.

Documentation:
- KMeanScorer: https://unit8co.github.io/darts/generated_api/darts.ad.scorers.kmeans_scorer.html
"""

# create the scorer
scorer = KMeansScorer(
    window= # TO FILL
    component_wise=False,
    ranom_state=42,
)

# train the scorer on the train set
scorer.fit_from_prediction(
    series= # TO FILL
    pred_series= # TO FILL
)

In [None]:
"""
Compute the scores on the test set (on the actual series and historical forecasts).
"""

# score on the test set
scores = scorer.score_from_prediction(
    series= # TO FILL
    pred_series= # TO FILL
)

# plot the results
scores.plot(label='Anomaly Score')

In [None]:
# compute a metric for the scores
eval_metric_from_scores(
    pred_scores=scores,
    anomalies=ts_anomaly_test,
    window=2*period,
    metric='AUC_ROC'
)

## Task #6
### Develop the anomaly detection models by using the Forecasting Anomaly Model via dedicated Darts API interface
This exercise aims to illustrate the power of the darts anomaly detection module by hiding all of the previously made steps under the hood into one dedicated AnomalyModel and corresponding APIs

We'll use the already pretrained forecasting model, but you can also give an un-trained model and call `ForecastingAnomalyModel.fit()` with `allow_model_training=True`.

In [None]:
from darts.ad.anomaly_model.forecasting_am import ForecastingAnomalyModel
from darts.ad.scorers import NormScorer, KMeansScorer

# Instantiate the anomaly model with: one forecasting model, and one or more scorers (and corresponding parameters)
anomaly_model = ForecastingAnomalyModel(
    model=forecasting_model,
    scorer=[
         NormScorer(ord=1),
         KMeansScorer(k=50, window=2*period, component_wise=False, random_state=42)
    ],
)

"""
Fit the Forecasting Anomaly Model on the train set `ts_ecg_train`.
"""

# fit the forecasting anomaly model
anomaly_model.fit(
    series= # TO FILL
    allow_model_training=False  # (we use a pre-trained model)
)

In [None]:
"""
Compute the scores the test set. Use the actual test target series `ts_ecg_test`.
"""

# compute scores on the test set and return historical forecasts in one step
anomaly_scores, predictions = anomaly_model.score(
    series= # TO FILL, 
    return_model_prediction=True  # (also return historical forecasts)
)

# plot the actual test series, and historical forecasts
fig, ax plt.subplots()
ts_ecg_test['ECG1'].plot(label='test', lw=1.)
predictions['ECG1'].plot(label='prediction', lw=1.)
plt.show()

# plot the scores from each scorer
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 5))
anomaly_scores[0].plot(ax=ax1, label="NormScorer") # indeces corresponding to the scorers
anomaly_scores[1].plot(ax=ax2, label="KMeansScorer") # indeces corresponding to the scorers
plt.show()

### Leverage the inbuilt darts visualization tool to evaluate and show anomalies

In [None]:
# Visualize and evaluate detection of anomalies
anomaly_model.show_anomalies(
    series=ts_ecg_test,
    anomalies=ts_anomaly_test,
    metric="AUC_ROC",
)

## Task #7
### Use a detector to binarize the anomaly scores

Currently there are two types of Detectors in Darts:
- QuantileDetector: flag points as anomalous if they are outside upper and lower quantile bound
- ThresholdDetector: flag points as anomalous if they are above/below a maximum/minimum threshold value

Link:
- https://unit8co.github.io/darts/generated_api/darts.ad.detectors.html?highlight=detector

In [None]:
from darts.ad.detectors import QuantileDetector, ThresholdDetector

# Instantiate a detector
detector = QuantileDetector(low_quantile=0, high_quantile=0.70)

fig, ax = plt.subplots()
(detector.fit_detect(anomaly_scores[0])-0).plot(lw=1., label='NormScorer - detected_anomaly')
(detector.fit_detect(anomaly_scores[1])-2).plot(lw=1., label='KMeanScorer - detected_anomaly')
(ts_anomaly_test-4).plot(color='r', lw=1, label='is_anomaly')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
# Instantiate a detector
detector =  ThresholdDetector(low_threshold=0, high_threshold=0.42)

fig, ax = plt.subplots()
(detector.detect(anomaly_scores[0])).plot(lw=1, label='NormScorer - detected_anomaly')
#(detector.detect(anomaly_scores[1])-2).plot(lw=1, label='KMeanScorer - detected_anomaly')
(ts_anomaly_test-2).plot(color='r', lw=1, label='is_anomaly')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')