# Baseline implementation of relevant error rate-based change-detection algorithms

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Data Loading

In [2]:
# Load data
train1 = pd.read_csv("/Users/emmatosato/Documents/UNI Locale/Erasmus/OST/ost-sm-change-detection/data_analysis/preprocessed_data/train1.csv")
train2 = pd.read_csv("/Users/emmatosato/Documents/UNI Locale/Erasmus/OST/ost-sm-change-detection/data_analysis/preprocessed_data/train2.csv")
test1 = pd.read_csv("/Users/emmatosato/Documents/UNI Locale/Erasmus/OST/ost-sm-change-detection/data_analysis/preprocessed_data/test1.csv")
test2 = pd.read_csv("/Users/emmatosato/Documents/UNI Locale/Erasmus/OST/ost-sm-change-detection/data_analysis/preprocessed_data/test2.csv")
label1 = pd.read_csv("/Users/emmatosato/Documents/UNI Locale/Erasmus/OST/ost-sm-change-detection/data_analysis/preprocessed_data/label1.csv")
label2 = pd.read_csv("/Users/emmatosato/Documents/UNI Locale/Erasmus/OST/ost-sm-change-detection/data_analysis/preprocessed_data/label2.csv")

In [None]:
# Definitions of functions
def calculate_delay_of_detection(true_change_indexes, detected_indexes):
    delays = [index - detected_index for detected_index, index in zip(detected_indexes, true_change_indexes)]
    average_delay = sum(delays) / len(delays)
    return average_delay

def calculate_false_detection_rate(true_change_indexes, detected_indexes):
    total_drifts = len(true_change_indexes)
    total_detected = len(detected_indexes)
    false_detections = total_detected - total_drifts
    fdr = false_detections / total_drifts
    return fdr

def calculate_miss_detection_rate(true_change_indexes, detected_indexes):
    total_drifts = len(true_change_indexes)
    total_detected = len(detected_indexes)
    mdr = (total_drifts - total_detected) / total_drifts
    return mdr

def calculate_rate_of_drift(detected_indexes, total_time):
    total_detected = len(detected_indexes)
    rod = total_detected / total_time
    return rod

## CUMSUM Algorithm

Pseudocode used as reference in the below article.

Reference : Thomas Flynn and Shinjae Yoo. *Change Detection with the Kernel Cumulative Sum Algorithm*

In [22]:
def cusum_algorithm(data, threshold, drift, reference_values):
    n = len(data)
    cumulative_sum = np.zeros(n)

    for i in range(1, n):
        log_likelihood_ratio = np.log(data.iloc[i] / reference_values.iloc[i - 1])
        cumulative_sum[i] = max(0, cumulative_sum[i - 1] + log_likelihood_ratio.iloc[i] - drift)
        

    change_points = np.where(cumulative_sum > threshold)[0]

    return cumulative_sum, change_points

In [23]:
# Extract relevant features for training and testing
features_train = train1.iloc[:, 1:]  
features_test = test1.iloc[:, 1:]
# Label vector
true_labels = label1["label"]
reference_values = features_train.mean()

# Train the CUSUM detector on the normal behavior
threshold_value = 10  
drift_value = 0.5     

cumulative_sum, _ = cusum_algorithm(features_train, threshold_value, drift_value, reference_values)


  cumulative_sum[i] = max(0, cumulative_sum[i - 1] + log_likelihood_ratio.iloc[i] - drift)


IndexError: single positional indexer is out-of-bounds

In [None]:
# Apply CUSUM detector on the test set
_, change_points = cusum_algorithm(features_test, threshold_value, drift_value, reference_values)

In [None]:
# Evaluate Performance
average_delay = calculate_delay_of_detection(np.where(true_labels == 1)[0], change_points)
false_detection_rate = calculate_false_detection_rate(np.where(true_labels == 1)[0], change_points)
miss_detection_rate = calculate_miss_detection_rate(np.where(true_labels == 1)[0], change_points)

# Assuming total_time is the total duration of your test set
rate_of_drift = calculate_rate_of_drift(change_points, len(df_test))

# Print or use the metrics as needed
print(f"Average Delay of Detection: {average_delay}")
print(f"False Detection Rate: {false_detection_rate}")
print(f"Miss Detection Rate: {miss_detection_rate}")
print(f"Rate of Drift: {rate_of_drift}")

## ADaptive WINdowing (ADWIN)

**ADWIN (ADaptive WINdowing)** is an adaptive sliding window algorithm for detecting change, and keeping updated statistics about a data stream.
ADWIN allows algorithms not adapted for drifting data, to be resistant to this phenomenon.

The general idea is to keep statistics from a window of variable size while detecting concept drift.

The algorithm will decide the size of the window by cutting the statistics' window at different points and analysing the average of some statistic over these two windows. If the absolute value of the difference between the two averages surpasses a pre-defined threshold, change is detected at that point and all data before that time is discarded.

*References*: 
Thomas, Flynn, and Yoo Shinjae. Change Detection with the Kernel Cumulative Sum Algorithm. ArXiv E-prints, 2017, https://doi.org/10.48550/arXiv.1903.01661.

In [None]:
adwin = ADWIN(delta=0.002)
data = train1['P1_FCV01D']


for value in data:
    estimated_mean = adwin.add_element(value)
    print(f"Estimated mean: {estimated_mean}")

## Early Drift Detection Method (EDDM)

This method works by keeping track of the average distance between two errors instead of only the error rate. For this, it is necessary to keep track of the running average distance and the running standard deviation, as well as the maximum distance and the maximum standard deviation.

The algorithm works similarly to the DDM algorithm. Like DDM, there are two threshold values that define the borderline between no change, warning zone, and drift detected.
These are as follows:

**Warning zone**
$$ \frac{(p_i + 2 * s_i)}{(p_{max} + 2 * s_{max})} < \alpha $$ 

**Change detected**

$$\frac{(p_i + 2 * s_i)}{(p_{max} + 2 * s_{max})} < \beta $$ 




$\alpha$ and  $\beta$ are set to 0.95 and 0.9, respectively.


<br><br>

*References:*
Early Drift Detection Method. Manuel Baena-Garcia, Jose Del Campo-Avila, Raúl Fidalgo, Albert Bifet, Ricard Gavalda, Rafael Morales-Bueno. In Fourth International Workshop on Knowledge Discovery from Data Streams, 2006.

In [27]:
class ErrorBasedChangeDetection:
    def __init__(self, alpha=0.95, beta=0.9, min_num_instances=30):
        self.alpha = alpha
        self.beta = beta
        self.min_num_instances = min_num_instances
        self.reset()

    def reset(self):
        self.n = 1
        self.num_errors = 0
        self.d = 0
        self.last_d = 0
        self.mean = 0.0
        self.std_temp = 0.0
        self.m2s_max = 0.0
        self.estimation = 0.0
        self.in_concept_change = 0
        self.in_warning_zone = 0
        self.delay = 0

    def add_element(self, prediction):
        if self.in_concept_change:
            self.reset()

        self.in_concept_change = 0
        self.n += 1

        if prediction == 1.0:
            self.in_warning_zone = 0
            self.delay = 0
            self.num_errors += 1
            self.last_d = self.d
            self.d = self.n - 1
            distance = self.d - self.last_d
            old_mean = self.mean
            self.mean = self.mean + (float(distance) - self.mean) / self.num_errors
            self.estimation = self.mean
            self.std_temp = self.std_temp + (distance - self.mean) * (distance - old_mean)
            std = np.sqrt(self.std_temp / self.num_errors)
            m2s = self.mean + 2 * std

            if self.n < self.min_num_instances:
                return

            if m2s > self.m2s_max:
                self.m2s_max = m2s
            else:
                p = m2s / self.m2s_max
                if (self.num_errors > self.min_num_instances) and (p < self.alpha):
                    self.in_concept_change = 1

                elif (self.num_errors > self.min_num_instances) and (p < self.beta):
                    self.in_warning_zone = 1

                else:
                    self.in_warning_zone = 0

    def detected_warning_zone(self):
        return self.in_warning_zone

    def detected_change(self):
        return self.in_concept_change


In [46]:
eddm = ErrorBasedChangeDetection()

In [None]:
train1.iloc[1:]

In [51]:
# Training
for c in range(train1.shape[1]+1):
    for r in range(train1.shape[0]):
        eddm.add_element(train1.iloc[r,c])
        if eddm.detected_warning_zone():
            print('Warning zone has been detected in data: ' + str(train1.iloc[r,c]) + ' - of index: ' + str(r))
        if eddm.detected_change():
            print('Change has been detected in data: ' + str(train1.iloc[r,c]) + ' - of index: ' + str(r))

IndexError: index 86 is out of bounds for axis 0 with size 86