
# Detección de Drift a partir de la incertidumbre


## Introducción

En este notebook se presenta un algoritmo de detección de drift a partir de cambios en la incertidumbre del modelo. En este caso se usaran datos provenientes de un problema de regresión. Para un problema de clasificación se aplicaria el algoritmo de la misma forma. Al final tambien se ofrece una comparativa con algunos algoritmos del estado del arte.

## Preparación del dataset

In [4]:
!pip install river numpy==1.23.5

from river import preprocessing, linear_model, optim
import matplotlib.pyplot as plt
from river import utils, stats
from river.drift import ADWIN, HDDM_W, DDM, EDDM, HDDM_A, KSWIN, PageHinkley
import math
import numpy as np
import pandas as pd
from functools import reduce

def generateRegressionDataSet(intercept=[1,0], domain=range(0,100), noise=[0 for x in range(0,100)], columns=['x', 'y']):
  data = []
  for x, single_noise in zip(domain, noise):
    data.append([x, x*intercept[0]+intercept[1]+ single_noise])
  return pd.DataFrame(data, columns=columns)

def mergeDataSetsSudden(dataset1, dataset2):
  res = []
  for data in dataset1.values.tolist():
    res.append(data)
  for data in dataset2.values.tolist():
    res.append(data)
  return pd.DataFrame(data=res, columns=dataset1.columns)

def generateRegressionDataSetWithSuddenDrift(intercept, domain=[range(0,100), range(100,200)], noise=[[0 for x in range(0,100)], [0 for x in range(0,100)]], columns=['x', 'y']):
  datasets = []
  for d in zip(domain, noise, intercept):
    dataset = generateRegressionDataSet(intercept=d[2], domain=d[0], noise=d[1], columns=columns)
    datasets.append(dataset)
  return reduce(mergeDataSetsSudden, datasets)

def generateRegressionWithGradualDrift(intercept, domain=[range(0,130), range(70,200)], transition_in=[1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,1,1,1,0,0], transition_out=[0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,0,0,1,1], noise=[[0 for x in range(0,130)], [0 for x in range(0,130)]], columns=['x', 'y']):
  datasets = []
  for d in zip(domain, noise, intercept):
    dom = [x for index, x in enumerate(d[0]) if ((index < len(transition_in) and transition_in[index] == 1) or (index > len(transition_in) and index<len(d[0])-len(transition_out)) or (index>len(d[0])-len(transition_out) and transition_out[index - len(d[0])+len(transition_out)] == 1))] 
    nos = [x for index, x in enumerate(d[1]) if ((index < len(transition_in) and transition_in[index] == 1) or (index > len(transition_in) and index<len(d[1])-len(transition_out)) or (index>len(d[1])-len(transition_out) and transition_out[index - len(d[1])+len(transition_out)] == 1))] 
    dataset = generateRegressionDataSet(intercept=d[2], domain=dom, noise=nos, columns=columns)
    datasets.append(dataset)    
  return reduce(mergeDataSetsSudden, datasets) 

def mergeDataSetsIncremental(dataset1, dataset2, combine_fn, steps=20):
  res = []
  for data in dataset1.values.tolist()[0:len(dataset1)-steps]:
    res.append(data)
  for i in range(0,steps):
    item1 = dataset1.values.tolist()[len(dataset1)-steps + i]
    item2 = dataset2.values.tolist()[i]
    res.append(combine_fn(item1, item2, i, steps))
  for data in dataset2.values.tolist()[steps:]:
    res.append(data)
  return pd.DataFrame(data=res, columns=dataset1.columns)

def generateRegressionDataSetWithIncrementalDrift(intercept, combine_fn, domain=[range(0,110), range(90,200)], noise=[[0 for x in range(0,110)], [0 for x in range(0,110)]], columns=['x', 'y']):
  datasets = []
  for d in zip(domain, noise, intercept):
    dataset = generateRegressionDataSet(intercept=d[2], domain=d[0], noise=d[1], columns=columns)
    datasets.append(dataset)
  return reduce(lambda d1, d2: mergeDataSetsIncremental(d1, d2, combine_fn=combine_fn, steps=20), datasets)

def normalizeData(dataset):
  return (dataset-dataset.mean())/dataset.std()

random_state = np.random.RandomState(42)

dataset1 = normalizeData(generateRegressionDataSetWithSuddenDrift(intercept=[[0,10], [0,30]], noise=[random_state.normal(0,2, 100), random_state.normal(0,2, 100)]))
dataset2 = normalizeData(generateRegressionWithGradualDrift(intercept=[[0,10], [0,30]], noise=[random_state.normal(0,2, 130), random_state.normal(0,2, 130)]))
dataset3 = normalizeData(generateRegressionDataSetWithIncrementalDrift(intercept=[[0,10], [0,30]], combine_fn=lambda item1, item2, pos, steps: [item1[0], (1 - pos/steps)*item1[1] + (pos/steps)*item2[1]], noise=[random_state.normal(0,2, 130), random_state.normal(0,2, 130)]))
dataset4 = normalizeData(generateRegressionDataSetWithSuddenDrift(intercept=[[1,0], [4,0]], noise=[random_state.normal(0,40, 100), random_state.normal(0,40, 100)]))
dataset5 = normalizeData(generateRegressionWithGradualDrift(intercept=[[1,0], [4,0]], noise=[random_state.normal(0,40, 130), random_state.normal(0,40, 130)]))
dataset6 = normalizeData(generateRegressionDataSetWithIncrementalDrift(intercept=[[1,0], [4,0]], combine_fn=lambda item1, item2, pos, steps: [item1[0], (1 - pos/steps)*item1[1] + (pos/steps)*item2[1]], noise=[random_state.normal(0,40, 130), random_state.normal(0,40, 130)]))
dataset7 = normalizeData(generateRegressionDataSetWithSuddenDrift(intercept=[[1,0], [4,-300]], noise=[random_state.normal(0,40, 100), random_state.normal(0,40, 100)]))
dataset8 = normalizeData(generateRegressionWithGradualDrift(intercept=[[1,0], [4,-300]], noise=[random_state.normal(0,40, 130), random_state.normal(0,40, 130)]))
dataset9 = normalizeData(generateRegressionDataSetWithIncrementalDrift(intercept=[[1,0], [4,-300]], combine_fn=lambda item1, item2, pos, steps: [item1[0], (1 - pos/steps)*item1[1] + (pos/steps)*item2[1]], noise=[random_state.normal(0,40, 130), random_state.normal(0,40, 130)]))


datasets = [
    [dataset1, dataset2, dataset3],
    [dataset4, dataset5, dataset6],
    [dataset7, dataset8, dataset9]]



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Cálculo de la incertidumbre

In [5]:


def getUncertaintyByQuantiles(dataset):
  sorted_dataset = dataset.sort_values(by=['x'])
  X = list(map(lambda x: {'x': x}, sorted_dataset['x']))
  Y =  sorted_dataset['y'].values.tolist()
  model = preprocessing.StandardScaler() | linear_model.LinearRegression(optimizer=optim.Adam())

  uncert = { 
      'uncert95': preprocessing.StandardScaler() | preprocessing.TargetStandardScaler(
          regressor=linear_model.LinearRegression(intercept_lr=0, optimizer=optim.SGD(0.03), loss=optim.losses.Quantile(alpha=0.95))),
      'uncert05': preprocessing.StandardScaler() | preprocessing.TargetStandardScaler(
          regressor=linear_model.LinearRegression(intercept_lr=0, optimizer=optim.SGD(0.03), loss=optim.losses.Quantile(alpha=0.05)))
  }
  res = {'x': [], 'y':[], 'uncert95': [], 'uncert05':[], 'metric': [], 'y_pred':[]}
  
  for x,y in zip(X,Y):
    res.get('x').append(x.get('x'))
    res.get('y').append(y)
    y_pred = model.predict_one(x)
    res.get('y_pred').append(y_pred)
    model.learn_one(x,y)


    for key in uncert.keys():
      uncert_model = uncert.get(key)
      y_pred = uncert_model.predict_one(x)
      res.get(key).append(y_pred)
      uncert_model.learn_one(x,y)
  return res



## Algoritmo de detección de Drift



In [6]:


def detectDrift(upper, lower, window_size=30, threshold=2):
  mean = utils.Rolling(stats.Mean(), window_size=window_size)
  max = stats.RollingMax(window_size)

  drifts = []
  diffs = np.array(upper) - np.array(lower)

  for index, diff in enumerate(diffs):
    if (mean.update(diff).get() + threshold < diff) and (max.get() == 0):
      max.update(1)
      drifts.append(index)
    else:
      max.update(0)
    
  return drifts 

drift_info = []

for index, dataset in  enumerate([dataset1,dataset2,dataset3,dataset4,dataset5,dataset6,dataset7,dataset8,dataset9]):


  data = getUncertaintyByQuantiles(dataset1)
  drift_detectors = [ADWIN(), HDDM_W(), EDDM(), HDDM_A(), KSWIN(), PageHinkley(), DDM()]
  drift_detector_names = ['ADWIN', 'HDDM_W', 'EDDM', 'HDDM_A', 'KSWIN', 'PageHinkley', 'DDM']

  for drift_detector, drift_detector_name in zip(drift_detectors, drift_detector_names):
    dd = []
    for i, y, y_pred in zip(range(len(data.get('y'))),data.get('y'), data.get('y_pred')):
      drift_detector.update(abs(y-y_pred))
      if drift_detector.drift_detected:
        if hasattr(drift_detector, 'reset') and callable(getattr(drift_detector, 'reset')):
          drift_detector.reset()
        drift_info.append({'algorithm': drift_detector_name, 'dataset': 1, 'position': i })
  
  for threshold in [0.3, 0.4, 0.5, 0.6, 0.7, 1.0, 1.5]:
    for d in detectDrift(data['uncert95'], data['uncert05'], threshold=threshold):
      drift_info.append({'algorithm': 'TFM {}'.format(threshold), 'dataset': 1, 'position': d })

drift_dataframe = pd.DataFrame(drift_info)
drift_dataframe['real_drift'] = (drift_dataframe['position'] <= 130) & (drift_dataframe['position'] >= 70)
drift_dataframe['distance'] = abs(100 - drift_dataframe['position'].abs())
drift_dataframe['distance2'] = drift_dataframe['distance'] * drift_dataframe['distance']

algorithms = list(set(drift_dataframe['algorithm'].tolist()))
algorithms.sort()


pd.DataFrame([{'algoritmo': algorithm, 
               'detecciones': len(drift_dataframe[(drift_dataframe['algorithm']==algorithm)]),
               'TP': len(drift_dataframe[(drift_dataframe['algorithm']==algorithm) & (drift_dataframe['real_drift'] == True)]),
               'TP%': "{:10.2f}".format(len(drift_dataframe[(drift_dataframe['algorithm']==algorithm) & (drift_dataframe['real_drift'] == True)]) / len(drift_dataframe[(drift_dataframe['algorithm']==algorithm)])),
               'FP': len(drift_dataframe[(drift_dataframe['algorithm']==algorithm) & (drift_dataframe['real_drift'] == False)]),
               'TP-\u03BC(d\u00b2)': "{:10.2f}".format(drift_dataframe[(drift_dataframe['algorithm']==algorithm) & (drift_dataframe['real_drift'] == True)]['distance2'].mean()),
               'TP-\u221a\u03BC(d\u00b2)': "{:10.2f}".format(math.sqrt(drift_dataframe[(drift_dataframe['algorithm']==algorithm) & (drift_dataframe['real_drift'] == True)]['distance2'].mean())),
               'TP-\u03BC(d)': "{:10.2f}".format(drift_dataframe[(drift_dataframe['algorithm']==algorithm) & (drift_dataframe['real_drift'] == True)]['distance'].mean()),} for algorithm in algorithms])



Unnamed: 0,algoritmo,detecciones,TP,TP%,FP,TP-μ(d²),TP-√μ(d²),TP-μ(d)
0,DDM,9,9,1.0,0,36.0,6.0,6.0
1,HDDM_A,9,9,1.0,0,1.0,1.0,1.0
2,HDDM_W,9,9,1.0,0,9.0,3.0,3.0
3,KSWIN,9,9,1.0,0,289.0,17.0,17.0
4,TFM 0.3,18,9,0.5,9,4.0,2.0,2.0
5,TFM 0.4,18,9,0.5,9,9.0,3.0,3.0
6,TFM 0.5,18,9,0.5,9,9.0,3.0,3.0
7,TFM 0.6,9,9,1.0,0,16.0,4.0,4.0
8,TFM 0.7,9,9,1.0,0,25.0,5.0,5.0
9,TFM 1.0,9,9,1.0,0,49.0,7.0,7.0
