In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from river.drift import ADWIN, PageHinkley, KSWIN
import warnings
warnings.filterwarnings("ignore")

# -----------------------------
# Lightweight custom detectors
# -----------------------------
class DDM:
    """Drift Detection Method on a 0/1 error stream"""
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.var = 0.0
        self.n = 0
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.var = 0.0
        else:
            # Welford update
            prev_mean = self.mean
            self.mean += (error - prev_mean) / self.n
            self.var += (error - prev_mean) * (error - self.mean)

        if self.n < 2:
            std = 0.0
        else:
            std = np.sqrt(self.var / (self.n - 1))

        if self.n >= self.min_num_instances:
            # drift?
            if self.mean + std > self.mean_min + self.drift_level * self.std_min:
                return True
            # update best-so-far
            if self.mean + std < self.mean_min + self.warning_level * self.std_min:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, std)
        return False


class PCDM:
    """Very small permutation test on a single stream value (e.g., PC1)"""
    def __init__(self, window_size=50, n_permutations=50, alpha=0.01):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.ref = []
        self.cur = []

    def add_element(self, v):
        self.cur.append(v)
        if len(self.cur) > self.window_size:
            self.cur.pop(0)
        if len(self.ref) < self.window_size:
            self.ref.append(v)
            return False
        if len(self.cur) < self.window_size:
            return False

        ref = np.array(self.ref, float)
        cur = np.array(self.cur, float)
        obs = abs(ref.mean() - cur.mean())
        both = np.concatenate([ref, cur])
        cnt = 0
        for _ in range(self.n_permutations):
            np.random.shuffle(both)
            if abs(both[:self.window_size].mean() - both[self.window_size:].mean()) >= obs:
                cnt += 1
        p = cnt / self.n_permutations
        drift = p < self.alpha
        if drift:
            self.ref = self.cur.copy()
        return drift

# -----------------------------
# Core routine (one dataset)
# -----------------------------
def run_one_dataset(data_file, data_type,
                    base_window=800,    # initial fit window
                    roll_window=100,    # rolling anomaly-rate window (for recovery)
                    adapt_window=500,   # samples to refit after drift
                    consensus_k=3,      # #detectors needed for consensus
                    tolerance=25,       # DDA: match tolerance to consensus
                    epsilon_sigma=2.0,  # AL: baseline + 2*std
                    verbose=True):

    # --- 1) Load & pick features
    df = pd.read_csv(data_file)
    if data_type == "keystroke":
        feats_all = ["dwell_time", "flight_time", "up_down_time", "session_duration", "rhythm"]
    else:
        # mouse: pick what exists
        candidates = ["speed", "distance", "delta_x", "delta_y"]
        feats_all = [c for c in candidates if c in df.columns]
        if not feats_all:  # fallback common minimal set
            raise ValueError("No mouse features found among speed/distance/delta_x/delta_y")

    df = df.dropna(subset=feats_all)
    X = df[feats_all].values.astype(float)
    print(f"{data_type.capitalize()} dataset loaded: {len(X)} rows, features={feats_all}")

    if len(X) <= base_window + roll_window + 10:
        raise ValueError("Dataset too small for chosen windows. Reduce base_window/roll_window or use more data.")

    # --- 2) Scale + PCA(1) to get a compact stream signal
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)
    pca = PCA(n_components=1, random_state=42)
    pc1 = pca.fit_transform(Xs[:base_window]).ravel()

    # --- 3) One-class models on the baseline
    ocsvm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.05)  # tighter boundary -> alerts when shifting
    iso   = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
    ocsvm.fit(Xs[:base_window])
    iso.fit(Xs[:base_window])

    # establish baseline anomaly rate & std over a clean slice (just before streaming)
    base_slice = Xs[base_window:base_window+roll_window]
    base_pred_oc = ocsvm.predict(base_slice)          # +1 inlier, -1 outlier
    base_anom_rate = np.mean(base_pred_oc == -1)
    # small std estimate: binomial approx with n=roll_window
    base_std = np.sqrt(max(base_anom_rate * (1 - base_anom_rate), 1e-6) / roll_window)
    recovery_threshold = base_anom_rate + epsilon_sigma * base_std

    # --- 4) Drift detectors
    adwin_pc  = ADWIN(delta=0.002)
    ph_pc     = PageHinkley(threshold=30, alpha=0.01)
    ks_pc     = KSWIN(alpha=0.01, window_size=100, stat_size=30)
    ddm_err   = DDM(min_num_instances=30, warning_level=2.0, drift_level=3.0)
    pcdm_pc   = PCDM(window_size=50, n_permutations=50, alpha=0.01)

    adwin_oc  = ADWIN(delta=0.002)  # on ocsvm score
    adwin_iso = ADWIN(delta=0.002)  # on iso score

    detectors = ["ADWIN_PC", "PH_PC", "KSWIN_PC", "DDM_ERR", "PCDM_PC", "ADWIN_OCSVM", "ADWIN_ISO"]
    det_points = {d: [] for d in detectors}

    # --- 5) Streaming + consensus + adaptation
    consensus_points = []
    AL_values = []

    # buffers for rolling
    roll_preds = []  # OCSVM anomaly flags in rolling window to compute recovery
    # keep an index where we are "waiting for recovery"
    waiting_recovery = False
    recovery_start_idx = None

    # Helper for anomaly scores (sign so that larger = more anomalous)
    def ocsvm_score(x):
        # decision_function: positive -> inlier; negative -> outlier; lower = more anomalous
        return -ocsvm.decision_function(x.reshape(1, -1))[0]

    def iso_score(x):
        # isolationForest.decision_function: higher = more normal; lower = more anomalous
        return -iso.decision_function(x.reshape(1, -1))[0]

    # start stream from base_window
    for i in range(base_window, len(Xs)):
        x = Xs[i]
        v_pc1 = float(pca.transform(x.reshape(1, -1))[0, 0])

        # anomaly flags / scores
        oc_pred = ocsvm.predict(x.reshape(1, -1))[0]      # +1/-1
        err_flag = 1 if oc_pred == -1 else 0              # 1 => anomaly (used by DDM)

        s_oc = ocsvm_score(x)
        s_iso = iso_score(x)

        # update rolling anomaly rate buffer
        roll_preds.append(err_flag)
        if len(roll_preds) > roll_window:
            roll_preds.pop(0)
        roll_rate = np.mean(roll_preds) if roll_preds else 0.0

        # per-detector triggers this step
        fired = []

        # update detectors
        if adwin_pc.update(v_pc1):   fired.append("ADWIN_PC")
        if ph_pc.update(v_pc1):      fired.append("PH_PC")
        if ks_pc.update(v_pc1):      fired.append("KSWIN_PC")
        if ddm_err.add_element(err_flag): fired.append("DDM_ERR")
        if pcdm_pc.add_element(v_pc1):    fired.append("PCDM_PC")
        if adwin_oc.update(s_oc):    fired.append("ADWIN_OCSVM")
        if adwin_iso.update(s_iso):  fired.append("ADWIN_ISO")

        # record detector points
        for d in fired:
            det_points[d].append(i)

        # consensus drift?
        if len(fired) >= consensus_k:
            consensus_points.append(i)

            # ADAPT: refit PCA + models on most recent window (use what we have)
            start_refit = max(0, i - adapt_window)
            Xref = Xs[start_refit:i]
            if len(Xref) >= 50:  # minimal size
                pca = PCA(n_components=1, random_state=42)
                pca.fit(Xref)

                ocsvm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.05)
                iso = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
                ocsvm.fit(Xref)
                iso.fit(Xref)

                # re-init streaming detectors (fresh regime)
                adwin_pc  = ADWIN(delta=0.002)
                ph_pc     = PageHinkley(threshold=30, alpha=0.01)
                ks_pc     = KSWIN(alpha=0.01, window_size=100, stat_size=30)
                ddm_err   = DDM(min_num_instances=30, warning_level=2.0, drift_level=3.0)
                pcdm_pc   = PCDM(window_size=50, n_permutations=50, alpha=0.01)
                adwin_oc  = ADWIN(delta=0.002)
                adwin_iso = ADWIN(delta=0.002)

                # baseline & threshold recompute from immediate post-adaptation window (if possible)
                base_start = max(0, i - roll_window)
                base_slice = Xs[base_start:i] if i - base_start >= roll_window else Xs[max(0, i-2*roll_window):i]
                if len(base_slice) >= roll_window:
                    base_pred_oc = ocsvm.predict(base_slice)
                    base_anom_rate = np.mean(base_pred_oc == -1)
                    base_std = np.sqrt(max(base_anom_rate * (1 - base_anom_rate), 1e-6) / roll_window)
                    recovery_threshold = base_anom_rate + 2.0 * base_std

            # start AL timer: wait until roll_rate <= threshold
            waiting_recovery = True
            recovery_start_idx = i

        # measure AL if waiting
        if waiting_recovery and len(roll_preds) == roll_window:
            if roll_rate <= recovery_threshold:
                AL_values.append(i - recovery_start_idx)
                waiting_recovery = False
                recovery_start_idx = None

    # --- 6) Metrics: DDA per detector vs consensus points
    # collapse consensus points (avoid duplicates within tolerance)
    collapsed = []
    for t in consensus_points:
        if not collapsed or (t - collapsed[-1]) > tolerance:
            collapsed.append(t)
    consensus_points = collapsed

    def dda_for(det_list, consensus, tol):
        if len(det_list) == 0:
            return 0.0
        correct = 0
        j = 0
        for d in det_list:
            # advance consensus pointer
            while j < len(consensus) and consensus[j] < d - tol:
                j += 1
            if j < len(consensus) and abs(consensus[j] - d) <= tol:
                correct += 1
        return correct / max(1, len(det_list))

    dda_scores = {d: dda_for(det_points[d], consensus_points, tolerance) for d in detectors}

    # --- 7) Print results
    print("\n=== SUMMARY ===============================")
    print(f"Data type: {data_type}")
    print(f"Samples: {len(Xs)} | Base window: {base_window} | Roll window: {roll_window} | Adapt window: {adapt_window}")
    print(f"Detectors: {detectors}")
    print(f"Consensus K: {consensus_k} | Match tolerance (DDA): ±{tolerance} samples")
    print("------------------------------------------")
    print(f"Total consensus drift points: {len(consensus_points)}")
    if consensus_points[:10]:
        print(f"First consensus points: {consensus_points[:10]}")
    print("------------------------------------------")
    print("Per-detector detections:")
    for d in detectors:
        print(f"  {d:12s}: {len(det_points[d])} detections  |  DDA vs consensus: {dda_scores[d]:.3f}")
    print("------------------------------------------")
    if AL_values:
        print(f"AL (Adaptation Latency) count: {len(AL_values)}")
        print(f"AL mean: {np.mean(AL_values):.1f} | median: {np.median(AL_values):.1f} | min: {np.min(AL_values)} | max: {np.max(AL_values)}")
        print(f"First 10 ALs: {AL_values[:10]}")
    else:
        print("No AL measured (no consensus drifts or no recovery observed).")
    print("==========================================\n")

    # Return (if you want to inspect programmatically)
    return {
        "detector_points": det_points,
        "consensus_points": consensus_points,
        "dda": dda_scores,
        "AL": AL_values,
        "baseline_rate": base_anom_rate,
        "recovery_threshold": recovery_threshold
    }

# -----------------------------
# Run both datasets
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/imputed_keystroke_data.csv",
    "mouse": "/Users/festusedward-n/Documents/Datasets/mouse_modified_trimmed_clean_imputed.csv",
}

# You can tweak per-dataset windows if needed (mouse is larger)
results_all = {}
for dtype, path in datasets.items():
    if dtype == "mouse":
        res = run_one_dataset(path, dtype,
                              base_window=1500, roll_window=200, adapt_window=800,
                              consensus_k=3, tolerance=30, epsilon_sigma=2.0)
    else:
        res = run_one_dataset(path, dtype,
                              base_window=800, roll_window=100, adapt_window=500,
                              consensus_k=3, tolerance=25, epsilon_sigma=2.0)
    results_all[dtype] = res


Keystroke dataset loaded: 19996 rows, features=['dwell_time', 'flight_time', 'up_down_time', 'session_duration', 'rhythm']

Data type: keystroke
Samples: 19996 | Base window: 800 | Roll window: 100 | Adapt window: 500
Detectors: ['ADWIN_PC', 'PH_PC', 'KSWIN_PC', 'DDM_ERR', 'PCDM_PC', 'ADWIN_OCSVM', 'ADWIN_ISO']
Consensus K: 3 | Match tolerance (DDA): ±25 samples
------------------------------------------
Total consensus drift points: 0
------------------------------------------
Per-detector detections:
  ADWIN_PC    : 0 detections  |  DDA vs consensus: 0.000
  PH_PC       : 0 detections  |  DDA vs consensus: 0.000
  KSWIN_PC    : 0 detections  |  DDA vs consensus: 0.000
  DDM_ERR     : 18760 detections  |  DDA vs consensus: 0.000
  PCDM_PC     : 275 detections  |  DDA vs consensus: 0.000
  ADWIN_OCSVM : 0 detections  |  DDA vs consensus: 0.000
  ADWIN_ISO   : 0 detections  |  DDA vs consensus: 0.000
------------------------------------------
No AL measured (no consensus drifts or no re

In [20]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import ADWIN, PageHinkley, KSWIN

# -----------------------------
# Custom Detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )
        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


class EDDM:
    def __init__(self, min_num_instances=30, warning_level=0.95, drift_level=0.9):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.n = 0
        self.last_error = 0
        self.distances = []
        self.drift_detected = False
        self.max_mean = 0.0

    def add_element(self, prediction, true_label):
        error = 1 if prediction != true_label else 0
        if self.n > 0 and error == 1:
            self.distances.append(self.n - self.last_error)
        if error == 1:
            self.last_error = self.n
        self.n += 1
        if len(self.distances) > 1:
            mean = np.mean(self.distances)
            std = np.std(self.distances)
            if self.n >= self.min_num_instances:
                m = (mean + 2 * std) / self.max_mean if self.max_mean > 0 else float("inf")
                if m < self.drift_level:
                    self.drift_detected = True
                else:
                    self.drift_detected = False
                if not self.drift_detected:
                    self.max_mean = max(self.max_mean, mean + 2 * std)
        return self.drift_detected


class PCDM:
    def __init__(self, window_size=50, n_permutations=100, alpha=0.01):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.reference_window = []
        self.current_window = []
        self.drift_detected = False

    def add_element(self, value):
        self.current_window.append(value)
        if len(self.current_window) > self.window_size:
            self.current_window.pop(0)
        if len(self.reference_window) < self.window_size:
            self.reference_window.append(value)
            return False
        if len(self.current_window) == self.window_size:
            stat, p_value = self._permutation_test()
            self.drift_detected = p_value < self.alpha
            if self.drift_detected:
                self.reference_window = self.current_window.copy()
        return self.drift_detected

    def _permutation_test(self):
        ref = np.array(self.reference_window)
        curr = np.array(self.current_window)
        observed_diff = np.abs(np.mean(ref) - np.mean(curr))
        combined = np.concatenate([ref, curr])
        perm_diffs = []
        for _ in range(self.n_permutations):
            np.random.shuffle(combined)
            perm_ref = combined[: self.window_size]
            perm_curr = combined[self.window_size :]
            perm_diffs.append(np.abs(np.mean(perm_ref) - np.mean(perm_curr)))
        p_value = np.sum(np.array(perm_diffs) >= observed_diff) / self.n_permutations
        return observed_diff, p_value


# -----------------------------
# Drift detection pipeline
# -----------------------------
def detect_natural_drift(data_file, data_type):
    df = pd.read_csv(data_file)
    print(f"\n{data_type.capitalize()} dataset loaded: {len(df)} rows")

    # Choose features dynamically
    if data_type == "keystroke":
        features = ["dwell_time", "flight_time", "up_down_time", "session_duration", "rhythm"]
    else:  # mouse
        possible_features = ["speed", "distance", "delta_x", "delta_y"]
        features = [f for f in possible_features if f in df.columns]

    if not features:
        raise ValueError(f"No valid features found in {data_type} dataset!")

    X = df[features].dropna().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize detectors
    detectors = {
        "ADWIN": ADWIN(delta=0.002),
        "PageHinkley": PageHinkley(threshold=30, alpha=0.01),
        "KSWIN": KSWIN(alpha=0.005, window_size=100),
        "DDM": DDM(),
        "EDDM": EDDM(),
        "PCDM": PCDM()
    }

    drift_points = {name: [] for name in detectors}

    # Sequentially feed samples
    for i, x in enumerate(X_scaled):
        for name, det in detectors.items():
            if name in ["DDM", "EDDM"]:
                pred = 0 if i % 2 == 0 else 1
                drift = det.add_element(pred, 1) if name == "EDDM" else det.add_element(pred)
            elif name == "PCDM":
                drift = det.add_element(np.mean(x))
            else:
                drift = det.update(np.mean(x))
            if drift:
                drift_points[name].append(i)

    # Print results directly
    print(f"\nDrift Points Summary for {data_type}:")
    for det, points in drift_points.items():
        print(f"  {det}: {len(points)} drifts detected, first 5 at {points[:5]}")
    return drift_points


# -----------------------------
# Run for both datasets
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/imputed_keystroke_data.csv",
    "mouse": "/Users/festusedward-n/Documents/Datasets/mouse_modified_trimmed_clean_imputed.csv",
}

for dtype, path in datasets.items():
    detect_natural_drift(path, dtype)



Keystroke dataset loaded: 19996 rows

Drift Points Summary for keystroke:
  ADWIN: 0 drifts detected, first 5 at []
  PageHinkley: 0 drifts detected, first 5 at []
  KSWIN: 0 drifts detected, first 5 at []
  DDM: 0 drifts detected, first 5 at []
  EDDM: 0 drifts detected, first 5 at []
  PCDM: 547 drifts detected, first 5 at [90, 105, 116, 356, 380]

Mouse dataset loaded: 252397 rows

Drift Points Summary for mouse:
  ADWIN: 0 drifts detected, first 5 at []
  PageHinkley: 0 drifts detected, first 5 at []
  KSWIN: 0 drifts detected, first 5 at []
  DDM: 0 drifts detected, first 5 at []
  EDDM: 0 drifts detected, first 5 at []
  PCDM: 5055 drifts detected, first 5 at [60, 71, 84, 98, 108]


In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import ADWIN, PageHinkley, KSWIN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# -----------------------------
# Custom detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )
        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


class PCDM:
    def __init__(self, window_size=50, n_permutations=50, alpha=0.05):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.reference_window = []
        self.current_window = []
        self.drift_detected = False

    def add_element(self, value):
        self.current_window.append(value)
        if len(self.current_window) > self.window_size:
            self.current_window.pop(0)
        if len(self.reference_window) < self.window_size:
            self.reference_window.append(value)
            return False
        if len(self.current_window) == self.window_size:
            stat, p_value = self._permutation_test()
            self.drift_detected = p_value < self.alpha
            if self.drift_detected:
                self.reference_window = self.current_window.copy()
        return self.drift_detected

    def _permutation_test(self):
        ref = np.array(self.reference_window)
        curr = np.array(self.current_window)
        observed_diff = np.abs(np.mean(ref) - np.mean(curr))
        combined = np.concatenate([ref, curr])
        perm_diffs = []
        for _ in range(self.n_permutations):
            np.random.shuffle(combined)
            perm_ref = combined[: self.window_size]
            perm_curr = combined[self.window_size :]
            perm_diffs.append(np.abs(np.mean(perm_ref) - np.mean(perm_curr)))
        p_value = np.sum(np.array(perm_diffs) >= observed_diff) / self.n_permutations
        return observed_diff, p_value


# -----------------------------
# Drift detection + AL pipeline
# -----------------------------
def detect_natural_drift(data_file, data_type):
    df = pd.read_csv(data_file)
    print(f"{data_type.capitalize()} dataset loaded: {len(df)} rows")

    # Choose features dynamically
    if data_type == "keystroke":
        features = ["dwell_time", "flight_time", "up_down_time", "session_duration", "rhythm"]
    else:  # mouse
        possible_features = ["speed", "distance", "delta_x", "delta_y"]
        features = [f for f in possible_features if f in df.columns]

    if not features:
        raise ValueError(f"No valid features found in {data_type} dataset!")

    X = df[features].dropna().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Base anomaly models
    ocsvm = OneClassSVM(nu=0.01, gamma="scale")
    iso = IsolationForest(contamination=0.01, random_state=42)
    ocsvm.fit(X_scaled[:1000])
    iso.fit(X_scaled[:1000])

    # Initialize detectors
    detectors = {
        "ADWIN_PC": ADWIN(delta=0.01),
        "PH_PC": PageHinkley(threshold=10, alpha=0.01),
        "KSWIN_PC": KSWIN(alpha=0.1, window_size=100),
        "DDM_ERR": DDM(),
        "PCDM_PC": PCDM(alpha=0.05),
        "ADWIN_OCSVM": ADWIN(delta=0.01),
        "ADWIN_ISO": ADWIN(delta=0.01),
    }

    drift_points = {name: [] for name in detectors}
    AL_values = []

    in_recovery = False
    recovery_start = None

    for i, x in enumerate(X_scaled):
        x_val = np.mean(x)
        err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
        err_iso = 1 if iso.predict([x])[0] == -1 else 0

        drift_detected = False
        for name, det in detectors.items():
            if name == "DDM_ERR":
                drift = det.add_element(err_ocsvm or err_iso)
            elif name == "PCDM_PC":
                drift = det.add_element(x_val)
            elif name == "ADWIN_OCSVM":
                drift = det.update(err_ocsvm)
            elif name == "ADWIN_ISO":
                drift = det.update(err_iso)
            else:
                drift = det.update(x_val)
            if drift:
                drift_detected = True
                drift_points[name].append(i)

        # Adaptation Latency: start recovery after drift
        if drift_detected and not in_recovery:
            in_recovery = True
            recovery_start = i

        if in_recovery:
            # Check if models recover (low anomaly rate over last 200 samples)
            if i - recovery_start > 200:
                recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[i-200:i]]
                if np.mean(recent_errs) < 0.05:
                    AL = i - recovery_start
                    AL_values.append(AL)
                    in_recovery = False

    print("\n=== SUMMARY ===============================")
    print(f"Data type: {data_type}")
    print(f"Samples: {len(X_scaled)} | Features: {features}")
    print("------------------------------------------")
    for det, pts in drift_points.items():
        print(f"  {det:12}: {len(pts)} detections")
    if AL_values:
        print(f"\nAdaptation Latency (mean over drifts): {np.mean(AL_values):.2f} samples")
    else:
        print("\nNo AL measured (no recovery detected).")
    print("==========================================\n")

    return drift_points, AL_values


# -----------------------------
# Run for both datasets
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/imputed_keystroke_data.csv",
    "mouse": "/Users/festusedward-n/Documents/Datasets/mouse_modified_trimmed_clean_imputed.csv",
}

for dtype, path in datasets.items():
    drift_points, AL = detect_natural_drift(path, dtype)


Keystroke dataset loaded: 19996 rows

Data type: keystroke
Samples: 19996 | Features: ['dwell_time', 'flight_time', 'up_down_time', 'session_duration', 'rhythm']
------------------------------------------
  ADWIN_PC    : 0 detections
  PH_PC       : 0 detections
  KSWIN_PC    : 0 detections
  DDM_ERR     : 19721 detections
  PCDM_PC     : 902 detections
  ADWIN_OCSVM : 0 detections
  ADWIN_ISO   : 0 detections

Adaptation Latency (mean over drifts): 222.98 samples

Mouse dataset loaded: 252397 rows

Data type: mouse
Samples: 252397 | Features: ['speed', 'distance']
------------------------------------------
  ADWIN_PC    : 0 detections
  PH_PC       : 0 detections
  KSWIN_PC    : 0 detections
  DDM_ERR     : 251311 detections
  PCDM_PC     : 8079 detections
  ADWIN_OCSVM : 0 detections
  ADWIN_ISO   : 0 detections

Adaptation Latency (mean over drifts): 201.00 samples



In [10]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import ADWIN, PageHinkley, KSWIN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# -----------------------------
# Custom detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )
        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


class PCDM:
    def __init__(self, window_size=50, n_permutations=50, alpha=0.05):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.reference_window = []
        self.current_window = []
        self.drift_detected = False

    def add_element(self, value):
        self.current_window.append(value)
        if len(self.current_window) > self.window_size:
            self.current_window.pop(0)
        if len(self.reference_window) < self.window_size:
            self.reference_window.append(value)
            return False
        if len(self.current_window) == self.window_size:
            stat, p_value = self._permutation_test()
            self.drift_detected = p_value < self.alpha
            if self.drift_detected:
                self.reference_window = self.current_window.copy()
        return self.drift_detected

    def _permutation_test(self):
        ref = np.array(self.reference_window)
        curr = np.array(self.current_window)
        observed_diff = np.abs(np.mean(ref) - np.mean(curr))
        combined = np.concatenate([ref, curr])
        perm_diffs = []
        for _ in range(self.n_permutations):
            np.random.shuffle(combined)
            perm_ref = combined[: self.window_size]
            perm_curr = combined[self.window_size :]
            perm_diffs.append(np.abs(np.mean(perm_ref) - np.mean(perm_curr)))
        p_value = np.sum(np.array(perm_diffs) >= observed_diff) / self.n_permutations
        return observed_diff, p_value


# -----------------------------
# Drift detection + metrics pipeline
# -----------------------------
def detect_natural_drift(data_file, data_type):
    df = pd.read_csv(data_file)
    print(f"{data_type.capitalize()} dataset loaded: {len(df)} rows")

    # Choose features dynamically
    if data_type == "keystroke":
        features = ["dwell_time", "flight_time", "up_down_time", "session_duration", "rhythm"]
    else:  # mouse
        possible_features = ["speed", "distance", "delta_x", "delta_y"]
        features = [f for f in possible_features if f in df.columns]

    if not features:
        raise ValueError(f"No valid features found in {data_type} dataset!")

    X = df[features].dropna().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Base anomaly models
    ocsvm = OneClassSVM(nu=0.01, gamma="scale")
    iso = IsolationForest(contamination=0.01, random_state=42)
    ocsvm.fit(X_scaled[:1000])
    iso.fit(X_scaled[:1000])

    # Initialize detectors
    detectors = {
        "ADWIN_PC": ADWIN(delta=0.01),
        "PH_PC": PageHinkley(threshold=10, alpha=0.01),
        "KSWIN_PC": KSWIN(alpha=0.1, window_size=100),
        "DDM_ERR": DDM(),
        "PCDM_PC": PCDM(alpha=0.05),
        "ADWIN_OCSVM": ADWIN(delta=0.01),
        "ADWIN_ISO": ADWIN(delta=0.01),
    }

    drift_points = {name: [] for name in detectors}
    AL_values, DDA_values, RA_values = [], [], []
    FAR_values = {}

    in_recovery = False
    recovery_start = None
    baseline_acc = None

    for i, x in enumerate(X_scaled):
        x_val = np.mean(x)
        err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
        err_iso = 1 if iso.predict([x])[0] == -1 else 0

        drift_detected = False
        for name, det in detectors.items():
            if name == "DDM_ERR":
                drift = det.add_element(err_ocsvm or err_iso)
            elif name == "PCDM_PC":
                drift = det.add_element(x_val)
            elif name == "ADWIN_OCSVM":
                drift = det.update(err_ocsvm)
            elif name == "ADWIN_ISO":
                drift = det.update(err_iso)
            else:
                drift = det.update(x_val)
            if drift:
                drift_detected = True
                drift_points[name].append(i)

        # Adaptation metrics
        if drift_detected and not in_recovery:
            in_recovery = True
            recovery_start = i
            # Baseline before drift
            recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[max(0, i-200):i]]
            baseline_acc = 1 - np.mean(recent_errs) if len(recent_errs) > 0 else None

        if in_recovery:
            if i - recovery_start > 200:
                recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[i-200:i]]
                if np.mean(recent_errs) < 0.05:
                    AL = i - recovery_start
                    AL_values.append(AL)

                    # DDA = approximated as AL
                    DDA_values.append(AL)

                    # RA = accuracy after recovery vs baseline
                    if baseline_acc is not None:
                        rec_acc = 1 - np.mean(recent_errs)
                        RA_values.append(rec_acc / baseline_acc * 100)

                    in_recovery = False

    # FAR = detections / total samples
    total_samples = len(X_scaled)
    for det, pts in drift_points.items():
        FAR_values[det] = len(pts) / total_samples * 100

    # Summary
    print("\n=== SUMMARY ===============================")
    print(f"Data type: {data_type}")
    print(f"Samples: {len(X_scaled)} | Features: {features}")
    print("------------------------------------------")
    for det, pts in drift_points.items():
        print(f"  {det:12}: {len(pts)} detections (FAR={FAR_values[det]:.2f}%)")

    if AL_values:
        print(f"\nAdaptation Latency (AL): {np.mean(AL_values):.2f} samples "
              f"({np.mean(AL_values)/len(X_scaled)*100:.2f}%)")
        print(f"Detection Delay Accuracy (DDA): {np.mean(DDA_values):.2f} samples")
        print(f"Recovery Accuracy (RA): {np.mean(RA_values):.2f}%")
    else:
        print("\nNo AL/DDA/RA measured (no recovery detected).")
    print("==========================================\n")

    return drift_points, AL_values, DDA_values, RA_values, FAR_values


# -----------------------------
# Run for both datasets
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/imputed_keystroke_data.csv",
    "mouse": "/Users/festusedward-n/Documents/Datasets/mouse_modified_trimmed_clean_imputed.csv",
}

for dtype, path in datasets.items():
    drift_points, AL, DDA, RA, FAR = detect_natural_drift(path, dtype)


Keystroke dataset loaded: 19996 rows

Data type: keystroke
Samples: 19996 | Features: ['dwell_time', 'flight_time', 'up_down_time', 'session_duration', 'rhythm']
------------------------------------------
  ADWIN_PC    : 0 detections (FAR=0.00%)
  PH_PC       : 0 detections (FAR=0.00%)
  KSWIN_PC    : 0 detections (FAR=0.00%)
  DDM_ERR     : 19721 detections (FAR=98.62%)
  PCDM_PC     : 902 detections (FAR=4.51%)
  ADWIN_OCSVM : 0 detections (FAR=0.00%)
  ADWIN_ISO   : 0 detections (FAR=0.00%)

Adaptation Latency (AL): 222.97 samples (1.12%)
Detection Delay Accuracy (DDA): 222.97 samples
Recovery Accuracy (RA): 99.96%

Mouse dataset loaded: 252397 rows

Data type: mouse
Samples: 252397 | Features: ['speed', 'distance']
------------------------------------------
  ADWIN_PC    : 0 detections (FAR=0.00%)
  PH_PC       : 0 detections (FAR=0.00%)
  KSWIN_PC    : 0 detections (FAR=0.00%)
  DDM_ERR     : 251311 detections (FAR=99.57%)
  PCDM_PC     : 8004 detections (FAR=3.17%)
  ADWIN_OCSVM 

In [13]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import ADWIN, PageHinkley, KSWIN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# -----------------------------
# Custom detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )
        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


class PCDM:
    def __init__(self, window_size=50, n_permutations=50, alpha=0.05):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.reference_window = []
        self.current_window = []
        self.drift_detected = False

    def add_element(self, value):
        self.current_window.append(value)
        if len(self.current_window) > self.window_size:
            self.current_window.pop(0)
        if len(self.reference_window) < self.window_size:
            self.reference_window.append(value)
            return False
        if len(self.current_window) == self.window_size:
            stat, p_value = self._permutation_test()
            self.drift_detected = p_value < self.alpha
            if self.drift_detected:
                self.reference_window = self.current_window.copy()
        return self.drift_detected

    def _permutation_test(self):
        ref = np.array(self.reference_window)
        curr = np.array(self.current_window)
        observed_diff = np.abs(np.mean(ref) - np.mean(curr))
        combined = np.concatenate([ref, curr])
        perm_diffs = []
        for _ in range(self.n_permutations):
            np.random.shuffle(combined)
            perm_ref = combined[: self.window_size]
            perm_curr = combined[self.window_size :]
            perm_diffs.append(np.abs(np.mean(perm_ref) - np.mean(perm_curr)))
        p_value = np.sum(np.array(perm_diffs) >= observed_diff) / self.n_permutations
        return observed_diff, p_value


# -----------------------------
# Drift detection + metrics pipeline
# -----------------------------
def detect_natural_drift(data_file, data_type):
    df = pd.read_csv(data_file)
    print(f"{data_type.capitalize()} dataset loaded: {len(df)} rows")

    # Choose features dynamically
    if data_type == "keystroke":
        features = ["dwell_time", "flight_time", "up_down_time", "session_duration", "rhythm"]
    else:  # mouse
        possible_features = ["speed", "distance", "delta_x", "delta_y"]
        features = [f for f in possible_features if f in df.columns]

    if not features:
        raise ValueError(f"No valid features found in {data_type} dataset!")

    X = df[features].dropna().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Base anomaly models (looser params for realism)
    ocsvm = OneClassSVM(nu=0.05, gamma=0.1)
    iso = IsolationForest(contamination=0.05, random_state=42)
    ocsvm.fit(X_scaled[:min(1000, len(X_scaled))])
    iso.fit(X_scaled[:min(1000, len(X_scaled))])

    # Initialize detectors
    detectors = {
        "ADWIN_PC": ADWIN(delta=0.01),
        "PH_PC": PageHinkley(threshold=10, alpha=0.01),
        "KSWIN_PC": KSWIN(alpha=0.1, window_size=100),
        "DDM_ERR": DDM(),
        "PCDM_PC": PCDM(alpha=0.05),
        "ADWIN_OCSVM": ADWIN(delta=0.01),
        "ADWIN_ISO": ADWIN(delta=0.01),
    }

    drift_points = {name: [] for name in detectors}
    AL_values, DDA_values, RA_values = [], [], []
    FAR_values = {}

    in_recovery = False
    recovery_start = None
    baseline_acc = None

    # Track error rates
    all_errors = []

    for i, x in enumerate(X_scaled):
        x_val = np.mean(x)
        err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
        err_iso = 1 if iso.predict([x])[0] == -1 else 0
        error = max(err_ocsvm, err_iso)
        all_errors.append(error)

        drift_detected = False
        for name, det in detectors.items():
            if name == "DDM_ERR":
                drift = det.add_element(error)
            elif name == "PCDM_PC":
                drift = det.add_element(x_val)
            elif name == "ADWIN_OCSVM":
                drift = det.update(err_ocsvm)
            elif name == "ADWIN_ISO":
                drift = det.update(err_iso)
            else:
                drift = det.update(x_val)
            if drift:
                drift_detected = True
                drift_points[name].append(i)

        # Adaptation metrics
        if drift_detected and not in_recovery:
            in_recovery = True
            recovery_start = i
            # Baseline before drift
            recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[max(0, i-200):i]]
            baseline_acc = 1 - np.mean(recent_errs) if len(recent_errs) > 0 else None

        if in_recovery:
            if i - recovery_start > 200:
                recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[i-200:i]]
                if np.mean(recent_errs) < 0.15:  # relaxed threshold
                    AL = i - recovery_start
                    AL_values.append(AL)
                    DDA_values.append(AL)

                    if baseline_acc is not None:
                        rec_acc = 1 - np.mean(recent_errs)
                        improvement = max(0, rec_acc - baseline_acc)
                        RA_values.append(improvement * 100)

                    in_recovery = False

    # FAR = detections / total samples
    total_samples = len(X_scaled)
    for det, pts in drift_points.items():
        FAR_values[det] = len(pts) / total_samples * 100

    # Extra metrics
    AER = np.mean(all_errors) * 100  # average error rate
    drift_density = sum(len(v) for v in drift_points.values()) / total_samples * 100

    # Summary
    print("\n=== SUMMARY ===============================")
    print(f"Data type: {data_type}")
    print(f"Samples: {len(X_scaled)} | Features: {features}")
    print("------------------------------------------")
    for det, pts in drift_points.items():
        print(f"  {det:12}: {len(pts)} detections (FAR={FAR_values[det]:.2f}%)")

    if AL_values:
        print(f"\nAdaptation Latency (AL): {np.mean(AL_values):.2f} samples")
        print(f"Detection Delay Accuracy (DDA): {np.mean(DDA_values):.2f} samples")
        print(f"Recovery Accuracy (RA): {np.mean(RA_values):.2f}%")
    else:
        print("\nNo AL/DDA/RA measured (no recovery detected).")

    print(f"Average Error Rate (AER): {AER:.2f}%")
    print(f"Drift Density (DD): {drift_density:.2f}%")
    print("==========================================\n")

    return drift_points, AL_values, DDA_values, RA_values, FAR_values, AER, drift_density


# -----------------------------
# Run for both datasets
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/imputed_keystroke_data.csv",
    "mouse": "/Users/festusedward-n/Documents/Datasets/mouse_modified_trimmed_clean_imputed.csv",
}

for dtype, path in datasets.items():
    detect_natural_drift(path, dtype)


Keystroke dataset loaded: 19996 rows

Data type: keystroke
Samples: 19996 | Features: ['dwell_time', 'flight_time', 'up_down_time', 'session_duration', 'rhythm']
------------------------------------------
  ADWIN_PC    : 0 detections (FAR=0.00%)
  PH_PC       : 0 detections (FAR=0.00%)
  KSWIN_PC    : 0 detections (FAR=0.00%)
  DDM_ERR     : 19069 detections (FAR=95.36%)
  PCDM_PC     : 889 detections (FAR=4.45%)
  ADWIN_OCSVM : 0 detections (FAR=0.00%)
  ADWIN_ISO   : 0 detections (FAR=0.00%)

Adaptation Latency (AL): 224.70 samples
Detection Delay Accuracy (DDA): 224.70 samples
Recovery Accuracy (RA): 2.20%
Average Error Rate (AER): 9.01%
Drift Density (DD): 99.81%

Mouse dataset loaded: 252397 rows

Data type: mouse
Samples: 252397 | Features: ['speed', 'distance']
------------------------------------------
  ADWIN_PC    : 0 detections (FAR=0.00%)
  PH_PC       : 0 detections (FAR=0.00%)
  KSWIN_PC    : 0 detections (FAR=0.00%)
  DDM_ERR     : 251310 detections (FAR=99.57%)
  PCDM_P