In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import ADWIN, PageHinkley, KSWIN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# -----------------------------
# Custom detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )
        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


class PCDM:
    def __init__(self, window_size=50, n_permutations=50, alpha=0.05):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.reference_window = []
        self.current_window = []
        self.drift_detected = False

    def add_element(self, value):
        self.current_window.append(value)
        if len(self.current_window) > self.window_size:
            self.current_window.pop(0)
        if len(self.reference_window) < self.window_size:
            self.reference_window.append(value)
            return False
        if len(self.current_window) == self.window_size:
            stat, p_value = self._permutation_test()
            self.drift_detected = p_value < self.alpha
            if self.drift_detected:
                self.reference_window = self.current_window.copy()
        return self.drift_detected

    def _permutation_test(self):
        ref = np.array(self.reference_window)
        curr = np.array(self.current_window)
        observed_diff = np.abs(np.mean(ref) - np.mean(curr))
        combined = np.concatenate([ref, curr])
        perm_diffs = []
        for _ in range(self.n_permutations):
            np.random.shuffle(combined)
            perm_ref = combined[: self.window_size]
            perm_curr = combined[self.window_size :]
            perm_diffs.append(np.abs(np.mean(perm_ref) - np.mean(perm_curr)))
        p_value = np.sum(np.array(perm_diffs) >= observed_diff) / self.n_permutations
        return observed_diff, p_value


# -----------------------------
# Drift detection + AL pipeline
# -----------------------------
def detect_natural_drift(data_file, data_type):
    df = pd.read_csv(data_file)
    print(f"{data_type.capitalize()} dataset loaded: {len(df)} rows")
    print(f"Available columns: {list(df.columns)[:20]} ...")  # show first 20 column names

    # Dynamically select numeric features (exclude IDs)
    exclude_cols = ["subject", "sessionIndex", "rep"]
    features = [col for col in df.columns if col not in exclude_cols]

    if not features:
        raise ValueError(f"No valid features found in {data_type} dataset!")

    X = df[features].dropna().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Base anomaly models
    ocsvm = OneClassSVM(nu=0.01, gamma="scale")
    iso = IsolationForest(contamination=0.01, random_state=42)
    ocsvm.fit(X_scaled[:1000])
    iso.fit(X_scaled[:1000])

    # Initialize detectors
    detectors = {
        "ADWIN_PC": ADWIN(delta=0.01),
        "PH_PC": PageHinkley(threshold=10, alpha=0.01),
        "KSWIN_PC": KSWIN(alpha=0.1, window_size=100),
        "DDM_ERR": DDM(),
        "PCDM_PC": PCDM(alpha=0.05),
        "ADWIN_OCSVM": ADWIN(delta=0.01),
        "ADWIN_ISO": ADWIN(delta=0.01),
    }

    drift_points = {name: [] for name in detectors}
    AL_values = []

    in_recovery = False
    recovery_start = None

    for i, x in enumerate(X_scaled):
        x_val = np.mean(x)
        err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
        err_iso = 1 if iso.predict([x])[0] == -1 else 0

        drift_detected = False
        for name, det in detectors.items():
            if name == "DDM_ERR":
                drift = det.add_element(err_ocsvm or err_iso)
            elif name == "PCDM_PC":
                drift = det.add_element(x_val)
            elif name == "ADWIN_OCSVM":
                drift = det.update(err_ocsvm)
            elif name == "ADWIN_ISO":
                drift = det.update(err_iso)
            else:
                drift = det.update(x_val)
            if drift:
                drift_detected = True
                drift_points[name].append(i)

        # Adaptation Latency
        if drift_detected and not in_recovery:
            in_recovery = True
            recovery_start = i

        if in_recovery:
            if i - recovery_start > 200:
                recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[i-200:i]]
                if np.mean(recent_errs) < 0.05:
                    AL = i - recovery_start
                    AL_values.append(AL)
                    in_recovery = False

    print("\n=== SUMMARY ===============================")
    print(f"Data type: {data_type}")
    print(f"Samples: {len(X_scaled)} | Features used: {features[:10]} ...")
    print("------------------------------------------")
    for det, pts in drift_points.items():
        print(f"  {det:12}: {len(pts)} detections")
    if AL_values:
        print(f"\nAdaptation Latency (mean over drifts): {np.mean(AL_values):.2f} samples")
    else:
        print("\nNo AL measured (no recovery detected).")
    print("==========================================\n")

    return drift_points, AL_values


# -----------------------------
# Run for CMU keystroke dataset
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/DSL-StrongPasswordData 2.csv"
}

for dtype, path in datasets.items():
    drift_points, AL = detect_natural_drift(path, dtype)


Keystroke dataset loaded: 20400 rows
Available columns: ['subject', 'sessionIndex', 'rep', 'H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e', 'DD.e.five', 'UD.e.five', 'H.five', 'DD.five.Shift.r', 'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o'] ...

Data type: keystroke
Samples: 20400 | Features used: ['H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e'] ...
------------------------------------------
  ADWIN_PC    : 0 detections
  PH_PC       : 0 detections
  KSWIN_PC    : 0 detections
  DDM_ERR     : 17663 detections
  PCDM_PC     : 599 detections
  ADWIN_OCSVM : 0 detections
  ADWIN_ISO   : 0 detections

Adaptation Latency (mean over drifts): 2740.17 samples



In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import ADWIN, PageHinkley, KSWIN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# -----------------------------
# Custom detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )
        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


class PCDM:
    def __init__(self, window_size=50, n_permutations=50, alpha=0.05):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.reference_window = []
        self.current_window = []
        self.drift_detected = False

    def add_element(self, value):
        self.current_window.append(value)
        if len(self.current_window) > self.window_size:
            self.current_window.pop(0)
        if len(self.reference_window) < self.window_size:
            self.reference_window.append(value)
            return False
        if len(self.current_window) == self.window_size:
            stat, p_value = self._permutation_test()
            self.drift_detected = p_value < self.alpha
            if self.drift_detected:
                self.reference_window = self.current_window.copy()
        return self.drift_detected

    def _permutation_test(self):
        ref = np.array(self.reference_window)
        curr = np.array(self.current_window)
        observed_diff = np.abs(np.mean(ref) - np.mean(curr))
        combined = np.concatenate([ref, curr])
        perm_diffs = []
        for _ in range(self.n_permutations):
            np.random.shuffle(combined)
            perm_ref = combined[: self.window_size]
            perm_curr = combined[self.window_size :]
            perm_diffs.append(np.abs(np.mean(perm_ref) - np.mean(perm_curr)))
        p_value = np.sum(np.array(perm_diffs) >= observed_diff) / self.n_permutations
        return observed_diff, p_value


# -----------------------------
# Drift detection + AL pipeline
# -----------------------------
def detect_natural_drift(data_file, data_type):
    df = pd.read_csv(data_file)
    print(f"{data_type.capitalize()} dataset loaded: {len(df)} rows")
    print(f"Available columns: {list(df.columns)[:20]} ...")

    # Dynamically select numeric features
    exclude_cols = ["subject", "sessionIndex", "rep"]
    features = [col for col in df.columns if col not in exclude_cols]

    if not features:
        raise ValueError(f"No valid features found in {data_type} dataset!")

    X = df[features].dropna().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Base anomaly models (global, not per-feature)
    ocsvm = OneClassSVM(nu=0.01, gamma="scale")
    iso = IsolationForest(contamination=0.01, random_state=42)
    ocsvm.fit(X_scaled[:1000])
    iso.fit(X_scaled[:1000])

    # Initialize drift detectors per feature
    detectors = {}
    for f in features:
        detectors[f"ADWIN_{f}"] = ADWIN(delta=0.01)
        detectors[f"PH_{f}"] = PageHinkley(threshold=10, alpha=0.01)
        detectors[f"KSWIN_{f}"] = KSWIN(alpha=0.1, window_size=100)
        detectors[f"PCDM_{f}"] = PCDM(alpha=0.05)
    detectors["DDM_ERR"] = DDM()  # error-based only once

    drift_points = {name: [] for name in detectors}
    AL_values = []

    in_recovery = False
    recovery_start = None

    for i, x in enumerate(X_scaled):
        err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
        err_iso = 1 if iso.predict([x])[0] == -1 else 0

        drift_detected = False
        for j, f in enumerate(features):
            val = x[j]
            for name, det in detectors.items():
                if name == "DDM_ERR":
                    drift = det.add_element(err_ocsvm or err_iso)
                elif name.startswith("PCDM") and name.endswith(f):
                    drift = det.add_element(val)
                elif name.startswith("ADWIN") and name.endswith(f):
                    drift = det.update(val)
                elif name.startswith("PH") and name.endswith(f):
                    drift = det.update(val)
                elif name.startswith("KSWIN") and name.endswith(f):
                    drift = det.update(val)
                else:
                    continue

                if drift:
                    drift_detected = True
                    drift_points[name].append(i)

        # Adaptation Latency
        if drift_detected and not in_recovery:
            in_recovery = True
            recovery_start = i

        if in_recovery:
            if i - recovery_start > 200:
                recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[i-200:i]]
                if np.mean(recent_errs) < 0.05:
                    AL = i - recovery_start
                    AL_values.append(AL)
                    in_recovery = False

    # -----------------------------
    # Summary per feature
    # -----------------------------
    print("\n=== SUMMARY ===============================")
    print(f"Data type: {data_type}")
    print(f"Samples: {len(X_scaled)} | Features: {len(features)}")
    print("------------------------------------------")
    for det, pts in drift_points.items():
        if len(pts) > 0:
            print(f"  {det:15}: {len(pts)} detections")
    if AL_values:
        print(f"\nAdaptation Latency (mean over drifts): {np.mean(AL_values):.2f} samples")
    else:
        print("\nNo AL measured (no recovery detected).")
    print("==========================================\n")

    return drift_points, AL_values


# -----------------------------
# Run for CMU keystroke dataset
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/DSL-StrongPasswordData 2.csv"
}

for dtype, path in datasets.items():
    drift_points, AL = detect_natural_drift(path, dtype)


Keystroke dataset loaded: 20400 rows
Available columns: ['subject', 'sessionIndex', 'rep', 'H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e', 'DD.e.five', 'UD.e.five', 'H.five', 'DD.five.Shift.r', 'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o'] ...

Data type: keystroke
Samples: 20400 | Features: 31
------------------------------------------
  PCDM_H.period  : 583 detections
  PCDM_DD.period.t: 359 detections
  PCDM_UD.period.t: 344 detections
  PCDM_H.t       : 566 detections
  PCDM_DD.t.i    : 338 detections
  PCDM_UD.t.i    : 323 detections
  PCDM_H.i       : 560 detections
  PCDM_DD.i.e    : 321 detections
  PCDM_UD.i.e    : 296 detections
  PCDM_H.e       : 549 detections
  PCDM_DD.e.five : 407 detections
  PCDM_UD.e.five : 412 detections
  PCDM_H.five    : 488 detections
  PCDM_DD.five.Shift.r: 381 detections
  PCDM_UD.five.Shift.r: 362 detections
  PCDM_H.Shift.r : 589 detections
  PCDM_DD.Shift.r.o: 416 detections
  PCDM_UD.Sh

In [10]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import ADWIN, PageHinkley, KSWIN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# -----------------------------
# Custom detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )
        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


class PCDM:
    def __init__(self, window_size=50, n_permutations=50, alpha=0.05):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.reference_window = []
        self.current_window = []
        self.drift_detected = False

    def add_element(self, value):
        self.current_window.append(value)
        if len(self.current_window) > self.window_size:
            self.current_window.pop(0)
        if len(self.reference_window) < self.window_size:
            self.reference_window.append(value)
            return False
        if len(self.current_window) == self.window_size:
            stat, p_value = self._permutation_test()
            self.drift_detected = p_value < self.alpha
            if self.drift_detected:
                self.reference_window = self.current_window.copy()
        return self.drift_detected

    def _permutation_test(self):
        ref = np.array(self.reference_window)
        curr = np.array(self.current_window)
        observed_diff = np.abs(np.mean(ref) - np.mean(curr))
        combined = np.concatenate([ref, curr])
        perm_diffs = []
        for _ in range(self.n_permutations):
            np.random.shuffle(combined)
            perm_ref = combined[: self.window_size]
            perm_curr = combined[self.window_size :]
            perm_diffs.append(np.abs(np.mean(perm_ref) - np.mean(perm_curr)))
        p_value = np.sum(np.array(perm_diffs) >= observed_diff) / self.n_permutations
        return observed_diff, p_value


# -----------------------------
# Drift detection + AL pipeline
# -----------------------------
def detect_natural_drift(data_file, data_type):
    df = pd.read_csv(data_file)
    print(f"{data_type.capitalize()} dataset loaded: {len(df)} rows")
    print(f"Available columns: {list(df.columns)[:20]} ...")

    # Select numeric features
    exclude_cols = ["subject", "sessionIndex", "rep"]
    features = [col for col in df.columns if col not in exclude_cols]

    if not features:
        raise ValueError(f"No valid features found in {data_type} dataset!")

    X = df[features].dropna().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Base anomaly models
    ocsvm = OneClassSVM(nu=0.01, gamma="scale")
    iso = IsolationForest(contamination=0.01, random_state=42)
    ocsvm.fit(X_scaled[:1000])
    iso.fit(X_scaled[:1000])

    # Initialize detectors
    detectors = {
        "ADWIN_PC": ADWIN(delta=0.01),
        "PH_PC": PageHinkley(threshold=10, alpha=0.01),
        "KSWIN_PC": KSWIN(alpha=0.1, window_size=100),
        "DDM_ERR": DDM(),
        "PCDM_PC": PCDM(alpha=0.05),
        "ADWIN_OCSVM": ADWIN(delta=0.01),
        "ADWIN_ISO": ADWIN(delta=0.01),
    }

    drift_points = {name: [] for name in detectors}
    AL_values, DDA_values, RA_values = [], [], []
    FAR_values = {}

    in_recovery = False
    recovery_start = None
    baseline_acc = None

    for i, x in enumerate(X_scaled):
        x_val = np.mean(x)
        err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
        err_iso = 1 if iso.predict([x])[0] == -1 else 0

        drift_detected = False
        for name, det in detectors.items():
            if name == "DDM_ERR":
                drift = det.add_element(err_ocsvm or err_iso)
            elif name == "PCDM_PC":
                drift = det.add_element(x_val)
            elif name == "ADWIN_OCSVM":
                drift = det.update(err_ocsvm)
            elif name == "ADWIN_ISO":
                drift = det.update(err_iso)
            else:
                drift = det.update(x_val)
            if drift:
                drift_detected = True
                drift_points[name].append(i)

        # Adaptation Latency
        if drift_detected and not in_recovery:
            in_recovery = True
            recovery_start = i
            # Save baseline accuracy before drift
            recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[max(0, i-200):i]]
            baseline_acc = 1 - np.mean(recent_errs) if len(recent_errs) > 0 else None

        if in_recovery:
            if i - recovery_start > 200:
                recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[i-200:i]]
                if np.mean(recent_errs) < 0.05:
                    AL = i - recovery_start
                    AL_values.append(AL)

                    # DDA = detection delay (here approximated as AL itself)
                    DDA_values.append(AL)

                    # RA = recovery accuracy compared to baseline
                    if baseline_acc is not None:
                        rec_acc = 1 - np.mean(recent_errs)
                        RA_values.append(rec_acc / baseline_acc * 100)

                    in_recovery = False

    # FAR = detections / total samples
    total_samples = len(X_scaled)
    for det, pts in drift_points.items():
        FAR_values[det] = len(pts) / total_samples * 100

    print("\n=== SUMMARY ===============================")
    print(f"Data type: {data_type}")
    print(f"Samples: {len(X_scaled)} | Features: {features[:10]} ...")
    print("------------------------------------------")
    for det, pts in drift_points.items():
        print(f"  {det:12}: {len(pts)} detections (FAR={FAR_values[det]:.2f}%)")

    if AL_values:
        print(f"\nAdaptation Latency (AL): {np.mean(AL_values):.2f} samples")
        print(f"Detection Delay Accuracy (DDA): {np.mean(DDA_values):.2f} samples")
        print(f"Recovery Accuracy (RA): {np.mean(RA_values):.2f}%")
    else:
        print("\nNo AL/DDA/RA measured (no recovery detected).")
    print("==========================================\n")

    return drift_points, AL_values, DDA_values, RA_values, FAR_values


# -----------------------------
# Run for CMU keystroke dataset
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/DSL-StrongPasswordData 2.csv"
}

for dtype, path in datasets.items():
    drift_points, AL, DDA, RA, FAR = detect_natural_drift(path, dtype)


Keystroke dataset loaded: 20400 rows
Available columns: ['subject', 'sessionIndex', 'rep', 'H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e', 'DD.e.five', 'UD.e.five', 'H.five', 'DD.five.Shift.r', 'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o'] ...

Data type: keystroke
Samples: 20400 | Features: ['H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e'] ...
------------------------------------------
  ADWIN_PC    : 0 detections (FAR=0.00%)
  PH_PC       : 0 detections (FAR=0.00%)
  KSWIN_PC    : 0 detections (FAR=0.00%)
  DDM_ERR     : 17663 detections (FAR=86.58%)
  PCDM_PC     : 604 detections (FAR=2.96%)
  ADWIN_OCSVM : 0 detections (FAR=0.00%)
  ADWIN_ISO   : 0 detections (FAR=0.00%)

Adaptation Latency (AL): 3248.80 samples
Detection Delay Accuracy (DDA): 3248.80 samples
Recovery Accuracy (RA): 99.65%



In [20]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import ADWIN, PageHinkley, KSWIN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# -----------------------------
# Custom detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )
        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


class PCDM:
    def __init__(self, window_size=50, n_permutations=50, alpha=0.05):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.reference_window = []
        self.current_window = []
        self.drift_detected = False

    def add_element(self, value):
        self.current_window.append(value)
        if len(self.current_window) > self.window_size:
            self.current_window.pop(0)
        if len(self.reference_window) < self.window_size:
            self.reference_window.append(value)
            return False
        if len(self.current_window) == self.window_size:
            stat, p_value = self._permutation_test()
            self.drift_detected = p_value < self.alpha
            if self.drift_detected:
                self.reference_window = self.current_window.copy()
        return self.drift_detected

    def _permutation_test(self):
        ref = np.array(self.reference_window)
        curr = np.array(self.current_window)
        observed_diff = np.abs(np.mean(ref) - np.mean(curr))
        combined = np.concatenate([ref, curr])
        perm_diffs = []
        for _ in range(self.n_permutations):
            np.random.shuffle(combined)
            perm_ref = combined[: self.window_size]
            perm_curr = combined[self.window_size:]
            perm_diffs.append(np.abs(np.mean(perm_ref) - np.mean(perm_curr)))
        p_value = np.sum(np.array(perm_diffs) >= observed_diff) / self.n_permutations
        return observed_diff, p_value


# -----------------------------
# Drift detection + metrics pipeline
# -----------------------------
def detect_natural_drift(data_file, data_type):
    df = pd.read_csv(data_file)
    print(f"{data_type.capitalize()} dataset loaded: {len(df)} rows")
    print(f"Available columns: {list(df.columns)[:20]} ...")

    # For CMU: drop IDs and keep numeric timing features (H., DD., UD.)
    exclude_cols = ["subject", "sessionIndex", "rep"]
    features = [col for col in df.columns if col not in exclude_cols]

    if not features:
        raise ValueError(f"No valid features found in {data_type} dataset!")

    X = df[features].dropna().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Base anomaly models
    ocsvm = OneClassSVM(nu=0.05, gamma="scale")   # more relaxed than 0.01
    iso = IsolationForest(contamination=0.05, random_state=42)  # more realistic contamination
    ocsvm.fit(X_scaled[:2000])  # bigger reference set
    iso.fit(X_scaled[:2000])

    # Initialize detectors
    detectors = {
        "ADWIN_PC": ADWIN(delta=0.01),
        "PH_PC": PageHinkley(threshold=10, alpha=0.01),
        "KSWIN_PC": KSWIN(alpha=0.1, window_size=100),
        "DDM_ERR": DDM(),
        "PCDM_PC": PCDM(alpha=0.05),
        "ADWIN_OCSVM": ADWIN(delta=0.01),
        "ADWIN_ISO": ADWIN(delta=0.01),
    }

    drift_points = {name: [] for name in detectors}
    AL_values, DDA_values, RA_values = [], [], []
    FAR_values = {}

    in_recovery = False
    recovery_start = None
    baseline_acc = None

    for i, x in enumerate(X_scaled):
        x_val = np.mean(x)
        err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
        err_iso = 1 if iso.predict([x])[0] == -1 else 0

        drift_detected = False
        for name, det in detectors.items():
            if name == "DDM_ERR":
                drift = det.add_element(err_ocsvm or err_iso)
            elif name == "PCDM_PC":
                drift = det.add_element(x_val)
            elif name == "ADWIN_OCSVM":
                drift = det.update(err_ocsvm)
            elif name == "ADWIN_ISO":
                drift = det.update(err_iso)
            else:
                drift = det.update(x_val)
            if drift:
                drift_detected = True
                drift_points[name].append(i)

        # Adaptation Latency
        if drift_detected and not in_recovery:
            in_recovery = True
            recovery_start = i
            # Save baseline accuracy before drift
            recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[max(0, i-200):i]]
            baseline_acc = 1 - np.mean(recent_errs) if len(recent_errs) > 0 else None

        if in_recovery:
            if i - recovery_start > 200:
                recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[i-200:i]]
                if np.mean(recent_errs) < 0.1:  # allow up to 10% anomaly rate
                    AL = i - recovery_start
                    AL_values.append(AL)

                    # DDA ~ AL
                    DDA_values.append(AL)

                    # RA = recovery accuracy compared to baseline
                    if baseline_acc is not None:
                        rec_acc = 1 - np.mean(recent_errs)
                        RA_values.append(rec_acc / baseline_acc * 100)

                    in_recovery = False

    # FAR = detections / total samples
    total_samples = len(X_scaled)
    for det, pts in drift_points.items():
        FAR_values[det] = len(pts) / total_samples * 100

    print("\n=== SUMMARY ===============================")
    print(f"Data type: {data_type}")
    print(f"Samples: {len(X_scaled)} | Features: {len(features)}")
    print("------------------------------------------")
    for det, pts in drift_points.items():
        print(f"  {det:12}: {len(pts)} detections (FAR={FAR_values[det]:.2f}%)")

    if AL_values:
        print(f"\nAdaptation Latency (AL): {np.mean(AL_values):.2f} samples")
        print(f"Detection Delay Accuracy (DDA): {np.mean(DDA_values):.2f} samples")
        print(f"Recovery Accuracy (RA): {np.mean(RA_values):.2f}%")
    else:
        print("\nNo AL/DDA/RA measured (no recovery detected).")
    print("==========================================\n")

    return drift_points, AL_values, DDA_values, RA_values, FAR_values


# -----------------------------
# Run for CMU DSL dataset
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/DSL-StrongPasswordData 2.csv"
}

for dtype, path in datasets.items():
    drift_points, AL, DDA, RA, FAR = detect_natural_drift(path, dtype)
        # Average Error Rate (AER) from OCSVM
    all_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled]
    AER = np.mean(all_errs) * 100

    # Drift Density (DD) using PCDM
    DD = len(drift_points["PCDM_PC"]) / total_samples * 100

    print(f"Average Error Rate (AER): {AER:.2f}%")
    print(f"Drift Density (DD): {DD:.2f}%")



Keystroke dataset loaded: 20400 rows
Available columns: ['subject', 'sessionIndex', 'rep', 'H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e', 'DD.e.five', 'UD.e.five', 'H.five', 'DD.five.Shift.r', 'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o'] ...

Data type: keystroke
Samples: 20400 | Features: 31
------------------------------------------
  ADWIN_PC    : 0 detections (FAR=0.00%)
  PH_PC       : 0 detections (FAR=0.00%)
  KSWIN_PC    : 0 detections (FAR=0.00%)
  DDM_ERR     : 15580 detections (FAR=76.37%)
  PCDM_PC     : 628 detections (FAR=3.08%)
  ADWIN_OCSVM : 0 detections (FAR=0.00%)
  ADWIN_ISO   : 0 detections (FAR=0.00%)

Adaptation Latency (AL): 718.27 samples
Detection Delay Accuracy (DDA): 718.27 samples
Recovery Accuracy (RA): 99.88%



NameError: name 'X_scaled' is not defined

In [23]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import ADWIN, PageHinkley, KSWIN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# -----------------------------
# Custom detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )
        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


class PCDM:
    def __init__(self, window_size=50, n_permutations=50, alpha=0.05):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.reference_window = []
        self.current_window = []
        self.drift_detected = False

    def add_element(self, value):
        self.current_window.append(value)
        if len(self.current_window) > self.window_size:
            self.current_window.pop(0)
        if len(self.reference_window) < self.window_size:
            self.reference_window.append(value)
            return False
        if len(self.current_window) == self.window_size:
            stat, p_value = self._permutation_test()
            self.drift_detected = p_value < self.alpha
            if self.drift_detected:
                self.reference_window = self.current_window.copy()
        return self.drift_detected

    def _permutation_test(self):
        ref = np.array(self.reference_window)
        curr = np.array(self.current_window)
        observed_diff = np.abs(np.mean(ref) - np.mean(curr))
        combined = np.concatenate([ref, curr])
        perm_diffs = []
        for _ in range(self.n_permutations):
            np.random.shuffle(combined)
            perm_ref = combined[: self.window_size]
            perm_curr = combined[self.window_size :]
            perm_diffs.append(np.abs(np.mean(perm_ref) - np.mean(perm_curr)))
        p_value = np.sum(np.array(perm_diffs) >= observed_diff) / self.n_permutations
        return observed_diff, p_value


# -----------------------------
# Drift detection + metrics pipeline
# -----------------------------
def detect_natural_drift(data_file, data_type):
    df = pd.read_csv(data_file)
    print(f"{data_type.capitalize()} dataset loaded: {len(df)} rows")
    print(f"Available columns: {list(df.columns)[:20]} ...")

    # Select numeric features (exclude IDs/labels)
    exclude_cols = ["subject", "sessionIndex", "rep"]
    features = [col for col in df.columns if col not in exclude_cols]

    if not features:
        raise ValueError(f"No valid features found in {data_type} dataset!")

    X = df[features].dropna().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Base anomaly models
    ocsvm = OneClassSVM(nu=0.01, gamma="scale")
    iso = IsolationForest(contamination=0.01, random_state=42)
    ocsvm.fit(X_scaled[:1000])
    iso.fit(X_scaled[:1000])

    # Initialize detectors
    detectors = {
        "ADWIN_PC": ADWIN(delta=0.01),
        "PH_PC": PageHinkley(threshold=10, alpha=0.01),
        "KSWIN_PC": KSWIN(alpha=0.1, window_size=100),
        "DDM_ERR": DDM(),
        "PCDM_PC": PCDM(alpha=0.05),
        "ADWIN_OCSVM": ADWIN(delta=0.01),
        "ADWIN_ISO": ADWIN(delta=0.01),
    }

    drift_points = {name: [] for name in detectors}
    AL_values, DDA_values, RA_values = [], [], []
    FAR_values = {}

    in_recovery = False
    recovery_start = None
    baseline_acc = None

    # Track errors for AER
    all_errs = []

    for i, x in enumerate(X_scaled):
        x_val = np.mean(x)
        err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
        err_iso = 1 if iso.predict([x])[0] == -1 else 0
        all_errs.append(err_ocsvm or err_iso)

        drift_detected = False
        for name, det in detectors.items():
            if name == "DDM_ERR":
                drift = det.add_element(err_ocsvm or err_iso)
            elif name == "PCDM_PC":
                drift = det.add_element(x_val)
            elif name == "ADWIN_OCSVM":
                drift = det.update(err_ocsvm)
            elif name == "ADWIN_ISO":
                drift = det.update(err_iso)
            else:
                drift = det.update(x_val)
            if drift:
                drift_detected = True
                drift_points[name].append(i)

        # Adaptation metrics
        if drift_detected and not in_recovery:
            in_recovery = True
            recovery_start = i
            # Baseline accuracy before drift
            recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0
                           for z in X_scaled[max(0, i-200):i]]
            baseline_acc = 1 - np.mean(recent_errs) if len(recent_errs) > 0 else None

        if in_recovery:
            if i - recovery_start > 200:
                recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0
                               for z in X_scaled[i-200:i]]
                if np.mean(recent_errs) < 0.05:
                    AL = i - recovery_start
                    AL_values.append(AL)
                    DDA_values.append(AL)  # here approximated as AL
                    if baseline_acc is not None:
                        rec_acc = 1 - np.mean(recent_errs)
                        RA_values.append(rec_acc / baseline_acc * 100)
                    in_recovery = False

    # FAR = detections / total samples
    total_samples = len(X_scaled)
    for det, pts in drift_points.items():
        FAR_values[det] = len(pts) / total_samples * 100

    # Average Error Rate (AER)
    AER = np.mean(all_errs) * 100

    # Drift Density (DD) from PCDM
    DD = len(drift_points["PCDM_PC"]) / total_samples * 100

    # Summary
    print("\n=== SUMMARY ===============================")
    print(f"Data type: {data_type}")
    print(f"Samples: {len(X_scaled)} | Features: {features[:10]} ...")
    print("------------------------------------------")
    for det, pts in drift_points.items():
        print(f"  {det:12}: {len(pts)} detections (FAR={FAR_values[det]:.2f}%)")

    if AL_values:
        print(f"\nAdaptation Latency (AL): {np.mean(AL_values):.2f} samples")
        print(f"Detection Delay Accuracy (DDA): {np.mean(DDA_values):.2f} samples")
        print(f"Recovery Accuracy (RA): {np.mean(RA_values):.2f}%")
    else:
        print("\nNo AL/DDA/RA measured (no recovery detected).")

    print(f"Average Error Rate (AER): {AER:.2f}%")
    print(f"Drift Density (DD): {DD:.2f}%")
    print("==========================================\n")

    return drift_points, AL_values, DDA_values, RA_values, FAR_values, AER, DD


# -----------------------------
# Run for CMU keystroke dataset
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/DSL-StrongPasswordData 2.csv"
}

for dtype, path in datasets.items():
    detect_natural_drift(path, dtype)


Keystroke dataset loaded: 20400 rows
Available columns: ['subject', 'sessionIndex', 'rep', 'H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e', 'DD.e.five', 'UD.e.five', 'H.five', 'DD.five.Shift.r', 'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o'] ...

Data type: keystroke
Samples: 20400 | Features: ['H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e'] ...
------------------------------------------
  ADWIN_PC    : 0 detections (FAR=0.00%)
  PH_PC       : 0 detections (FAR=0.00%)
  KSWIN_PC    : 0 detections (FAR=0.00%)
  DDM_ERR     : 17663 detections (FAR=86.58%)
  PCDM_PC     : 608 detections (FAR=2.98%)
  ADWIN_OCSVM : 0 detections (FAR=0.00%)
  ADWIN_ISO   : 0 detections (FAR=0.00%)

Adaptation Latency (AL): 2346.71 samples
Detection Delay Accuracy (DDA): 2346.71 samples
Recovery Accuracy (RA): 100.35%
Average Error Rate (AER): 44.59%
Drift Density (DD): 2.98%



In [25]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import ADWIN, PageHinkley, KSWIN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# -----------------------------
# Custom detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )
        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


class PCDM:
    def __init__(self, window_size=50, n_permutations=50, alpha=0.05):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.reference_window = []
        self.current_window = []
        self.drift_detected = False

    def add_element(self, value):
        self.current_window.append(value)
        if len(self.current_window) > self.window_size:
            self.current_window.pop(0)
        if len(self.reference_window) < self.window_size:
            self.reference_window.append(value)
            return False
        if len(self.current_window) == self.window_size:
            stat, p_value = self._permutation_test()
            self.drift_detected = p_value < self.alpha
            if self.drift_detected:
                self.reference_window = self.current_window.copy()
        return self.drift_detected

    def _permutation_test(self):
        ref = np.array(self.reference_window)
        curr = np.array(self.current_window)
        observed_diff = np.abs(np.mean(ref) - np.mean(curr))
        combined = np.concatenate([ref, curr])
        perm_diffs = []
        for _ in range(self.n_permutations):
            np.random.shuffle(combined)
            perm_ref = combined[: self.window_size]
            perm_curr = combined[self.window_size :]
            perm_diffs.append(np.abs(np.mean(perm_ref) - np.mean(perm_curr)))
        p_value = np.sum(np.array(perm_diffs) >= observed_diff) / self.n_permutations
        return observed_diff, p_value


# -----------------------------
# Drift detection + metrics
# -----------------------------
def detect_natural_drift(data_file, data_type):
    df = pd.read_csv(data_file)
    print(f"{data_type.capitalize()} dataset loaded: {len(df)} rows")
    print(f"Available columns: {list(df.columns)[:20]} ...")

    # Select numeric features (exclude identifiers)
    exclude_cols = ["subject", "sessionIndex", "rep"]
    features = [col for col in df.columns if col not in exclude_cols]

    if not features:
        raise ValueError(f"No valid features found in {data_type} dataset!")

    X = df[features].dropna().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Base anomaly models
    ocsvm = OneClassSVM(nu=0.01, gamma="scale")
    iso = IsolationForest(contamination=0.01, random_state=42)
    ocsvm.fit(X_scaled[:1000])
    iso.fit(X_scaled[:1000])

    # Initialize detectors
    detectors = {
        "ADWIN_PC": ADWIN(delta=0.01),
        "PH_PC": PageHinkley(threshold=10, alpha=0.01),
        "KSWIN_PC": KSWIN(alpha=0.1, window_size=100),
        "DDM_ERR": DDM(),
        "PCDM_PC": PCDM(alpha=0.05),
        "ADWIN_OCSVM": ADWIN(delta=0.01),
        "ADWIN_ISO": ADWIN(delta=0.01),
    }

    drift_points = {name: [] for name in detectors}
    AL_values, DDA_values, RA_values, AER_values = [], [], [], []
    FAR_values = {}

    in_recovery = False
    recovery_start = None
    baseline_acc = None

    for i, x in enumerate(X_scaled):
        x_val = np.mean(x)
        err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
        err_iso = 1 if iso.predict([x])[0] == -1 else 0
        avg_err = (err_ocsvm + err_iso) / 2.0
        AER_values.append(avg_err)

        drift_detected = False
        for name, det in detectors.items():
            if name == "DDM_ERR":
                drift = det.add_element(err_ocsvm or err_iso)
            elif name == "PCDM_PC":
                drift = det.add_element(x_val)
            elif name == "ADWIN_OCSVM":
                drift = det.update(err_ocsvm)
            elif name == "ADWIN_ISO":
                drift = det.update(err_iso)
            else:
                drift = det.update(x_val)
            if drift:
                drift_detected = True
                drift_points[name].append(i)

        # Adaptation Latency
        if drift_detected and not in_recovery:
            in_recovery = True
            recovery_start = i
            recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[max(0, i-200):i]]
            baseline_acc = 1 - np.mean(recent_errs) if len(recent_errs) > 0 else None

        if in_recovery:
            if i - recovery_start > 200:
                recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[i-200:i]]
                post_acc = 1 - np.mean(recent_errs)
                baseline_err = 1 - baseline_acc if baseline_acc is not None else None
                post_err = np.mean(recent_errs)

                if post_err < 0.05:  # recovery condition
                    AL = i - recovery_start
                    AL_values.append(AL)
                    DDA_values.append(AL)

                    # ✅ FIXED: Normalized RA (0–100%)
                    if baseline_err is not None and baseline_err > 0:
                        RA = (1 - (post_err / baseline_err)) * 100
                        RA_values.append(max(min(RA, 100), 0))  # bound between 0 and 100

                    in_recovery = False

    # FAR = detections / total samples
    total_samples = len(X_scaled)
    for det, pts in drift_points.items():
        FAR_values[det] = len(pts) / total_samples * 100

    print("\n=== SUMMARY ===============================")
    print(f"Data type: {data_type}")
    print(f"Samples: {len(X_scaled)} | Features: {features[:10]} ...")
    print("------------------------------------------")
    for det, pts in drift_points.items():
        print(f"  {det:12}: {len(pts)} detections (FAR={FAR_values[det]:.2f}%)")

    if AL_values:
        print(f"\nAdaptation Latency (AL): {np.mean(AL_values):.2f} samples")
        print(f"Detection Delay Accuracy (DDA): {np.mean(DDA_values):.2f} samples")
        print(f"Recovery Accuracy (RA): {np.mean(RA_values):.2f}%")
        print(f"Average Error Rate (AER): {np.mean(AER_values)*100:.2f}%")
        print(f"Drift Density (DD): {np.mean([len(pts) for pts in drift_points.values()])/total_samples*100:.2f}%")
    else:
        print("\nNo AL/DDA/RA measured (no recovery detected).")
    print("==========================================\n")

    return drift_points, AL_values, DDA_values, RA_values, FAR_values


# -----------------------------
# Run for CMU keystroke dataset
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/DSL-StrongPasswordData 2.csv"
}

for dtype, path in datasets.items():
    drift_points, AL, DDA, RA, FAR = detect_natural_drift(path, dtype)


Keystroke dataset loaded: 20400 rows
Available columns: ['subject', 'sessionIndex', 'rep', 'H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e', 'DD.e.five', 'UD.e.five', 'H.five', 'DD.five.Shift.r', 'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o'] ...

Data type: keystroke
Samples: 20400 | Features: ['H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e'] ...
------------------------------------------
  ADWIN_PC    : 0 detections (FAR=0.00%)
  PH_PC       : 0 detections (FAR=0.00%)
  KSWIN_PC    : 0 detections (FAR=0.00%)
  DDM_ERR     : 17663 detections (FAR=86.58%)
  PCDM_PC     : 584 detections (FAR=2.86%)
  ADWIN_OCSVM : 0 detections (FAR=0.00%)
  ADWIN_ISO   : 0 detections (FAR=0.00%)

Adaptation Latency (AL): 2354.14 samples
Detection Delay Accuracy (DDA): 2354.14 samples
Recovery Accuracy (RA): 25.05%
Average Error Rate (AER): 29.73%
Drift Density (DD): 12.78%



In [27]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import ADWIN, PageHinkley, KSWIN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# -----------------------------
# Custom detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )
        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


class PCDM:
    def __init__(self, window_size=50, n_permutations=50, alpha=0.05):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.reference_window = []
        self.current_window = []
        self.drift_detected = False

    def add_element(self, value):
        self.current_window.append(value)
        if len(self.current_window) > self.window_size:
            self.current_window.pop(0)
        if len(self.reference_window) < self.window_size:
            self.reference_window.append(value)
            return False
        if len(self.current_window) == self.window_size:
            stat, p_value = self._permutation_test()
            self.drift_detected = p_value < self.alpha
            if self.drift_detected:
                self.reference_window = self.current_window.copy()
        return self.drift_detected

    def _permutation_test(self):
        ref = np.array(self.reference_window)
        curr = np.array(self.current_window)
        observed_diff = np.abs(np.mean(ref) - np.mean(curr))
        combined = np.concatenate([ref, curr])
        perm_diffs = []
        for _ in range(self.n_permutations):
            np.random.shuffle(combined)
            perm_ref = combined[: self.window_size]
            perm_curr = combined[self.window_size :]
            perm_diffs.append(np.abs(np.mean(perm_ref) - np.mean(perm_curr)))
        p_value = np.sum(np.array(perm_diffs) >= observed_diff) / self.n_permutations
        return observed_diff, p_value


# -----------------------------
# Drift detection + metrics pipeline
# -----------------------------
def detect_natural_drift(data_file, data_type):
    df = pd.read_csv(data_file)
    print(f"{data_type.capitalize()} dataset loaded: {len(df)} rows")
    print(f"Available columns: {list(df.columns)[:20]} ...")

    # Select numeric features
    exclude_cols = ["subject", "sessionIndex", "rep"]
    features = [col for col in df.columns if col not in exclude_cols]

    if not features:
        raise ValueError(f"No valid features found in {data_type} dataset!")

    X = df[features].dropna().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Base anomaly models
    ocsvm = OneClassSVM(nu=0.01, gamma="scale")
    iso = IsolationForest(contamination=0.01, random_state=42)
    ocsvm.fit(X_scaled[:1000])
    iso.fit(X_scaled[:1000])

    # Initialize detectors
    detectors = {
        "ADWIN_PC": ADWIN(delta=0.01),
        "PH_PC": PageHinkley(threshold=10, alpha=0.01),
        "KSWIN_PC": KSWIN(alpha=0.1, window_size=100),
        "DDM_ERR": DDM(),
        "PCDM_PC": PCDM(alpha=0.05),
        "ADWIN_OCSVM": ADWIN(delta=0.01),
        "ADWIN_ISO": ADWIN(delta=0.01),
    }

    drift_points = {name: [] for name in detectors}
    AL_values, DDA_values, RA_values = [], [], []
    FAR_values = {}

    in_recovery = False
    recovery_start = None

    for i, x in enumerate(X_scaled):
        x_val = np.mean(x)
        err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
        err_iso = 1 if iso.predict([x])[0] == -1 else 0

        drift_detected = False
        for name, det in detectors.items():
            if name == "DDM_ERR":
                drift = det.add_element(err_ocsvm or err_iso)
            elif name == "PCDM_PC":
                drift = det.add_element(x_val)
            elif name == "ADWIN_OCSVM":
                drift = det.update(err_ocsvm)
            elif name == "ADWIN_ISO":
                drift = det.update(err_iso)
            else:
                drift = det.update(x_val)
            if drift:
                drift_detected = True
                drift_points[name].append(i)

        # Adaptation Latency & Recovery
        if drift_detected and not in_recovery:
            in_recovery = True
            recovery_start = i

        if in_recovery:
            window_size = 500  # smoother window
            if i - recovery_start > window_size:
                recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[i-window_size:i]]
                rec_acc = 1 - np.mean(recent_errs)

                if rec_acc > 0.90:  # recovered if accuracy >90%
                    AL = i - recovery_start
                    AL_values.append(AL)
                    DDA_values.append(AL)
                    RA_values.append(rec_acc * 100)  # absolute RA %
                    in_recovery = False

    # FAR = detections / total samples
    total_samples = len(X_scaled)
    for det, pts in drift_points.items():
        FAR_values[det] = len(pts) / total_samples * 100

    # Summary
    print("\n=== SUMMARY ===============================")
    print(f"Data type: {data_type}")
    print(f"Samples: {len(X_scaled)} | Features: {features[:10]} ...")
    print("------------------------------------------")
    for det, pts in drift_points.items():
        print(f"  {det:12}: {len(pts)} detections (FAR={FAR_values[det]:.2f}%)")

    if AL_values:
        print(f"\nAdaptation Latency (AL): {np.mean(AL_values):.2f} samples")
        print(f"Detection Delay Accuracy (DDA): {np.mean(DDA_values):.2f} samples")
        print(f"Recovery Accuracy (RA): {np.mean(RA_values):.2f}%")
    else:
        print("\nNo AL/DDA/RA measured (no recovery detected).")
    print("==========================================\n")

    return drift_points, AL_values, DDA_values, RA_values, FAR_values


# -----------------------------
# Run for CMU keystroke dataset
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/DSL-StrongPasswordData 2.csv"
}

for dtype, path in datasets.items():
    drift_points, AL, DDA, RA, FAR = detect_natural_drift(path, dtype)


Keystroke dataset loaded: 20400 rows
Available columns: ['subject', 'sessionIndex', 'rep', 'H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e', 'DD.e.five', 'UD.e.five', 'H.five', 'DD.five.Shift.r', 'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o'] ...

Data type: keystroke
Samples: 20400 | Features: ['H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e'] ...
------------------------------------------
  ADWIN_PC    : 0 detections (FAR=0.00%)
  PH_PC       : 0 detections (FAR=0.00%)
  KSWIN_PC    : 0 detections (FAR=0.00%)
  DDM_ERR     : 17663 detections (FAR=86.58%)
  PCDM_PC     : 613 detections (FAR=3.00%)
  ADWIN_OCSVM : 0 detections (FAR=0.00%)
  ADWIN_ISO   : 0 detections (FAR=0.00%)

Adaptation Latency (AL): 2019.50 samples
Detection Delay Accuracy (DDA): 2019.50 samples
Recovery Accuracy (RA): 93.75%



In [29]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import ADWIN, PageHinkley, KSWIN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# -----------------------------
# Custom detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float('inf')
        self.std_min = float('inf')

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )
        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


class PCDM:
    def __init__(self, window_size=50, n_permutations=50, alpha=0.05):
        self.window_size = window_size
        self.n_permutations = n_permutations
        self.alpha = alpha
        self.reference_window = []
        self.current_window = []
        self.drift_detected = False

    def add_element(self, value):
        self.current_window.append(value)
        if len(self.current_window) > self.window_size:
            self.current_window.pop(0)
        if len(self.reference_window) < self.window_size:
            self.reference_window.append(value)
            return False
        if len(self.current_window) == self.window_size:
            stat, p_value = self._permutation_test()
            self.drift_detected = p_value < self.alpha
            if self.drift_detected:
                self.reference_window = self.current_window.copy()
        return self.drift_detected

    def _permutation_test(self):
        ref = np.array(self.reference_window)
        curr = np.array(self.current_window)
        observed_diff = np.abs(np.mean(ref) - np.mean(curr))
        combined = np.concatenate([ref, curr])
        perm_diffs = []
        for _ in range(self.n_permutations):
            np.random.shuffle(combined)
            perm_ref = combined[: self.window_size]
            perm_curr = combined[self.window_size :]
            perm_diffs.append(np.abs(np.mean(perm_ref) - np.mean(perm_curr)))
        p_value = np.sum(np.array(perm_diffs) >= observed_diff) / self.n_permutations
        return observed_diff, p_value


# -----------------------------
# Drift detection + metrics pipeline
# -----------------------------
def detect_natural_drift(data_file, data_type):
    df = pd.read_csv(data_file)
    print(f"{data_type.capitalize()} dataset loaded: {len(df)} rows")
    print(f"Available columns: {list(df.columns)[:20]} ...")

    # Select numeric features (exclude metadata)
    exclude_cols = ["subject", "sessionIndex", "rep"]
    features = [col for col in df.columns if col not in exclude_cols]

    if not features:
        raise ValueError(f"No valid features found in {data_type} dataset!")

    X = df[features].dropna().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Base anomaly models
    ocsvm = OneClassSVM(nu=0.01, gamma="scale")
    iso = IsolationForest(contamination=0.01, random_state=42)
    ocsvm.fit(X_scaled[:1000])
    iso.fit(X_scaled[:1000])

    # Initialize detectors
    detectors = {
        "ADWIN_PC": ADWIN(delta=0.01),
        "PH_PC": PageHinkley(threshold=10, alpha=0.01),
        "KSWIN_PC": KSWIN(alpha=0.1, window_size=100),
        "DDM_ERR": DDM(),
        "PCDM_PC": PCDM(alpha=0.05),
        "ADWIN_OCSVM": ADWIN(delta=0.01),
        "ADWIN_ISO": ADWIN(delta=0.01),
    }

    drift_points = {name: [] for name in detectors}
    AL_values, DDA_values, RA_values = [], [], []
    FAR_values = {}

    in_recovery = False
    recovery_start = None
    baseline_acc = None
    error_rates = []

    for i, x in enumerate(X_scaled):
        x_val = np.mean(x)
        err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
        err_iso = 1 if iso.predict([x])[0] == -1 else 0
        error_rates.append(err_ocsvm or err_iso)

        drift_detected = False
        for name, det in detectors.items():
            if name == "DDM_ERR":
                drift = det.add_element(err_ocsvm or err_iso)
            elif name == "PCDM_PC":
                drift = det.add_element(x_val)
            elif name == "ADWIN_OCSVM":
                drift = det.update(err_ocsvm)
            elif name == "ADWIN_ISO":
                drift = det.update(err_iso)
            else:
                drift = det.update(x_val)
            if drift:
                drift_detected = True
                drift_points[name].append(i)

        # Adaptation Latency
        if drift_detected and not in_recovery:
            in_recovery = True
            recovery_start = i
            recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[max(0, i-200):i]]
            baseline_acc = 1 - np.mean(recent_errs) if len(recent_errs) > 0 else None

        if in_recovery:
            if i - recovery_start > 200:
                recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[i-200:i]]
                if np.mean(recent_errs) < 0.05:  # recovered
                    AL = i - recovery_start
                    AL_values.append(AL)
                    DDA_values.append(AL)
                    if baseline_acc is not None:
                        rec_acc = 1 - np.mean(recent_errs)
                        RA_values.append(rec_acc / max(baseline_acc, 1e-6) * 100)
                    in_recovery = False

    # FAR = detections / total samples
    total_samples = len(X_scaled)
    for det, pts in drift_points.items():
        FAR_values[det] = len(pts) / total_samples * 100

    # AER = mean error rate
    AER = np.mean(error_rates) * 100

    # Drift Density (DD) = (#drift detections by PCDM) / total_samples * 100
    DD = len(drift_points["PCDM_PC"]) / total_samples * 100

    # Summary
    print("\n=== SUMMARY ===============================")
    print(f"Data type: {data_type}")
    print(f"Samples: {total_samples} | Features: {features[:10]} ...")
    print("------------------------------------------")
    for det, pts in drift_points.items():
        print(f"  {det:12}: {len(pts)} detections (FAR={FAR_values[det]:.2f}%)")
    if AL_values:
        print(f"\nAdaptation Latency (AL): {np.mean(AL_values):.2f} samples")
        print(f"Detection Delay Accuracy (DDA): {np.mean(DDA_values):.2f} samples")
        print(f"Recovery Accuracy (RA): {np.mean(RA_values):.2f}%")
        print(f"Average Error Rate (AER): {AER:.2f}%")
        print(f"Drift Density (DD): {DD:.2f}%")
    else:
        print("\nNo AL/DDA/RA measured (no recovery detected).")
    print("==========================================\n")

    return drift_points, AL_values, DDA_values, RA_values, FAR_values, AER, DD


# -----------------------------
# Run for CMU keystroke dataset
# -----------------------------
datasets = {
    "keystroke": "/Users/festusedward-n/Documents/Datasets/DSL-StrongPasswordData 2.csv"
}

for dtype, path in datasets.items():
    detect_natural_drift(path, dtype)


Keystroke dataset loaded: 20400 rows
Available columns: ['subject', 'sessionIndex', 'rep', 'H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e', 'DD.e.five', 'UD.e.five', 'H.five', 'DD.five.Shift.r', 'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o'] ...

Data type: keystroke
Samples: 20400 | Features: ['H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e'] ...
------------------------------------------
  ADWIN_PC    : 0 detections (FAR=0.00%)
  PH_PC       : 0 detections (FAR=0.00%)
  KSWIN_PC    : 0 detections (FAR=0.00%)
  DDM_ERR     : 17663 detections (FAR=86.58%)
  PCDM_PC     : 605 detections (FAR=2.97%)
  ADWIN_OCSVM : 0 detections (FAR=0.00%)
  ADWIN_ISO   : 0 detections (FAR=0.00%)

Adaptation Latency (AL): 2740.67 samples
Detection Delay Accuracy (DDA): 2740.67 samples
Recovery Accuracy (RA): 100.24%
Average Error Rate (AER): 44.59%
Drift Density (DD): 2.97%

