In [19]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from river.drift import ADWIN, PageHinkley, KSWIN

# -----------------------------
# Feature Extraction
# -----------------------------
def extract_mouse_features(file_path):
    df = pd.read_csv(file_path)

    # Ensure column names are clean
    df.columns = df.columns.str.strip()

    required = {"client timestamp", "x", "y"}
    if not required.issubset(df.columns):
        raise ValueError(f"Missing required columns in {file_path}")

    # Compute deltas
    df["dx"] = df["x"].diff().fillna(0)
    df["dy"] = df["y"].diff().fillna(0)
    df["dt"] = df["client timestamp"].diff().replace(0, np.nan).fillna(1)

    # Features
    df["distance"] = np.sqrt(df["dx"]**2 + df["dy"]**2)
    df["speed"] = df["distance"] / df["dt"]

    return df[["speed", "distance", "dx", "dy"]].fillna(0)


# -----------------------------
# Custom Drift Detectors
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, warning_level=2.0, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.warning_level = warning_level
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float("inf")
        self.std_min = float("inf")

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
            self.std = 0.0
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )

        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


# -----------------------------
# Drift Detection on Features
# -----------------------------
def detect_natural_drift_from_features(features):
    X = features.values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Base models
    ocsvm = OneClassSVM(nu=0.01, gamma="scale")
    iso = IsolationForest(contamination=0.01, random_state=42)
    ocsvm.fit(X_scaled[:min(1000, len(X_scaled))])
    iso.fit(X_scaled[:min(1000, len(X_scaled))])

    # Initialize detectors
    detectors = {
        "ADWIN_PC": ADWIN(delta=0.01),
        "PH_PC": PageHinkley(threshold=10, alpha=0.01),
        "KSWIN_PC": KSWIN(alpha=0.1, window_size=100),
        "DDM_ERR": DDM(),
        "ADWIN_OCSVM": ADWIN(delta=0.01),
        "ADWIN_ISO": ADWIN(delta=0.01),
    }

    drift_points = {name: [] for name in detectors}
    AL_values = []
    in_recovery, recovery_start = False, None

    for i, x in enumerate(X_scaled):
        x_val = np.mean(x)
        err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
        err_iso = 1 if iso.predict([x])[0] == -1 else 0

        drift_detected = False
        for name, det in detectors.items():
            if name == "DDM_ERR":
                drift = det.add_element(err_ocsvm or err_iso)
            elif name == "ADWIN_OCSVM":
                drift = det.update(err_ocsvm)
            elif name == "ADWIN_ISO":
                drift = det.update(err_iso)
            else:
                drift = det.update(x_val)
            if drift:
                drift_detected = True
                drift_points[name].append(i)

        # Adaptation Latency (example: using OCSVM recovery)
        if drift_detected and not in_recovery:
            in_recovery, recovery_start = True, i

        if in_recovery and i - recovery_start > 200:
            recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[i-200:i]]
            if np.mean(recent_errs) < 0.05:
                AL = i - recovery_start
                AL_values.append(AL)
                in_recovery = False

    return drift_points, AL_values


# -----------------------------
# Main Loop Over Users
# -----------------------------
root = "/Users/festusedward-n/Documents/Datasets/DFL_Mouse_Dynamics_Dataset/Users"
summary = []

for user in sorted(os.listdir(root)):
    user_path = os.path.join(root, user)
    if not os.path.isdir(user_path):
        continue

    print(f"\n=== Processing {user} ===")
    user_results = {"user": user, "files": 0, "detections": 0, "AL_mean": None}

    drift_counts, all_AL = [], []

    for file in os.listdir(user_path):
        if not file.endswith(".CSV"):
            continue
        file_path = os.path.join(user_path, file)

        try:
            feats = extract_mouse_features(file_path)
            if feats.empty:
                print(f"⚠ Skipped {file_path} (no usable data)")
                continue

            drift_points, AL = detect_natural_drift_from_features(feats)
            user_results["files"] += 1
            user_results["detections"] += sum(len(v) for v in drift_points.values())
            if AL:
                all_AL.extend(AL)

        except Exception as e:
            print(f"❌ Error in {file_path}: {e}")

    if all_AL:
        user_results["AL_mean"] = np.mean(all_AL)

    summary.append(user_results)


# -----------------------------
# Aggregate Summary
# -----------------------------
print("\n=== AGGREGATE SUMMARY ACROSS USERS ===")
df_summary = pd.DataFrame(summary)
print(df_summary)



=== Processing User1 ===

=== Processing User10 ===

=== Processing User11 ===

=== Processing User12 ===

=== Processing User13 ===

=== Processing User14 ===

=== Processing User15 ===

=== Processing User16 ===

=== Processing User17 ===

=== Processing User18 ===

=== Processing User19 ===

=== Processing User2 ===
❌ Error in /Users/festusedward-n/Documents/Datasets/DFL_Mouse_Dynamics_Dataset/Users/User2/2018_11_11__18_32_58.CSV: Error tokenizing data. C error: Expected 5 fields in line 265, saw 6


=== Processing User20 ===

=== Processing User21 ===

=== Processing User3 ===

=== Processing User4 ===

=== Processing User5 ===

=== Processing User6 ===

=== Processing User7 ===

=== Processing User8 ===

=== Processing User9 ===

=== AGGREGATE SUMMARY ACROSS USERS ===
      user  files  detections      AL_mean
0    User1     31      983299   296.963220
1   User10      1           0          NaN
2   User11     46     1543683   368.131144
3   User12     17     3194083   283.215500


In [29]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from river.drift import ADWIN, PageHinkley, KSWIN
from concurrent.futures import ProcessPoolExecutor, as_completed

# -----------------------------
# Feature Extraction
# -----------------------------
def extract_mouse_features(file_path):
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()

    required = {"client timestamp", "x", "y"}
    if not required.issubset(df.columns):
        return pd.DataFrame()

    df["dx"] = df["x"].diff().fillna(0)
    df["dy"] = df["y"].diff().fillna(0)
    df["dt"] = df["client timestamp"].diff().replace(0, np.nan).fillna(1)

    df["distance"] = np.sqrt(df["dx"]**2 + df["dy"]**2)
    df["speed"] = df["distance"] / df["dt"]

    return df[["speed", "distance", "dx", "dy"]].fillna(0)


# -----------------------------
# Custom DDM
# -----------------------------
class DDM:
    def __init__(self, min_num_instances=30, drift_level=3.0):
        self.min_num_instances = min_num_instances
        self.drift_level = drift_level
        self.mean = 0.0
        self.std = 0.0
        self.n = 0
        self.drift_detected = False
        self.mean_min = float("inf")
        self.std_min = float("inf")

    def add_element(self, error):
        self.n += 1
        if self.n == 1:
            self.mean = error
        else:
            old_mean = self.mean
            self.mean += (error - old_mean) / self.n
            self.std = np.sqrt(
                (self.std**2 * (self.n - 1) + (error - self.mean) * (error - old_mean)) / self.n
            )

        if self.n >= self.min_num_instances:
            if self.mean + self.std > self.mean_min + self.drift_level * self.std_min:
                self.drift_detected = True
            else:
                self.drift_detected = False
            if not self.drift_detected:
                self.mean_min = min(self.mean_min, self.mean)
                self.std_min = min(self.std_min, self.std)
        return self.drift_detected


# -----------------------------
# Drift Detection per File
# -----------------------------
def process_file(file_path):
    try:
        feats = extract_mouse_features(file_path)
        if feats.empty:
            return {"detections": 0, "AL": []}

        X = feats.values
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        ocsvm = OneClassSVM(nu=0.01, gamma="scale")
        iso = IsolationForest(contamination=0.01, random_state=42)
        train_size = min(500, len(X_scaled))
        ocsvm.fit(X_scaled[:train_size])
        iso.fit(X_scaled[:train_size])

        detectors = {
            "ADWIN_PC": ADWIN(delta=0.01),
            "PH_PC": PageHinkley(threshold=10, alpha=0.01),
            "KSWIN_PC": KSWIN(alpha=0.1, window_size=100),
            "DDM_ERR": DDM(),
            "ADWIN_OCSVM": ADWIN(delta=0.01),
            "ADWIN_ISO": ADWIN(delta=0.01),
        }

        drift_points = {name: [] for name in detectors}
        AL_values = []
        in_recovery, recovery_start = False, None

        for i, x in enumerate(X_scaled):
            x_val = np.mean(x)
            err_ocsvm = 1 if ocsvm.predict([x])[0] == -1 else 0
            err_iso = 1 if iso.predict([x])[0] == -1 else 0

            drift_detected = False
            for name, det in detectors.items():
                if name == "DDM_ERR":
                    drift = det.add_element(err_ocsvm or err_iso)
                elif name == "ADWIN_OCSVM":
                    drift = det.update(err_ocsvm)
                elif name == "ADWIN_ISO":
                    drift = det.update(err_iso)
                else:
                    drift = det.update(x_val)
                if drift:
                    drift_detected = True
                    drift_points[name].append(i)

            if drift_detected and not in_recovery:
                in_recovery, recovery_start = True, i
            if in_recovery and i - recovery_start > 200:
                recent_errs = [1 if ocsvm.predict([z])[0] == -1 else 0 for z in X_scaled[i-200:i]]
                if np.mean(recent_errs) < 0.05:
                    AL_values.append(i - recovery_start)
                    in_recovery = False

        return {"detections": sum(len(v) for v in drift_points.values()), "AL": AL_values}

    except Exception as e:
        return {"detections": 0, "AL": [], "error": str(e)}


# -----------------------------
# Main Parallel Loop Over Users
# -----------------------------
root = "/Users/festusedward-n/Documents/Datasets/DFL_Mouse_Dynamics_Dataset/Users"
summary = []
MAX_FILES = 10  # limit per user for testing

for user in sorted(os.listdir(root)):
    user_path = os.path.join(root, user)
    if not os.path.isdir(user_path):
        continue

    print(f"\n=== Processing {user} ===")
    user_results = {"user": user, "files": 0, "detections": 0, "AL_mean": None}
    all_AL = []

    files = [os.path.join(user_path, f) for f in os.listdir(user_path) if f.endswith(".CSV")]
    if MAX_FILES:
        files = files[:MAX_FILES]

    with ProcessPoolExecutor() as executor:
        futures = {executor.submit(process_file, f): f for f in files}
        for idx, future in enumerate(as_completed(futures), 1):
            file = os.path.basename(futures[future])
            result = future.result()
            print(f"   → [{idx}/{len(files)}] {file}")

            if "error" in result:
                print(f"     ❌ Error: {result['error']}")
            else:
                user_results["files"] += 1
                user_results["detections"] += result["detections"]
                all_AL.extend(result["AL"])

    if all_AL:
        user_results["AL_mean"] = np.mean(all_AL)
    summary.append(user_results)
    print(f"--- {user} finished: {user_results['files']} files, {user_results['detections']} detections, AL={user_results['AL_mean']}")

print("\n=== AGGREGATE SUMMARY ACROSS USERS ===")
df_summary = pd.DataFrame(summary)
print(df_summary)



=== Processing User1 ===


Process SpawnProcess-8:
Process SpawnProcess-5:
Process SpawnProcess-4:
Process SpawnProcess-1:
Process SpawnProcess-6:
Process SpawnProcess-3:
Process SpawnProcess-7:
Process SpawnProcess-2:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/pytorch_env_310/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/envs/pytorch_env_310/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/envs/pytorch_env_310/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/envs/pytorch_env_310/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/envs/pytorch_env_310/lib/python3.10/concurrent/futures/process.py", line 240, in _process_worker
    call_item = call_queue.

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.