In [8]:
from pathlib import Path

RAW = Path.cwd().parents[0] / "data"
print("RAW =", RAW)
print("Exists =", RAW.exists())
print("Files:", [p.name for p in RAW.iterdir()])

RAW = /Users/will/Documents/Behavioural-Biometrics-Data-Collection/Behavioural-Biometrics-Analysis/data
Exists = True
Files: ['auth_windows_1ea8d3a866aa482f951c9734e8232bd4.csv', 'events_1ea8d3a866aa482f951c9734e8232bd4.csv']


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

RAW = Path.cwd().parents[0] / "data"
assert RAW.exists(), f"Data folder not found: {RAW}"

auth = pd.read_csv(RAW / "auth_windows_1ea8d3a866aa482f951c9734e8232bd4.csv")
events = pd.read_csv(RAW / "events_1ea8d3a866aa482f951c9734e8232bd4.csv")

print("auth rows:", len(auth))
print("events rows:", len(events))


auth rows: 7
events rows: 663


In [10]:
REQUIRED_AUTH = [
    "sessionId","participantId","windowIndex","windowStartMs","windowEndMs"
]
REQUIRED_EVENTS = [
    "sessionId","participantId","t","ms","dt","tISO"
]

def require(df, cols, name):
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise AssertionError(f"[HARD FAIL] {name} missing columns: {missing}")

require(auth, REQUIRED_AUTH, "auth_windows")
require(events, REQUIRED_EVENTS, "events")

for c in ["sessionId","participantId"]:
    if auth[c].isna().any():
        raise AssertionError(f"[HARD FAIL] auth_windows missing {c}")
    if events[c].isna().any():
        raise AssertionError(f"[HARD FAIL] events missing {c}")

print("Participants:", auth.participantId.nunique())
print("Sessions:", auth.sessionId.nunique())

Participants: 1
Sessions: 1


In [11]:
# uniqueness
if auth.duplicated(["participantId","sessionId","windowIndex"]).any():
    raise AssertionError("[HARD FAIL] Duplicate window rows")

# time sanity
if (auth.windowEndMs <= auth.windowStartMs).any():
    raise AssertionError("[HARD FAIL] windowEndMs <= windowStartMs")

# window counts
wc = auth.groupby("sessionId").windowIndex.nunique()
print(wc.describe())

if (wc < 6).any():
    print("[SOFT FAIL] Incomplete sessions detected:")
    display(wc[wc < 6])

count    1.0
mean     7.0
std      NaN
min      7.0
25%      7.0
50%      7.0
75%      7.0
max      7.0
Name: windowIndex, dtype: float64


In [12]:
# Typing IKTs should be human-plausible
ikt_cols = [c for c in auth.columns if "ikt" in c and "pct" not in c]
for c in ikt_cols:
    if (auth[c] < 40).any() or (auth[c] > 2000).any():
        print(f"[SOFT FAIL] Implausible IKT in {c}")

# Reaction times
if (auth.tap_rt_mean < 80).any() or (auth.tap_rt_mean > 1500).any():
    print("[SOFT FAIL] Implausible tap RT")

# Percentages
pct_cols = [c for c in auth.columns if c.endswith("_pct")]
for c in pct_cols:
    if (auth[c] < 0).any() or (auth[c] > 100).any():
        raise AssertionError(f"[HARD FAIL] % out of bounds in {c}")

[SOFT FAIL] Implausible IKT in typing_drift_ikt
[SOFT FAIL] Implausible IKT in coupling_var_ikt


In [13]:
# Event timing
if (events.ms < 0).any():
    raise AssertionError("[HARD FAIL] Negative event ms")

if (events.dt.dropna() < 0).any():
    raise AssertionError("[HARD FAIL] Negative dt")

print("Event types:")
display(events.t.value_counts())

Event types:


t
key                219
before_input       192
target_move         94
tap_hit             93
word_shown          28
typing_submit       27
tap_miss             6
typing_reaction      1
session_start        1
typing_end           1
tapping_end          1
Name: count, dtype: int64

In [14]:
ev = events.groupby(["sessionId","t"]).size().unstack(fill_value=0)

key_like = [c for c in ev.columns if "key" in c.lower()]
tap_like = [c for c in ev.columns if "tap" in c.lower() or "target" in c.lower()]

ev["n_key"] = ev[key_like].sum(axis=1)
ev["n_tap"] = ev[tap_like].sum(axis=1)

bad = ev[(ev.n_key < 30) | (ev.n_tap < 30)]
if len(bad):
    print("[SOFT FAIL] Low-engagement sessions:")
    display(bad[["n_key","n_tap"]])

In [15]:
num = auth.select_dtypes(include=[np.number])
miss = num.isna().mean()

display(miss.sort_values(ascending=False).head(15))

# Drift & recovery features SHOULD be NA-heavy
expected_na = [
    "typing_error_recovery_wrong_median",
    "tap_error_recovery_miss_median",
    "typing_drift_ikt",
    "tap_drift_rt"
]

for c in expected_na:
    if miss[c] < 0.1:
        print(f"[SOFT FAIL] {c} rarely missing â€” check logic")

tap_error_recovery_miss_median        1.000000
typing_error_recovery_wrong_median    1.000000
coupling_var_ratio                    0.857143
typing_drift_ikt                      0.571429
tap_drift_rt                          0.571429
typing_ikt_within_std                 0.428571
typing_ikt_global_clipped_pct         0.428571
typing_ikt_within_iqr                 0.428571
typing_ikt_global_std                 0.428571
typing_ikt_global_mean                0.428571
typing_ikt_global_p95                 0.428571
typing_ikt_global_iqr                 0.428571
coupling_var_rt                       0.428571
coupling_var_ikt                      0.428571
typing_ikt_within_clipped_pct         0.428571
dtype: float64