# 01 Sanity Check (Diagnostics Only)

Purpose: quick visual checks on raw `auth_windows.csv` files.

This notebook is intentionally non-gating; launch/pass/fail logic now lives in scripts:
- `scripts/validate_raw_sessions.py`
- `scripts/run_qc.py`


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


def find_sessions_dir(start: Path) -> Path:
    for p in [start.resolve()] + list(start.resolve().parents):
        candidate = p / 'data' / 'raw' / 'sessions'
        if candidate.exists():
            return candidate
    raise FileNotFoundError('Could not find data/raw/sessions')

SESSIONS_DIR = find_sessions_dir(Path.cwd())
files = sorted(SESSIONS_DIR.glob('*/auth_windows.csv'))
print('Sessions dir:', SESSIONS_DIR)
print('auth_windows files:', len(files))

df = pd.concat([pd.read_csv(f).assign(sessionFolder=f.parent.name) for f in files], ignore_index=True) if files else pd.DataFrame()
print('Rows:', len(df), 'Cols:', len(df.columns))
df.head()


In [None]:
if df.empty:
    print('No data found.')
else:
    required = ['participantId', 'sessionId', 'windowIndex', 'windowStartMs', 'windowEndMs']
    missing_required = [c for c in required if c not in df.columns]
    print('Missing required columns:', missing_required)

    print('
Window count per session:')
    if {'participantId', 'sessionId', 'windowIndex'}.issubset(df.columns):
        counts = df.groupby(['participantId', 'sessionId'])['windowIndex'].nunique().sort_values()
        display(counts.describe())
        display(counts.head(10))


In [None]:
if not df.empty:
    miss = df.isna().mean().sort_values(ascending=False)
    miss_tbl = pd.DataFrame({'missing_%': (100 * miss).round(1), 'non_missing_n': df.notna().sum()})
    display(miss_tbl.head(25))


In [None]:
if not df.empty:
    plot_cols = [
        c for c in [
            'typing_ikt_global_mean',
            'typing_ikt_global_std',
            'tap_rt_mean',
            'tap_rt_std',
            'typing_accuracy_pct',
            'tap_miss_rate_pct',
            'typing_drift_ikt',
            'tap_drift_rt',
        ] if c in df.columns
    ]

    n = len(plot_cols)
    if n == 0:
        print('No expected numeric feature columns found.')
    else:
        fig, axes = plt.subplots((n + 1) // 2, 2, figsize=(12, 4 * ((n + 1) // 2)))
        axes = np.array(axes).reshape(-1)
        for ax, col in zip(axes, plot_cols):
            x = pd.to_numeric(df[col], errors='coerce').dropna()
            ax.hist(x, bins=30)
            ax.set_title(col)
            ax.set_xlabel(col)
            ax.set_ylabel('count')
        for ax in axes[n:]:
            ax.axis('off')
        plt.tight_layout()
        plt.show()
