# 04 Model Readiness

Purpose: assess whether `data/processed/windows.csv` is suitable for baseline model experiments.


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


def find_windows_csv(start: Path) -> Path:
    for p in [start.resolve()] + list(start.resolve().parents):
        candidate = p / 'data' / 'processed' / 'windows.csv'
        if candidate.exists():
            return candidate
    raise FileNotFoundError('Could not find data/processed/windows.csv')

WINDOWS_CSV = find_windows_csv(Path.cwd())
windows = pd.read_csv(WINDOWS_CSV)
print('windows.csv:', WINDOWS_CSV)
print('rows:', len(windows), 'cols:', len(windows.columns))
windows.head()


In [None]:
summary = {
    'rows': len(windows),
    'participants': windows['participantId'].nunique(dropna=True) if 'participantId' in windows.columns else 0,
    'sessions': windows['sessionId'].nunique(dropna=True) if 'sessionId' in windows.columns else 0,
}
print(summary)

if {'participantId', 'sessionId'}.issubset(windows.columns):
    counts = windows.groupby(['participantId', 'sessionId']).size().rename('n_windows').reset_index()
    display(counts.sort_values('n_windows'))


In [None]:
missing = windows.isna().mean().sort_values(ascending=False)
missing_df = pd.DataFrame({'missing_frac': missing, 'missing_%': (100 * missing).round(1)})
display(missing_df)


In [None]:
feature_cols = [
    c for c in windows.columns
    if c not in {'participantId', 'sessionId', 'windowIndex'}
]

num_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(windows[c])]
print('Numeric features:', len(num_cols))
print(num_cols)


In [None]:
if num_cols:
    desc = windows[num_cols].describe().T
    display(desc[['mean', 'std', 'min', '50%', 'max']])


In [None]:
if {'participantId', 'sessionId'}.issubset(windows.columns):
    by_pid = windows.groupby('participantId').size().sort_values(ascending=False)
    plt.figure(figsize=(8, 3))
    by_pid.plot(kind='bar')
    plt.title('Windows per Participant')
    plt.ylabel('n_windows')
    plt.tight_layout()
    plt.show()


In [None]:
readiness_flags = []

if len(windows) < 100:
    readiness_flags.append('LOW_ROWS: fewer than 100 windows')
if 'participantId' in windows.columns and windows['participantId'].nunique(dropna=True) < 5:
    readiness_flags.append('LOW_PARTICIPANTS: fewer than 5 participants')
if missing.max() > 0.8:
    readiness_flags.append('HIGH_MISSINGNESS: at least one column >80% missing')

if readiness_flags:
    print('Model readiness: LIMITED')
    for f in readiness_flags:
        print('-', f)
else:
    print('Model readiness: GOOD for baseline experiments')
