# 01 — Metadata inventory & run_summary (ds003029)

Mục tiêu:
- Quét toàn bộ metadata BIDS (không cần tải `*.eeg`)
- Tạo `eda_outputs/ds003029_run_summary.csv` + `eda_outputs/ds003029_event_vocab.csv`

Đầu vào:
- Dataset tại `EEG/ds003029` (BIDS iEEG)

Đầu ra:
- `eda_outputs/ds003029_run_summary.csv`
- `eda_outputs/ds003029_event_vocab.csv`

In [None]:
from __future__ import annotations

import sys
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt

# Make src/ importable (works from repo root or notebooks/)
ws = Path.cwd().resolve()
src_dir = ws / 'src'
if not src_dir.exists() and (ws.parent / 'src').exists():
    ws = ws.parent
src_dir = (ws / 'src').resolve()
sys.path.insert(0, str(src_dir))

from ds003029_eda.paths import get_paths
from ds003029_eda.run_summary import build_run_summary, export_run_summary

paths = get_paths()
paths.outputs_dir.mkdir(exist_ok=True)
print('Workspace:', paths.workspace)
print('Dataset root:', paths.dataset_root)
print('Outputs:', paths.outputs_dir)

In [None]:
result = build_run_summary(paths.dataset_root)
run_summary = result.run_summary
event_vocab = result.event_vocab

print('runs:', len(run_summary))
print('subjects:', run_summary['subject'].nunique() if 'subject' in run_summary.columns else 'n/a')
print('runs with onset:', int(run_summary['has_onset'].sum()))
print('runs with onset+offset:', int((run_summary['has_onset'] & run_summary['has_offset']).sum()))
print('runs with eeg content present (heuristic):', int(run_summary['eeg_content_present'].sum()))

display(run_summary.head())
display(event_vocab.head(30))

export_run_summary(paths.outputs_dir, result)
print('Wrote:', (paths.outputs_dir / 'ds003029_run_summary.csv').resolve())
print('Wrote:', (paths.outputs_dir / 'ds003029_event_vocab.csv').resolve())

## Quick EDA plots (sanity)
Các plot dưới đây là để sanity-check: phân bố `sfreq`, `n_channels`, và coverage seizure markers.

In [None]:
rs = run_summary.copy()
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

axes[0].hist(rs['sfreq'].dropna(), bins=20)
axes[0].set_title('Sampling frequency (sfreq)')
axes[0].set_xlabel('Hz')

axes[1].hist(rs['n_channels'].dropna(), bins=20)
axes[1].set_title('Channel count (n_channels)')
axes[1].set_xlabel('#channels')

counts = pd.Series({
    'total': len(rs),
    'has_onset': int(rs['has_onset'].sum()),
    'has_onset+offset': int((rs['has_onset'] & rs['has_offset']).sum()),
    'eeg_content_present': int(rs['eeg_content_present'].sum()),
})
axes[2].bar(counts.index, counts.values)
axes[2].set_title('Coverage')
axes[2].tick_params(axis='x', rotation=20)

plt.tight_layout(); plt.show()