# Day 4 — Phase 1 ROI Attractor Sandbox

Focus on quickly prototyping Takens embeddings inside language-related ROIs for a single subject/story.

## Phase 1 Checklist
- load ROI × time fMRI matrices for selected narratives
- collapse to language-sensitive ROIs per hemisphere
- visualize ROI time courses and build Takens attractors
- compare simple attractor summaries across stories as a sanity check

In [None]:
import sys
from pathlib import Path

REPO_ROOT = Path.cwd().parent
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

try:
    import h5py
except ImportError as exc:
    raise RuntimeError('Install h5py before running this notebook.') from exc

try:
    import nibabel.freesurfer.io as fsio
    HAVE_NIBABEL = True
except ImportError:
    HAVE_NIBABEL = False
    print('Nibabel not installed; run `pip install nibabel` for ROI surface labels.')

from src.io_ds003020 import list_stories_for_subject

plt.style.use('seaborn-v0_8-darkgrid')

DATA_ROOT = Path('/bucket/PaoU/seann/openneuro/ds003020')
PREPROC_ROOT = DATA_ROOT / 'derivative' / 'preprocessed_data'
FREESURFER_ROOT = DATA_ROOT / 'derivative' / 'freesurfer_subjdir'

SUBJECT_ID = 'sub-UTS01'  # change as needed
SUBJECT_FS = SUBJECT_ID.replace('sub-', '')
TR = 2.0

RESULTS_DIR = (REPO_ROOT / 'derivatives' / 'results' / f'day4_{SUBJECT_ID}').resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

LANGUAGE_ROIS = {
    'lh': ['parsopercularis', 'parstriangularis', 'superiortemporal', 'middletemporal', 'temporalpole', 'bankssts', 'inferiorparietal', 'supramarginal'],
    'rh': ['parsopercularis', 'parstriangularis', 'superiortemporal', 'middletemporal', 'temporalpole', 'bankssts', 'inferiorparietal', 'supramarginal'],
}
DEFAULT_STORIES = ['adventuresinsayingyes', 'adollshouse']  # edit per subject


In [None]:
story_records = list_stories_for_subject(DATA_ROOT, SUBJECT_ID)
story_df = pd.DataFrame(story_records)
if story_df.empty:
    raise RuntimeError(f'No stories located for {SUBJECT_ID}.')
story_df['preproc_path'] = story_df['story_id'].apply(
    lambda sid: PREPROC_ROOT / SUBJECT_FS / f'{sid}.hf5'
)
story_df['has_hf5'] = story_df['preproc_path'].apply(lambda p: p.exists())
display(story_df[['story_id', 'session', 'run', 'has_hf5']].head())


In [None]:
def build_language_roi_map(subject_fs: str, rois: dict, atlas: str = 'aparc.annot'):
    """Return vertex indices for each requested ROI per hemisphere.

    Requires nibabel to read FreeSurfer annotation files."""
    if not HAVE_NIBABEL:
        raise RuntimeError('Nibabel missing; install it to map ROIs.')
    label_dir = FREESURFER_ROOT / subject_fs / 'label'
    roi_vertices = {}
    hemi_counts = {}
    for hemi in ('lh', 'rh'):
        annot_path = label_dir / f'{hemi}.{atlas}'
        if not annot_path.exists():
            raise FileNotFoundError(f'Missing annotation: {annot_path}')
        labels, _, names = fsio.read_annot(str(annot_path))
        names = [n.decode('utf-8') for n in names]
        name_to_index = {name: idx for idx, name in enumerate(names)}
        hemi_counts[hemi] = labels.shape[0]
        for roi in rois.get(hemi, []):
            label_index = None
            for candidate in (roi, f'ctx-{hemi}-{roi}', f'{hemi}-{roi}'):
                if candidate in name_to_index:
                    label_index = name_to_index[candidate]
                    break
            if label_index is None:
                print(f'ROI `{roi}` not found in {annot_path.name}; skipping.')
                continue
            vert_idx = np.where(labels == label_index)[0]
            if vert_idx.size == 0:
                print(f'ROI `{roi}` has no vertices; skipping.')
                continue
            roi_key = f'{hemi}-{roi}'
            roi_vertices[roi_key] = vert_idx
    return roi_vertices, hemi_counts


In [None]:
def load_story_bold(subject_fs: str, story_id: str):
    h5_path = PREPROC_ROOT / subject_fs / f'{story_id}.hf5'
    if not h5_path.exists():
        raise FileNotFoundError(f'Missing preprocessed file: {h5_path}')
    with h5py.File(h5_path, 'r') as hf:
        data = hf['data'][:]  # shape (TR, voxels/vertices)
    return data

def extract_roi_timeseries(bold_matrix: np.ndarray, roi_map: dict, tr: float):
    n_tr, n_vertices = bold_matrix.shape
    frame = pd.DataFrame(index=np.arange(n_tr))
    for roi, idx in roi_map.items():
        valid_idx = np.asarray(idx, dtype=int)
        valid_idx = valid_idx[(valid_idx >= 0) & (valid_idx < n_vertices)]
        if valid_idx.size == 0:
            continue
        frame[roi] = bold_matrix[:, valid_idx].mean(axis=1)
    frame.insert(0, 'Time', np.arange(n_tr) * tr)
    frame.attrs['source'] = 'roi_mean' if frame.shape[1] > 1 else 'empty'
    return frame.reset_index(drop=True)

def compute_pca_components(bold_matrix: np.ndarray, tr: float, n_components: int = 6):
    bold_centered = bold_matrix - bold_matrix.mean(axis=0, keepdims=True)
    u, s, _ = np.linalg.svd(bold_centered, full_matrices=False)
    comps = u[:, :n_components] * s[:n_components]
    frame = pd.DataFrame({f'PC{i+1}': comps[:, i] for i in range(comps.shape[1])})
    frame.insert(0, 'Time', np.arange(frame.shape[0]) * tr)
    frame.attrs['source'] = 'pca_fallback'
    return frame

def zscore(arr: np.ndarray):
    arr = np.asarray(arr, dtype=float)
    if arr.size == 0:
        return arr
    std = arr.std()
    if not np.isfinite(std) or std == 0:
        return arr - arr.mean()
    return (arr - arr.mean()) / std


In [None]:
try:
    roi_vertices  # type: ignore[name-defined]
except NameError:
    print('ROI map not found; building with LANGUAGE_ROIS ...')
    roi_vertices, hemi_counts = build_language_roi_map(
        SUBJECT_FS,
        LANGUAGE_ROIS,
        atlas='aparc.annot'
    )

expected_vertices = None
if 'hemi_counts' in globals():
    expected_vertices = sum(hemi_counts.values())

STORY_IDS = DEFAULT_STORIES
story_roi_timeseries = {}
for story_id in STORY_IDS:
    bold = load_story_bold(SUBJECT_FS, story_id)
    if expected_vertices is None:
        expected_vertices = bold.shape[1]
    elif bold.shape[1] != expected_vertices:
        print(f'Warning: vertex/voxel count mismatch for {story_id} ({bold.shape[1]} vs {expected_vertices})')
    roi_frame = extract_roi_timeseries(bold, roi_vertices, TR)
    if roi_frame.shape[1] <= 1:
        print(f'No ROI vertices matched for {story_id}; falling back to PCA components.')
        roi_frame = compute_pca_components(bold, TR, n_components=6)
    story_roi_timeseries[story_id] = roi_frame
    source = roi_frame.attrs.get('source', 'unknown')
    print(f'{story_id}: {roi_frame.shape[0]} TRs, {roi_frame.shape[1] - 1} features ({source})')
if not story_roi_timeseries:
    raise RuntimeError('No ROI time series extracted.')


_If FreeSurfer vertex indices do not map cleanly onto the preprocessed matrices, the notebook falls back to PCA components (`compute_pca_components`) so that downstream attractor diagnostics still run. Update `build_language_roi_map` or provide a custom index list once you have a definitive ROI → data mapping._

In [None]:
first_story_id, first_frame = next(iter(story_roi_timeseries.items()))
feature_cols = [col for col in first_frame.columns if col != 'Time']
if not feature_cols:
    raise RuntimeError('No features available for plotting.')
selected_features = feature_cols[: min(6, len(feature_cols))]
n_rows = len(selected_features)
fig, axes = plt.subplots(n_rows, 1, figsize=(10, 2.2 * n_rows), sharex=True)
if n_rows == 1:
    axes = [axes]
for axis, feature in zip(axes, selected_features):
    for story_id, roi_frame in story_roi_timeseries.items():
        axis.plot(
            roi_frame['Time'],
            zscore(roi_frame[feature]),
            label=story_id
        )
    axis.set_ylabel(f'{feature}
(z-score)')
    axis.axhline(0, color='black', linewidth=0.8, alpha=0.5)
axes[-1].set_xlabel('Time (s)')
axes[0].legend(loc='upper right', ncol=len(STORY_IDS))
fig.suptitle('Feature dynamics across stories', y=1.02)
plt.tight_layout()


In [None]:
def takens_embedding(series: pd.Series, E: int = 3, tau: int = 1):
    data = np.asarray(series, dtype=float)
    window = (E - 1) * tau
    if data.size <= window:
        raise ValueError(f'Not enough samples for embedding: len={data.size}, E={E}, tau={tau}')
    n_rows = data.size - window
    cols = []
    for delay in range(0, E * tau, tau):
        cols.append(data[delay:delay + n_rows])
    return np.stack(cols, axis=1)


In [None]:
target_feature = selected_features[0]
E = 3
TAU = 1
fig = plt.figure(figsize=(7, 5))
ax = fig.add_subplot(111, projection='3d')
for story_id, roi_frame in story_roi_timeseries.items():
    embedded = takens_embedding(zscore(roi_frame[target_feature]), E=E, tau=TAU)
    ax.plot(embedded[:, 0], embedded[:, 1], embedded[:, 2], label=story_id, alpha=0.8)
ax.set_xlabel('x(t)')
ax.set_ylabel('x(t-τ)')
ax.set_zlabel('x(t-2τ)')
ax.set_title(f'{target_feature} attractor (E={E}, τ={TAU})')
ax.legend(loc='upper right')
plt.tight_layout()


In [None]:
for story_id, roi_frame in story_roi_timeseries.items():
    embedded = takens_embedding(zscore(roi_frame[target_feature]), E=E, tau=TAU)
    fig = plt.figure(figsize=(6, 4))
    ax = fig.add_subplot(111, projection='3d')
    ax.plot(embedded[:, 0], embedded[:, 1], embedded[:, 2], color='tab:blue', alpha=0.85)
    ax.set_xlabel('x(t)')
    ax.set_ylabel('x(t-τ)')
    ax.set_zlabel('x(t-2τ)')
    ax.set_title(f'{story_id} — {target_feature} (E={E}, τ={TAU})')
    plt.tight_layout()


In [None]:
import itertools

summary_rows = []
for story_id, roi_frame in story_roi_timeseries.items():
    embedded = takens_embedding(zscore(roi_frame[target_feature]), E=E, tau=TAU)
    centroid = embedded.mean(axis=0)
    spread = embedded.std(axis=0)
    summary_rows.append({
        'story_id': story_id,
        'centroid_x': centroid[0],
        'centroid_y': centroid[1],
        'centroid_z': centroid[2],
        'spread_x': spread[0],
        'spread_y': spread[1],
        'spread_z': spread[2],
    })
summary_df = pd.DataFrame(summary_rows)
display(summary_df)

pair_rows = []
for (story_a, frame_a), (story_b, frame_b) in itertools.combinations(story_roi_timeseries.items(), 2):
    emb_a = takens_embedding(zscore(frame_a[target_feature]), E=E, tau=TAU)
    emb_b = takens_embedding(zscore(frame_b[target_feature]), E=E, tau=TAU)
    k = min(len(emb_a), len(emb_b))
    diff = emb_a[:k] - emb_b[:k]
    rms = np.sqrt((diff ** 2).mean())
    pair_rows.append({
        'feature': target_feature,
        'story_a': story_a,
        'story_b': story_b,
        'rms_distance': rms,
    })
pair_df = pd.DataFrame(pair_rows)
display(pair_df)


### Next steps
- Sweep additional ROIs (`target_roi = ...`) and compare attractor summaries to decide whether to pool stories.
- Expand `pair_df` into a matrix for quick visual comparison (heatmap).
- Stage CCM runs per ROI pair and log outputs for later Phase 2 cross-subject checks.