# Day 5 — Language ROI Attractors (pycortex-aligned)


Focus: pull language ROI masks from the pycortex database that ships with ds003020, align them with the ~81k-vertex preprocessed matrices, and generate Takens attractors for sample stories.


## Prerequisites
- `pip install pycortex` (once per environment).
- Ensure `/bucket/.../ds003020/derivative/pycortex-db` and `preprocessed_data` are mounted.
- Confirm `derivative/subject_xfms.json` contains your subject (e.g., `"UTS01": "UTS01_auto"`).


In [None]:
import sys
from pathlib import Path

REPO_ROOT = Path.cwd().parent
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))


In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

try:
    import cortex
    HAVE_PYCORTEX = True
except ImportError:
    HAVE_PYCORTEX = False
    print('pycortex missing — install with `pip install pycortex`.')

import h5py

from src.io_ds003020 import list_stories_for_subject
from src.qc_viz import ensure_dir

plt.style.use('seaborn-v0_8-darkgrid')


In [None]:
DATA_ROOT = Path('/bucket/PaoU/seann/openneuro/ds003020')
PREPROC_ROOT = DATA_ROOT / 'derivative' / 'preprocessed_data'
PYCORTEX_DB = DATA_ROOT / 'derivative' / 'pycortex-db'
FREESURFER_SUBJECTS = DATA_ROOT / 'derivative' / 'freesurfer_subjdir'
TRANSFORM_PATH = DATA_ROOT / 'derivative' / 'subject_xfms.json'

SUBJECT_ID = 'sub-UTS01'  # change as needed
SUBJECT_FS = SUBJECT_ID.replace('sub-', '')
TR = 2.0
STORY_IDS = ['adventuresinsayingyes', 'adollshouse']

ROI_FILTER = None  # set to dict{'lh': [...], 'rh': [...]} to limit ROIs

RESULTS_DIR = ensure_dir(REPO_ROOT / 'derivatives' / 'results' / f'day5_{SUBJECT_ID}')
ROI_CACHE = RESULTS_DIR / f'{SUBJECT_ID}_roi_masks.npz'


In [None]:
if not HAVE_PYCORTEX:
    raise RuntimeError('Install pycortex before fetching ROI masks.')
if not PYCORTEX_DB.exists():
    raise FileNotFoundError(f'Missing pycortex DB at {PYCORTEX_DB}')

cortex.config.default_db = str(PYCORTEX_DB)
cortex.config.default_filestore = str(PYCORTEX_DB)
cortex.config.default_subject = SUBJECT_FS
cortex.database.default_filestore = str(PYCORTEX_DB)
# rebuild the global database handles so downstream API uses the dataset filestore
cortex.database.db = cortex.database.Database(str(PYCORTEX_DB))
cortex.db = cortex.database.db
try:
    import cortex.dataset
    cortex.dataset.db = cortex.database.db
    import cortex.dataset.braindata as _braindata
    _braindata.db = cortex.database.db
    import cortex.dataset.views as _views
    _views.db = cortex.database.db
except ImportError:
    pass
# clear cached subject list and force a refresh so `UTS01` is visible
def _refresh_pycortex_subjects():
    cortex.database.db._subjects = None
    subjects = cortex.database.db.subjects
    print(f'pycortex subjects: {list(subjects.keys())[:5]}... (total={len(subjects)})')
    if SUBJECT_FS not in subjects:
        raise KeyError(f'pycortex filestore {PYCORTEX_DB} does not contain subject {SUBJECT_FS}')

_refresh_pycortex_subjects()

with TRANSFORM_PATH.open() as fh:
    transform_map = json.load(fh)
TRANSFORM_ID = transform_map.get(SUBJECT_FS)
if TRANSFORM_ID is None:
    raise KeyError(f'No transform entry for {SUBJECT_FS} in {TRANSFORM_PATH}')

print(f'Subject {SUBJECT_ID}: using transform {TRANSFORM_ID}')



In [None]:
from pathlib import Path
from typing import Dict, Optional

ROI_NAME_OVERRIDES = {
    # Map shorthand to pycortex atlas names if needed
    'parsopercularis': ['parsopercularis', 'G_front_inf-Opercular'],
    'parstriangularis': ['parstriangularis', 'G_front_inf-Triangul'],
    'superiortemporal': ['superiortemporal', 'G_temporal_sup'],
    'middletemporal': ['middletemporal', 'G_temporal_middle'],
    'temporalpole': ['temporalpole', 'Pole_temporal'],
    'bankssts': ['bankssts', 'S_temporal_sup-Lateral'],
    'inferiorparietal': ['inferiorparietal', 'G_pariet_inf-Angular'],
    'supramarginal': ['supramarginal', 'G_pariet_inf-Supramar'],
}

PYCORTEX_ROI_TYPES = ('atlas', 'labels', 'freesurfer')
ROI_LABEL_VARIANTS = (
    '{hemi}-{token}',
    '{hemi}_{token}',
    '{hemi}.{token}',
    '{token}-{hemi}',
    '{token}_{hemi}',
    '{token}.{hemi}',
    'ctx-{hemi}-{token}',
    'ctx_{hemi}_{token}',
    'ctx.{hemi}.{token}',
    'ctx-{token}',
    'ctx_{token}',
    '{token}',
)


def _call_with_xfm(func, subject_fs: str, value, roi_type: str, transform: str):
    """Call a pycortex helper, trying xfmname/transform keywords as needed."""
    if transform is None:
        return func(subject_fs, value, type=roi_type)
    last_error = None
    for key in ('xfmname', 'transform'):
        try:
            return func(subject_fs, value, type=roi_type, **{key: transform})
        except TypeError as exc:
            if 'unexpected keyword argument' not in str(exc):
                last_error = exc
                break
            last_error = exc
            continue
    if last_error is not None:
        raise last_error
    return func(subject_fs, value, type=roi_type)


def _format_roi_label(entry) -> str:
    if isinstance(entry, (tuple, list)):
        return '-'.join(str(part) for part in entry)
    return str(entry)


def list_available_pyctx_rois(subject_fs: str, transform: str):
    summary: Dict[str, list] = {}
    for roi_type in PYCORTEX_ROI_TYPES:
        try:
            names = cortex.db.get_rois(subject_fs, type=roi_type)
        except Exception as exc:
            print(f'Unable to fetch ROI list via pycortex ({roi_type}): {exc}')
            continue
        if not names:
            continue
        summary[roi_type] = names
        preview = ', '.join(_format_roi_label(name) for name in names[:10])
        suffix = ' ...' if len(names) > 10 else ''
        print(f'pycortex {roi_type} entries ({len(names)}): {preview}{suffix}')
    return summary


def _auto_roi_spec(available_by_type: Dict[str, list]) -> Dict[str, list]:
    auto_spec: Dict[str, list] = {}
    prefixes = (
        '{hemi}-',
        '{hemi}_',
        '{hemi}.',
        'ctx-{hemi}-',
        'ctx_{hemi}_',
        'ctx.{hemi}.',
    )
    for entries in available_by_type.values():
        for entry in entries:
            label = _format_roi_label(entry)
            lowered = label.lower()
            hemi = None
            trimmed = label
            for candidate in ('lh', 'rh'):
                matched = False
                for template in prefixes:
                    prefix = template.format(hemi=candidate)
                    if lowered.startswith(prefix):
                        hemi = candidate
                        trimmed = label[len(prefix):]
                        matched = True
                        break
                if matched:
                    break
            if hemi is None:
                continue
            trimmed = trimmed.strip()
            if not trimmed:
                continue
            auto_spec.setdefault(hemi, [])
            if trimmed not in auto_spec[hemi]:
                auto_spec[hemi].append(trimmed)
    return auto_spec


def _normalize_label(text: str) -> str:
    return ''.join(ch for ch in text.lower() if ch.isalnum())


def _match_token_to_label(token: str, label_names) -> Optional[str]:
    tokens = [token]
    tokens.extend(ROI_NAME_OVERRIDES.get(token, []))
    lookup = {_normalize_label(name): name for name in label_names}
    for candidate in tokens:
        norm = _normalize_label(candidate)
        if norm in lookup:
            return lookup[norm]
    return None


def _build_masks_from_pycortex(
    subject_fs: str,
    roi_spec: Optional[Dict[str, list]],
    transform: str,
    available_by_type: Dict[str, list],
):
    working_spec = roi_spec
    if not working_spec:
        working_spec = _auto_roi_spec(available_by_type)
        if not working_spec:
            raise KeyError('Unable to derive hemisphere-specific ROI labels from pycortex DB entries.')
        print(f'Auto-selected {sum(len(v) for v in working_spec.values())} ROI labels from pycortex DB.')

    masks: Dict[str, np.ndarray] = {}
    for hemi, roi_list in working_spec.items():
        for roi in roi_list:
            key = f'{hemi}-{roi}'
            token_seq = [roi]
            token_seq.extend(ROI_NAME_OVERRIDES.get(roi, []))
            unique_tokens = []
            for token in token_seq:
                canonical = str(token).lower().strip()
                if canonical not in unique_tokens:
                    unique_tokens.append(canonical)
            candidates = []
            for token in unique_tokens:
                token_variants = {
                    token,
                    token.replace('-', '_'),
                    token.replace('-', '.'),
                    token.replace('_', '-'),
                }
                for token_variant in token_variants:
                    for template in ROI_LABEL_VARIANTS:
                        candidates.append(template.format(hemi=hemi, token=token_variant))
                candidates.append((hemi, token))
            seen_labels = set()
            mask = None
            for cand in candidates:
                if cand in seen_labels:
                    continue
                seen_labels.add(cand)
                for roi_type in PYCORTEX_ROI_TYPES:
                    try:
                        raw = _call_with_xfm(cortex.db.get_roi_mask, subject_fs, cand, roi_type, transform)
                    except Exception:
                        continue
                    arr = np.asarray(raw)
                    if arr.ndim == 2:
                        arr = arr[0] if hemi == 'lh' else arr[-1]
                    arr = arr.astype(bool).ravel()
                    if arr.any():
                        mask = arr
                        print(f'{key}: {arr.sum()} vertices (pycortex label `{cand}` [{roi_type}])')
                        break
                if mask is not None:
                    break
            if mask is None:
                preview = {
                    roi_type: [
                        _format_roi_label(name)
                        for name in names[:10]
                    ]
                    for roi_type, names in available_by_type.items()
                }
                raise KeyError(f'ROI {key} not found in pycortex DB. Available entries sample: {preview}')
            masks[key] = mask
    return masks


def _load_freesurfer_annotations(subject_fs: str, atlas: str = 'aparc'):
    fs_label_dir = FREESURFER_SUBJECTS / subject_fs / 'label'
    if not fs_label_dir.exists():
        raise FileNotFoundError(f'Missing FreeSurfer labels for {subject_fs}: {fs_label_dir}')
    try:
        from nibabel.freesurfer import read_annot
    except ImportError as exc:
        raise RuntimeError('Install nibabel to enable FreeSurfer ROI fallback (e.g. `pip install nibabel`).') from exc

    annotations = {}
    for hemi in ('lh', 'rh'):
        annot_path = fs_label_dir / f'{hemi}.{atlas}.annot'
        if not annot_path.exists():
            raise FileNotFoundError(f'Missing FreeSurfer annotation file: {annot_path}')
        labels, _, names = read_annot(str(annot_path))
        decoded_names = []
        for name in names:
            if isinstance(name, bytes):
                decoded_names.append(name.decode('utf-8'))
            else:
                decoded_names.append(str(name))
        name_to_index = {name: idx for idx, name in enumerate(decoded_names)}
        annotations[hemi] = {
            'labels': labels,
            'names': decoded_names,
            'name_to_index': name_to_index,
        }
    return annotations


def _auto_roi_spec_from_fs(fs_annotations: Dict[str, Dict[str, object]]) -> Dict[str, list]:
    auto_spec: Dict[str, list] = {}
    for hemi, info in fs_annotations.items():
        names = info['names']
        keep = []
        for name in names:
            lower = name.lower()
            if not name or lower in {'unknown', 'corpuscallosum'}:
                continue
            keep.append(name)
        auto_spec[hemi] = keep
    return auto_spec


def _build_masks_from_freesurfer(
    subject_fs: str,
    roi_spec: Optional[Dict[str, list]],
):
    fs_annotations = _load_freesurfer_annotations(subject_fs)
    working_spec = roi_spec
    if not working_spec:
        working_spec = _auto_roi_spec_from_fs(fs_annotations)
        if not working_spec:
            raise KeyError('Unable to derive ROI labels from FreeSurfer annotations.')
        print(f'Auto-selected {sum(len(v) for v in working_spec.values())} ROI labels from FreeSurfer annotations.')

    n_lh = fs_annotations['lh']['labels'].shape[0]
    n_rh = fs_annotations['rh']['labels'].shape[0]

    masks: Dict[str, tuple[np.ndarray, np.ndarray]] = {}
    for hemi, roi_list in working_spec.items():
        labels = fs_annotations[hemi]['labels']
        name_to_index = fs_annotations[hemi]['name_to_index']
        for roi in roi_list:
            match_name = _match_token_to_label(roi, name_to_index.keys())
            if match_name is None:
                preview = list(name_to_index.keys())[:10]
                raise KeyError(f'ROI {hemi}-{roi} not found in FreeSurfer annotation. Sample entries: {preview}')
            label_index = name_to_index[match_name]
            hemi_indices = np.where(labels == label_index)[0]
            lh_mask = np.zeros(n_lh, dtype=bool)
            rh_mask = np.zeros(n_rh, dtype=bool)
            if hemi == 'lh':
                lh_mask[hemi_indices] = True
            else:
                rh_mask[hemi_indices] = True
            if not hemi_indices.size:
                print(f'Warning: ROI {hemi}-{roi} matched label `{match_name}` but contains no vertices.')
            else:
                print(f'{hemi}-{roi}: {hemi_indices.size} vertices (FreeSurfer label `{match_name}`)')
            masks[f'{hemi}-{roi}'] = (lh_mask, rh_mask)
    return masks


def _resample_fs_masks_to_transform(
    fs_masks: Dict[str, tuple[np.ndarray, np.ndarray]],
    subject_fs: str,
    transform: str,
) -> Dict[str, np.ndarray]:
    if transform is None:
        raise RuntimeError('A valid pycortex transform is required to resample FreeSurfer ROI masks.')
    try:
        import cortex
    except ImportError as exc:
        raise RuntimeError('Install pycortex to resample FreeSurfer ROI masks (e.g. `pip install pycortex`).') from exc

    resampled: Dict[str, np.ndarray] = {}
    for key, (lh_mask, rh_mask) in fs_masks.items():
        surf = cortex.Vertex((lh_mask.astype(float), rh_mask.astype(float)), subject_fs)
        mapped = surf.to_xfm(transform)
        data = mapped.data
        if isinstance(data, tuple):
            arrays = []
            for arr in data:
                arr = np.asarray(arr)
                if arr.ndim == 2:
                    arrays.append(arr[0].ravel())
                    arrays.append(arr[1].ravel())
                else:
                    arrays.append(arr.ravel())
            flat = np.concatenate(arrays)
        else:
            arr = np.asarray(data)
            if arr.ndim == 2:
                flat = np.concatenate([arr[0].ravel(), arr[1].ravel()])
            else:
                flat = arr.ravel()
        resampled[key] = (flat > 0.5)
    return resampled


def fetch_language_masks(
    subject_fs: str,
    roi_spec: Optional[Dict[str, list]],
    transform: str,
    cache_path: Path,
):
    if cache_path.exists():
        with np.load(cache_path, allow_pickle=True) as data:
            files = list(data.files)
            if '__meta_version' not in files:
                print(f'ROI cache {cache_path} missing metadata; rebuilding.')
            else:
                masks = {key: data[key] for key in files if not key.startswith('__meta_')}
                print(f'Loaded cached ROI masks from {cache_path} (backend={data["__meta_backend"].item()})')
                return masks
        cache_path.unlink(missing_ok=True)

    available_by_type = list_available_pyctx_rois(subject_fs, transform)
    masks = None
    backend = 'pycortex'

    if available_by_type:
        try:
            masks = _build_masks_from_pycortex(subject_fs, roi_spec, transform, available_by_type)
        except KeyError as exc:
            print(f'pycortex ROI lookup failed: {exc}')
            masks = None

    if masks is None:
        print('Falling back to FreeSurfer annotations for ROI masks.')
        fs_masks = _build_masks_from_freesurfer(subject_fs, roi_spec)
        masks = _resample_fs_masks_to_transform(fs_masks, subject_fs, transform)
        backend = 'freesurfer->pycortex'

    np.savez_compressed(
        cache_path,
        __meta_version=np.array(2, dtype=int),
        __meta_backend=np.array(backend),
        **masks,
    )
    print(f'Saved ROI masks to {cache_path} ({backend}).')
    return masks






In [None]:
roi_masks = fetch_language_masks(SUBJECT_FS, ROI_FILTER, TRANSFORM_ID, ROI_CACHE)
roi_summary = pd.DataFrame({
    'roi': list(roi_masks.keys()),
    'n_vertices': [int(mask.sum()) for mask in roi_masks.values()],
}).sort_values('roi')
display(roi_summary)


In [None]:
def load_story_bold(subject_fs: str, story_id: str):
    h5_path = PREPROC_ROOT / subject_fs / f'{story_id}.hf5'
    if not h5_path.exists():
        raise FileNotFoundError(f'Missing preprocessed file: {h5_path}')
    with h5py.File(h5_path, 'r') as hf:
        data = hf['data'][:]
    return data


def roi_timeseries_from_masks(bold_matrix: np.ndarray, masks: Dict[str, np.ndarray], tr: float):
    n_tr, n_vertices = bold_matrix.shape
    frame = pd.DataFrame(index=np.arange(n_tr))
    for roi, mask in masks.items():
        mask = mask.astype(bool)
        if mask.shape[0] != n_vertices:
            raise ValueError(f'ROI {roi} mask length {mask.shape[0]} != data vertices {n_vertices}')
        frame[roi] = bold_matrix[:, mask].mean(axis=1)
    frame.insert(0, 'Time', np.arange(n_tr) * tr)
    return frame.reset_index(drop=True)


def zscore(arr: np.ndarray):
    arr = np.asarray(arr, dtype=float)
    mean = np.nanmean(arr)
    std = np.nanstd(arr)
    if std == 0 or not np.isfinite(std):
        return arr - mean
    return (arr - mean) / std


In [None]:
story_records = list_stories_for_subject(DATA_ROOT, SUBJECT_ID)
story_df = pd.DataFrame(story_records)
story_df['preproc_path'] = story_df['story_id'].apply(lambda sid: PREPROC_ROOT / SUBJECT_FS / f'{sid}.hf5')
filtered_df = story_df[story_df['story_id'].isin(STORY_IDS)].copy()
if filtered_df.empty:
    raise RuntimeError('No matching stories found; update STORY_IDS.')
display(filtered_df[['story_id', 'preproc_path']])

story_roi_timeseries: Dict[str, pd.DataFrame] = {}
for story_id in STORY_IDS:
    bold = load_story_bold(SUBJECT_FS, story_id)
    frame = roi_timeseries_from_masks(bold, roi_masks, TR)
    story_roi_timeseries[story_id] = frame
    print(f"Loaded {story_id}: {frame.shape[0]} TRs, {frame.shape[1] - 1} ROIs")


In [None]:
first_story_id, first_frame = next(iter(story_roi_timeseries.items()))
feature_cols = [col for col in first_frame.columns if col != 'Time']
display(pd.Series(feature_cols, name='available_features'))
selected_features = feature_cols[: min(8, len(feature_cols))]
print('Plotting features:', selected_features)

n_rows = len(selected_features)
fig, axes = plt.subplots(n_rows, 1, figsize=(10, 2.2 * n_rows), sharex=True)
if n_rows == 1:
    axes = [axes]
for axis, feature in zip(axes, selected_features):
    for story_id, roi_frame in story_roi_timeseries.items():
        axis.plot(roi_frame['Time'], zscore(roi_frame[feature]), label=story_id)
    axis.set_ylabel(f'{feature}
(z-score)')
    axis.axhline(0, color='black', linewidth=0.7, alpha=0.5)
axes[-1].set_xlabel('Time (s)')
axes[0].legend(loc='upper right', ncol=len(STORY_IDS))
fig.suptitle('Language ROI dynamics (pycortex-aligned)')
plt.tight_layout()


In [None]:
def takens_embedding(series: pd.Series, E: int = 3, tau: int = 1):
    data = np.asarray(series, dtype=float)
    window = (E - 1) * tau
    if data.size <= window:
        raise ValueError(f'Not enough samples for embedding: len={data.size}, E={E}, tau={tau}')
    n_rows = data.size - window
    cols = []
    for delay in range(0, E * tau, tau):
        cols.append(data[delay:delay + n_rows])
    return np.stack(cols, axis=1)


In [None]:
target_feature = selected_features[0]
E = 3
TAU = 1

fig = plt.figure(figsize=(7, 5))
ax = fig.add_subplot(111, projection='3d')
for story_id, roi_frame in story_roi_timeseries.items():
    embedded = takens_embedding(zscore(roi_frame[target_feature]), E=E, tau=TAU)
    ax.plot(embedded[:, 0], embedded[:, 1], embedded[:, 2], label=story_id, alpha=0.8)
ax.set_xlabel('x(t)')
ax.set_ylabel('x(t-τ)')
ax.set_zlabel('x(t-2τ)')
ax.set_title(f'{target_feature} attractor (E={E}, τ={TAU})')
ax.legend(loc='upper right')
plt.tight_layout()


In [None]:
for story_id, roi_frame in story_roi_timeseries.items():
    embedded = takens_embedding(zscore(roi_frame[target_feature]), E=E, tau=TAU)
    fig = plt.figure(figsize=(6, 4))
    ax = fig.add_subplot(111, projection='3d')
    ax.plot(embedded[:, 0], embedded[:, 1], embedded[:, 2], color='tab:blue', alpha=0.85)
    ax.set_xlabel('x(t)')
    ax.set_ylabel('x(t-τ)')
    ax.set_zlabel('x(t-2τ)')
    ax.set_title(f'{story_id} — {target_feature} (E={E}, τ={TAU})')
    plt.tight_layout()


In [None]:
import itertools

summary_rows = []
for story_id, roi_frame in story_roi_timeseries.items():
    embedded = takens_embedding(zscore(roi_frame[target_feature]), E=E, tau=TAU)
    centroid = embedded.mean(axis=0)
    spread = embedded.std(axis=0)
    summary_rows.append({
        'story_id': story_id,
        'feature': target_feature,
        'centroid_x': centroid[0],
        'centroid_y': centroid[1],
        'centroid_z': centroid[2],
        'spread_x': spread[0],
        'spread_y': spread[1],
        'spread_z': spread[2],
    })
summary_df = pd.DataFrame(summary_rows)
display(summary_df)

pair_rows = []
for (story_a, frame_a), (story_b, frame_b) in itertools.combinations(story_roi_timeseries.items(), 2):
    emb_a = takens_embedding(zscore(frame_a[target_feature]), E=E, tau=TAU)
    emb_b = takens_embedding(zscore(frame_b[target_feature]), E=E, tau=TAU)
    k = min(len(emb_a), len(emb_b))
    diff = emb_a[:k] - emb_b[:k]
    rms = np.sqrt((diff ** 2).mean())
    pair_rows.append({
        'feature': target_feature,
        'story_a': story_a,
        'story_b': story_b,
        'rms_distance': rms,
    })
pair_df = pd.DataFrame(pair_rows)
display(pair_df)


### Next steps
- Iterate across `selected_features` to export attractors for every ROI.
- Feed these aligned ROI series into the EDM/CCM routines from Day 3.
- Build pairwise distance heatmaps to compare story- or subject-level geometry.
