# Day 2 — Subject-Level Driver Diagnostics (EDM/CCM Prep)

Focus subject: extended-story participant selected automatically (default `sub-UTS01`).

Checklist:
- [ ] Load story-level metadata and filter to usable audio+TextGrid pairs
- [ ] Build continuous driver summaries (envelope, word-rate)
- [ ] Save per-story statistics for EDM/CCM planning
- [ ] Visualize TR spans, envelope/word-rate dynamics, and autocorrelation scales
- [ ] Inspect cross-correlation peaks to guide Δ selection
- [ ] Outline embedding-dimension feasibility for subject library


In [1]:
import sys
from pathlib import Path

REPO_ROOT = Path.cwd().parent
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))


In [2]:
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

from src.io_ds003020 import list_stories_for_subject
from src.qc_viz import (
    HAVE_AUDIO,
    HAVE_TEXTGRID,
    ensure_dir,
    load_driver_series,
    normalize,
    pearsonr_safe,
    story_cross_correlation,
)
from src.edm_ccm import (
    load_subject_stories,
    aggregate_driver_summary,
    autocorrelation,
    multi_segment_library_lengths,
)

plt.style.use('default')
plt.rcParams.update({'figure.figsize': (8, 4)})

DATA_ROOT = Path('/bucket/PaoU/seann/openneuro/ds003020')
TR = 2.0
FOCUS_SUBJECT = 'sub-UTS01'
OUTPUT_DIR = ensure_dir(REPO_ROOT / 'derivatives' / 'results' / f'day2_{FOCUS_SUBJECT}')
SUMMARY_PATH = OUTPUT_DIR / f'{FOCUS_SUBJECT}_driver_summary.csv'

print(f'AUDIO deps available: {HAVE_AUDIO}, TEXTGRID deps available: {HAVE_TEXTGRID}')


AUDIO deps available: True, TEXTGRID deps available: True


In [3]:
subject_records = list_stories_for_subject(DATA_ROOT, FOCUS_SUBJECT)
story_df = pd.DataFrame(subject_records)

if story_df.empty:
    raise RuntimeError(f'No stories found for {FOCUS_SUBJECT} under {DATA_ROOT}')

story_df['has_textgrid'] = story_df['textgrid'].notna()

print(f"Total stories: {len(story_df)} | with TextGrid: {story_df['has_textgrid'].sum()}")
display(story_df[['subject', 'story_id', 'session', 'run', 'has_textgrid']].head())


Total stories: 93 | with TextGrid: 93


Unnamed: 0,subject,story_id,session,run,has_textgrid
0,sub-UTS01,shoppinginchina,ses-7,,True
1,sub-UTS01,treasureisland,ses-7,,True
2,sub-UTS01,penpal,ses-7,,True
3,sub-UTS01,wheretheressmoke,ses-7,6.0,True
4,sub-UTS01,onlyonewaytofindout,ses-7,,True


In [4]:
if not HAVE_AUDIO or not HAVE_TEXTGRID:
    raise RuntimeError('Audio/TextGrid dependencies missing; install librosa, soundfile, textgrid before continuing.')

usable_df = story_df[story_df['has_textgrid']].copy()
print(f'Processing {len(usable_df)} stories with both WAV + TextGrid for {FOCUS_SUBJECT}.')


Processing 93 stories with both WAV + TextGrid for sub-UTS01.


In [5]:
driver_cache = {}
failed = []

for rec in usable_df.to_dict('records'):
    wav_path = Path(rec['wav'])
    tg_path = Path(rec['textgrid']) if rec.get('textgrid') else None
    try:
        drivers = load_driver_series(wav_path, tg_path, tr=TR)
    except Exception as exc:
        failed.append((rec['story_id'], str(exc)))
        continue
    driver_cache[(rec['subject'], rec['story_id'])] = drivers

print(f"Loaded drivers for {len(driver_cache)} stories; failures: {len(failed)}")
if failed:
    display(pd.DataFrame(failed, columns=['story_id', 'reason']).head())


Loaded drivers for 79 stories; failures: 5


Unnamed: 0,story_id,reason
0,theshower,"(Interval(22.42154, 24.23742, sp), Interval(24..."
1,legacy,The file could not be parsed as a Praat text f...
2,exorcism,The file could not be parsed as a Praat text f...
3,food,"(Interval(618.22744, 622.78708, lg), Interval(..."
4,haveyoumethimyet,"(Interval(577.77937, 579.34581, lg), Interval(..."


In [6]:
story_objs = []
for rec in usable_df.to_dict('records'):
    key = (rec['subject'], rec['story_id'])
    drivers = driver_cache.get(key)
    if drivers is None or drivers.n_tr == 0:
        continue
    story_objs.append({
        'subject': rec['subject'],
        'story_id': rec['story_id'],
        'session': rec['session'],
        'run': rec['run'],
        'drivers': drivers,
        'wav': rec['wav'],
        'textgrid': rec['textgrid'],
    })

print(f'Stories retained for summary: {len(story_objs)}')


Stories retained for summary: 88


In [7]:
from src.edm_ccm import StoryDriver, aggregate_driver_summary

story_driver_list: List[StoryDriver] = []
for rec in story_objs:
    story_driver_list.append(
        StoryDriver(
            subject=rec['subject'],
            story_id=rec['story_id'],
            session=rec['session'],
            run=rec['run'],
            drivers=rec['drivers'],
            wav_path=Path(rec['wav']),
            textgrid_path=Path(rec['textgrid']) if rec['textgrid'] else None,
        )
    )

summary_rows = aggregate_driver_summary(story_driver_list)
summary_df = pd.DataFrame(summary_rows)
summary_df = summary_df.sort_values('n_tr', ascending=False).reset_index(drop=True)
summary_df.to_csv(SUMMARY_PATH, index=False)
print(f'Saved summary stats to {SUMMARY_PATH}')
display(summary_df.head())


Saved summary stats to /flash/PaoU/seann/fmri-edm-ccm/derivatives/results/day2_sub-UTS01/sub-UTS01_driver_summary.csv


Unnamed: 0,subject,story_id,session,run,n_tr,env_mean,env_sd,env_ac1,env_e_fold,wordrate_mean,wordrate_sd,word_count,wordrate_ac1,wordrate_e_fold
0,sub-UTS01,breakingupintheageofgoogle,ses-7,,532,0.02741,0.014466,0.442267,2.0,8.466165,2.654402,4504.0,0.20979,1.0
1,sub-UTS01,stumblinginthedark,ses-11,,500,0.04964,0.018842,0.145974,1.0,7.4,2.290851,3700.0,0.255198,1.0
2,sub-UTS01,mayorofthefreaks,ses-20,,486,0.040274,0.015429,0.075838,1.0,8.489712,2.836943,4126.0,0.296123,1.0
3,sub-UTS01,quietfire,ses-12,,466,0.073226,0.027809,0.398728,2.0,6.10515,1.805956,2845.0,0.286279,1.0
4,sub-UTS01,thepostmanalwayscalls,ses-8,,465,0.158579,0.062786,0.123037,1.0,6.815054,2.39448,3169.0,0.236156,1.0


In [8]:
fig, ax = plt.subplots()
ax.hist(summary_df['n_tr'], bins=np.arange(0, summary_df['n_tr'].max() + 25, 25), color='C0', edgecolor='black')
ax.set_title(f'{FOCUS_SUBJECT} — Story length distribution (TR)')
ax.set_xlabel('TR count per story')
ax.set_ylabel('Stories')
fig_path = OUTPUT_DIR / f'{FOCUS_SUBJECT}_ntr_hist.png'
fig.savefig(fig_path, dpi=150, bbox_inches='tight')
plt.close(fig)
print(f'Saved {fig_path}')


Saved /flash/PaoU/seann/fmri-edm-ccm/derivatives/results/day2_sub-UTS01/sub-UTS01_ntr_hist.png


In [9]:
valid = summary_df.dropna(subset=['env_mean', 'wordrate_mean'])
if valid.empty:
    print('No valid envelope/word-rate pairs for scatter.')
else:
    r_val = pearsonr_safe(valid['env_mean'].values, valid['wordrate_mean'].values)
    fig, ax = plt.subplots()
    ax.scatter(valid['env_mean'], valid['wordrate_mean'], alpha=0.7)
    ax.set_xlabel('Envelope mean (a.u.)')
    ax.set_ylabel('Word-rate mean (words/TR)')
    ax.set_title(f'{FOCUS_SUBJECT} — Mean envelope vs word-rate (r={r_val:.2f}, n={len(valid)})')
    fig_path = OUTPUT_DIR / f'{FOCUS_SUBJECT}_env_vs_wordrate_scatter.png'
    fig.savefig(fig_path, dpi=150, bbox_inches='tight')
    plt.close(fig)
    print(f'Saved {fig_path}')


Saved /flash/PaoU/seann/fmri-edm-ccm/derivatives/results/day2_sub-UTS01/sub-UTS01_env_vs_wordrate_scatter.png


In [10]:
metrics = summary_df[['story_id', 'n_tr', 'env_e_fold', 'wordrate_e_fold']].copy()
fig, ax = plt.subplots()
ax.hist(metrics['env_e_fold'].dropna(), bins=np.arange(0, metrics['env_e_fold'].max() + 1, 1), alpha=0.7, label='Envelope')
ax.hist(metrics['wordrate_e_fold'].dropna(), bins=np.arange(0, metrics['wordrate_e_fold'].max() + 1, 1), alpha=0.7, label='Word-rate')
ax.set_xlabel('Lag (TR) where ACF ≤ exp(-1)')
ax.set_ylabel('Stories')
ax.set_title(f'{FOCUS_SUBJECT} — E-folding lag distribution')
ax.legend()
fig_path = OUTPUT_DIR / f'{FOCUS_SUBJECT}_acf_efold_hist.png'
fig.savefig(fig_path, dpi=150, bbox_inches='tight')
plt.close(fig)
print(f'Saved {fig_path}')


Saved /flash/PaoU/seann/fmri-edm-ccm/derivatives/results/day2_sub-UTS01/sub-UTS01_acf_efold_hist.png


In [11]:
selected = summary_df.head(3)
for _, row in selected.iterrows():
    drivers = driver_cache.get((row['subject'], row['story_id']))
    if drivers is None or drivers.word_rate is None:
        continue
    env_norm = normalize(drivers.envelope)
    rate_norm = normalize(drivers.word_rate)
    fig, ax = plt.subplots(figsize=(10, 3))
    ax.plot(env_norm, label='Envelope (norm)')
    ax.plot(rate_norm, label='Word rate (norm)')
    ax.set_xlabel('TR (2.0 s)')
    ax.set_ylabel('Normalized amplitude')
    ax.set_title(f"{row['subject']} | {row['story_id']} | n_TR={int(row['n_tr'])}")
    ax.legend(loc='upper right')
    fig_path = OUTPUT_DIR / f"{row['subject']}_{row['story_id']}_overlay.png"
    fig.savefig(fig_path, dpi=150, bbox_inches='tight')
    plt.close(fig)
    print(f'Saved {fig_path}')


Saved /flash/PaoU/seann/fmri-edm-ccm/derivatives/results/day2_sub-UTS01/sub-UTS01_breakingupintheageofgoogle_overlay.png
Saved /flash/PaoU/seann/fmri-edm-ccm/derivatives/results/day2_sub-UTS01/sub-UTS01_stumblinginthedark_overlay.png
Saved /flash/PaoU/seann/fmri-edm-ccm/derivatives/results/day2_sub-UTS01/sub-UTS01_mayorofthefreaks_overlay.png


In [12]:
lag_peaks: List[Tuple[str, int, float]] = []
for _, row in summary_df.iterrows():
    drivers = driver_cache.get((row['subject'], row['story_id']))
    if drivers is None or drivers.word_rate is None or drivers.n_tr < 50:
        continue
    lags, corrs = story_cross_correlation(drivers.envelope, drivers.word_rate, max_lag=10)
    pos_mask = lags >= 0
    if pos_mask.sum() == 0:
        continue
    pos_lags = lags[pos_mask]
    pos_corrs = corrs[pos_mask]
    if np.all(np.isnan(pos_corrs)):
        continue
    best_idx = int(np.nanargmax(pos_corrs))
    lag_peaks.append((row['story_id'], int(pos_lags[best_idx]), float(pos_corrs[best_idx])))

lag_df = pd.DataFrame(lag_peaks, columns=['story_id', 'lag_tr', 'corr'])
lag_df = lag_df.sort_values('corr', ascending=False).reset_index(drop=True)
if lag_df.empty:
    print('No cross-correlation peaks computed.')
else:
    display(lag_df.head())
    fig, ax = plt.subplots()
    ax.hist(lag_df['lag_tr'], bins=np.arange(-0.5, lag_df['lag_tr'].max() + 1.5, 1), color='C2', edgecolor='black')
    ax.set_xlabel('Lag (TR) with max positive correlation (brain lagging driver)')
    ax.set_ylabel('Stories')
    ax.set_title(f'{FOCUS_SUBJECT} — CCM candidate Δ distribution (env → word-rate)')
    fig_path = OUTPUT_DIR / f'{FOCUS_SUBJECT}_lag_hist.png'
    fig.savefig(fig_path, dpi=150, bbox_inches='tight')
    plt.close(fig)
    print(f'Saved {fig_path}')


Unnamed: 0,story_id,lag_tr,corr
0,itsabox,0,0.758393
1,jugglingandjesus,0,0.746766
2,theinterview,0,0.716445
3,superheroesjustforeachother,0,0.71531
4,reachingoutbetweenthebars,0,0.678592


Saved /flash/PaoU/seann/fmri-edm-ccm/derivatives/results/day2_sub-UTS01/sub-UTS01_lag_hist.png


In [13]:
E_VALUES = [2, 3, 4, 5, 6]
usable_lengths = summary_df['n_tr'].values
lib_lengths = multi_segment_library_lengths(usable_lengths, exclusion=0)

rows = []
for E in E_VALUES:
    effective = np.maximum(lib_lengths - E, 0)
    rows.append({
        'E': E,
        'median_library': np.median(effective),
        'min_library': np.min(effective),
        'stories_ge_200': int((effective >= 200).sum()),
    })

embedding_df = pd.DataFrame(rows)
display(embedding_df)
fig, ax = plt.subplots()
ax.plot(embedding_df['E'], embedding_df['median_library'], marker='o', label='Median usable TRs')
ax.plot(embedding_df['E'], embedding_df['min_library'], marker='s', label='Minimum usable TRs')
ax.axhline(200, color='black', linestyle='--', linewidth=1)
ax.set_xlabel('Embedding dimension E')
ax.set_ylabel('Usable TR count per story')
ax.set_title(f'{FOCUS_SUBJECT} — Library depth vs embedding dimension')
ax.legend()
fig_path = OUTPUT_DIR / f'{FOCUS_SUBJECT}_library_vs_E.png'
fig.savefig(fig_path, dpi=150, bbox_inches='tight')
plt.close(fig)
print(f'Saved {fig_path}')


Unnamed: 0,E,median_library,min_library,stories_ge_200
0,2,335.0,166,83
1,3,334.0,165,83
2,4,333.0,164,83
3,5,332.0,163,82
4,6,331.0,162,82


Saved /flash/PaoU/seann/fmri-edm-ccm/derivatives/results/day2_sub-UTS01/sub-UTS01_library_vs_E.png


In [14]:
summary_notes = {
    'subject': FOCUS_SUBJECT,
    'stories_processed': len(summary_df),
    'median_n_tr': float(summary_df['n_tr'].median()),
    'recommended_tau_tr': float(np.nanmedian(summary_df['env_e_fold'])),
    'recommended_delta_tr': float(lag_df['lag_tr'].median()) if not lag_df.empty else np.nan,
    'stories_with_lag1': int((lag_df['lag_tr'] == 1).sum()) if not lag_df.empty else 0,
}

notes_path = OUTPUT_DIR / f'{FOCUS_SUBJECT}_edm_ccm_notes.json'
import json
notes_path.write_text(json.dumps(summary_notes, indent=2))
print('Recommendations saved to', notes_path)
summary_notes


Recommendations saved to /flash/PaoU/seann/fmri-edm-ccm/derivatives/results/day2_sub-UTS01/sub-UTS01_edm_ccm_notes.json


{'subject': 'sub-UTS01',
 'stories_processed': 88,
 'median_n_tr': 337.0,
 'recommended_tau_tr': 1.0,
 'recommended_delta_tr': 0.0,
 'stories_with_lag1': 0}