# Day 24 â€“ Subject-Wide Category & ROI Concatenation

This scaffold outlines the steps required to build combined time series across all stories for a single subject.

## 1. Notebook Setup
- Import standard libraries (`json`, `pathlib`, `pandas`, `numpy`) and plotting helpers if needed.
- Load the Day 24 configuration (reuse `configs/demo.yaml`).
- Configure display options and root paths (similar to prior notebooks).

In [None]:
import json
from pathlib import Path
from typing import List, Optional

import numpy as np
import pandas as pd

from src.utils import load_yaml
from src.day24_subject_concat import (
    DEFAULT_FEATURES_ROOT,
    DEFAULT_OUTPUT_SUBDIR,
    build_story_inventory,
    concatenate_subject_timeseries,
    save_subject_concat,
    load_subject_concat_manifest,
    load_subject_boundaries,
    get_story_order_from_manifest,
)

PROJECT_ROOT = Path('/flash/PaoU/seann/fmri-edm-ccm')
CONFIG_PATH = PROJECT_ROOT / 'configs' / 'demo.yaml'

cfg = load_yaml(CONFIG_PATH)
paths = (cfg.get('paths') or {}).copy()
paths.setdefault('project_root', str(PROJECT_ROOT))

SUBJECT = (cfg.get('subject') or 'UTS01').strip()
N_PARCELS = int(cfg.get('n_parcels', 400))

FEATURES_ROOT = PROJECT_ROOT / paths.get('features_root', str(DEFAULT_FEATURES_ROOT))
OUTPUT_SUBDIR = DEFAULT_OUTPUT_SUBDIR

pd.set_option('display.max_columns', 80)
pd.set_option('display.width', 160)

print(f'Subject: {SUBJECT} | n_parcels: {N_PARCELS}')
print(f'Features root: {FEATURES_ROOT}')

## 2. Discover Available Story Segments
- Enumerate story directories for the subject inside `features_no_fallback/subjects/<subject>/`.
- Filter to only those with category exports (check for `category_timeseries_trimmed.csv`).
- Build a summary table with TR counts, file paths, and flags for missing assets.

In [None]:
inventory_df = build_story_inventory(
    SUBJECT,
    features_root=FEATURES_ROOT,
    paths_cfg=paths,
    n_parcels=N_PARCELS,
)

display(inventory_df.head(12))

ready_stories = inventory_df[inventory_df['status'] == 'ready']['story'].tolist()
print(f'Ready stories: {len(ready_stories)} / {len(inventory_df)}')

existing_manifest = load_subject_concat_manifest(
    SUBJECT,
    features_root=FEATURES_ROOT,
    output_subdir=OUTPUT_SUBDIR,
)

if existing_manifest:
    manifest_order = get_story_order_from_manifest(existing_manifest)
    if manifest_order:
        print('Loaded previous story order from manifest.')
    else:
        manifest_order = None
else:
    manifest_order = None

story_order = manifest_order or ready_stories
print(f'Story order length: {len(story_order)}')

## 3. Helper Functions
- Utilities to load a category segment with boundary metadata.
- Utilities to load an ROI segment aligned to the category segment.
- Optional QC helpers to verify schemas and lengths.

In [None]:
def summarize_inventory(inventory: pd.DataFrame) -> pd.DataFrame:
    summary = (
        inventory.groupby('status')
        .agg(n_stories=('story', 'count'), total_rows=('usable_len', 'sum'))
        .reset_index()
    )
    summary['total_rows'] = summary['total_rows'].fillna(0).astype(int)
    return summary


def compute_boundary_overview(boundaries: pd.DataFrame) -> pd.DataFrame:
    df = boundaries.copy()
    if not df.empty:
        df['lag_reset_after'] = df['end_index'] + 1
    else:
        df['lag_reset_after'] = []
    return df


def build_transition_mask(boundaries: pd.DataFrame, total_rows: int) -> np.ndarray:
    mask = np.zeros(total_rows, dtype=bool)
    if boundaries.empty:
        return mask
    for end_idx in boundaries['end_index'].iloc[:-1]:
        mask[int(end_idx)] = True
    return mask


inventory_summary = summarize_inventory(inventory_df)
display(inventory_summary)

## 4. Assemble Category Time Series
- Loop through stories in a predefined order.
- Load each trimmed category DataFrame and append boundary metadata (`story`, `segment_index`, etc.).
- Concatenate into a single DataFrame; collect boundary indices for later lag resets.
- Display previews and summary stats.

In [None]:
if not story_order:
    raise RuntimeError('No stories are ready for concatenation. Populate ROI caches or adjust filters.')

subject_concat = concatenate_subject_timeseries(
    SUBJECT,
    inventory_df,
    story_order=story_order,
    features_root=FEATURES_ROOT,
    paths_cfg=paths,
    n_parcels=N_PARCELS,
)

combined_categories = subject_concat.category_frame
print(f'Combined category rows: {len(combined_categories)}')
display(combined_categories.head())

## 5. Assemble ROI Time Series
- Load ROI matrices for the same story order and trim to match segment lengths.
- Attach the same boundary metadata and concatenate.
- Validate alignment with the combined category DataFrame.

In [None]:
combined_roi = subject_concat.roi_matrix
print(f'Combined ROI shape: {combined_roi.shape}')

preview_cols = min(5, combined_roi.shape[1])
roi_preview = pd.DataFrame(
    combined_roi[:5, :preview_cols],
    columns=subject_concat.roi_columns[:preview_cols],
)
roi_preview.insert(0, 'global_index', combined_categories['global_index'].iloc[: len(roi_preview)].to_numpy())
display(roi_preview)

## 6. Boundary Metadata & Lag Safety
- Construct boundary markers (start/end indices) for each story in the combined timeline.
- Optionally create masks to zero out cross-story rows when generating lags.
- Summarize metadata in a table for downstream reference.

In [None]:
boundary_df = compute_boundary_overview(subject_concat.boundaries)
display(boundary_df)

lag_transition_mask = build_transition_mask(subject_concat.boundaries, len(combined_categories))
print(f'Story transitions marked at {lag_transition_mask.sum()} indices.')

if lag_transition_mask.any():
    display(combined_categories.loc[lag_transition_mask, ['story', 'global_index', 'story_row']])

## 7. Persistence of Combined Assets
- Choose an output directory (e.g., `features_no_fallback/subjects/<subject>/all_stories/`).
- Save concatenated category and ROI data, plus boundary metadata (CSV/JSON/NPY).
- Log any warnings (stories skipped, mismatched lengths).

In [None]:
export_paths = save_subject_concat(
    subject_concat,
    features_root=FEATURES_ROOT,
    output_subdir=OUTPUT_SUBDIR,
)

print('Saved outputs:')
for label, path in export_paths.items():
    if path is None:
        continue
    print(f' - {label}: {path}')

manifest = load_subject_concat_manifest(
    SUBJECT,
    features_root=FEATURES_ROOT,
    output_subdir=OUTPUT_SUBDIR,
)
if manifest:
    manifest_preview = json.dumps(manifest, indent=2)
    print(manifest_preview[:500] + ('...' if len(manifest_preview) > 500 else ''))

## 8. Optional Diagnostics
- Quick plots showing category traces with story boundaries.
- Optional ROI trace previews.
- Final summary display confirming lengths and file paths.

In [None]:
import matplotlib.pyplot as plt

plot_cols = subject_concat.category_columns[: min(4, len(subject_concat.category_columns))]
fig, ax = plt.subplots(figsize=(12, 4.5))
for col in plot_cols:
    ax.plot(combined_categories['global_index'], combined_categories[col], label=col, linewidth=1.0)
for end_idx in subject_concat.boundaries['end_index'].iloc[:-1]:
    ax.axvline(end_idx + 0.5, color='black', linestyle='--', alpha=0.25)
ax.set_title(f'{SUBJECT} combined category traces (first {len(plot_cols)} categories)')
ax.set_xlabel('global_index')
ax.grid(alpha=0.3)
ax.legend(loc='upper right', frameon=False)
plt.tight_layout()
plt.show()

print(f"Outputs stored under: {export_paths['output_dir']}")