# Day 17 – Semantic Category Featurization

Generate TR-aligned semantic category trajectories for each story, with optional concatenation and Day16 integration hooks.

In [None]:
import json
import math
import os
import sys
import warnings
from collections import Counter, defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple

import numpy as np
import pandas as pd
import yaml

np.random.seed(42)
pd.options.display.max_columns = 50

project_root = Path('/flash/PaoU/seann/fmri-edm-ccm')
project_root.mkdir(parents=True, exist_ok=True)
os.chdir(project_root)

sys.path.append(str(project_root))
sys.path.append('/flash/PaoU/seann/pyEDM/src')
sys.path.append('/flash/PaoU/seann/MDE-main/src')

try:
    import ipywidgets as widgets
    from IPython.display import display
except Exception:
    widgets = None
    def display(obj):
        print(obj)

try:
    import matplotlib.pyplot as plt
except Exception as exc:
    plt = None
    warnings.warn(f'Matplotlib unavailable: {exc}')

EPS = 1e-12  # numeric guard



In [None]:
from src.utils import load_yaml
from src.decoding import load_transcript_words
from src.edm_ccm import English1000Loader

cfg = load_yaml('configs/demo.yaml')
categories_cfg = cfg.get('categories')
default_categories_cfg = {
    'embedding_source': 'english1000',
    'word2vec_path': '',
    'concat_all_stories': False,
    'category_score_method': 'similarity',
    'overlap_weighting': 'proportional',
    'allow_single_seed': False,
    'seconds_bin_width': 0.05,
    'expansion': {
        'enabled': True,
        'top_k': 2000,
        'min_sim': 0.35,
    },
    'sets': {
        'basic_semantics': {
            'inanimate': {'seeds': ['rock', 'table', 'cup', 'chair']},
            'animate': {'seeds': ['man', 'woman', 'dog', 'child']},
            'place': {'seeds': ['city', 'park', 'room', 'kitchen']},
            'tool': {'seeds': ['hammer', 'knife', 'saw', 'scissors']},
            'motion': {'seeds': ['run', 'walk', 'drive', 'fly']},
            'emotion_pos': {'seeds': ['happy', 'joy', 'delight', 'pleased']},
            'emotion_neg': {'seeds': ['sad', 'angry', 'fear', 'disgust']},
        },
    },
}
if categories_cfg is None:
    categories_cfg = default_categories_cfg
    print('No categories block found in config; using default template.')
else:
    categories_cfg = json.loads(json.dumps(categories_cfg))


cluster_csv_path = (categories_cfg or {}).get('cluster_csv_path', '')  # CSV of clusters
temporal_weighting = str((categories_cfg or {}).get('temporal_weighting', 'proportional')).lower()
assert temporal_weighting in {'proportional', 'none'}, "temporal_weighting must be 'proportional' or 'none'."
prototype_weight_power = float((categories_cfg or {}).get('prototype_weight_power', 1.0))  # weight shaping
SUBJECT = cfg.get('subject')
STORY = cfg.get('story')
paths = cfg.get('paths', {})
TR = float(cfg.get('TR', 2.0))

features_root = Path(paths.get('features', 'features'))  # features root
features_root.mkdir(parents=True, exist_ok=True)
if not paths.get('features') and Path('figs').exists():
    warnings.warn("Using default 'features/' directory. Update configs/demo.yaml with paths.features.")  # config hint
output_root = features_root / 'subjects' / SUBJECT / STORY / 'day17_categories'
output_root.mkdir(parents=True, exist_ok=True)
print(f"Writing outputs under {features_root}/subjects/{SUBJECT}/{STORY}/day17_categories")  # location notice
print(f'Output directory: {output_root}')

category_sets = categories_cfg.get('sets', {})
available_sets = sorted(category_sets.keys())
category_set_name = categories_cfg.get('category_set') or (available_sets[0] if available_sets else None)
if cluster_csv_path and not category_set_name:
    category_set_name = 'csv_clusters'  # default label for CSV clusters
    categories_cfg['category_set'] = category_set_name
if cluster_csv_path:
    categories_cfg.setdefault('sets', {})
    categories_cfg['sets'].setdefault(category_set_name, {})
    categories_cfg['category_score_method'] = 'similarity'
    categories_cfg['allow_single_seed'] = True
    categories_cfg['expansion'] = {'enabled': False}
if cluster_csv_path:
    # CSV-driven categories do not require seed sets
    selected_set_spec = category_sets.get(category_set_name, {})
    if category_set_name:
        print(f'Using category set: {category_set_name}')
else:
    if widgets is not None and available_sets:
        set_selector = widgets.Dropdown(options=available_sets, value=category_set_name, description='Category Set')
        display(set_selector)
        category_set_name = set_selector.value
    print(f'Using category set: {category_set_name}')
    assert category_set_name in category_sets, 'Selected category set not available in configuration.'
    selected_set_spec = category_sets[category_set_name]
category_score_method = str(categories_cfg.get('category_score_method', 'similarity')).lower()
overlap_mode = str(categories_cfg.get('overlap_weighting', 'proportional')).lower()
expansion_cfg = categories_cfg.get('expansion', {})
config_used_path = output_root / 'config_used.yaml'
with config_used_path.open('w') as fh:
    yaml.safe_dump({
        **categories_cfg,
        'category_set': category_set_name,
        'cluster_csv_path': cluster_csv_path,
        'temporal_weighting': temporal_weighting,
        'prototype_weight_power': prototype_weight_power,
    }, fh, sort_keys=False)
print(f'Wrote configuration snapshot to {config_used_path}')




In [None]:
# Helper functions
def load_story_words(paths: Dict, subject: str, story: str) -> List[Tuple[str, float, float]]:
    """Wrapper around load_transcript_words with sanity checks."""
    events = load_transcript_words(paths, subject, story)
    if not events:
        raise ValueError(f'No transcript events found for {subject} {story}.')
    return [(str(word).strip(), float(start), float(end)) for word, start, end in events]


# === NEW: load cluster words from CSV (category, word, [weight]) ===
def load_clusters_from_csv(csv_path: str) -> Dict[str, Dict[str, List[Tuple[str, float]]]]:
    from pathlib import Path  # local import to avoid global dependency
    if not csv_path or not Path(csv_path).exists():
        raise FileNotFoundError(f'Cluster CSV not found at {csv_path}')
    df = pd.read_csv(csv_path)
    cols = {c.lower().strip(): c for c in df.columns}
    for needed in ('category', 'word'):
        assert needed in cols, f"CSV must contain '{needed}' column."
    cat_col = cols['category']
    word_col = cols['word']
    weight_col = cols.get('weight')
    if weight_col is None:
        df['_weight'] = 1.0
        weight_col = '_weight'
    df = df[[cat_col, word_col, weight_col]].copy()
    df[word_col] = df[word_col].astype(str).str.strip().str.lower()
    df[cat_col] = df[cat_col].astype(str).str.strip().str.lower()
    df[weight_col] = pd.to_numeric(df[weight_col], errors='coerce').fillna(1.0).clip(lower=0.0)
    clusters: Dict[str, Dict[str, List[Tuple[str, float]]]] = {}
    clusters: Dict[str, Dict[str, List[Tuple[str, float]]]] = {}
    for cat, sub in df.groupby(cat_col):
        bucket: Dict[str, float] = {}
        for w, wt in zip(sub[word_col].tolist(), sub[weight_col].tolist()):
            if not w:
                continue
            bucket[w] = float(wt)
        pairs = sorted(bucket.items())
        if pairs:
            clusters[cat] = {'words': pairs}
    if not clusters:
        raise ValueError('No clusters parsed from CSV.')
    return clusters

# === NEW: build category prototypes from representative words (CSV) ===
def build_states_from_csv(
    clusters: Dict[str, Dict[str, List[Tuple[str, float]]]],
    primary_lookup: Dict[str, np.ndarray],
    fallback=None,
    weight_power: float = 1.0
) -> Tuple[Dict[str, Dict], Dict[str, Dict]]:
    category_states: Dict[str, Dict] = {}
    category_definitions: Dict[str, Dict] = {}
    oov_counts: Dict[str, int] = {}
    for cat, spec in clusters.items():
        pairs = spec.get('words', [])
        vecs: List[np.ndarray] = []
        weights: List[float] = []
        found_words: List[str] = []
        missing_words: List[str] = []
        for word, wt in pairs:
            vec = lookup_embedding(word, primary_lookup, fallback)
            if vec is None:
                missing_words.append(word)
                continue
            vecs.append(vec.astype(float))
            weights.append(float(max(0.0, wt)) ** float(weight_power))
            found_words.append(word)
        if not vecs:
            warnings.warn(f"[{cat}] no usable representative embeddings; prototype will be None.")  # centroid warning
            prototype = None
            prototype_norm = None
        else:
            W = np.array(weights, dtype=float)
            W = W / (W.sum() + 1e-12)
            M = np.stack(vecs, axis=0)
            prototype = (W[:, None] * M).sum(axis=0)
            prototype_norm = float(np.linalg.norm(prototype))
            if prototype_norm < EPS:
                prototype = None
                prototype_norm = None
        rep_lex = {word: float(wt) for word, wt in pairs}
        category_states[cat] = {
            'name': cat,
            'seeds': [],  # seeds unused in CSV mode
            'found_seeds': found_words,
            'missing_seeds': missing_words,
            'prototype': prototype,
            'prototype_norm': prototype_norm,
            'lexicon': rep_lex,  # metadata/debug only
            'expanded_count': 0,
            'expansion_params': {'enabled': False, 'top_k': 0, 'min_sim': 0.0},
        }
        category_definitions[cat] = {
        'from': 'csv',
        'seeds': [],
        'found_seeds': found_words,
        'missing_seeds': missing_words,
        'prototype_dim': int(prototype.shape[0]) if isinstance(prototype, np.ndarray) else 0,
        'prototype_norm': prototype_norm,
        'representative_words': rep_lex,
        'lexicon': rep_lex,
        'expanded_neighbors': {},
        }
        oov_counts[cat] = len(missing_words)
    if any(oov_counts.values()):
        warnings.warn(f"OOV representative words: {oov_counts}")
    return category_states, category_definitions

def build_tr_edges(word_events: Sequence[Tuple[str, float, float]], tr_s: float) -> np.ndarray:
    """Compute TR edges covering the full duration of the transcript."""
    if not word_events:
        return np.arange(0, tr_s, tr_s)
    max_end = max(end for _, _, end in word_events)
    n_tr = max(1, int(math.ceil(max_end / tr_s)))
    edges = np.arange(0.0, (n_tr + 1) * tr_s, tr_s, dtype=float)
    if edges[-1] < max_end:
        edges = np.append(edges, edges[-1] + tr_s)
    if edges[-1] < max_end - 1e-9:
        edges = np.append(edges, edges[-1] + tr_s)  # ensure coverage
    return edges


def lookup_embedding(token: str, primary_lookup: Dict[str, np.ndarray], fallback=None) -> Optional[np.ndarray]:
    key = token.lower().strip()
    if not key:
        return None
    vec = primary_lookup.get(key) if primary_lookup else None
    if vec is not None:
        return np.asarray(vec, dtype=float)
    if fallback is not None:
        try:
            if hasattr(fallback, 'get_vector') and key in fallback:
                return np.asarray(fallback.get_vector(key), dtype=float)
            if hasattr(fallback, '__contains__') and key in fallback:
                return np.asarray(fallback[key], dtype=float)
        except Exception:
            return None
    return None


def make_category_prototype(seeds: Sequence[str], primary_lookup: Dict[str, np.ndarray], fallback=None, allow_single: bool = False) -> Tuple[Optional[np.ndarray], List[str], List[str]]:
    found_vectors = []
    found_words = []
    missing_words = []
    for seed in seeds:
        vec = lookup_embedding(seed, primary_lookup, fallback)
        if vec is None:
            missing_words.append(seed)
            continue
        found_vectors.append(vec)
        found_words.append(seed)
    if not found_vectors:
        return None, found_words, missing_words
    if len(found_vectors) < 2 and not allow_single:
        warnings.warn(f'Only {len(found_vectors)} usable seed(s); enable allow_single_seed to accept singleton prototypes.')
        if not allow_single:
            return None, found_words, missing_words
    prototype = np.mean(found_vectors, axis=0)
    return prototype, found_words, missing_words


def expand_category(prototype: np.ndarray, vocab_embeddings: np.ndarray, vocab_words: Sequence[str], top_k: int, min_sim: float) -> Dict[str, float]:
    if prototype is None or vocab_embeddings is None or vocab_words is None:
        return {}
    proto = np.asarray(prototype, dtype=float)
    proto_norm = np.linalg.norm(proto)
    if proto_norm == 0:
        return {}
    proto_unit = proto / proto_norm
    vocab_norms = np.linalg.norm(vocab_embeddings, axis=1)
    valid_mask = vocab_norms > 0
    sims = np.full(vocab_embeddings.shape[0], -1.0, dtype=float)
    sims[valid_mask] = (vocab_embeddings[valid_mask] @ proto_unit) / vocab_norms[valid_mask]
    top_k_eff = min(top_k, len(sims))
    if top_k_eff <= 0:
        return {}
    candidate_idx = np.argpartition(-sims, top_k_eff - 1)[:top_k_eff]
    out = {}
    for idx in candidate_idx:
        score = float(sims[idx])
        if score < min_sim:
            continue
        out[vocab_words[idx]] = score
    return out


def tr_token_overlap(token_start: float, token_end: float, tr_start: float, tr_end: float, mode: str = 'proportional') -> float:
    token_start = float(token_start)
    token_end = float(token_end)
    if token_end <= token_start:
        token_end = token_start + 1e-3
    if mode == 'midpoint':
        midpoint = 0.5 * (token_start + token_end)
        return 1.0 if tr_start <= midpoint < tr_end else 0.0
    overlap = max(0.0, min(token_end, tr_end) - max(token_start, tr_start))
    duration = token_end - token_start
    if duration <= 0:
        return 1.0 if overlap > 0 else 0.0
    return max(0.0, min(1.0, overlap / duration))


def score_tr(token_payload: Sequence[Dict], method: str, *, lexicon: Optional[Dict[str, float]] = None, prototype: Optional[np.ndarray] = None, prototype_norm: Optional[float] = None) -> float:
    if not token_payload:
        return float('nan')
    method = method.lower()
    if method == 'count':
        if not lexicon:
            return float('nan')
        total = 0.0
        for item in token_payload:
            weight = lexicon.get(item['word'].lower())
            if weight is None:
                continue
            total += weight * item['overlap']
        return float(total)
    if method == 'similarity':
        if prototype is None or prototype_norm is None or prototype_norm < EPS:
            return float('nan')  # numeric guard
        num = 0.0
        denom = 0.0
        for item in token_payload:
            emb = item.get('embedding')
            if emb is None:
                continue
            emb_norm = item.get('embedding_norm')
            if emb_norm is None or emb_norm < EPS:
                continue  # numeric guard
            sim = float(np.dot(emb, prototype) / (emb_norm * prototype_norm))
            num += sim * item['overlap']
            denom += item['overlap']
        if denom == 0:
            return float('nan')
        value = num / denom
        return float(np.clip(value, -1.0, 1.0))
    raise ValueError(f'Unknown scoring method: {method}')


def ensure_serializable(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    if isinstance(obj, (np.floating, np.integer)):
        return obj.item()
    if isinstance(obj, dict):
        return {k: ensure_serializable(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [ensure_serializable(v) for v in obj]
    return obj


def discover_story_dirs(features_root: Path, subject: str) -> List[Path]:
    base = features_root / 'subjects' / subject  # subject root
    if not base.exists():
        return []
    candidates = []
    for child in base.iterdir():
        if not child.is_dir() or child.name == 'ALL':
            continue
        ts_path = child / 'day17_categories' / 'category_timeseries.csv'
        if ts_path.exists():
            candidates.append(child)
    return sorted(candidates)


def build_token_buckets(edges: np.ndarray, event_records: Sequence[Dict], mode: str = 'proportional') -> List[List[Dict]]:
    if edges.size < 2:
        return []
    buckets: List[List[Dict]] = [[] for _ in range(len(edges) - 1)]
    for rec in event_records:
        start = rec['start']
        end = rec['end']
        if end <= edges[0] or start >= edges[-1]:
            continue
        start_idx = max(0, int(np.searchsorted(edges, start, side='right')) - 1)
        end_idx = max(0, int(np.searchsorted(edges, end, side='left')))
        end_idx = min(end_idx, len(buckets) - 1)
        for idx in range(start_idx, end_idx + 1):
            bucket_start = edges[idx]
            bucket_end = edges[idx + 1]
            if mode == 'none':
                overlap = 1.0 if not (end <= bucket_start or start >= bucket_end) else 0.0
            else:
                overlap = tr_token_overlap(start, end, bucket_start, bucket_end, 'proportional')
            if overlap <= 0:
                continue
            buckets[idx].append({
                'word': rec['word'],
                'overlap': overlap,
                'embedding': rec['embedding'],
                'embedding_norm': rec['embedding_norm'],
            })
    return buckets

def score_time_series(edges: np.ndarray, buckets: Sequence[Sequence[Dict]], category_states: Dict[str, Dict], category_names: Sequence[str], category_columns: Sequence[str], method: str, index_name: str) -> Tuple[pd.DataFrame, np.ndarray]:
    n_bins = len(buckets)
    score_matrix = np.full((n_bins, len(category_names)), np.nan, dtype=float)
    for col_idx, cat_name in enumerate(category_names):
        state = category_states[cat_name]
        lexicon = state['lexicon']
        prototype = state['prototype']
        prototype_norm = state['prototype_norm']
        for bin_idx, bucket in enumerate(buckets):
            score_matrix[bin_idx, col_idx] = score_tr(bucket, method, lexicon=lexicon, prototype=prototype, prototype_norm=prototype_norm)
    data = {
        index_name: np.arange(n_bins, dtype=int),
        'start_sec': edges[:-1],
        'end_sec': edges[1:],
    }
    for col_idx, col in enumerate(category_columns):
        data[col] = score_matrix[:, col_idx]
    df = pd.DataFrame(data)
    return df, score_matrix





In [None]:
# Load transcript and build TR grid
story_events = load_story_words(paths, SUBJECT, STORY)
print(f'Loaded {len(story_events)} word events for {SUBJECT} {STORY}.')
tr_edges = build_tr_edges(story_events, TR)
n_tr = len(tr_edges) - 1
print(f'TR edges: {len(tr_edges)} (n_tr={n_tr}) spanning {tr_edges[-1]:.2f} seconds.')
assert n_tr > 0, 'No TRs derived from transcript timing.'
assert np.all(np.diff(tr_edges) > 0), 'Non-monotone TR edges.'  # hard check



In [None]:
# Load embedding resources
embedding_source = str(categories_cfg.get('embedding_source', 'english1000')).lower()
english_loader = None
english_lookup = {}
english_vocab = []
english_matrix = None
if embedding_source in {'english1000', 'both'}:
    english1000_path = Path(paths.get('data_root', '')) / 'derivative' / 'english1000sm.hf5'
    if english1000_path.exists():
        english_loader = English1000Loader(english1000_path)
        english_lookup = english_loader.lookup
        english_vocab = english_loader.vocab
        english_matrix = english_loader.embeddings
        print(f'Loaded English1000 embeddings from {english1000_path} (vocab={len(english_vocab)}).')
    else:
        raise FileNotFoundError(f'English1000 embeddings not found at {english1000_path}')
else:
    print('English1000 disabled by configuration.')

word2vec_model = None
if embedding_source in {'word2vec', 'both'}:
    w2v_path = categories_cfg.get('word2vec_path')
    if w2v_path:
        w2v_path = Path(w2v_path)
        if w2v_path.exists():
            try:
                from gensim.models import KeyedVectors
                binary = w2v_path.suffix.lower() in {'.bin', '.gz'}
                word2vec_model = KeyedVectors.load_word2vec_format(w2v_path, binary=binary)
                print(f'Loaded Word2Vec fallback from {w2v_path}.')
            except Exception as exc:
                warnings.warn(f'Failed to load Word2Vec fallback: {exc}')
        else:
            warnings.warn(f'Word2Vec path does not exist: {w2v_path}')
    else:
        warnings.warn('Word2Vec fallback requested but no path provided.')
else:
    print('Word2Vec fallback disabled.')
if embedding_source == 'word2vec' and expansion_cfg.get('enabled', True) and english_matrix is not None:
    warnings.warn("Category expansion used English1000 while token embeddings use word2vec; cosine geometry may be inconsistent.")  # geometry notice

primary_lookup = english_lookup



In [None]:
# Build category prototypes and lexicons
if 'set_selector' in globals() and hasattr(set_selector, 'value'):
    category_set_name = set_selector.value
selected_set_spec = category_sets.get(category_set_name, {})

allow_single = bool(categories_cfg.get('allow_single_seed', False))
exp_enabled = bool(expansion_cfg.get('enabled', True))
exp_top_k = int(expansion_cfg.get('top_k', 2000))
exp_min_sim = float(expansion_cfg.get('min_sim', 0.35))

if cluster_csv_path:
    category_score_method = 'similarity'
    category_states, category_definitions = build_states_from_csv(
        load_clusters_from_csv(cluster_csv_path),
        primary_lookup,
        word2vec_model,
        weight_power=prototype_weight_power,
    )
    category_names = sorted(category_states.keys())
    category_columns = [f'cat_{name}' for name in category_names]
    print(f"Loaded {len(category_names)} CSV-driven categories from {cluster_csv_path}: {category_names}")
    zero_norm = [k for k, v in category_states.items() if v.get('prototype') is not None and (v.get('prototype_norm') or 0.0) < EPS]
    if zero_norm:
        warnings.warn(f"Zero-norm prototypes (check OOV/weights): {zero_norm}")
else:
    category_states: Dict[str, Dict] = {}
    category_definitions: Dict[str, Dict] = {}
    seed_oov_counter = Counter()
    for cat_name, cat_spec in selected_set_spec.items():
        seeds = cat_spec.get('seeds', [])
        explicit_words = cat_spec.get('words', [])
        prototype = None
        found_seeds: List[str] = []
        missing_seeds: List[str] = []
        if seeds:
            prototype, found_seeds, missing_seeds = make_category_prototype(seeds, primary_lookup, word2vec_model, allow_single)
            seed_oov_counter[cat_name] = len(missing_seeds)
            if prototype is None and category_score_method == 'similarity':
                warnings.warn(f"Category '{cat_name}' has no usable prototype; TR scores will be NaN.")  # warn missing proto
        elif category_score_method == 'similarity':
            warnings.warn(f'Category {cat_name} has no seeds; similarity method will yield NaNs.')
        lexicon = {word.lower(): 1.0 for word in explicit_words}
        for seed in found_seeds:
            lexicon.setdefault(seed.lower(), 1.0)
        prototype_norm = None
        expanded_words = {}
        if prototype is not None:
            prototype_norm = float(np.linalg.norm(prototype))
            if exp_enabled and english_matrix is not None:
                expanded_words = expand_category(prototype, english_matrix, english_vocab, exp_top_k, exp_min_sim)
                for word, weight in expanded_words.items():
                    lexicon.setdefault(word.lower(), float(weight))
        if not lexicon and category_score_method == 'count':
            warnings.warn(f'Category {cat_name} lexicon is empty; counts will be NaN.')
        category_states[cat_name] = {
            'name': cat_name,
            'seeds': seeds,
            'found_seeds': found_seeds,
            'missing_seeds': missing_seeds,
            'prototype': prototype,
            'prototype_norm': prototype_norm,
            'lexicon': lexicon,
            'expanded_count': len(expanded_words),
            'expansion_params': {
                'enabled': exp_enabled,
                'top_k': exp_top_k,
                'min_sim': exp_min_sim,
            },
        }
        category_definitions[cat_name] = {
            'seeds': seeds,
            'found_seeds': found_seeds,
            'missing_seeds': missing_seeds,
            'prototype_dim': int(prototype.shape[0]) if isinstance(prototype, np.ndarray) else 0,
            'prototype_norm': prototype_norm,
            'expanded_neighbors': ensure_serializable(expanded_words),
            'lexicon': {word: float(weight) for word, weight in sorted(category_states[cat_name]['lexicon'].items())},
        }
    if exp_enabled:
        for _cn, _st in category_states.items():
            if _st.get('seeds') and _st.get('expanded_count', 0) == 0:
                warnings.warn(f"No neighbors met min_sim for '{_cn}' (min_sim={exp_min_sim}).")  # expansion check
    print('Category seeds missing counts:', dict(seed_oov_counter))
    category_names = sorted(category_states.keys())
    category_columns = [f'cat_{name}' for name in category_names]
    print(f'Prepared {len(category_names)} categories: {category_names}')

tag_label = '[huth2016_12]' if cluster_csv_path else '[categories]'
for _cn, _st in category_states.items():
    if _st.get('prototype') is None or (_st.get('prototype_norm') or 0.0) < EPS:
        warnings.warn(f"{tag_label} Cluster '{_cn}' has no valid centroid (all seeds OOV?). Scores will be NaN.")
    print(f"{tag_label} {_cn}: {len(_st.get('found_seeds', []))} in-vocab / {len(_st.get('missing_seeds', []))} OOV")



In [None]:
# Prepare continuous (story-timebase) category series
seconds_bin_width = float(categories_cfg.get('seconds_bin_width', 0.05))
if seconds_bin_width <= 0:
    raise ValueError('seconds_bin_width must be positive.')

embedding_cache: Dict[str, Optional[np.ndarray]] = {}
event_records: List[Dict] = []
tokens_with_embeddings = 0
for word, onset, offset in story_events:
    token = word.strip()
    if not token:
        continue
    key = token.lower()
    if key not in embedding_cache:
        embedding_cache[key] = lookup_embedding(token, primary_lookup, word2vec_model)
    emb = embedding_cache[key]
    emb_norm = float(np.linalg.norm(emb)) if emb is not None else None
    if emb is not None:
        tokens_with_embeddings += 1
    event_records.append({
        'word': token,
        'start': float(onset),
        'end': float(offset),
        'embedding': emb,
        'embedding_norm': emb_norm,
    })

total_tokens = len(event_records)
print(f'Tokens with embeddings: {tokens_with_embeddings}/{total_tokens} (OOV rate={(total_tokens - tokens_with_embeddings) / max(total_tokens, 1):.2%}).')

if not event_records:
    raise ValueError('No token events available for category featurization.')
max_end_time = max(rec['end'] for rec in event_records)
canonical_edges = np.arange(0.0, max_end_time + seconds_bin_width, seconds_bin_width, dtype=float)
if canonical_edges[-1] < max_end_time:
    canonical_edges = np.append(canonical_edges, canonical_edges[-1] + seconds_bin_width)
assert np.all(np.diff(canonical_edges) > 0), 'Non-monotone canonical edges.'  # hard check

canonical_buckets = build_token_buckets(canonical_edges, event_records, temporal_weighting)
empty_canonical = sum(1 for bucket in canonical_buckets if not bucket)
print(f'Canonical bins without tokens: {empty_canonical}/{len(canonical_buckets)}')

canonical_df, canonical_matrix = score_time_series(canonical_edges, canonical_buckets, category_states, category_names, category_columns, category_score_method, index_name='bin_index')
canonical_root = features_root / 'stories' / STORY / 'day17_categories'  # story-level features
canonical_root.mkdir(parents=True, exist_ok=True)
canonical_csv_path = canonical_root / 'category_timeseries_seconds.csv'
canonical_df.to_csv(canonical_csv_path, index=False)
print(f'Saved canonical story time series to {canonical_csv_path}')

canonical_definition_path = canonical_root / 'category_definition.json'
with canonical_definition_path.open('w') as fh:
    json.dump(ensure_serializable(category_definitions), fh, indent=2)

canonical_config_path = canonical_root / 'config_used.yaml'
with canonical_config_path.open('w') as fh:
    yaml.safe_dump({**categories_cfg, 'category_set': category_set_name}, fh, sort_keys=False)





In [None]:
# Project canonical series onto subject-specific TR grid
tr_buckets = build_token_buckets(tr_edges, event_records, temporal_weighting)
empty_tr = sum(1 for bucket in tr_buckets if not bucket)
print(f'TRs without tokens: {empty_tr}/{len(tr_buckets)}')

category_df, score_matrix = score_time_series(tr_edges, tr_buckets, category_states, category_names, category_columns, category_score_method, index_name='tr_index')
print(category_df.head())
if category_score_method == 'similarity':
    nonempty_mask = np.array([len(b) > 0 for b in tr_buckets])
    if nonempty_mask.any():
        has_finite = np.isfinite(category_df.loc[nonempty_mask, category_columns].to_numpy()).any()
        if not has_finite:
            raise RuntimeError('All similarity scores are NaN despite non-empty TR buckets.')  # hard check
assert len(category_df) == len(tr_buckets), 'Category dataframe row mismatch.'
assert len(category_columns) == len(category_names), 'Category column mismatch.'
if category_score_method == 'similarity':
    finite_vals = category_df[category_columns].to_numpy().astype(float)
    finite_vals = finite_vals[np.isfinite(finite_vals)]
    if finite_vals.size:
        assert np.nanmin(finite_vals) >= -1.0001 and np.nanmax(finite_vals) <= 1.0001, 'Similarity scores out of bounds.'
else:
    assert (category_df[category_columns].fillna(0.0) >= -1e-9).all().all(), 'Count scores must be non-negative.'




In [None]:
# Save per-story outputs and metadata
category_csv_path = output_root / 'category_timeseries.csv'
category_df.to_csv(category_csv_path, index=False)
print(f'Saved category time series to {category_csv_path}')
definition_path = output_root / 'category_definition.json'
with definition_path.open('w') as fh:
    json.dump(ensure_serializable(category_definitions), fh, indent=2)
print(f'Saved category definitions to {definition_path}')
print(f'Canonical assets available at {canonical_root}')

trimmed_path = Path(paths.get('figs', 'figs')) / SUBJECT / STORY / 'day16_decoding' / 'semantic_pcs_trimmed.csv'
max_lag_primary = 0
day16_trim = None
if trimmed_path.exists():
    day16_trim = pd.read_csv(trimmed_path)
    expected_len = len(day16_trim)
    if len(day16_trim) > len(category_df):
        raise ValueError('Day16 trimmed series longer than category series; regenerate Day16 or rerun Day17.')  # hard check
    max_lag_primary = max(0, len(category_df) - expected_len)
    print(f'Aligning with Day16 trim (max_lag_primary={max_lag_primary}).')
else:
    tau_grid = [1, 2]
    E_cap = 6
    max_tau = max(tau_grid)
    max_lag_primary = max_tau * (E_cap - 1)
    max_lag_primary = min(max_lag_primary, len(category_df) - 1)
    warnings.warn(f"Day16 trimmed PCs not found; approximating max_lag_primary={max_lag_primary}.")

trimmed_df = category_df.iloc[max_lag_primary:].reset_index(drop=True)
trimmed_out = trimmed_df[['tr_index']].copy()
trimmed_out.rename(columns={'tr_index': 'trim_index'}, inplace=True)
for col in category_columns:
    trimmed_out[col] = trimmed_df[col].values
trimmed_csv_path = output_root / 'category_timeseries_trimmed.csv'
trimmed_out.to_csv(trimmed_csv_path, index=False)
print(f'Saved trimmed category series to {trimmed_csv_path}')

category_stats = {}
for col in category_columns:
    values = category_df[col].to_numpy()
    category_stats[col] = {
        'mean': float(np.nanmean(values)),
        'std': float(np.nanstd(values)),
        'nan_fraction': float(np.mean(~np.isfinite(values)))
    }
meta = {
    'subject': SUBJECT,
    'story': STORY,
    'tr_seconds': TR,
    'n_tr': n_tr,
    'category_set': category_set_name,
    'category_score_method': category_score_method,
    'overlap_weighting': overlap_mode,
    'max_lag_primary': max_lag_primary,
    'categories': category_stats,
}
import hashlib, json as _json
_cfg_hash = hashlib.md5(_json.dumps({**categories_cfg, 'category_set': category_set_name}, sort_keys=True).encode()).hexdigest()  # config hash
meta['config_hash'] = _cfg_hash
meta.update({
        'cluster_csv_path': cluster_csv_path,
        'temporal_weighting': temporal_weighting,
        'prototype_weight_power': prototype_weight_power,
    })
meta_path = output_root / 'meta.json'
with meta_path.open('w') as fh:
    json.dump(meta, fh, indent=2)
print(f'Meta statistics saved to {meta_path}')
# sanity check prints
print("  ,  ", (day16_trim is None) or (len(day16_trim) <= len(category_df)))
print("  ,  ", set(category_columns).issubset(set(category_df.columns)))
print("  ,  ", meta.get('config_hash', '')[:8])





In [None]:
# Optional concatenation across stories
if bool(categories_cfg.get('concat_all_stories', False)):
    story_dirs = discover_story_dirs(features_root, SUBJECT)
    if not story_dirs:
        story_dirs = [output_root.parent]
    frames = []
    global_idx = 0
    for story_dir in story_dirs:
        ts_path = story_dir / 'day17_categories' / 'category_timeseries.csv'
        if not ts_path.exists():
            continue
        df_story = pd.read_csv(ts_path)
        df_story.insert(0, 'story_id', story_dir.name)
        df_story['tr_index_global'] = np.arange(global_idx, global_idx + len(df_story), dtype=int)
        global_idx += len(df_story)
        frames.append(df_story)
    if frames:
        concat_dir = features_root / 'subjects' / SUBJECT / 'ALL' / 'day17_categories'
        concat_dir.mkdir(parents=True, exist_ok=True)
        concat_df = pd.concat(frames, ignore_index=True)
        concat_path = concat_dir / 'category_timeseries_concat.csv'
        concat_df.to_csv(concat_path, index=False)
        print(f'Concatenated category series saved to {concat_path}')
    else:
        print('No per-story category outputs found for concatenation.')
else:
    print('Concatenation disabled by configuration.')



In [None]:
# Quick sanity plots
if plt is None:
    print('Matplotlib unavailable; skipping plots.')
else:
    sample_cols = category_columns[:min(4, len(category_columns))]
    if not sample_cols:
        print('No category columns to plot.')
    else:
        rows = len(sample_cols)
        fig, axes = plt.subplots(rows, 1, figsize=(12, 3 * rows), sharex=True)
        if rows == 1:
            axes = [axes]
        for ax, col in zip(axes, sample_cols):
            ax.plot(category_df['start_sec'], category_df[col])
            ax.set_ylabel(col)
            ax.grid(True, alpha=0.3)
        axes[-1].set_xlabel('Time (s)')
        fig.suptitle('Sample category trajectories')
        plt.tight_layout()
        plt.show()


In [None]:
# Integration demo with Day16 outputs
trimmed_csv_path = output_root / 'category_timeseries_trimmed.csv'
day16_trim_path = Path(paths.get('figs', 'figs')) / SUBJECT / STORY / 'day16_decoding' / 'semantic_pcs_trimmed.csv'
if trimmed_csv_path.exists() and day16_trim_path.exists():
    categories_trim = pd.read_csv(trimmed_csv_path)
    sem_trim = pd.read_csv(day16_trim_path)
    merged = pd.merge(sem_trim[['trim_index', 'sem_pc1']], categories_trim, on='trim_index', how='left')
    display(merged.head())
    first_cat = category_columns[0] if category_columns else None
    if first_cat:
        corr = merged['sem_pc1'].corr(merged[first_cat])
        print(f'Correlation between sem_pc1 and {first_cat}: {corr:.3f}')
else:
    print('Trimmed category or Day16 semantic PCs not available; skipping merge demo.')


## README & Next Steps

- **Category definitions** combine user-provided seeds with optional embedding expansion (cosine top-k). Edit `configs/demo.yaml` under `categories` or adjust `config_used.yaml` for future runs.
- **Scoring mode** toggles via `category_score_method` (`similarity` for cosine averages, `count` for weighted lexicon counts).
- **Canonical story timeline**: `figs/stories/{story}/day17_categories/category_timeseries_seconds.csv` captures the stimulus in seconds; resample this per subject to their TR grid and apply subject/run-specific shifts before decoding.
- **Word2Vec fallback**: enable by setting `embedding_source` to `both` or `word2vec` and pointing `word2vec_path` to a compatible model. Missing models are skipped with warnings.
- **Concatenation**: set `concat_all_stories: true` to produce `figs/{subject}/ALL/day17_categories/category_timeseries_concat.csv`; rerun after generating per-story outputs.
- **Day16 integration**: use `category_timeseries_trimmed.csv` (aligned `trim_index`) to swap targets (`sem_pc1` → `cat_*`) or append as auxiliary predictors inside the Day16 pipeline.
- **Customization**: extend `categories.sets` with additional taxonomy groups (e.g., Gallant lab categories) and rerun to materialize new feature matrices.
- **Quality checks**: inspect `meta.json` for per-category means/stds, and consult `category_definition.json` for the exact lexicon used in scoring.