# Day 27 - Karaoke Category Video

This notebook builds smoothed category time series (based on Day 19) and renders a
karaoke-style MP4 with transcript text on top and a 4x3 grid of category series below.

Workflow:
1. Update the configuration cell.
2. Run the generation cell to create `result`.
3. Run the token prep cell.
4. Pick 12 categories for the grid.
5. Run the video cell (requires ffmpeg).


In [1]:
import json
import math
import os
import sys
import warnings
from collections import Counter
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple

import numpy as np
import pandas as pd

np.random.seed(42)
pd.options.display.max_columns = 60

project_root = Path('/flash/PaoU/seann/fmri-edm-ccm')
project_root.mkdir(parents=True, exist_ok=True)
os.chdir(project_root)

sys.path.append(str(project_root))
sys.path.append('/flash/PaoU/seann/pyEDM/src')
sys.path.append('/flash/PaoU/seann/MDE-main/src')

try:
    import ipywidgets as widgets
    from IPython.display import display
except Exception:
    widgets = None
    def display(obj):
        print(obj)

try:
    import matplotlib.pyplot as plt
except Exception as exc:
    plt = None
    warnings.warn(f'Matplotlib unavailable: {exc}')

from src.utils import load_yaml
from src.decoding import load_transcript_words
from src.edm_ccm import English1000Loader

EPS = 1e-12


In [2]:
from pathlib import Path
from subprocess import run

PROJECT_ROOT = Path('/flash/PaoU/seann/fmri-edm-ccm')
CONFIG_PATH = PROJECT_ROOT / 'configs' / 'demo.yaml'


def run_day19_for_subject(subject: str, stories=None, dry_run=False):
    """Batch regenerate Day19 category series via the CLI helper."""
    cmd = [
        'python',
        str(PROJECT_ROOT / 'scripts' / 'run_day19_batch.py'),
        '--config',
        str(CONFIG_PATH),
        '--subjects',
        subject,
    ]
    if stories:
        cmd.extend(['--stories', *stories])
    if dry_run:
        cmd.append('--dry-run')
    print('Running:', ' '.join(cmd))
    run(cmd, check=True)


# Example usage (uncomment to preview without executing):
# run_day19_for_subject('UTS01', dry_run=True)

# Load default story list if available
stories = Path('misc/story_list.txt').read_text().splitlines() if Path('misc/story_list.txt').exists() else None
# Example usage (uses all stories by default)
# run_day19_for_subject('UTS01', stories=stories, dry_run=True)


In [3]:
# --- Configuration -------------------------------------------------------------------
cfg = load_yaml('configs/demo.yaml')
categories_cfg = cfg.get('categories', {}) or {}
cluster_csv_path = categories_cfg.get('cluster_csv_path', '')
prototype_weight_power = float(categories_cfg.get('prototype_weight_power', 1.0))
seconds_bin_width_default = float(categories_cfg.get('seconds_bin_width', 0.05))
temporal_weighting_default = str(categories_cfg.get('temporal_weighting', 'proportional')).lower()

paths = cfg.get('paths', {})
TR = float(cfg.get('TR', 2.0))
features_root = Path(paths.get('featurestest', 'featurestest'))
features_root.mkdir(parents=True, exist_ok=True)

SUBJECT = cfg.get('subject') or 'UTS01'
STORY = cfg.get('story') or 'wheretheressmoke'
TEMPORAL_WEIGHTING = temporal_weighting_default  # {'proportional', 'none'}
SECONDS_BIN_WIDTH = seconds_bin_width_default

# canonical smoothing controls (edit to taste)
SMOOTHING_SECONDS = 1.00            # shorter window preserves fast dynamics for forecasting
SMOOTHING_METHOD = 'moving_average'       # {'moving_average', 'gaussian'}
GAUSSIAN_SIGMA_SECONDS = 0.5 * SMOOTHING_SECONDS  # tie sigma to window length for EDM
SMOOTHING_PAD_MODE = 'reflect'      # {'edge', 'reflect'}

SAVE_OUTPUTS = True  # toggle off to skip writing CSVs

# video settings
VIDEO_DIR = features_root / 'videos'
VIDEO_DIR.mkdir(parents=True, exist_ok=True)
KARAOKE_OUTPUT = str(VIDEO_DIR / f'karaoke_{SUBJECT}_{STORY}.mp4')
KARAOKE_USE_DOMAIN = 'tr'  # 'tr' or 'canonical'
KARAOKE_FPS = 15
KARAOKE_WINDOW_SEC = 30.0
KARAOKE_PAD_LEFT_SEC = 0.5
KARAOKE_PAD_RIGHT_SEC = 2.0
KARAOKE_BIN_WORDS_MAX = 30  # None to show all words in the bin
KARAOKE_PLAYBACK_SPEED = 7.0  # 1.0 real-time; <1 slower; >1 faster
KARAOKE_YLIM_PAD_FRAC = 0.05  # add 5% headroom to y-lims
KARAOKE_ZSCORE = True  # z-score each category series before plotting
KARAOKE_LOG_EVERY_FRAMES = None  # None -> log every ~5s of video time
KARAOKE_CATEGORY_COUNT = 12  # must be 12 for the 4x3 grid
KARAOKE_CATEGORY_COLUMNS = None  # set to a list of 12 column names if desired

print(f'Subject/story: {SUBJECT} / {STORY}')
print(f'Cluster CSV: {cluster_csv_path or "<none>"}')
print(f'Temporal weighting: {TEMPORAL_WEIGHTING}')
print(f'Seconds bin width: {SECONDS_BIN_WIDTH}')
print(f'Smoothing: {SMOOTHING_METHOD} | window={SMOOTHING_SECONDS}s | sigma={GAUSSIAN_SIGMA_SECONDS}')
print(f'Video output: {KARAOKE_OUTPUT}')
print(f'Video domain: {KARAOKE_USE_DOMAIN} | fps={KARAOKE_FPS} | window={KARAOKE_WINDOW_SEC}s')


Subject/story: UTS01 / wheretheressmoke
Cluster CSV: configs/cluster_words.csv
Temporal weighting: proportional
Seconds bin width: 0.05
Smoothing: moving_average | window=1.0s | sigma=0.5
Video output: featuresqaemb/videos/karaoke_UTS01_wheretheressmoke.mp4
Video domain: tr | fps=1 | window=30.0s


In [4]:
# Helper functions

def load_story_words(paths: Dict, subject: str, story: str) -> List[Tuple[str, float, float]]:
    events = load_transcript_words(paths, subject, story)
    if not events:
        raise ValueError(f'No transcript events found for {subject} {story}.')
    return [(str(word).strip(), float(start), float(end)) for word, start, end in events]


def load_clusters_from_csv(csv_path: str) -> Dict[str, Dict[str, List[Tuple[str, float]]]]:
    from pathlib import Path
    if not csv_path or not Path(csv_path).exists():
        raise FileNotFoundError(f'Cluster CSV not found at {csv_path}')
    df = pd.read_csv(csv_path)
    cols = {c.lower().strip(): c for c in df.columns}
    for needed in ('category', 'word'):
        assert needed in cols, f"CSV must contain '{needed}' column."
    cat_col = cols['category']
    word_col = cols['word']
    weight_col = cols.get('weight')
    if weight_col is None:
        df['_weight'] = 1.0
        weight_col = '_weight'
    df = df[[cat_col, word_col, weight_col]].copy()
    df[word_col] = df[word_col].astype(str).str.strip().str.lower()
    df[cat_col] = df[cat_col].astype(str).str.strip().str.lower()
    df[weight_col] = pd.to_numeric(df[weight_col], errors='coerce').fillna(1.0).clip(lower=0.0)
    clusters: Dict[str, Dict[str, List[Tuple[str, float]]]] = {}
    for cat, sub in df.groupby(cat_col):
        bucket: Dict[str, float] = {}
        for w, wt in zip(sub[word_col].tolist(), sub[weight_col].tolist()):
            if not w:
                continue
            bucket[w] = float(wt)
        pairs = sorted(bucket.items())
        if pairs:
            clusters[cat] = {'words': pairs}
    if not clusters:
        raise ValueError('No clusters parsed from CSV.')
    return clusters


def build_states_from_csv(
    clusters: Dict[str, Dict[str, List[Tuple[str, float]]]],
    primary_lookup: Dict[str, np.ndarray],
    fallback=None,
    weight_power: float = 1.0
) -> Tuple[Dict[str, Dict], Dict[str, Dict]]:
    category_states: Dict[str, Dict] = {}
    category_definitions: Dict[str, Dict] = {}
    oov_counts: Dict[str, int] = {}
    for cat, spec in clusters.items():
        pairs = spec.get('words', [])
        vecs: List[np.ndarray] = []
        weights: List[float] = []
        found_words: List[str] = []
        missing_words: List[str] = []
        for word, wt in pairs:
            vec = lookup_embedding(word, primary_lookup, fallback)
            if vec is None:
                missing_words.append(word)
                continue
            vecs.append(vec.astype(float))
            weights.append(float(max(0.0, wt)) ** float(weight_power))
            found_words.append(word)
        if not vecs:
            warnings.warn(f"[{cat}] no usable representative embeddings; prototype will be None.")
            prototype = None
            prototype_norm = None
        else:
            W = np.array(weights, dtype=float)
            W = W / (W.sum() + 1e-12)
            M = np.stack(vecs, axis=0)
            prototype = (W[:, None] * M).sum(axis=0)
            prototype_norm = float(np.linalg.norm(prototype))
            if prototype_norm < EPS:
                prototype = None
                prototype_norm = None
        rep_lex = {word: float(wt) for word, wt in pairs}
        category_states[cat] = {
            'name': cat,
            'seeds': [],
            'found_seeds': found_words,
            'missing_seeds': missing_words,
            'prototype': prototype,
            'prototype_norm': prototype_norm,
            'lexicon': rep_lex,
            'expanded_count': 0,
            'expansion_params': {'enabled': False, 'top_k': 0, 'min_sim': 0.0},
        }
        category_definitions[cat] = {
            'from': 'csv',
            'seeds': [],
            'found_seeds': found_words,
            'missing_seeds': missing_words,
            'prototype_dim': int(prototype.shape[0]) if isinstance(prototype, np.ndarray) else 0,
            'prototype_norm': prototype_norm,
            'representative_words': rep_lex,
            'lexicon': rep_lex,
            'expanded_neighbors': {},
        }
        oov_counts[cat] = len(missing_words)
    if any(oov_counts.values()):
        warnings.warn(f"OOV representative words: {oov_counts}")
    return category_states, category_definitions


def build_tr_edges(word_events: Sequence[Tuple[str, float, float]], tr_s: float) -> np.ndarray:
    if not word_events:
        return np.arange(0, tr_s, tr_s)
    max_end = max(end for _, _, end in word_events)
    n_tr = max(1, int(math.ceil(max_end / tr_s)))
    edges = np.arange(0.0, (n_tr + 1) * tr_s, tr_s, dtype=float)
    if edges[-1] < max_end:
        edges = np.append(edges, edges[-1] + tr_s)
    if edges[-1] < max_end - 1e-9:
        edges = np.append(edges, edges[-1] + tr_s)
    return edges


def lookup_embedding(token: str, primary_lookup: Dict[str, np.ndarray], fallback=None) -> Optional[np.ndarray]:
    key = token.lower().strip()
    if not key:
        return None
    vec = primary_lookup.get(key) if primary_lookup else None
    if vec is not None:
        return np.asarray(vec, dtype=float)
    if fallback is not None:
        try:
            if hasattr(fallback, 'get_vector') and key in fallback:
                return np.asarray(fallback.get_vector(key), dtype=float)
            if hasattr(fallback, '__contains__') and key in fallback:
                return np.asarray(fallback[key], dtype=float)
        except Exception:
            return None
    return None


def make_category_prototype(seeds: Sequence[str], primary_lookup: Dict[str, np.ndarray], fallback=None, allow_single: bool = False) -> Tuple[Optional[np.ndarray], List[str], List[str]]:
    found_vectors = []
    found_words = []
    missing_words = []
    for seed in seeds:
        vec = lookup_embedding(seed, primary_lookup, fallback)
        if vec is None:
            missing_words.append(seed)
            continue
        found_vectors.append(vec)
        found_words.append(seed)
    if not found_vectors:
        return None, found_words, missing_words
    if len(found_vectors) < 2 and not allow_single:
        warnings.warn(f'Only {len(found_vectors)} usable seed(s); enable allow_single_seed to accept singleton prototypes.')
        if not allow_single:
            return None, found_words, missing_words
    prototype = np.mean(found_vectors, axis=0)
    return prototype, found_words, missing_words


def expand_category(prototype: np.ndarray, vocab_embeddings: np.ndarray, vocab_words: Sequence[str], top_k: int, min_sim: float) -> Dict[str, float]:
    if prototype is None or vocab_embeddings is None or vocab_words is None:
        return {}
    proto = np.asarray(prototype, dtype=float)
    proto_norm = np.linalg.norm(proto)
    if proto_norm == 0:
        return {}
    proto_unit = proto / proto_norm
    vocab_norms = np.linalg.norm(vocab_embeddings, axis=1)
    valid_mask = vocab_norms > 0
    sims = np.full(vocab_embeddings.shape[0], -1.0, dtype=float)
    sims[valid_mask] = (vocab_embeddings[valid_mask] @ proto_unit) / vocab_norms[valid_mask]
    top_k_eff = min(top_k, len(sims))
    if top_k_eff <= 0:
        return {}
    candidate_idx = np.argpartition(-sims, top_k_eff - 1)[:top_k_eff]
    out = {}
    for idx in candidate_idx:
        score = float(sims[idx])
        if score < min_sim:
            continue
        out[vocab_words[idx]] = score
    return out


def tr_token_overlap(token_start: float, token_end: float, tr_start: float, tr_end: float, mode: str = 'proportional') -> float:
    token_start = float(token_start)
    token_end = float(token_end)
    if token_end <= token_start:
        token_end = token_start + 1e-3
    if mode == 'midpoint':
        midpoint = 0.5 * (token_start + token_end)
        return 1.0 if tr_start <= midpoint < tr_end else 0.0
    overlap = max(0.0, min(token_end, tr_end) - max(token_start, tr_start))
    duration = token_end - token_start
    if duration <= 0:
        return 1.0 if overlap > 0 else 0.0
    return max(0.0, min(1.0, overlap / duration))


def score_tr(token_payload: Sequence[Dict], method: str, *, lexicon: Optional[Dict[str, float]] = None, prototype: Optional[np.ndarray] = None, prototype_norm: Optional[float] = None) -> float:
    if not token_payload:
        return float('nan')
    method = method.lower()
    if method == 'count':
        if not lexicon:
            return float('nan')
        total = 0.0
        for item in token_payload:
            weight = lexicon.get(item['word'].lower())
            if weight is None:
                continue
            total += weight * item['overlap']
        return float(total)
    if method == 'similarity':
        if prototype is None or prototype_norm is None or prototype_norm < EPS:
            return float('nan')
        num = 0.0
        denom = 0.0
        for item in token_payload:
            emb = item.get('embedding')
            if emb is None:
                continue
            emb_norm = item.get('embedding_norm')
            if emb_norm is None or emb_norm < EPS:
                continue
            sim = float(np.dot(emb, prototype) / (emb_norm * prototype_norm))
            num += sim * item['overlap']
            denom += item['overlap']
        if denom == 0:
            return float('nan')
        value = num / denom
        return float(np.clip(value, -1.0, 1.0))
    raise ValueError(f'Unknown scoring method: {method}')


def ensure_serializable(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    if isinstance(obj, (np.floating, np.integer)):
        return obj.item()
    if isinstance(obj, dict):
        return {k: ensure_serializable(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [ensure_serializable(v) for v in obj]
    return obj


def build_token_buckets(edges: np.ndarray, event_records: Sequence[Dict], mode: str = 'proportional') -> List[List[Dict]]:
    if edges.size < 2:
        return []
    buckets: List[List[Dict]] = [[] for _ in range(len(edges) - 1)]
    for rec in event_records:
        start = rec['start']
        end = rec['end']
        if end <= edges[0] or start >= edges[-1]:
            continue
        start_idx = max(0, int(np.searchsorted(edges, start, side='right')) - 1)
        end_idx = max(0, int(np.searchsorted(edges, end, side='left')))
        end_idx = min(end_idx, len(buckets) - 1)
        for idx in range(start_idx, end_idx + 1):
            bucket_start = edges[idx]
            bucket_end = edges[idx + 1]
            if mode == 'none':
                overlap = 1.0 if not (end <= bucket_start or start >= bucket_end) else 0.0
            else:
                overlap = tr_token_overlap(start, end, bucket_start, bucket_end, 'proportional')
            if overlap <= 0:
                continue
            buckets[idx].append({
                'word': rec['word'],
                'overlap': overlap,
                'embedding': rec['embedding'],
                'embedding_norm': rec['embedding_norm'],
                'token_start': rec['start'],
                'token_end': rec['end'],
                'bucket_start': bucket_start,
                'bucket_end': bucket_end,
            })
    return buckets


def score_time_series(edges: np.ndarray, buckets: Sequence[Sequence[Dict]], category_states: Dict[str, Dict], category_names: Sequence[str], category_columns: Sequence[str], method: str, index_name: str) -> Tuple[pd.DataFrame, np.ndarray]:
    n_bins = len(buckets)
    score_matrix = np.full((n_bins, len(category_names)), np.nan, dtype=float)
    for col_idx, cat_name in enumerate(category_names):
        state = category_states[cat_name]
        lexicon = state.get('lexicon')
        prototype = state.get('prototype')
        prototype_norm = state.get('prototype_norm')
        for bin_idx, bucket in enumerate(buckets):
            score_matrix[bin_idx, col_idx] = score_tr(bucket, method, lexicon=lexicon, prototype=prototype, prototype_norm=prototype_norm)
    data = {
        index_name: np.arange(n_bins, dtype=int),
        'start_sec': edges[:-1],
        'end_sec': edges[1:],
    }
    for col_idx, col in enumerate(category_columns):
        data[col] = score_matrix[:, col_idx]
    df = pd.DataFrame(data)
    return df, score_matrix


def build_smoothing_kernel(seconds_bin_width: float, smoothing_seconds: float, *, method: str = 'moving_average', gaussian_sigma_seconds: Optional[float] = None) -> np.ndarray:
    if smoothing_seconds <= 0:
        return np.array([1.0], dtype=float)
    method = str(method or 'moving_average').lower()
    if method == 'moving_average':
        window_samples = max(1, int(round(smoothing_seconds / seconds_bin_width)))
        if window_samples % 2 == 0:
            window_samples += 1
        kernel = np.ones(window_samples, dtype=float)
    elif method == 'gaussian':
        sigma_seconds = float(gaussian_sigma_seconds) if gaussian_sigma_seconds not in (None, '') else max(smoothing_seconds / 2.0, seconds_bin_width)
        sigma_samples = max(sigma_seconds / seconds_bin_width, 1e-6)
        half_width = max(1, int(round(3.0 * sigma_samples)))
        grid = np.arange(-half_width, half_width + 1, dtype=float)
        kernel = np.exp(-0.5 * (grid / sigma_samples) ** 2)
    else:
        raise ValueError(f"Unknown smoothing method: {method}")
    kernel_sum = float(kernel.sum())
    if kernel_sum <= 0:
        return np.array([1.0], dtype=float)
    return kernel / kernel_sum


def apply_smoothing_kernel(values: np.ndarray, kernel: np.ndarray, *, pad_mode: str = 'edge', eps: float = 1e-8) -> np.ndarray:
    if values.size == 0 or kernel.size <= 1:
        return values.copy()
    pad_mode = pad_mode if pad_mode in {'edge', 'reflect'} else 'edge'
    half = kernel.size // 2
    padded = np.pad(values, ((half, half), (0, 0)), mode=pad_mode)
    mask = np.isfinite(padded).astype(float)
    filled = np.where(mask, padded, 0.0)
    smoothed = np.empty((values.shape[0], values.shape[1]), dtype=float)
    for col in range(values.shape[1]):
        numerator = np.convolve(filled[:, col], kernel, mode='valid')
        denominator = np.convolve(mask[:, col], kernel, mode='valid')
        with np.errstate(divide='ignore', invalid='ignore'):
            smoothed_col = numerator / np.maximum(denominator, eps)
        smoothed_col[denominator < eps] = np.nan
        smoothed[:, col] = smoothed_col
    return smoothed


def aggregate_seconds_to_edges(canonical_edges: np.ndarray, canonical_values: np.ndarray, target_edges: np.ndarray) -> np.ndarray:
    if canonical_values.size == 0:
        return np.empty((len(target_edges) - 1, 0), dtype=float)
    midpoints = 0.5 * (canonical_edges[:-1] + canonical_edges[1:])
    bin_ids = np.digitize(midpoints, target_edges) - 1
    if bin_ids.size:
        bin_ids = np.clip(bin_ids, 0, len(target_edges) - 2)
    out = np.full((len(target_edges) - 1, canonical_values.shape[1]), np.nan, dtype=float)
    for idx in range(out.shape[0]):
        mask = bin_ids == idx
        if not np.any(mask):
            continue
        values = canonical_values[mask]
        if values.ndim == 1:
            values = values[:, None]
        finite_any = np.isfinite(values).any(axis=0)
        if not finite_any.any():
            continue
        col_means = np.full(values.shape[1], np.nan, dtype=float)
        col_means[finite_any] = np.nanmean(values[:, finite_any], axis=0)
        out[idx] = col_means
    return out


In [5]:
def generate_category_time_series(
    subject: str,
    story: str,
    *,
    cfg_base: Dict[str, Any],
    categories_cfg_base: Dict[str, Any],
    cluster_csv_path: str,
    temporal_weighting: str,
    prototype_weight_power: float,
    smoothing_seconds: float,
    smoothing_method: str,
    gaussian_sigma_seconds: Optional[float],
    smoothing_pad: str,
    seconds_bin_width: float,
    save_outputs: bool = True,
) -> Dict[str, Any]:
    if not subject or not story:
        raise ValueError('Subject and story must be provided.')
    print(f"=== Day19 category build for {subject} / {story} ===")

    categories_cfg = json.loads(json.dumps(categories_cfg_base or {}))
    categories_cfg['seconds_bin_width'] = float(seconds_bin_width)
    category_sets = categories_cfg.get('sets', {})
    available_sets = sorted(category_sets.keys())
    category_set_name = categories_cfg.get('category_set') or (available_sets[0] if available_sets else None)
    if cluster_csv_path:
        if not category_set_name:
            category_set_name = 'csv_clusters'
        categories_cfg['category_set'] = category_set_name
        categories_cfg['category_score_method'] = 'similarity'
        categories_cfg['allow_single_seed'] = True
        categories_cfg['expansion'] = {'enabled': False}
    category_score_method = str(categories_cfg.get('category_score_method', 'similarity')).lower()
    overlap_mode = str(categories_cfg.get('overlap_weighting', 'proportional')).lower()
    expansion_cfg = categories_cfg.get('expansion', {})
    allow_single = bool(categories_cfg.get('allow_single_seed', False))
    exp_enabled = bool(expansion_cfg.get('enabled', True))
    exp_top_k = int(expansion_cfg.get('top_k', 2000)) if exp_enabled else 0
    exp_min_sim = float(expansion_cfg.get('min_sim', 0.35)) if exp_enabled else 0.0

    selected_set_spec = category_sets.get(category_set_name, {}) if category_sets else {}

    output_root = features_root / 'subjects' / subject / story
    canonical_root = features_root / 'stories' / story
    if save_outputs:
        output_root.mkdir(parents=True, exist_ok=True)
        canonical_root.mkdir(parents=True, exist_ok=True)

    story_events = load_story_words(paths, subject, story)
    print(f'Loaded {len(story_events)} transcript events.')
    tr_edges = build_tr_edges(story_events, TR)
    n_tr = len(tr_edges) - 1
    print(f'TR edges: {len(tr_edges)} (n_tr={n_tr}) spanning {tr_edges[-1]:.2f} seconds.')

    embedding_source = str(categories_cfg.get('embedding_source', 'english1000')).lower()
    english_loader = None
    english_lookup: Dict[str, np.ndarray] = {}
    english_vocab: List[str] = []
    english_matrix = None
    if embedding_source in {'english1000', 'both'}:
        english1000_path = Path(paths.get('data_root', '')) / 'derivative' / 'english1000sm.hf5'
        if english1000_path.exists():
            english_loader = English1000Loader(english1000_path)
            english_lookup = english_loader.lookup
            english_vocab = english_loader.vocab
            english_matrix = english_loader.embeddings
            print(f'Loaded English1000 embeddings from {english1000_path} (vocab={len(english_vocab)}).')
        else:
            raise FileNotFoundError(f'English1000 embeddings not found at {english1000_path}')
    else:
        print('English1000 disabled by configuration.')

    word2vec_model = None
    if embedding_source in {'word2vec', 'both'}:
        w2v_path = categories_cfg.get('word2vec_path')
        if w2v_path:
            w2v_path = Path(w2v_path)
            if w2v_path.exists():
                try:
                    from gensim.models import KeyedVectors
                    binary = w2v_path.suffix.lower() in {'.bin', '.gz'}
                    word2vec_model = KeyedVectors.load_word2vec_format(w2v_path, binary=binary)
                    print(f'Loaded Word2Vec fallback from {w2v_path}.')
                except Exception as exc:
                    warnings.warn(f'Failed to load Word2Vec fallback: {exc}')
            else:
                warnings.warn(f'Word2Vec path does not exist: {w2v_path}')
        else:
            warnings.warn('Word2Vec fallback requested but no path provided.')
    else:
        print('Word2Vec fallback disabled.')

    if cluster_csv_path:
        csv_clusters = load_clusters_from_csv(cluster_csv_path)
        category_states, category_definitions = build_states_from_csv(
            csv_clusters,
            english_lookup,
            word2vec_model,
            weight_power=prototype_weight_power,
        )
        category_names = sorted(category_states.keys())
        category_columns = [f'cat_{name}' for name in category_names]
        print(f"Loaded {len(category_names)} CSV-driven categories from {cluster_csv_path}: {category_names}")
        zero_norm = [k for k, v in category_states.items() if v.get('prototype') is not None and (v.get('prototype_norm') or 0.0) < EPS]
        if zero_norm:
            warnings.warn(f"Zero-norm prototypes (check OOV/weights): {zero_norm}")
    else:
        category_states = {}
        category_definitions = {}
        seed_oov_counter = Counter()
        for cat_name, cat_spec in selected_set_spec.items():
            seeds = cat_spec.get('seeds', [])
            explicit_words = cat_spec.get('words', [])
            prototype = None
            found_seeds: List[str] = []
            missing_seeds: List[str] = []
            if seeds:
                prototype, found_seeds, missing_seeds = make_category_prototype(seeds, english_lookup, word2vec_model, allow_single)
                seed_oov_counter[cat_name] = len(missing_seeds)
                if prototype is None and category_score_method == 'similarity':
                    warnings.warn(f"Category '{cat_name}' has no usable prototype; TR scores will be NaN.")
            elif category_score_method == 'similarity':
                warnings.warn(f'Category {cat_name} has no seeds; similarity method will yield NaNs.')
            lexicon = {word.lower(): 1.0 for word in explicit_words}
            for seed in found_seeds:
                lexicon.setdefault(seed.lower(), 1.0)
            prototype_norm = None
            expanded_words = {}
            if prototype is not None:
                prototype_norm = float(np.linalg.norm(prototype))
                if exp_enabled and english_matrix is not None:
                    expanded_words = expand_category(prototype, english_matrix, english_vocab, exp_top_k, exp_min_sim)
                    for word, weight in expanded_words.items():
                        lexicon.setdefault(word.lower(), float(weight))
            if not lexicon and category_score_method == 'count':
                warnings.warn(f'Category {cat_name} lexicon is empty; counts will be NaN.')
            category_states[cat_name] = {
                'name': cat_name,
                'seeds': seeds,
                'found_seeds': found_seeds,
                'missing_seeds': missing_seeds,
                'prototype': prototype,
                'prototype_norm': prototype_norm,
                'lexicon': lexicon,
                'expanded_count': len(expanded_words),
                'expansion_params': {
                    'enabled': exp_enabled,
                    'top_k': exp_top_k,
                    'min_sim': exp_min_sim,
                },
            }
            category_definitions[cat_name] = {
                'seeds': seeds,
                'found_seeds': found_seeds,
                'missing_seeds': missing_seeds,
                'prototype_dim': int(prototype.shape[0]) if isinstance(prototype, np.ndarray) else 0,
                'prototype_norm': prototype_norm,
                'expanded_neighbors': ensure_serializable(expanded_words),
                'lexicon': {word: float(weight) for word, weight in sorted(category_states[cat_name]['lexicon'].items())},
            }
        print('Category seeds missing counts:', dict(seed_oov_counter))
        category_names = sorted(category_states.keys())
        category_columns = [f'cat_{name}' for name in category_names]
        print(f'Prepared {len(category_names)} categories: {category_names}')

    tw_mode = str(temporal_weighting or 'proportional').lower()
    if tw_mode not in {'proportional', 'none', 'midpoint'}:
        raise ValueError(f'Unsupported temporal weighting: {tw_mode}')

    seconds_bin_width = float(seconds_bin_width)
    if seconds_bin_width <= 0:
        raise ValueError('seconds_bin_width must be positive.')
    smoothing_method = str(smoothing_method or 'moving_average').lower()
    gaussian_sigma_seconds = gaussian_sigma_seconds if gaussian_sigma_seconds not in (None, '') else None
    smoothing_pad = str(smoothing_pad or 'edge').lower()
    if smoothing_pad not in {'edge', 'reflect'}:
        smoothing_pad = 'edge'

    embedding_cache: Dict[str, Optional[np.ndarray]] = {}
    event_records: List[Dict] = []
    tokens_with_embeddings = 0
    for word, onset, offset in story_events:
        token = word.strip()
        if not token:
            continue
        key = token.lower()
        if key not in embedding_cache:
            embedding_cache[key] = lookup_embedding(token, english_lookup, word2vec_model)
        emb = embedding_cache[key]
        emb_norm = float(np.linalg.norm(emb)) if emb is not None else None
        if emb is not None:
            tokens_with_embeddings += 1
        event_records.append({
            'word': token,
            'start': float(onset),
            'end': float(offset),
            'embedding': emb,
            'embedding_norm': emb_norm,
        })

    total_tokens = len(event_records)
    print(f'Tokens with embeddings: {tokens_with_embeddings}/{total_tokens} (OOV rate={(total_tokens - tokens_with_embeddings) / max(total_tokens, 1):.2%}).')
    if not event_records:
        raise ValueError('No token events available for category featurization.')

    max_end_time = max(rec['end'] for rec in event_records)
    canonical_edges = np.arange(0.0, max_end_time + seconds_bin_width, seconds_bin_width, dtype=float)
    if canonical_edges[-1] < max_end_time:
        canonical_edges = np.append(canonical_edges, canonical_edges[-1] + seconds_bin_width)
    if canonical_edges[-1] < max_end_time - 1e-9:
        canonical_edges = np.append(canonical_edges, canonical_edges[-1] + seconds_bin_width)
    assert np.all(np.diff(canonical_edges) > 0), 'Non-monotone canonical edges.'

    canonical_buckets = build_token_buckets(canonical_edges, event_records, tw_mode)
    empty_canonical = sum(1 for bucket in canonical_buckets if not bucket)
    print(f'Canonical bins without tokens: {empty_canonical}/{len(canonical_buckets)}')

    canonical_df_raw, canonical_matrix = score_time_series(
        canonical_edges,
        canonical_buckets,
        category_states,
        category_names,
        category_columns,
        category_score_method,
        index_name='bin_index',
    )
    canonical_values_raw = canonical_matrix.copy()
    smoothing_kernel = build_smoothing_kernel(
        seconds_bin_width,
        smoothing_seconds,
        method=smoothing_method,
        gaussian_sigma_seconds=gaussian_sigma_seconds,
    )
    smoothing_applied = smoothing_kernel.size > 1
    if canonical_values_raw.size and smoothing_applied:
        canonical_values_smoothed = apply_smoothing_kernel(canonical_values_raw, smoothing_kernel, pad_mode=smoothing_pad)
    else:
        canonical_values_smoothed = canonical_values_raw.copy()

    canonical_df_smoothed = canonical_df_raw.copy()
    if category_columns:
        canonical_df_smoothed.loc[:, category_columns] = canonical_values_smoothed
    canonical_df_selected = canonical_df_smoothed if smoothing_applied else canonical_df_raw

    if save_outputs:
        canonical_root.mkdir(parents=True, exist_ok=True)
        canonical_csv_path = canonical_root / 'category_timeseries_seconds.csv'
        canonical_df_selected.to_csv(canonical_csv_path, index=False)
        if smoothing_applied:
            canonical_df_raw.to_csv(canonical_root / 'category_timeseries_seconds_raw.csv', index=False)
        canonical_definition_path = canonical_root / 'category_definition.json'
        with canonical_definition_path.open('w') as fh:
            json.dump(ensure_serializable(category_definitions), fh, indent=2)
        print(f'Saved canonical story series to {canonical_csv_path}')

    tr_buckets = build_token_buckets(tr_edges, event_records, tw_mode)
    empty_tr = sum(1 for bucket in tr_buckets if not bucket)
    print(f'TRs without tokens: {empty_tr}/{len(tr_buckets)}')

    if category_columns:
        tr_values_raw = aggregate_seconds_to_edges(canonical_edges, canonical_values_raw, tr_edges)
        tr_values_smoothed = aggregate_seconds_to_edges(canonical_edges, canonical_values_smoothed, tr_edges)
    else:
        tr_values_raw = np.empty((len(tr_edges) - 1, 0), dtype=float)
        tr_values_smoothed = tr_values_raw

    base_index = np.arange(len(tr_edges) - 1, dtype=int)
    base_df = pd.DataFrame({'tr_index': base_index, 'start_sec': tr_edges[:-1], 'end_sec': tr_edges[1:]})
    category_df_raw = base_df.copy()
    category_df_smoothed = base_df.copy()
    if category_columns:
        category_df_raw.loc[:, category_columns] = tr_values_raw
        category_df_smoothed.loc[:, category_columns] = tr_values_smoothed
    category_df = category_df_smoothed if smoothing_applied else category_df_raw
    print(category_df.head())

    if category_score_method == 'similarity' and category_columns:
        finite_vals = category_df[category_columns].to_numpy(dtype=float)
        finite_vals = finite_vals[np.isfinite(finite_vals)]
        if finite_vals.size:
            assert np.nanmin(finite_vals) >= -1.0001 and np.nanmax(finite_vals) <= 1.0001, 'Similarity scores out of bounds.'
    else:
        if category_columns:
            assert (category_df[category_columns].fillna(0.0) >= -1e-9).all().all(), 'Count scores must be non-negative.'

    if save_outputs:
        output_root.mkdir(parents=True, exist_ok=True)
        category_csv_path = output_root / 'category_timeseries.csv'
        category_df.to_csv(category_csv_path, index=False)
        if smoothing_applied:
            category_df_raw.to_csv(output_root / 'category_timeseries_raw.csv', index=False)
        definition_path = output_root / 'category_definition.json'
        with definition_path.open('w') as fh:
            json.dump(ensure_serializable(category_definitions), fh, indent=2)
        print(f'Saved category time series to {category_csv_path}')

    trimmed_path = Path(paths.get('figs', 'figs')) / subject / story / 'day16_decoding' / 'semantic_pcs_trimmed.csv'
    max_lag_primary = 0
    trimmed_df = None
    if trimmed_path.exists():
        day16_trim = pd.read_csv(trimmed_path)
        expected_len = len(day16_trim)
        if len(day16_trim) > len(category_df):
            raise ValueError('Day16 trimmed series longer than category series; regenerate Day16 or rerun Day17.')
        max_lag_primary = max(0, len(category_df) - expected_len)
        trimmed_df = category_df.iloc[max_lag_primary:].reset_index(drop=True)
        if save_outputs:
            trimmed_out = trimmed_df.copy()
            trimmed_out.insert(0, 'trim_index', np.arange(len(trimmed_out), dtype=int))
            trimmed_out.drop(columns=['tr_index'], inplace=True, errors='ignore')
            trimmed_out.to_csv(output_root / 'category_timeseries_trimmed.csv', index=False)
            print(f'Saved trimmed category series to {output_root / "category_timeseries_trimmed.csv"}')
    else:
        warnings.warn('Day16 trimmed PCs not found; skipping auto-alignment.')

    smoothing_meta = {
        'applied': bool(smoothing_applied),
        'seconds': smoothing_seconds,
        'method': smoothing_method,
        'gaussian_sigma_seconds': float(gaussian_sigma_seconds) if gaussian_sigma_seconds is not None else None,
        'kernel_size': int(smoothing_kernel.size),
        'pad_mode': smoothing_pad,
        'bin_width_seconds': seconds_bin_width,
    }

    return {
        'subject': subject,
        'story': story,
        'temporal_weighting': tw_mode,
        'category_columns': category_columns,
        'category_states': category_states,
        'category_definitions': category_definitions,
        'category_score_method': category_score_method,
        'event_records': event_records,
        'canonical_buckets': canonical_buckets,
        'tr_buckets': tr_buckets,
        'canonical_df_raw': canonical_df_raw,
        'canonical_df_smoothed': canonical_df_smoothed,
        'canonical_df_selected': canonical_df_selected,
        'category_df_raw': category_df_raw,
        'category_df_smoothed': category_df_smoothed,
        'category_df_selected': category_df,
        'canonical_edges': canonical_edges,
        'tr_edges': tr_edges,
        'smoothing': smoothing_meta,
        'output_root': output_root,
        'canonical_root': canonical_root,
        'trimmed_df': trimmed_df,
        'max_lag_primary': max_lag_primary,
    }



In [6]:
result = generate_category_time_series(
    SUBJECT,
    STORY,
    cfg_base=cfg,
    categories_cfg_base=categories_cfg,
    cluster_csv_path=cluster_csv_path,
    temporal_weighting=TEMPORAL_WEIGHTING,
    prototype_weight_power=prototype_weight_power,
    smoothing_seconds=SMOOTHING_SECONDS,
    smoothing_method=SMOOTHING_METHOD,
    gaussian_sigma_seconds=GAUSSIAN_SIGMA_SECONDS,
    smoothing_pad=SMOOTHING_PAD_MODE,
    seconds_bin_width=SECONDS_BIN_WIDTH,
    save_outputs=SAVE_OUTPUTS,
)

canonical_df = result['canonical_df_selected']
tr_df = result['category_df_selected']
print()
print('Smoothing configuration:', result['smoothing'])
print()
print('Canonical preview:')
display(canonical_df.head())
print()
print('TR-aligned preview:')
display(tr_df.head())
if result['trimmed_df'] is not None:
    print()
    print(f"Trimmed window length: {len(result['trimmed_df'])} (max_lag_primary={result['max_lag_primary']})")


=== Day19 category build for UTS01 / wheretheressmoke ===
Loaded 2308 transcript events.
TR edges: 302 (n_tr=301) spanning 602.00 seconds.
Loaded English1000 embeddings from /bucket/PaoU/seann/openneuro/ds003020/derivative/english1000sm.hf5 (vocab=10470).
Word2Vec fallback disabled.
Loaded 12 CSV-driven categories from configs/cluster_words.csv: ['abstract', 'communal', 'emotional', 'locational', 'mental', 'numeric', 'professional', 'social', 'tactile', 'temporal', 'violent', 'visual']
Tokens with embeddings: 1835/2308 (OOV rate=20.49%).
Canonical bins without tokens: 0/12039
Saved canonical story series to featuresqaemb/stories/wheretheressmoke/category_timeseries_seconds.csv
TRs without tokens: 0/301
   tr_index  start_sec  end_sec  cat_abstract  cat_communal  cat_emotional  \
0         0        0.0      2.0     -0.234710     -0.221588      -0.162214   
1         1        2.0      4.0     -0.118607     -0.186303       0.095760   
2         2        4.0      6.0     -0.212263     -0.3



Unnamed: 0,bin_index,start_sec,end_sec,cat_abstract,cat_communal,cat_emotional,cat_locational,cat_mental,cat_numeric,cat_professional,cat_social,cat_tactile,cat_temporal,cat_violent,cat_visual
0,0,0.0,0.05,-0.216557,-0.250232,-0.327978,0.025273,-0.158207,0.31243,-0.049614,-0.146466,0.182867,0.235947,0.005606,0.152835
1,1,0.05,0.1,-0.216704,-0.250805,-0.327851,0.024511,-0.157807,0.312208,-0.04984,-0.14655,0.182932,0.236174,0.006072,0.152925
2,2,0.1,0.15,-0.220967,-0.267412,-0.324147,0.002433,-0.146226,0.305769,-0.056401,-0.148991,0.184803,0.242755,0.019557,0.155537
3,3,0.15,0.2,-0.225229,-0.284018,-0.320444,-0.019644,-0.134644,0.29933,-0.062962,-0.151432,0.186674,0.249335,0.033043,0.15815
4,4,0.2,0.25,-0.229491,-0.300625,-0.316741,-0.041722,-0.123062,0.292892,-0.069524,-0.153874,0.188545,0.255916,0.046529,0.160762



TR-aligned preview:


Unnamed: 0,tr_index,start_sec,end_sec,cat_abstract,cat_communal,cat_emotional,cat_locational,cat_mental,cat_numeric,cat_professional,cat_social,cat_tactile,cat_temporal,cat_violent,cat_visual
0,0,0.0,2.0,-0.23471,-0.221588,-0.162214,-0.153373,-0.0245,0.133099,-0.027683,-0.016114,0.052948,0.200849,0.077543,0.088356
1,1,2.0,4.0,-0.118607,-0.186303,0.09576,-0.29771,0.190081,-0.08768,-0.039687,0.105637,-0.004579,0.141428,0.25306,-0.063777
2,2,4.0,6.0,-0.212263,-0.350677,-0.237075,-0.210927,-0.067724,0.175485,-0.137288,-0.094446,0.186172,0.192468,0.178671,0.208267
3,3,6.0,8.0,-0.088565,-0.428633,-0.301686,-0.10732,-0.111763,0.217664,-0.226084,-0.266789,0.35599,0.149258,0.138575,0.313628
4,4,8.0,10.0,-0.39403,-0.368272,-0.342657,-0.07614,-0.030331,0.223652,0.057729,-0.032521,0.109272,0.376442,0.144865,0.141703


In [7]:
# Prepare transcript tokens and per-category scores
import numpy as np
import pandas as pd

if 'result' not in globals():
    raise RuntimeError('Run the generation cell first to populate `result`.')

_tokens_raw = result.get('event_records') or []
if not _tokens_raw:
    raise RuntimeError('No transcript events found - rerun upstream steps.')

_category_states = result.get('category_states') or {}
if not _category_states:
    raise RuntimeError('Category states missing from result; rerun the generation cell.')

_token_df = pd.DataFrame(_tokens_raw)
_token_df = _token_df[['word', 'start', 'end', 'embedding', 'embedding_norm']].copy()
_token_df['midpoint'] = 0.5 * (_token_df['start'] + _token_df['end'])
_token_df['duration'] = _token_df['end'] - _token_df['start']
_token_df['token_index'] = np.arange(len(_token_df))

_score_method = str(result.get('category_score_method', 'similarity')).lower()
_token_scores = {}
_abs_max = 0.0
for cat_name, state in _category_states.items():
    scores = []
    proto = state.get('prototype')
    proto_norm = state.get('prototype_norm') or 0.0
    lexicon = state.get('lexicon', {}) or {}
    for rec in _tokens_raw:
        word = rec['word']
        if _score_method == 'count':
            score = lexicon.get(word.lower(), np.nan)
        else:
            emb = rec.get('embedding')
            emb_norm = rec.get('embedding_norm') or 0.0
            if emb is None or proto is None or proto_norm <= 0 or emb_norm <= 0:
                score = np.nan
            else:
                score = float(np.clip(np.dot(emb, proto) / (emb_norm * proto_norm), -1.0, 1.0))
        scores.append(score)
    arr = np.array(scores, dtype=float)
    _token_scores[cat_name] = arr
    finite = np.abs(arr[np.isfinite(arr)])
    if finite.size:
        _abs_max = max(_abs_max, float(finite.max()))

TOKEN_BASE_DF = _token_df[['token_index', 'word', 'start', 'end', 'midpoint', 'duration']].copy()
TOKEN_SCORE_CACHE = _token_scores
TOKEN_SCORE_METHOD = _score_method
TOKEN_SCORE_ABS_MAX = _abs_max if _abs_max > 0 else 1.0

# Drop heavy objects from the temporary frame to free memory
del _token_df
del _tokens_raw


In [8]:
# Pick 12 categories for the 4x3 grid (edit as needed)
if KARAOKE_CATEGORY_COLUMNS is None:
    category_cols_12 = result['category_columns'][:KARAOKE_CATEGORY_COUNT]
else:
    category_cols_12 = list(KARAOKE_CATEGORY_COLUMNS)

if len(category_cols_12) != 12:
    raise ValueError(f'Expected 12 categories, got {len(category_cols_12)}.')

print('Selected categories:', category_cols_12)


Selected categories: ['cat_abstract', 'cat_communal', 'cat_emotional', 'cat_locational', 'cat_mental', 'cat_numeric', 'cat_professional', 'cat_social', 'cat_tactile', 'cat_temporal', 'cat_violent', 'cat_visual']


In [9]:
import numpy as np
import pandas as pd
import matplotlib.animation as animation
import matplotlib.patches as mpatches
import time

if plt is None:
    raise RuntimeError('Matplotlib unavailable in this environment.')


def make_karaoke_category_video(
    *,
    result: dict,
    token_df: pd.DataFrame,
    category_cols: list,
    out_mp4: str = "karaoke_categories.mp4",
    use_domain: str = "tr",          # "tr" or "canonical"
    fps: int = 30,
    window_sec: float = 30.0,        # how many seconds visible at once
    pad_left_sec: float = 0.5,
    pad_right_sec: float = 2.0,
    log_every_frames: int | None = None,
    playback_speed: float = 1.0,
    ypad_frac: float = 0.05,
    bin_words_max: int | None = 30,
    zscore: bool = True,
):
    '''
    Creates an MP4 where the top shows words within the active bin and bottom is a 4x3 grid.
    Assumes token_df has columns: word,start,end,midpoint (like your TOKEN_BASE_DF).
    '''

    assert use_domain in {"tr", "canonical"}
    assert len(category_cols) == 12, "Pass exactly 12 categories for a 4x3 grid."

    if playback_speed <= 0:
        raise ValueError("playback_speed must be > 0.")
    if ypad_frac < 0:
        raise ValueError("ypad_frac must be >= 0.")
    if bin_words_max is not None and bin_words_max <= 0:
        raise ValueError("bin_words_max must be > 0 or None.")

    if not isinstance(zscore, bool):
        raise ValueError("zscore must be a boolean.")

    if log_every_frames is None:
        log_every_frames = max(1, int(fps * 5))

    # --- Choose time base and series ---
    if use_domain == "canonical":
        df = result["canonical_df_selected"]
        t = 0.5 * (result["canonical_edges"][:-1] + result["canonical_edges"][1:])
        edges = result["canonical_edges"]
    else:
        df = result["category_df_selected"]
        t = result["tr_edges"][:-1]
        edges = result["tr_edges"]

    t = np.asarray(t, dtype=float)
    edges = np.asarray(edges, dtype=float)

    # Determine video duration from transcript
    t_end = float(token_df["end"].max())
    transcript_duration = t_end + pad_right_sec
    duration = transcript_duration / playback_speed
    n_frames = int(np.ceil(duration * fps))

    # Pre-extract series arrays for speed
    Y = np.vstack([df[c].to_numpy(dtype=float) for c in category_cols])  # shape (12, T)

    if zscore:
        means = np.nanmean(Y, axis=1, keepdims=True)
        stds = np.nanstd(Y, axis=1, keepdims=True)
        stds = np.where(np.isfinite(stds) & (stds > 0), stds, 1.0)
        Y = (Y - means) / stds

    # Robust y-lims per subplot (full range with padding)
    ylims = []
    for i in range(12):
        vals = Y[i]
        finite = vals[np.isfinite(vals)]
        if finite.size == 0:
            ylims.append((-1, 1))
        else:
            lo = float(np.min(finite))
            hi = float(np.max(finite))
            if np.isclose(lo, hi):
                pad = 1.0 if lo == 0 else abs(lo) * 0.1
            else:
                pad = (hi - lo) * ypad_frac
            ylims.append((lo - pad, hi + pad))

    # --- Figure layout: top karaoke + 4x3 plots ---
    fig = plt.figure(figsize=(16, 9), dpi=150)
    gs = fig.add_gridspec(5, 3, height_ratios=[0.65, 1, 1, 1, 1], hspace=0.35, wspace=0.25)

    ax_text = fig.add_subplot(gs[0, :])
    ax_text.axis("off")

    axes = []
    for r in range(1, 5):
        for c in range(3):
            axes.append(fig.add_subplot(gs[r, c]))

    # Initialize lines
    lines = []
    cursors = []
    highlights = []
    for i, ax in enumerate(axes):
        ax.set_title(category_cols[i].replace("cat_", ""), fontsize=10)
        ax.set_xlim(0, window_sec)
        ax.set_ylim(*ylims[i])
        ax.grid(True, alpha=0.25)
        (ln,) = ax.plot([], [], linewidth=1.6)
        cursor = ax.axvline(0, linewidth=1.2, alpha=0.9)
        highlight = mpatches.Rectangle(
            (0, 0), 0, 1,
            transform=ax.get_xaxis_transform(),
            facecolor="#f4c542",
            alpha=0.25,
            zorder=0,
        )
        ax.add_patch(highlight)
        lines.append(ln)
        cursors.append(cursor)
        highlights.append(highlight)

    # Text artists
    txt_bin = ax_text.text(0.01, 0.72, "", fontsize=16, fontweight="bold", va="center", ha="left")
    txt_words = ax_text.text(0.01, 0.36, "", fontsize=14, va="center", ha="left")
    txt_sub  = ax_text.text(0.01, 0.10, "", fontsize=11, va="center", ha="left", alpha=0.8)

    words = token_df["word"].astype(str).tolist()
    starts = token_df["start"].to_numpy(dtype=float)
    ends = token_df["end"].to_numpy(dtype=float)

    start_time = None

    def _bin_index(t_now: float) -> int:
        idx = int(np.searchsorted(edges, t_now, side="right") - 1)
        return int(np.clip(idx, 0, len(edges) - 2))

    def _format_bin_words(bin_words: list[str]) -> str:
        if not bin_words:
            return "(no words in bin)"
        if bin_words_max is not None and len(bin_words) > bin_words_max:
            shown = bin_words[:bin_words_max]
            return " ".join(shown) + " ..."
        return " ".join(bin_words)

    def init():
        for ln in lines:
            ln.set_data([], [])
        txt_bin.set_text("")
        txt_words.set_text("")
        txt_sub.set_text("")
        return lines + cursors + highlights + [txt_bin, txt_words, txt_sub]

    def update(frame):
        nonlocal start_time
        if start_time is None:
            start_time = time.perf_counter()

        t_now = (frame / fps) * playback_speed

        # sliding window
        w0 = max(0.0, t_now - window_sec + pad_left_sec)
        w1 = w0 + window_sec

        # slice series in the window (by time array)
        mask = (t >= w0) & (t <= w1)
        if not np.any(mask):
            return lines + cursors + highlights + [txt_bin, txt_words, txt_sub]

        tw = t[mask] - w0  # shift to window coords

        bin_idx = _bin_index(t_now)
        bin_start = float(edges[bin_idx])
        bin_end = float(edges[bin_idx + 1])

        view_start = max(bin_start, w0)
        view_end = min(bin_end, w1)
        span_start = view_start - w0
        span_width = max(0.0, view_end - view_start)

        for i in range(12):
            yw = Y[i, mask]
            lines[i].set_data(tw, yw)
            cursors[i].set_xdata([t_now - w0, t_now - w0])
            axes[i].set_xlim(0, window_sec)
            highlights[i].set_x(span_start)
            highlights[i].set_width(span_width)

        bin_mask = (starts < bin_end) & (ends > bin_start)
        bin_words = [words[i] for i in np.where(bin_mask)[0]]
        words_text = _format_bin_words(bin_words)

        txt_bin.set_text(
            f"Bin {bin_start:6.2f}-{bin_end:6.2f}s | {len(bin_words)} words"
        )
        txt_words.set_text(words_text)
        txt_sub.set_text(
            f"t = {t_now:6.2f}s | bin {bin_idx + 1}/{len(edges) - 1} | domain: {use_domain}"
        )

        if frame % log_every_frames == 0 or frame == n_frames - 1:
            elapsed = time.perf_counter() - start_time
            progress = (frame + 1) / n_frames
            eta = elapsed / progress - elapsed if progress > 0 else float('inf')
            print(
                f"Frame {frame + 1}/{n_frames} "
                f"({progress * 100:.1f}%) | "
                f"elapsed {elapsed / 60:.1f}m | ETA {eta / 60:.1f}m"
            )

        return lines + cursors + highlights + [txt_bin, txt_words, txt_sub]

    anim = animation.FuncAnimation(
        fig, update, init_func=init,
        frames=n_frames, interval=1000/fps, blit=False
    )

    writer = animation.FFMpegWriter(
        fps=fps,
        codec="libx264",
        bitrate=5000,
        extra_args=[
            "-pix_fmt", "yuv420p",
            "-profile:v", "baseline",
            "-level", "3.0",
            "-movflags", "+faststart"
        ]
    )
    anim.save(out_mp4, writer=writer)
    plt.close(fig)
    print(f"Saved: {out_mp4}")


In [10]:
import shutil

if plt is None:
    raise RuntimeError('Matplotlib unavailable in this environment.')

if shutil.which('ffmpeg') is None:
    raise RuntimeError('ffmpeg not found on PATH. Install ffmpeg to write MP4 files.')

make_karaoke_category_video(
    result=result,
    token_df=TOKEN_BASE_DF,
    category_cols=category_cols_12,
    out_mp4=KARAOKE_OUTPUT,
    use_domain=KARAOKE_USE_DOMAIN,
    fps=KARAOKE_FPS,
    window_sec=KARAOKE_WINDOW_SEC,
    pad_left_sec=KARAOKE_PAD_LEFT_SEC,
    pad_right_sec=KARAOKE_PAD_RIGHT_SEC,
        log_every_frames=KARAOKE_LOG_EVERY_FRAMES,
    playback_speed=KARAOKE_PLAYBACK_SPEED,
    ypad_frac=KARAOKE_YLIM_PAD_FRAC,
    bin_words_max=KARAOKE_BIN_WORDS_MAX,
    zscore=KARAOKE_ZSCORE,
)


Frame 1/302 (0.3%) | elapsed 0.0m | ETA 0.0m
Frame 6/302 (2.0%) | elapsed 0.1m | ETA 2.7m
Frame 11/302 (3.6%) | elapsed 0.1m | ETA 2.5m
Frame 16/302 (5.3%) | elapsed 0.1m | ETA 2.4m
Frame 21/302 (7.0%) | elapsed 0.2m | ETA 2.5m
Frame 26/302 (8.6%) | elapsed 0.2m | ETA 2.4m
Frame 31/302 (10.3%) | elapsed 0.3m | ETA 2.5m
Frame 36/302 (11.9%) | elapsed 0.3m | ETA 2.4m
Frame 41/302 (13.6%) | elapsed 0.4m | ETA 2.3m
Frame 46/302 (15.2%) | elapsed 0.4m | ETA 2.3m
Frame 51/302 (16.9%) | elapsed 0.5m | ETA 2.2m
Frame 56/302 (18.5%) | elapsed 0.5m | ETA 2.2m
Frame 61/302 (20.2%) | elapsed 0.5m | ETA 2.2m
Frame 66/302 (21.9%) | elapsed 0.6m | ETA 2.1m
Frame 71/302 (23.5%) | elapsed 0.6m | ETA 2.1m
Frame 76/302 (25.2%) | elapsed 0.7m | ETA 2.0m
Frame 81/302 (26.8%) | elapsed 0.7m | ETA 2.0m
Frame 86/302 (28.5%) | elapsed 0.8m | ETA 1.9m
Frame 91/302 (30.1%) | elapsed 0.8m | ETA 1.9m
Frame 96/302 (31.8%) | elapsed 0.9m | ETA 1.9m
Frame 101/302 (33.4%) | elapsed 0.9m | ETA 1.8m
Frame 106/302 (35.1%

## Next steps

- If you want different categories, set `KARAOKE_CATEGORY_COLUMNS` to an explicit list of 12.
- For faster preview, reduce `KARAOKE_FPS` or `KARAOKE_WINDOW_SEC`.
- The output MP4 path is set by `KARAOKE_OUTPUT`.
