# QAEmb per-question trajectories (UTS01 / wheretheressmoke)

Load QAEmb outputs, optionally normalize post-temporal (z-score), and emit per-question PNGs.

In [1]:
from pathlib import Path
import json
import math
import numpy as np
import pandas as pd
import matplotlib

matplotlib.use("Agg")
import matplotlib.pyplot as plt

# --- Config ---
FEATURES_ROOT = Path("/flash/PaoU/seann/fmri-edm-ccm/features_qaemb/featuresqaemb/qaemb")
SUBJECT = "UTS01"
STORY = "wheretheressmoke"

# Select which QAEmb time series to load
USE_SECONDS = True  # True => qaemb_timeseries_seconds.csv, False => qaemb_timeseries.csv (TR grid)

# Post-temporal normalization for EDM/MDE; leave as "none" for raw interpretability
NORMALIZATION = "none"  # options: none, zscore_time, robust_zscore_time
NAN_FRAC_DROP = None  # e.g., 0.2 drops cols with >20% NaN after normalization; None keeps all
CLIP_SIGMA = None  # e.g., 5 to clip z to [-5, 5]; None disables
HANDLE_CONSTANT = "keep_zero"  # keep_zero | drop | warn_keep

# Paths
csv_filename = "qaemb_timeseries_seconds.csv" if USE_SECONDS else "qaemb_timeseries.csv"
if USE_SECONDS:
    csv_path = FEATURES_ROOT / "stories" / STORY / csv_filename
    output_dir_raw = FEATURES_ROOT / "plots" / SUBJECT / STORY / "per_question_seconds"
else:
    csv_path = FEATURES_ROOT / "subjects" / SUBJECT / STORY / csv_filename
    output_dir_raw = FEATURES_ROOT / "plots" / SUBJECT / STORY / "per_question_tr"
questions_path = FEATURES_ROOT / "tokens" / SUBJECT / f"{STORY}_qaemb_questions.json"
output_dir_z = output_dir_raw.parent / (output_dir_raw.name + "_zscored")

output_dir_raw.mkdir(parents=True, exist_ok=True)
if NORMALIZATION != "none":
    output_dir_z.mkdir(parents=True, exist_ok=True)

print(f"CSV: {csv_path}")
print(f"Questions: {questions_path}")
print(f"Saving raw plots to: {output_dir_raw}")
if NORMALIZATION != "none":
    print(f"Saving z-scored plots to: {output_dir_z}")

CSV: /flash/PaoU/seann/fmri-edm-ccm/features_qaemb/featuresqaemb/qaemb/stories/wheretheressmoke/qaemb_timeseries_seconds.csv
Questions: /flash/PaoU/seann/fmri-edm-ccm/features_qaemb/featuresqaemb/qaemb/tokens/UTS01/wheretheressmoke_qaemb_questions.json
Saving raw plots to: /flash/PaoU/seann/fmri-edm-ccm/features_qaemb/featuresqaemb/qaemb/plots/UTS01/wheretheressmoke/per_question_seconds


## Normalization (post-temporal)
Per-feature normalization across time after temporal construction (EDM/MDE-friendly).

In [None]:
def normalize_features(df: pd.DataFrame, method: str = "none", eps: float = 1e-8, clip_sigma: float | None = None, handle_constant: str = "keep_zero"):
    """Normalize features column-wise across time (post-temporal).

    method: none | zscore_time | robust_zscore_time
    handle_constant: keep_zero -> output zeros; drop -> drop column; warn_keep -> keep original values.
    eps avoids divide-by-zero; clip_sigma optionally clamps z-scores for stability.
    """
    method = (method or "none").lower()
    handle_constant = (handle_constant or "keep_zero").lower()
    if method not in {"none", "zscore_time", "robust_zscore_time"}:
        raise ValueError(f"Unsupported normalization method: {method}")
    if handle_constant not in {"keep_zero", "drop", "warn_keep"}:
        raise ValueError(f"Unsupported handle_constant: {handle_constant}")
    if method == "none":
        return df.copy(), {"dropped": [], "constant": []}

    data = df.replace([np.inf, -np.inf], np.nan)
    dropped = []
    constant = []
    out = pd.DataFrame(index=data.index)
    for col in data.columns:
        series = data[col]
        if method == "zscore_time":
            center = series.mean(skipna=True)
            scale = series.std(skipna=True)
        else:
            center = series.median(skipna=True)
            mad = (series - center).abs().median(skipna=True)
            scale = 1.4826 * mad
        if scale is None or scale < eps or np.isnan(scale):
            constant.append(col)
            if handle_constant == "drop":
                dropped.append(col)
                continue
            if handle_constant == "keep_zero":
                out[col] = 0.0
                continue
            out[col] = series  # warn_keep
            continue
        z = (series - center) / (scale + eps)
        if clip_sigma is not None:
            z = z.clip(lower=-clip_sigma, upper=clip_sigma)
        out[col] = z
    return out, {"dropped": dropped, "constant": constant}

In [None]:
questions = json.loads(questions_path.read_text())
df = pd.read_csv(csv_path)

time_keywords = ("time", "second", "sec")
candidates = [c for c in df.columns if any(tok in c.lower() for tok in time_keywords)]
preferred = [c for c in ("time_s", "time", "seconds", "start_sec", "end_sec") if c in df.columns]
time_col = preferred[0] if preferred else (candidates[0] if candidates else df.columns[0])
qa_cols = [c for c in df.columns if c not in {time_col, "bin_index", "end_sec", "start_sec"}]

if len(qa_cols) != len(questions):
    raise ValueError(f"QA column count ({len(qa_cols)}) does not match questions ({len(questions)})")

time_values = df[time_col].to_numpy()
print(f"Detected time column: {time_col}")
print(f"Loaded {len(questions)} questions and {len(df)} timepoints")

In [2]:
feature_df_raw = df[qa_cols].copy()
feature_df_norm = None
norm_meta = {"dropped": [], "constant": []}

if NORMALIZATION != "none":
    feature_df_norm, norm_meta = normalize_features(
        feature_df_raw,
        method=NORMALIZATION,
        eps=1e-8,
        clip_sigma=CLIP_SIGMA,
        handle_constant=HANDLE_CONSTANT,
    )
    if NAN_FRAC_DROP is not None:
        drop_cols = []
        for col in feature_df_norm.columns:
            frac_nan = feature_df_norm[col].isna().mean()
            if frac_nan > NAN_FRAC_DROP:
                drop_cols.append(col)
        if drop_cols:
            feature_df_norm = feature_df_norm.drop(columns=drop_cols)
            norm_meta["dropped"] = sorted(set(norm_meta.get("dropped", [])) | set(drop_cols))

print("Normalization summary:")
print(f"  mode: {NORMALIZATION}")
print(f"  features raw: {feature_df_raw.shape[1]}")
if feature_df_norm is not None:
    print(f"  features normalized: {feature_df_norm.shape[1]}")
    print(f"  constant cols: {norm_meta.get('constant', [])}")
    print(f"  dropped cols: {norm_meta.get('dropped', [])}")
    for col in qa_cols[:3]:
        if col in feature_df_norm:
            before_mean, before_std = feature_df_raw[col].mean(skipna=True), feature_df_raw[col].std(skipna=True)
            after_mean, after_std = feature_df_norm[col].mean(skipna=True), feature_df_norm[col].std(skipna=True)
            print(f"  {col}: mean/std before=({before_mean:.4f}, {before_std:.4f}) after=({after_mean:.4f}, {after_std:.4f})")

if NORMALIZATION != "none" and feature_df_norm is not None:
    df_norm_out = df.copy()
    for col in feature_df_norm.columns:
        df_norm_out[col] = feature_df_norm[col]
    suffix = NORMALIZATION
    norm_csv_name = csv_path.name.replace(".csv", f"_{suffix}.csv")
    norm_csv_path = csv_path.with_name(norm_csv_name)
    df_norm_out.to_csv(norm_csv_path, index=False)
    print(f"Saved normalized CSV: {norm_csv_path}")

NameError: name 'df' is not defined

In [None]:
def plot_series(series_df: pd.DataFrame, out_dir: Path, label: str):
    out_dir.mkdir(parents=True, exist_ok=True)
    for idx, (col, question) in enumerate(zip(qa_cols, questions)):
        if col not in series_df.columns:
            continue
        fig, ax = plt.subplots(figsize=(10, 3))
        ax.plot(time_values, series_df[col].to_numpy(), linewidth=1.4)
        ax.set_xlabel("Time (s)")
        ax.set_ylabel("QAEmb value")
        ax.set_title(f"Q{idx:02d}: {question} ({label})")
        ax.grid(True, alpha=0.3)
        fig.tight_layout()
        out_path = out_dir / f"q{idx:02d}.png"
        fig.savefig(out_path, dpi=180)
        plt.close(fig)
    print(f"Saved per-question plots: {out_dir}")


def plot_contact(series_df: pd.DataFrame, out_dir: Path, label: str, max_plots: int = 12):
    keep_cols = [c for c in qa_cols if c in series_df.columns]
    max_plots = min(max_plots, len(keep_cols))
    n_cols = 3
    n_rows = math.ceil(max_plots / n_cols) if max_plots else 1
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 3), sharex=True)
    axes = np.array(axes).reshape(-1)

    for i, ax in enumerate(axes):
        if i >= max_plots:
            ax.axis("off")
            continue
        col = keep_cols[i]
            
        ax.plot(time_values, series_df[col].to_numpy(), linewidth=1.0)
        ax.set_title(f"Q{i:02d} ({label})")
        if i // n_cols == n_rows - 1:
            ax.set_xlabel("Time (s)")
        ax.set_ylabel("QAEmb")
        ax.grid(True, alpha=0.2)

    fig.tight_layout()
    contact_path = out_dir / "contact_sheet.png"
    fig.savefig(contact_path, dpi=180)
    plt.close(fig)
    print(f"Saved contact sheet: {contact_path}")


# Raw plots
plot_series(feature_df_raw, output_dir_raw, label="raw")
plot_contact(feature_df_raw, output_dir_raw, label="raw")

# Z-scored plots (optional)
if NORMALIZATION != "none" and feature_df_norm is not None:
    plot_series(feature_df_norm, output_dir_z, label=NORMALIZATION)
    plot_contact(feature_df_norm, output_dir_z, label=NORMALIZATION)