In [None]:
# cell 0.0 — ENVIRONMENT LOCK (install once, then RESTART RUNTIME)
import os, sys, json, subprocess, platform, time, textwrap

# Pinned, Py3.12-safe wheels (avoid source builds and hotfix drift)
REQ = {
    "numpy": "1.26.4",
    "pandas": "2.2.2",
    "scipy": "1.13.1",
    "scikit-learn": "1.4.2",
    "pyarrow": "15.0.2",
    "numba": "0.60.0",
    "llvmlite": "0.43.0",
    "umap-learn": "0.5.6",
    "hdbscan": "0.8.36",
    "bertopic": "0.16.3",
    "sentence-transformers": "3.0.1",
    "rapidfuzz": "3.9.6",
    "nltk": "3.8.1",
    "matplotlib": "3.8.4",
    "tqdm": "4.66.4",
}

# (Optional) keep HF cache predictable; installs won’t use pip cache to save disk
os.environ.setdefault("TRANSFORMERS_CACHE", "/content/.cache/hf")
os.makedirs("outputs/_env", exist_ok=True)

pkgs = [f"{k}=={v}" for k, v in REQ.items()]
print("Installing pinned stack … (no pip cache, wheels only)")
cmd = [sys.executable, "-m", "pip", "install",
       "--upgrade", "--quiet", "--no-input",
       "--no-cache-dir", "--only-binary=:all:"] + pkgs
rc = subprocess.call(cmd)
if rc != 0:
    raise SystemExit(
        f"pip install failed with exit code {rc}. "
        "Run this cell again after checking internet space/availability."
    )

lock = {
    "python": sys.version.split()[0],
    "platform": platform.platform(),
    "packages": REQ,
    "ts": int(time.time()),
}
with open("outputs/_env/lock.json", "w") as f:
    json.dump(lock, f, indent=2)

print("✓ Wrote outputs/_env/lock.json")
print(textwrap.dedent("""
    IMPORTANT — ACTION REQUIRED
    1) Restart the runtime now (e.g., Runtime ▸ Restart runtime).
    2) Then run 0.1 and 0.2 before any module cells.
""").strip())

# Hard stop to enforce restart
raise SystemExit(0)


Installing pinned stack … (no pip cache, wheels only)


SystemExit: pip install failed with exit code 1. Run this cell again after checking internet space/availability.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# cell 0.1 — RUNTIME GUARD + THREAD/SEED SETUP (run after restart)
import os, json, random, numpy as np
from pathlib import Path

assert Path("outputs/_env/lock.json").exists(), "Missing outputs/_env/lock.json — run 0.0 first."

# Thread caps & quiet tokenizers (stability/determinism)
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("VECLIB_MAXIMUM_THREADS", "1")
os.environ.setdefault("NUMEXPR_MAX_THREADS", "1")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

# Determinism
SEED = int(os.environ.get("LSA_SEED", "42"))
random.seed(SEED); np.random.seed(SEED)

# If PyTorch is installed later modules will use it — seed it too (no-op if not available)
try:
    import torch
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(SEED)
except Exception:
    pass

# Non-interactive plotting & Parquet behavior
import matplotlib
matplotlib.use("Agg")
os.environ.setdefault("PYARROW_IGNORE_TIMEZONE", "1")

# Keep HF cache in a predictable place
os.environ.setdefault("TRANSFORMERS_CACHE", "/content/.cache/hf")

print("Guard OK. SEED =", SEED)


AssertionError: Missing outputs/_env/lock.json — run 0.0 first.

In [None]:
# cell 0.2 — HEAVY LIBS IMPORT CHECK (must pass before Module 6+)
import importlib, json

to_check = [
    ("numpy", "numpy"),
    ("pandas", "pandas"),
    ("pyarrow", "pyarrow"),
    ("sklearn", "scikit-learn"),
    ("numba", "numba"),
    ("llvmlite", "llvmlite"),
    ("umap", "umap-learn"),
    ("hdbscan", "hdbscan"),
    ("bertopic", "bertopic"),
    ("rapidfuzz", "rapidfuzz"),
    ("nltk", "nltk"),
    ("matplotlib", "matplotlib"),
]

present, missing = {}, {}

for mod_name, pkg_label in to_check:
    try:
        mod = importlib.import_module(mod_name)
        present[mod_name] = getattr(mod, "__version__", "n/a")
    except Exception as e:
        missing[mod_name] = f"{pkg_label} (import name '{mod_name}') — {type(e).__name__}: {e}"

# ensure sklearn version is reported cleanly
if "sklearn" in present:
    import sklearn  # noqa
    present["sklearn"] = sklearn.__version__

print("Resolved versions:")
print(json.dumps(present, indent=2, sort_keys=True))

if missing:
    print("\nMissing or failed imports:")
    for k, v in missing.items():
        print(f" - {v}")

# light compatibility hint only (no hard fail)
if "numba" in present and "llvmlite" in present:
    nb_minor = ".".join(present["numba"].split(".")[:2])
    ll_minor = ".".join(present["llvmlite"].split(".")[:2])
    if nb_minor != ll_minor:
        print(f"\n[warn] numba ({present['numba']}) vs llvmlite ({present['llvmlite']}) minor versions differ; "
              f"if UMAP/HDBSCAN complain, reinstall as a matched pair.")

assert not missing, (
    "Some required libraries are missing. "
    "Run the module-specific install cells first (e.g., 6.1 for BERTopic stack, 7.1 for rapidfuzz)."
)
print("Heavy libs check: OK")


  axis.set_ylabel('$\lambda$ value')
  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


Resolved versions:
{
  "hdbscan": "n/a",
  "llvmlite": "0.43.0",
  "matplotlib": "3.10.0",
  "nltk": "3.9.1",
  "numba": "0.60.0",
  "numpy": "2.0.2",
  "pandas": "2.2.2",
  "pyarrow": "18.1.0",
  "sklearn": "1.6.1",
  "umap": "0.5.9.post2"
}

Missing or failed imports:
 - bertopic (import name 'bertopic') — ModuleNotFoundError: No module named 'bertopic'
 - rapidfuzz (import name 'rapidfuzz') — ModuleNotFoundError: No module named 'rapidfuzz'

[warn] numba (0.60.0) vs llvmlite (0.43.0) minor versions differ; if UMAP/HDBSCAN complain, reinstall as a matched pair.


AssertionError: Some required libraries are missing. Run the module-specific install cells first (e.g., 6.1 for BERTopic stack, 7.1 for rapidfuzz).

In [None]:
# cell 0.3 — OPTIONAL: PURGE BUILD/WHEEL CACHES (saves disk)
import os, sys, shutil, subprocess, json
from pathlib import Path

def _dir_size(p: Path) -> int:
    try:
        return sum(f.stat().st_size for f in p.rglob("*") if f.is_file())
    except Exception:
        return 0

targets = []

# 1) pip cache (newer pip has 'pip cache purge'; else delete the dir)
try:
    out = subprocess.check_output([sys.executable, "-m", "pip", "cache", "dir"], text=True).strip()
    pip_cache_dir = Path(out)
except Exception:
    pip_cache_dir = Path.home() / ".cache" / "pip"

if pip_cache_dir.exists():
    targets.append(pip_cache_dir)

# 2) transient pip build/metadata dirs
for p in ["/tmp/pip-req-build", "/tmp/pip-modern-metadata", "/tmp/pip-unpack"]:
    if os.path.isdir(p):
        targets.append(Path(p))

# Measure before
before = {str(p): _dir_size(p) for p in targets}
total_before = sum(before.values())

# Try the official purge first (no failure if unsupported)
try:
    subprocess.run([sys.executable, "-m", "pip", "cache", "purge"], check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except Exception:
    pass

# Hard delete any remaining target dirs (idempotent)
for p in targets:
    if p.exists():
        shutil.rmtree(p, ignore_errors=True)

# Report
freed_mb = total_before / (1024**2)
disk_total, disk_used, disk_free = shutil.disk_usage("/")
print(json.dumps({
    "purged_paths": list(before.keys()),
    "freed_mb_estimate": round(freed_mb, 2),
    "disk_free_mb": round(disk_free/(1024**2), 2),
    "note": "Installed packages remain intact. HF model cache left untouched."
}, indent=2))


In [None]:
# cell 0.4 — foundations: status logging & self-report
from __future__ import annotations
import datetime, json
from pathlib import Path
from typing import Dict, Any, Optional

# Resolve outputs/ base even if "paths" isn't defined yet
_out_base = getattr(globals().get("paths", None), "out_dir", Path("outputs"))
_out_base.mkdir(parents=True, exist_ok=True)

MODULE_STATUS: Dict[str, Dict[str, Any]] = {}

def _jsonify(obj: Any) -> Any:
    """Ensure values are JSON-serializable; fallback to string."""
    try:
        json.dumps(obj)
        return obj
    except Exception:
        return str(obj)

def report_status(module: str, ok: bool, note: str = "", extra: Optional[Dict[str, Any]] = None):
    rec = {
        "ok": bool(ok),
        "note": str(note),
        "extra": {k: _jsonify(v) for k, v in (extra or {}).items()},
        "ts": datetime.datetime.utcnow().isoformat(timespec="seconds") + "Z",
    }
    MODULE_STATUS[module] = rec
    print(json.dumps({"module": module, **rec}))

def dump_status_json(out_path: Path = None):
    out_path = out_path or (_out_base / "module_status.json")
    out_path.write_text(json.dumps(MODULE_STATUS, indent=2), encoding="utf-8")
    print(json.dumps({"wrote": str(out_path), "entries": len(MODULE_STATUS)}))

# cell-level self-report (per rule #9)
print(json.dumps({"cell_id": "0.4 foundations: status utilities", "status": "pass"}))


In [None]:
# cell 0.5 — foundations: offset-preserving normalization (CRLF fix + maps)
from typing import NamedTuple, List
import unicodedata

class NormResult(NamedTuple):
    text: str
    norm_to_orig: List[int]
    orig_to_norm: List[int]

def normalize_with_offsets(s: str) -> NormResult:
    """
    Normalize text while preserving a bidirectional char index map:
      - CRLF -> LF, lone CR -> LF
      - Unicode NFKC
      - Unicode line/para separators -> LF
      - Collapse runs of whitespace to a single space (but keep LF)
    Returns:
      text: normalized string
      norm_to_orig[i] = original index for normalized char i
      orig_to_norm[j] = nearest normalized index corresponding to original j (left-fill)
    """
    norm_to_orig: List[int] = []
    out_chars: List[str] = []
    prev_space = False
    j = 0
    L = len(s)

    while j < L:
        ch = s[j]
        # Handle CR/LF combinations
        if ch == "\r":
            if j + 1 < L and s[j + 1] == "\n":
                out_chars.append("\n")
                norm_to_orig.append(j)  # map LF to CR position
                prev_space = False
                j += 2
                continue
            else:
                ch_n = "\n"
        else:
            ch_n = unicodedata.normalize("NFKC", ch)
            if ch_n in ("\u2028", "\u2029"):
                ch_n = "\n"

        # Collapse whitespace (except keep newlines)
        if ch_n.isspace() and ch_n != "\n":
            if prev_space:
                j += 1
                continue
            ch_n = " "
            prev_space = True
        else:
            prev_space = (ch_n == " ")

        out_chars.append(ch_n)
        norm_to_orig.append(j)
        j += 1

    text = "".join(out_chars)

    # Build orig->norm with left fill for unmapped positions
    orig_to_norm: List[int] = [-1] * L
    for norm_i, orig_j in enumerate(norm_to_orig):
        if 0 <= orig_j < L:
            orig_to_norm[orig_j] = norm_i
    last = 0
    for k in range(L):
        if orig_to_norm[k] == -1:
            orig_to_norm[k] = last
        else:
            last = orig_to_norm[k]

    return NormResult(text=text, norm_to_orig=norm_to_norm, orig_to_norm=orig_to_norm)

# sanity self-report
try:
    demo = "A  test\u00A0string\r\nwith\rmixed  spaces.\nNext line."
    res = normalize_with_offsets(demo)
    assert "\r" not in res.text and "\r\n" not in res.text, "CR/CRLF not fully normalized"
    if "report_status" in globals():
        report_status("0.foundation.normalize", True, "Normalization OK", {"len": len(res.text)})
    else:
        print({"module": "0.foundation.normalize", "ok": True, "note": "Normalization OK", "len": len(res.text)})
except Exception as e:
    if "report_status" in globals():
        report_status("0.foundation.normalize", False, f"Init error: {e}")
    else:
        print({"module": "0.foundation.normalize", "ok": False, "note": f"Init error: {e}"})


In [None]:
# cell 0.6 — foundations: sentence segmentation & windowing (regex heuristic)

import re
from typing import List, Tuple, Dict, Any

# Regex: split on . ! ? when followed by whitespace + capital/digit,
# but DO NOT split after single initials (A.), multi-initials (U.S.),
# or common abbreviations (Dr., Mr., e.g., i.e., etc., vs., Prof., Sr., Jr., St., Inc., Ltd.).
SENT_SPLIT_RE = re.compile(
    r"""
    (?<!\b[A-Z]\.)                                  # not single-letter initial before period
    (?<!\b(?:[A-Z]\.){2,})                          # not multi-initials like U.S.
    (?<!\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|St|vs|etc|     # not common abbrevs (period included below)
         e\.g|i\.e|Inc|Ltd))\.
    (?<=[.!?])\s+(?=[A-Z0-9])                       # end mark + space(s) + capital/digit
    """,
    re.VERBOSE,
)

def split_sentences(text: str) -> List[Tuple[str, Tuple[int, int]]]:
    """
    Return [(sentence_text, (char_start, char_end)), ...]
    Uses a lightweight regex heuristic; newline handling is left to upstream normalization.
    """
    spans: List[Tuple[int, int]] = []
    start = 0
    for m in SENT_SPLIT_RE.finditer(text):
        # m is at the gap between sentences; previous end is the last punctuation char
        end = m.start() + 1
        if end > start:
            spans.append((start, end))
        start = m.end()
    if start < len(text):
        spans.append((start, len(text)))
    return [(text[a:b], (a, b)) for a, b in spans if b > a]

def window_sentences(
    sents: List[Tuple[str, Tuple[int, int]]],
    win: int,
    stride: int
) -> List[Dict[str, Any]]:
    """
    Fixed-size windows over sentence list.
    Returns a list of dicts with:
      - sent_start_idx, sent_end_idx (end-exclusive)
      - char_span = [char_start, char_end] from original text
      - text = joined sentence text (space-joined, stripped)
    No trailing partial window is emitted.
    """
    if win <= 0 or stride <= 0:
        raise ValueError("win and stride must be positive integers")

    windows: List[Dict[str, Any]] = []
    n = len(sents)
    max_start = max(n - win + 1, 0)
    for i in range(0, max_start, stride):
        chunk = sents[i:i + win]
        if len(chunk) < win:
            break
        text = " ".join(t.strip() for t, _ in chunk if t)
        char_start = chunk[0][1][0]
        char_end = chunk[-1][1][1]
        windows.append({
            "sent_start_idx": i,
            "sent_end_idx": i + win,
            "char_span": [char_start, char_end],
            "text": text,
        })
    return windows

# sanity self-report
try:
    sample = "Dr. A. Smith wrote this. Second sentence! Third sentence? U.S. officials agreed. Fourth one. Fifth here."
    sents = split_sentences(sample)
    wins = window_sentences(sents, 4, 2)
    payload = {"sents": len(sents), "windows": len(wins)}
    if "report_status" in globals():
        report_status("0.foundation.segmentation", True, "Splitter/windowing OK", payload)
    else:
        print({"module": "0.foundation.segmentation", "ok": True, "note": "Splitter/windowing OK", **payload})
except Exception as e:
    if "report_status" in globals():
        report_status("0.foundation.segmentation", False, f"Error: {e}")
    else:
        print({"module": "0.foundation.segmentation", "ok": False, "note": f"Error: {e}"})


In [None]:
# cell 0.7 — foundations: visualization smoke test (matplotlib only)
import json, datetime
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt  # local import by design

out_dir = Path("outputs/diagnostics"); out_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S")
fp = out_dir / f"viz_smoke_{ts}.png"

try:
    xs = np.arange(0, 10)
    ys = np.sqrt(xs)

    plt.figure(figsize=(4, 3), dpi=120)
    plt.plot(xs, ys, marker="o")
    plt.title("Matplotlib render check (0.7)")
    plt.xlabel("x"); plt.ylabel("sqrt(x)")
    plt.tight_layout()
    plt.savefig(fp)
    plt.close("all")

    assert fp.exists() and fp.stat().st_size > 0, "Smoke image was not written"
    # status hook from 0.4
    report_status("0.foundation.viz", True, "Matplotlib rendering OK", {"file": str(fp), "bytes": fp.stat().st_size})
    print(json.dumps({"cell_id": "0.7 foundations: viz smoke", "file": str(fp)}))
except Exception as e:
    plt.close("all")
    report_status("0.foundation.viz", False, f"Matplotlib failed: {e}")
    raise


In [None]:
# cell 0.8 — FS scaffold + disk sanity (one-time)
import os, shutil, json, datetime
from pathlib import Path

# Create a predictable outputs layout for all modules up-front
BASE = Path("outputs")
LAYOUT = {f"module-{i}": ["data", "plots"] for i in range(1, 13)}
LAYOUT.update({"bundles": [], "diagnostics": []})

for mod, subs in LAYOUT.items():
    if subs:
        for s in subs:
            (BASE / mod / s).mkdir(parents=True, exist_ok=True)
    else:
        (BASE / mod).mkdir(parents=True, exist_ok=True)

# Light disk report at the current working dir mount
root = Path(".").resolve()
total, used, free = shutil.disk_usage(str(root))
def _fmt(b):
    for u in ["B","KB","MB","GB","TB"]:
        if b < 1024: return f"{b:,.1f} {u}"
        b /= 1024
    return f"{b:,.1f} PB"

stamp = datetime.datetime.utcnow().isoformat() + "Z"
summary = {
    "cwd": str(root),
    "created_dirs": [str((BASE / k)) for k in LAYOUT.keys()],
    "disk": {"total": _fmt(total), "used": _fmt(used), "free": _fmt(free)},
    "ts": stamp,
}
print(json.dumps(summary, indent=2))

# status hook (from 0.4)
report_status("0.foundation.fs", True, "FS scaffold ready", summary["disk"])


In [None]:
# cell 0.9 — Compact Parquet I/O helpers (space + schema guards)
import json
from pathlib import Path
import numpy as np
import pandas as pd

# Optional: use pyarrow if present for zstd compression
try:
    import pyarrow as pa
    import pyarrow.parquet as pq
    _HAVE_PA = True
except Exception:
    _HAVE_PA = False

def _downcast_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """Downcast numerics to save disk without changing semantics."""
    out = df.copy()
    for c in out.columns:
        s = out[c]
        if pd.api.types.is_float_dtype(s):
            out[c] = s.astype("float32")
        elif pd.api.types.is_integer_dtype(s):
            # Nullable Int32 to preserve NaNs if present
            out[c] = s.astype("Int32" if s.isna().any() else "int32")
        elif pd.api.types.is_bool_dtype(s):
            out[c] = s.astype("bool")
        # strings/categoricals left as-is; caller may choose to cast
    return out

def assert_has_cols(df: pd.DataFrame, required: set, where: str = ""):
    missing = set(required) - set(df.columns)
    assert not missing, f"Missing columns{f' in {where}' if where else ''}: {sorted(missing)}"

def write_parquet(df: pd.DataFrame, path: str | Path, compression: str = "zstd") -> dict:
    """Write compact parquet; falls back to pandas engine if pyarrow unavailable."""
    path = Path(path); path.parent.mkdir(parents=True, exist_ok=True)
    df2 = _downcast_numeric(df)
    if _HAVE_PA:
        table = pa.Table.from_pandas(df2, preserve_index=False)
        pq.write_table(table, path, compression=compression)
    else:
        df2.to_parquet(path, index=False)  # engine="auto" fallback
    meta = {"rows": int(len(df2)), "cols": list(map(str, df2.columns)), "bytes": int(path.stat().st_size)}
    return meta

def read_parquet(path: str | Path) -> pd.DataFrame:
    """Read parquet via pandas (auto-engine)."""
    return pd.read_parquet(path)

print(json.dumps({"cell_id": "0.9 foundations: parquet_io", "pyarrow": _HAVE_PA}))
report_status("0.foundation.io", True, "Parquet helpers ready", {"pyarrow": _HAVE_PA})


In [None]:
# cell 0.10 — Resilience toggles + resource guard
import os, json, shutil, datetime, re
from pathlib import Path

# ---------- Defaults (overridable via env before running the notebook) ----------
def _setdef(k, v):
    if os.environ.get(k) is None:
        os.environ[k] = str(v)

_setdef("LSA_FULL_STACK",              0)   # heavy deps only when needed
_setdef("LSA_MIN_FREE_GB",             3)   # degrade if disk below this
_setdef("LSA_MIN_RAM_GB",              4)   # degrade if free RAM below this
_setdef("LSA_SAVE_EMB",                0)   # M6: don't write *.npy by default
_setdef("LSA_INLINE_PLOTS",            0)   # don't display inline by default
_setdef("LSA_SAVE_PLOTS",              1)   # write PNGs unless degraded
_setdef("SAFE_MODE",                   1)   # M7: skip heavy sentence tables
_setdef("LSA_RF_MAX_CANDS_PER_WIN",  300)  # M7: cap candidate count per window
_setdef("LSA_ALLOW_SOURCE",            0)   # 0 = wheel-only installs in 0.11

# If inline plots are off, force a non-interactive backend early
if os.environ.get("LSA_INLINE_PLOTS", "0") == "0":
    os.environ["MPLBACKEND"] = "Agg"

# ---------- Resource probes ----------
def _disk_free_gb(path="."):
    total, used, free = shutil.disk_usage(str(Path(path).resolve()))
    return round(free / (1024**3), 2)

def _free_ram_gb():
    try:
        import psutil
        return round(psutil.virtual_memory().available / (1024**3), 2)
    except Exception:
        try:
            txt = Path("/proc/meminfo").read_text()
            m = re.search(r"MemAvailable:\s+(\d+)\s+kB", txt)
            if m: return round(int(m.group(1)) / (1024**2), 2)
        except Exception:
            pass
    return 0.0

free_gb  = _disk_free_gb(".")
ram_gb   = _free_ram_gb()
need_gb  = float(os.environ["LSA_MIN_FREE_GB"])
need_ram = float(os.environ["LSA_MIN_RAM_GB"])

degraded = int(free_gb < need_gb or ram_gb < need_ram)
os.environ["LSA_DEGRADED"] = str(degraded)

# If disk is tight, auto-disable plot saving to conserve space
if degraded and free_gb < need_gb:
    os.environ["LSA_SAVE_PLOTS"] = "0"

status = {
    "cell_id": "0.10_resilience_guard",
    "thresholds": {"min_free_gb": need_gb, "min_ram_gb": need_ram},
    "observed": {"free_gb": free_gb, "free_ram_gb": ram_gb},
    "flags": {
        "LSA_DEGRADED": degraded,
        "LSA_FULL_STACK": int(os.environ["LSA_FULL_STACK"]),
        "LSA_SAVE_EMB": int(os.environ["LSA_SAVE_EMB"]),
        "LSA_INLINE_PLOTS": int(os.environ["LSA_INLINE_PLOTS"]),
        "LSA_SAVE_PLOTS": int(os.environ["LSA_SAVE_PLOTS"]),
        "SAFE_MODE": int(os.environ["SAFE_MODE"]),
        "LSA_RF_MAX_CANDS_PER_WIN": int(os.environ["LSA_RF_MAX_CANDS_PER_WIN"]),
        "LSA_ALLOW_SOURCE": int(os.environ["LSA_ALLOW_SOURCE"]),
    },
    "ts": datetime.datetime.utcnow().isoformat()+"Z",
}
print(json.dumps(status, indent=2))

# Persist a run-env snapshot for post-hoc debugging
env_dir = Path("outputs/_env"); env_dir.mkdir(parents=True, exist_ok=True)
(Path(env_dir) / "_run_env.json").write_text(json.dumps(status, indent=2), encoding="utf-8")

# Soft-report to the notebook ledger if present
try:
    report_status("0.foundation.resilience", True, "Resilience toggles applied", status)
except NameError:
    pass


In [None]:
# cell 0.11 — Lazy install helper (wheel-first, retry once, proper import mapping)
import os, re, importlib, subprocess, sys, json

# Map pip names → import module names
NAME_MAP = {
    "scikit-learn": "sklearn",
    "sentence-transformers": "sentence_transformers",
    "umap-learn": "umap",
    # common cases that already match kept implicit: numpy, pandas, pyarrow, hdbscan, rapidfuzz, nltk, matplotlib, scipy
}

def _import_name_from_spec(spec: str) -> str:
    base = spec.split("[")[0]            # strip extras
    base = re.split(r"[<>=]", base)[0]   # strip version ops
    base = base.strip()
    return NAME_MAP.get(base, base.replace("-", "_"))

def ensure_installed(reqs, quiet=True, no_cache=True, wheel_only=None):
    """
    Install only if missing. Returns {"installed":[...], "skipped":[...], "errors":[...]}.
    wheel_only:
      - None → infer from env LSA_ALLOW_SOURCE (0 = wheel-only, 1 = allow source)
      - True → force '--only-binary=:all:'
      - False → allow source builds
    """
    if wheel_only is None:
        wheel_only = (os.environ.get("LSA_ALLOW_SOURCE", "0") != "1")

    installed, skipped, errors = [], [], []

    for spec in reqs:
        modname = _import_name_from_spec(spec)
        try:
            importlib.import_module(modname)
            skipped.append(spec)
            continue
        except Exception:
            pass

        cmd = [sys.executable, "-m", "pip", "install", spec]
        if quiet: cmd.append("-q")
        if no_cache: cmd.append("--no-cache-dir")
        cmd.append("--prefer-binary")
        if wheel_only: cmd.append("--only-binary=:all:")

        env = os.environ.copy()
        print(json.dumps({"pip": "install", "spec": spec, "cmd": " ".join(cmd)}))

        # try once, then one retry on failure
        try:
            subprocess.check_call(cmd, env=env)
            installed.append(spec)
        except subprocess.CalledProcessError as e1:
            print(json.dumps({"pip": "retry", "spec": spec}))
            try:
                subprocess.check_call(cmd, env=env)
                installed.append(spec)
            except subprocess.CalledProcessError as e2:
                errors.append({"spec": spec, "returncode": e2.returncode})

    out = {"installed": installed, "skipped": skipped, "errors": errors}
    print(json.dumps(out, indent=2))
    try:
        report_status("0.foundation.installer", True if not errors else False, "ensure_installed completed", out)
    except NameError:
        pass
    return out

print(json.dumps({"cell_id": "0.11_lazy_install_helper", "usage": "ensure_installed([...]) in module install cells"}))


In [None]:
# cell 1.0A: set SOURCE_DIR to your article folder (expects 01-*.md .. 04-*.md)
from pathlib import Path
SOURCE_DIR = Path("/content")  # adjust if needed
print({"cell_id": "cell 1.0A: set SOURCE_DIR",
       "SOURCE_DIR": str(SOURCE_DIR),
       "exists": SOURCE_DIR.exists(),
       "is_dir": SOURCE_DIR.is_dir()})

In [None]:
# cell 1.0B: filename sanity (previews matches and version coverage by slug)
import re, json
from pathlib import Path

FNAME_RE = re.compile(r"^(?P<prefix>0[1-4])-(?P<slug>.+)\.md$")
if not SOURCE_DIR.exists() or not SOURCE_DIR.is_dir():
    raise ValueError(f"SOURCE_DIR not found or not a directory: {SOURCE_DIR}")

files = [p for p in SOURCE_DIR.iterdir() if p.is_file() and FNAME_RE.match(p.name)]
preview = [p.name for p in files[:10]]
groups = {}
for p in files:
    m = FNAME_RE.match(p.name)
    slug = m.group("slug")
    vid = int(m.group("prefix"))
    groups.setdefault(slug, []).append(vid)

summary = {slug: sorted(vs) for slug, vs in groups.items()}
print(json.dumps({
    "cell_id": "cell 1.0B: filename sanity",
    "SOURCE_DIR": str(SOURCE_DIR),
    "matches": len(files),
    "preview": preview,
    "articles": len(summary),
    "versions_by_article": summary
}, indent=2))


In [None]:
# cell 1.1: textstat|wordfreq: install (module-only; no base upgrades; pinned, wheels-only)
# Ensure pyphen is installed before textstat imports.
import importlib, sys, subprocess

def _ensure_pkg(import_name: str, pip_name: str = None, version: str = None):
    try:
        return importlib.import_module(import_name)
    except ModuleNotFoundError:
        pkg_spec = pip_name or import_name
        if version:
            pkg_spec = f"{pkg_spec}{version}"
        print(f"[1.1 hotfix] Installing missing dependency: {pkg_spec} …")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg_spec])
        return importlib.import_module(import_name)

_ensure_pkg("pyphen", "pyphen", ">=0.14,<0.15")

# Install main packages and their dependencies
%pip install -q --only-binary=:all: \
  "textstat>=0.7,<0.8" \
  "wordfreq>=3,<4" \
  "regex>=2023.10,<2026.0" \
  "pyarrow>=14,<18" \
  "ftfy>=6,<7" \
  "langcodes>=3,<4"

from pathlib import Path, PurePosixPath
import json

BASE_OUT = Path("outputs/textstat_lex")
PLOTS_OUT = BASE_OUT / "plots"
BASE_OUT.mkdir(parents=True, exist_ok=True)
PLOTS_OUT.mkdir(parents=True, exist_ok=True)

print(json.dumps({
    "cell_id": "cell 1.1: textstat|wordfreq: install",
    "status": "pass",
    "dirs": {"base": str(PurePosixPath(BASE_OUT)), "plots": str(PurePosixPath(PLOTS_OUT))}
}, indent=2))

In [None]:
lazy_import_ml()


In [None]:
# cell 1.2: textstat|wordfreq: imports & sanity checks + filename-driven versioning
import json, datetime, math, random, re
from pathlib import Path

# Environment & Imports (hotfix: ensure pyphen for textstat)
import importlib, sys, subprocess

def _ensure_pkg(import_name: str, pip_name: str = None, version: str = None):
    try:
        return importlib.import_module(import_name)
    except ModuleNotFoundError:
        pkg_spec = pip_name or import_name
        if version:
            pkg_spec = f"{pkg_spec}=={version}"
        print(f"[1.2 hotfix] Installing missing dependency: {pkg_spec} …")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg_spec])
        return importlib.import_module(import_name)

# textstat requires 'pyphen' at import time for hyphenation/syllables.
_ensure_pkg("pyphen", "pyphen", "0.14.0")

# Now import textstat (works if pyphen is present)
import textstat
from importlib.metadata import version as _pkg_version

print("[1.2] textstat:", _pkg_version("textstat"), "| pyphen:", _pkg_version("pyphen"))


import textstat
from importlib.metadata import version as _pkg_version

# Assert foundations (avoid base re-imports here)
_missing = [name for name in ("np","pd","plt") if name not in globals()]
if _missing:
    raise RuntimeError(f"Foundations not loaded (missing {_missing}). Run cell 0.2 before Module 1.")

# ---- Versions & determinism ----
VERS = {"textstat": _pkg_version("textstat"), "wordfreq": _pkg_version("wordfreq")}

# ---- Inputs handshake ----
g = globals()
DOCS = g.get("DOCS"); TEXT = g.get("TEXT")

# If neither DOCS nor TEXT is provided, load from SOURCE_DIR using strict filename rule.
SOURCE_DIR = Path(g.get("SOURCE_DIR", "content"))

FNAME_RE = re.compile(r"^(?P<prefix>0[1-4])-(?P<slug>.+)\.md$")
NOTES = []

def _clean_markdown(s: str) -> str:
    """Regex-only light cleaner for readability/lexical metrics."""
    if not s: return ""
    s = re.sub(r"```[\s\S]*?```", " ", s)                            # fenced code blocks
    s = re.sub(r"!\[([^\]]*)\]\([^)]+\)", r"\1", s)                  # images -> alt
    s = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", s)                   # links -> text
    s = re.sub(r"`([^`]+)`", r"\1", s)                               # inline code
    s = re.sub(r"^\s*#{1,6}\s+", "", s, flags=re.MULTILINE)          # headings
    s = re.sub(r"[ \t]+", " ", s)                                    # spaces
    s = re.sub(r"\s*\n\s*", "\n", s).strip()                         # newlines
    return s

if DOCS is None and TEXT is not None:
    DOCS = [{"doc_id": "doc_0001", "text": TEXT, "article_id": "doc_0001", "version_id": 1,
             "version_tag": "v1", "filename": "<in-memory>", "path": "<in-memory>"}]
elif DOCS is None and TEXT is None:
    # Filename-driven discovery
    if not SOURCE_DIR.exists() or not SOURCE_DIR.is_dir():
        raise ValueError(f"Module 1: SOURCE_DIR does not exist or is not a directory: {SOURCE_DIR}")
    files = [p for p in SOURCE_DIR.iterdir() if p.is_file() and FNAME_RE.match(p.name)]
    if not files:
        raise ValueError("Module 1: No files found matching ^(0[1-4])-(.+)\\.md$ in SOURCE_DIR.")
    groups = {}
    for p in files:
        m = FNAME_RE.match(p.name)
        prefix, slug = m.group("prefix"), m.group("slug")
        vid = int(prefix)
        grp = groups.setdefault(slug, {})
        if vid in grp:
            raise ValueError(f"Duplicate version prefix for article '{slug}': {prefix}")
        grp[vid] = p
    DOCS = []
    for slug, versions in sorted(groups.items()):
        present = sorted(versions.keys())
        if len(present) < 4:
            NOTES.append(f"Article '{slug}' missing versions: {sorted(set(range(1,5)) - set(present))}")
        for vid in present:
            path = versions[vid]
            text_raw = path.read_text(encoding="utf-8-sig", errors="replace")
            text_clean = _clean_markdown(text_raw)
            DOCS.append({
                "article_id": slug,
                "version_id": vid,
                "version_tag": f"v{vid}",
                "filename": path.name,
                "path": str(path),
                "doc_id": f"{slug}::v{vid}",
                "text_raw": text_raw,
                "text": text_clean,   # downstream cells use 'text'
            })
else:
    # DOCS provided by user: validate shape
    if not (isinstance(DOCS, list) and all(isinstance(d, dict) for d in DOCS)):
        raise ValueError("DOCS must be a list of dicts.")
    if not all(("doc_id" in d and "text" in d) for d in DOCS):
        raise ValueError("DOCS dicts must include 'doc_id' and 'text'.")

# ---- Metadata ----
BASE_OUT = Path("outputs/textstat_lex"); PLOTS_OUT = BASE_OUT / "plots"
BASE_OUT.mkdir(parents=True, exist_ok=True); PLOTS_OUT.mkdir(parents=True, exist_ok=True)
metadata_path = BASE_OUT / "metadata.json"

articles = {}
for d in DOCS:
    slug = d.get("article_id", d.get("doc_id", "<doc>"))
    vid = d.get("version_id")
    if slug not in articles: articles[slug] = []
    if vid is not None and vid not in articles[slug]:
        articles[slug].append(vid)
for slug in articles:
    articles[slug] = sorted(articles[slug])

metadata = {
    "module": "module_1_textstat_lex_v1",
    "timestamp_utc": datetime.datetime.utcnow().isoformat(timespec="seconds") + "Z",
    "seed": 7,
    "library_versions": VERS,
    "input_count": len(DOCS),
    "source_dir": (
        str(SOURCE_DIR.resolve())
        if (TEXT is None and isinstance(DOCS, list) and len(DOCS) and isinstance(DOCS[0], dict) and "path" in DOCS[0])
        else None
    ),
    "articles": len(articles) if articles else None,
    "versions_per_article_min": min((len(v) for v in articles.values()), default=None),
    "versions_per_article_max": max((len(v) for v in articles.values()), default=None),
    "expected_versions": 4,
    "version_order_source": "filename_prefix",
    "notes": NOTES,
}
metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")

print(json.dumps({
    "cell_id": "cell 1.2: textstat|wordfreq: imports & sanity checks",
    "status": "pass",
    "inputs": len(DOCS),
    "articles": len(articles) if articles else 1,
    "metadata": str(metadata_path),
    "versions": VERS
}, indent=2))


In [None]:
# cell 1.3: textstat: readability metrics (document-level on text_clean)
problems = []
rows = []

# naive sentence split for eligibility check (not for final stats)
SENT_SPLIT = re.compile(r'(?<=[.!?])\s+')

def _eligible(text: str) -> bool:
    if text is None: return False
    if len(text) < 200: return False
    sents = [s for s in SENT_SPLIT.split(text.strip()) if s]
    return len(sents) >= 3

for d in DOCS:
    doc_id = str(d.get("doc_id", d.get("article_id", "doc")) )
    text = d.get("text") or ""
    row = {
        "doc_id": doc_id,
        "article_id": d.get("article_id", None),
        "version_id": d.get("version_id", None),
        "version_tag": d.get("version_tag", None),
        "filename": d.get("filename", None),
        "path": d.get("path", None),
        "n_chars": int(len(text)),
    }
    heavy_ok = _eligible(text)

    metrics = {
        "flesch_reading_ease": textstat.flesch_reading_ease,
        "flesch_kincaid_grade": textstat.flesch_kincaid_grade,
        "gunning_fog": textstat.gunning_fog,
        "smog_index": textstat.smog_index,
        "coleman_liau_index": textstat.coleman_liau_index,
        "automated_readability_index": textstat.automated_readability_index,
        "dale_chall_readability_score": textstat.dale_chall_readability_score,
        "linsear_write_formula": textstat.linsear_write_formula,
    }

    if heavy_ok:
        for k, fn in metrics.items():
            try:
                row[k] = float(fn(text))
            except Exception as e:
                row[k] = np.nan
                problems.append({"doc_id": doc_id, "metric": k, "error": str(e)})
        try:
            row["text_standard"] = str(textstat.text_standard(text))
        except Exception as e:
            row["text_standard"] = ""
            problems.append({"doc_id": doc_id, "metric": "text_standard", "error": str(e)})
    else:
        for k in metrics.keys(): row[k] = np.nan
        row["text_standard"] = ""

    rows.append(row)

df_lex = pd.DataFrame(rows)

print(json.dumps({
    "cell_id": "cell 1.3: textstat: readability metrics",
    "status": "pass", "docs": int(len(df_lex)), "problems": int(len(problems))
}, indent=2))


In [None]:
# cell 1.4: wordfreq: Zipf frequency features (Unicode-aware, regex-only tokenization)
import regex as rxx
from wordfreq import zipf_frequency

TOKEN_RE = rxx.compile(r"\b[\p{L}\p{M}\p{N}’'-]+\b", flags=rxx.UNICODE)

def tokenize(text: str):
    return [m.group(0) for m in TOKEN_RE.finditer(text or "")]

def token_zipf_stats(tokens):
    lowers = [t.lower() for t in tokens]
    freqs = [zipf_frequency(t, 'en', wordlist='best') for t in lowers]
    freqs = [f for f in freqs if f is not None and f > 0]
    if len(freqs) == 0:
        return {
            "zipf_mean": np.nan, "zipf_std": np.nan,
            "zipf_p25": np.nan, "zipf_p50": np.nan, "zipf_p75": np.nan,
            "rare_rate": np.nan, "mid_rate": np.nan, "common_rate": np.nan
        }, []
    arr = np.asarray(freqs, dtype=float)
    stats = {
        "zipf_mean": float(np.mean(arr)),
        "zipf_std": float(np.std(arr, ddof=0)),
        "zipf_p25": float(np.percentile(arr, 25)),
        "zipf_p50": float(np.percentile(arr, 50)),
        "zipf_p75": float(np.percentile(arr, 75)),
        "rare_rate": float(np.mean(arr < 3.0)),
        "mid_rate":  float(np.mean((arr >= 3.0) & (arr < 5.0))),
        "common_rate": float(np.mean(arr >= 5.0)),
    }
    return stats, arr

def latin_alpha_ratio(text: str) -> float:
    if not text: return 0.0
    all_alpha = [ch for ch in text if ch.isalpha()]
    if not all_alpha: return 0.0
    latin = [ch for ch in all_alpha if ('A' <= ch <= 'Z') or ('a' <= ch <= 'z')]
    return len(latin) / len(all_alpha)

ZIPF_ALL = []
zipf_rows = []
for idx, d in enumerate(DOCS):
    doc_id, text = str(d.get("doc_id", f"doc_{idx:04d}")), (d.get("text") or "")
    toks = tokenize(text)
    stats, freqs = token_zipf_stats(toks)
    zipf_rows.append({"doc_id": doc_id, **stats})
    if len(freqs): ZIPF_ALL.append(np.asarray(freqs, dtype=float))

df_zipf = pd.DataFrame(zipf_rows)
df_lex = df_lex.merge(df_zipf, on="doc_id", how="left")

# language heuristic update (non-blocking)
meta_path = Path("outputs/textstat_lex/metadata.json")
try:
    meta = json.loads(meta_path.read_text(encoding="utf-8"))
    ratios = [latin_alpha_ratio(d.get("text") or "") for d in DOCS]
    if len(ratios) and float(np.mean([r < 0.5 for r in ratios])) > 0.5:
        meta["lang_guess"] = "non_en_possible"
    meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
except Exception:
    pass

ZIPF_ALL = np.concatenate(ZIPF_ALL, axis=0) if len(ZIPF_ALL) else np.array([], dtype=float)

print(json.dumps({
    "cell_id": "cell 1.4: wordfreq: Zipf frequency features",
    "status": "pass", "docs": int(len(df_zipf)), "zipf_values": int(ZIPF_ALL.size)
}, indent=2))


In [None]:
# cell 1.5: lexicons: function-word profile, TTR, naive sentence stats + dtype enforcement
from pathlib import Path

# Reuse TOKEN_RE & tokenize from 1.4

LEX_PATH = Path("lexicons/function_words_en.txt")
if LEX_PATH.exists():
    FUNC_WORDS = {w.strip().lower() for w in LEX_PATH.read_text(encoding="utf-8").splitlines() if w.strip()}
else:
    FUNC_WORDS = {
        "a","an","the","and","or","but","if","because","as","until","while",
        "of","at","by","for","with","about","against","between","into","through",
        "during","before","after","above","below","to","from","up","down","in","out",
        "on","off","over","under","again","further","then","once","here","there",
        "when","where","why","how","all","any","both","each","few","more","most",
        "other","some","such","no","nor","not","only","own","same","so","than","too","very",
        "can","will","just","don","should","now","i","you","he","she","it","we","they",
        "me","him","her","us","them","my","your","his","its","our","their",
        "is","am","are","was","were","be","been","being","have","has","had","do","does","did"
    }

PUNCT_CHARS = set(list(".,;:!?—–-()[]'\"“”‘’"))
SENT_SPLIT_FOR_STATS = rxx.compile(r"(?<=[.!?])\s+(?=[A-Z])")

SENT_LEN_ALL = []
rows = []
for idx, d in enumerate(DOCS):
    doc_id, text = str(d.get("doc_id", f"doc_{idx:04d}")), (d.get("text") or "")
    toks = tokenize(text)
    n_tokens = int(len(toks))
    lowers = [t.lower() for t in toks]
    n_unique = len(set(lowers))
    ttr = float(n_unique / n_tokens) if n_tokens > 0 else float("nan")
    fw_rate = float(sum(1 for t in lowers if t in FUNC_WORDS) / n_tokens) if n_tokens > 0 else float("nan")
    tok_lens = np.array([len(t) for t in toks], dtype=float) if n_tokens > 0 else np.array([], dtype=float)
    token_len_mean = float(tok_lens.mean()) if tok_lens.size else float("nan")
    token_len_std  = float(tok_lens.std(ddof=0)) if tok_lens.size else float("nan")
    pden = (sum(1 for ch in (text or "") if ch in PUNCT_CHARS) / n_tokens * 100.0) if n_tokens > 0 else float("nan")

    sents = [s for s in SENT_SPLIT_FOR_STATS.split(text.strip()) if s]
    sent_lens = [len(tokenize(s)) for s in sents] if sents else []
    if sent_lens: SENT_LEN_ALL.extend(sent_lens)
    naive_sentence_count = int(len(sents))
    naive_sentence_len_mean = float(np.mean(sent_lens)) if sent_lens else float("nan")
    naive_sentence_len_std  = float(np.std(sent_lens, ddof=0)) if sent_lens else float("nan")

    rows.append({
        "doc_id": doc_id,
        "n_tokens_regex": n_tokens,
        "ttr": ttr,
        "function_word_rate": fw_rate,
        "punctuation_density_per_100toks": pden,
        "token_len_mean": token_len_mean,
        "token_len_std": token_len_std,
        "naive_sentence_count": naive_sentence_count,
        "naive_sentence_len_mean": naive_sentence_len_mean,
        "naive_sentence_len_std": naive_sentence_len_std,
    })

df_lex2 = pd.DataFrame(rows)
df_lex = df_lex.merge(df_lex2, on="doc_id", how="left")

# ---- dtype & required columns (includes filename-driven identifiers) ----
required_cols = [
    "doc_id","article_id","version_id","version_tag","filename","path",
    "n_chars","n_tokens_regex","ttr","function_word_rate","punctuation_density_per_100toks",
    "token_len_mean","token_len_std","zipf_mean","zipf_std","zipf_p25","zipf_p50","zipf_p75",
    "rare_rate","mid_rate","common_rate","flesch_reading_ease","flesch_kincaid_grade","gunning_fog",
    "smog_index","coleman_liau_index","automated_readability_index","dale_chall_readability_score",
    "linsear_write_formula","text_standard","naive_sentence_count","naive_sentence_len_mean",
    "naive_sentence_len_std"
]
for c in required_cols:
    if c not in df_lex.columns:
        df_lex[c] = np.nan

# identifiers
df_lex["doc_id"] = df_lex["doc_id"].astype("string")
df_lex["article_id"] = df_lex["article_id"].astype("string")
df_lex["version_tag"] = df_lex["version_tag"].astype("string")
df_lex["filename"] = df_lex["filename"].astype("string")
df_lex["path"] = df_lex["path"].astype("string")
df_lex["version_id"] = pd.to_numeric(df_lex["version_id"], errors="coerce").fillna(0).astype("int64")

# ints
for c in ["n_chars","n_tokens_regex","naive_sentence_count"]:
    df_lex[c] = pd.to_numeric(df_lex[c], errors="coerce").fillna(0).astype("int64")

# strings
df_lex["text_standard"] = df_lex["text_standard"].astype("string")

# floats
float_cols = [c for c in required_cols if c not in
              ["doc_id","article_id","version_id","version_tag","filename","path",
               "n_chars","n_tokens_regex","naive_sentence_count","text_standard"]]
for c in float_cols:
    df_lex[c] = pd.to_numeric(df_lex[c], errors="coerce").astype("float64")

print(json.dumps({
    "cell_id": "cell 1.5: lexicons: function-word profile & TTR",
    "status": "pass", "docs": int(len(df_lex)), "have_sent_len": int(len(SENT_LEN_ALL))
}, indent=2))


In [None]:
# cell 1.6: textstat|wordfreq: visuals — baseline distributions + per-article trends + deltas
from pathlib import Path
import json

BASE_OUT = Path("outputs/textstat_lex"); PLOTS_OUT = BASE_OUT / "plots"
PLOTS_OUT.mkdir(parents=True, exist_ok=True)

# Assert foundations / prior cells present (no base re-imports here)
_missing = [name for name in ("np","pd","plt","df_lex") if name not in globals()]
if _missing:
    raise RuntimeError(f"Prereqs missing in cell 1.6: {_missing}. Run 0.2 and cells 1.3–1.5 first.")

# Fallback tokenizer if 1.4 wasn't run
try:
    TOKEN_RE  # noqa: F821
except NameError:
    import regex as rxx
    TOKEN_RE = rxx.compile(r"\b[\p{L}\p{M}\p{N}’'-]+\b", flags=rxx.UNICODE)

def _placeholder_plot(title, note, outfile):
    plt.figure(dpi=120)
    plt.text(0.5, 0.6, "No data available", ha="center", va="center", fontsize=12)
    plt.text(0.5, 0.4, note, ha="center", va="center", fontsize=9)
    plt.title(title); plt.axis("off")
    plt.tight_layout(); plt.savefig(outfile); plt.show(); plt.close()

# ---- Global histograms ----
# Sentence length histogram
try:
    sent_ok = False
    if 'SENT_LEN_ALL' in globals() and len(SENT_LEN_ALL):
        plt.figure(dpi=120)
        bins = np.arange(0, max(SENT_LEN_ALL)+2)
        plt.hist(SENT_LEN_ALL, bins=bins, edgecolor='black', linewidth=0.5)
        plt.title("Sentence Lengths (regex tokens) — Module 1 baselines")
        plt.xlabel("Tokens per sentence"); plt.ylabel("Count")
        plt.figtext(0.01, 0.01, "Module 1 regex-based baselines; not syntactic tokenization",
                    ha="left", fontsize=9)
        out1 = PLOTS_OUT / "sentence_length_hist.png"
        plt.tight_layout(); plt.savefig(out1); plt.show()
        sent_ok = True
    else:
        out1 = PLOTS_OUT / "sentence_length_hist.png"
        _placeholder_plot("Sentence Lengths — Module 1 baselines",
                          "Regex tokens; produced as placeholder due to empty input.",
                          out1)
except Exception as e:
    print(json.dumps({"plot": "sentence_length_hist", "error": str(e)}))

# Zipf frequency histogram
try:
    zipf_ok = False
    if 'ZIPF_ALL' in globals() and getattr(ZIPF_ALL, "size", 0):
        plt.figure(dpi=120)
        bins = np.arange(1.0, 7.01, 0.25)
        plt.hist(ZIPF_ALL, bins=bins, edgecolor='black', linewidth=0.5)
        plt.title("Token Zipf Frequencies — Module 1 baselines")
        plt.xlabel("Zipf frequency (best list)"); plt.ylabel("Count")
        plt.figtext(0.01, 0.01, "Module 1 regex-based baselines; not syntactic tokenization",
                    ha="left", fontsize=9)
        out2 = PLOTS_OUT / "zipf_hist.png"
        plt.tight_layout(); plt.savefig(out2); plt.show()
        zipf_ok = True
    else:
        out2 = PLOTS_OUT / "zipf_hist.png"
        _placeholder_plot("Token Zipf Frequencies — Module 1 baselines",
                          "English word list; produced as placeholder due to empty input.",
                          out2)
except Exception as e:
    print(json.dumps({"plot": "zipf_hist", "error": str(e)}))

# ---- Per-article visuals (up to first 10 slugs alphabetically) ----
per_article_ok = True
try:
    if "article_id" in df_lex.columns and df_lex["article_id"].notna().any():
        for slug in sorted(df_lex["article_id"].dropna().unique())[:10]:
            sub = df_lex[df_lex["article_id"] == slug].copy()
            if "version_id" in sub.columns:
                sub = sub.sort_values("version_id")
            # Trend: Flesch Reading Ease across versions
            try:
                plt.figure(dpi=120)
                plt.plot(sub["version_id"], sub["flesch_reading_ease"], marker="o")
                plt.title(f"Flesch Reading Ease — {slug} (v1..v4)")
                plt.xlabel("Version (from filename prefix)"); plt.ylabel("Flesch Reading Ease")
                plt.xticks(sub["version_id"])
                outp = PLOTS_OUT / f"trend_flesch_reading_ease_{slug}.png"
                plt.tight_layout(); plt.savefig(outp); plt.close()
            except Exception as e:
                per_article_ok = False
                print(json.dumps({"plot": "trend_flesch", "slug": slug, "error": str(e)}))
            # Stacked bars: Zipf bins per version
            try:
                plt.figure(dpi=120)
                idx = sub["version_id"].astype(int).to_numpy()
                rare = sub["rare_rate"].fillna(0).to_numpy()
                mid  = sub["mid_rate"].fillna(0).to_numpy()
                com  = sub["common_rate"].fillna(0).to_numpy()
                plt.bar(idx, rare, label="rare <3.0")
                plt.bar(idx, mid, bottom=rare, label="mid 3–<5")
                plt.bar(idx, com, bottom=rare+mid, label="common ≥5")
                plt.title(f"Zipf Bins — {slug} (v1..v4)")
                plt.xlabel("Version (from filename prefix)"); plt.ylabel("Fraction")
                plt.xticks(idx); plt.ylim(0, 1); plt.legend(frameon=False)
                outp = PLOTS_OUT / f"zipf_bins_stacked_{slug}.png"
                plt.tight_layout(); plt.savefig(outp); plt.close()
            except Exception as e:
                per_article_ok = False
                print(json.dumps({"plot": "zipf_bins_stacked", "slug": slug, "error": str(e)}))
except Exception as e:
    per_article_ok = False
    print(json.dumps({"plot": "per_article", "error": str(e)}))

# ---- Write Parquet artifact (document-level) ----
parquet_path = BASE_OUT / "lexical_features.parquet"
df_lex.to_parquet(parquet_path, index=False, engine="pyarrow")

# ---- Deltas between consecutive versions within each article ----
def _token_set(text: str) -> set:
    return set([m.group(0).lower() for m in TOKEN_RE.finditer(text or "")])

def token_jaccard(a_text: str, b_text: str) -> float:
    A, B = _token_set(a_text), _token_set(b_text)
    if not A and not B: return float("nan")
    inter = len(A & B); uni = len(A | B)
    return inter / uni if uni else float("nan")

def js_divergence(p, q, eps=1e-8):
    p = np.asarray(p, dtype=float) + eps
    q = np.asarray(q, dtype=float) + eps
    p /= p.sum(); q /= q.sum()
    m = 0.5 * (p + q)
    kl = lambda x, y: np.sum(x * np.log(x / y))
    return float(0.5 * kl(p, m) + 0.5 * kl(q, m))

delta_rows = []
if "article_id" in df_lex.columns and df_lex["article_id"].notna().any():
    # need access to original cleaned text for jaccard; rebuild map from DOCS
    text_map = {d.get("doc_id", f"doc_{i:04d}"): d.get("text","") for i,d in enumerate(DOCS)}
    for slug in sorted(df_lex["article_id"].dropna().unique()):
        sub = df_lex[df_lex["article_id"] == slug].copy().sort_values("version_id")
        vids = sub["version_id"].astype(int).tolist()
        docs_order = sub["doc_id"].tolist()
        for i in range(len(vids)-1):
            v_from, v_to = vids[i], vids[i+1]
            if v_to != v_from + 1:  # only adjacent pairs (1->2, 2->3, 3->4)
                continue
            docA, docB = docs_order[i], docs_order[i+1]
            p = [float(sub.iloc[i]["rare_rate"] or 0), float(sub.iloc[i]["mid_rate"] or 0), float(sub.iloc[i]["common_rate"] or 0)]
            q = [float(sub.iloc[i+1]["rare_rate"] or 0), float(sub.iloc[i+1]["mid_rate"] or 0), float(sub.iloc[i+1]["common_rate"] or 0)]
            delta_rows.append({
                "article_id": slug,
                "from_version": int(v_from),
                "to_version": int(v_to),
                "token_jaccard": token_jaccard(text_map.get(docA,""), text_map.get(docB,"")),
                "zipf_jsd": js_divergence(p, q),
            })

df_deltas = pd.DataFrame(delta_rows) if delta_rows else pd.DataFrame(
    columns=["article_id","from_version","to_version","token_jaccard","zipf_jsd"])
deltas_path = BASE_OUT / "lexical_deltas.parquet"
df_deltas.to_parquet(deltas_path, index=False, engine="pyarrow")

print(json.dumps({
    "cell_id": "cell 1.6: textstat|wordfreq: visuals — baseline distributions (+per-article & deltas)",
    "status": "pass",
    "plots": {"sentence_length_hist": bool('sent_ok' in locals() and sent_ok),
              "zipf_hist": bool('zipf_ok' in locals() and zipf_ok),
              "per_article": bool(per_article_ok)},
    "artifacts": {"features_parquet": str(parquet_path), "deltas_parquet": str(deltas_path)}
}, indent=2))


In [None]:
# cell 1.Y: Module 1 validtion
import pandas as pd, json, pathlib

base = pathlib.Path("outputs/textstat_lex")  # or the folder where you placed them
meta = json.loads((base / "metadata.json").read_text())
feat = pd.read_parquet(base / "lexical_features.parquet")
delt = pd.read_parquet(base / "lexical_deltas.parquet")

print("version_order_source:", meta.get("version_order_source"))

# Features: required keys
req_feat = {"article_id","version_id","version_tag","doc_id"}
print("features missing keys:", sorted(req_feat - set(feat.columns)))

# Deltas: adjacency & ranges
req_delta = {"article_id","from_version","to_version","token_jaccard","zipf_jsd"}
print("deltas missing keys:", sorted(req_delta - set(delt.columns)))
print("adjacent only:", ((delt["to_version"] - delt["from_version"]) == 1).all())
print("token_jaccard in [0,1]:", delt["token_jaccard"].dropna().between(0,1).all())
print("zipf_jsd in [0,1]:", delt["zipf_jsd"].dropna().between(0,1).all())


In [None]:
# cell 2.1: NLTK — install & corpora download (module-only; base-safe)
# Wheels-only, quiet; no base upgrades. Install pyarrow for Parquet writes used in 2.3.
%pip install -q --only-binary=:all: --no-deps "nltk>=3.8,<3.10" "pyarrow>=14,<18"

import json, sys
import nltk
from nltk.data import find

needed = {"punkt": "tokenizers/punkt", "stopwords": "corpora/stopwords"}
present = {}
for pkg, loc in needed.items():
    try:
        find(loc)
        present[pkg] = True
    except LookupError:
        nltk.download(pkg, quiet=True, raise_on_error=False)
        try:
            find(loc)
            present[pkg] = True
        except LookupError:
            present[pkg] = False

report = {
    "cell_id": "2.1",
    "python": sys.version.split()[0],
    "nltk": nltk.__version__,
    "corpora": present,
}
print(json.dumps(report, indent=2))

# Fail-fast if required corpora are unavailable after attempted download
if not all(present.values()):
    missing = [k for k, ok in present.items() if not ok]
    raise RuntimeError(f"Module 2 requires NLTK corpora {missing}; download failed. Aborting Module 2.")


In [None]:
# 2.2a — NLTK data hotfix (punkt, punkt_tab, stopwords)
from pathlib import Path
import os, nltk

# Put NLTK data under the working dir so it’s reproducible and non-root
NLTK_DIR = Path(os.environ.get("NLTK_DATA", (BASE_DIR if 'BASE_DIR' in globals() else Path.cwd()) / "nltk_data"))
NLTK_DIR.mkdir(parents=True, exist_ok=True)
if str(NLTK_DIR) not in nltk.data.path:
    nltk.data.path.insert(0, str(NLTK_DIR))

def _ensure(resource: str, locator: str):
    try:
        nltk.data.find(locator)
    except LookupError:
        nltk.download(resource, download_dir=str(NLTK_DIR), quiet=True)
        nltk.data.find(locator)  # fail fast if still missing

# Required for sentence tokenization & stopwords used in 2.x
_ensure("punkt",      "tokenizers/punkt")
# punkt_tab is needed in newer NLTK; ignore quietly if resource not recognized
try:
    _ensure("punkt_tab",  "tokenizers/punkt_tab")
except Exception:
    pass
_ensure("stopwords",  "corpora/stopwords")

print(f"✔ NLTK data ready at {NLTK_DIR}")


In [None]:
# cell 2.2: NLTK — imports & tokenizer init (filename-prefix discovery compatible)
# Singletons for sentence/word tokenization. English stopwords used as function-word proxy.
from __future__ import annotations

import re, math, statistics
from dataclasses import dataclass
from typing import List, Dict, Any
import nltk
from nltk.corpus import stopwords
from nltk.data import load as nltk_load
from nltk.tokenize import TreebankWordTokenizer

LANG = "english"  # If Module 1 later exposes lang detection, wire it here.
STOPWORDS = set(stopwords.words(LANG))

# Singletons
PUNKT = nltk_load("tokenizers/punkt/english.pickle")
TB = TreebankWordTokenizer()

@dataclass
class SentenceSpan:
    start: int
    end: int
    text: str

def sent_spans(text: str) -> List[SentenceSpan]:
    return [SentenceSpan(s, e, text[s:e]) for (s, e) in PUNKT.span_tokenize(text)]

@dataclass
class TokenSpan:
    start: int
    end: int
    text: str
    is_stop: bool

def token_spans(text: str) -> List[TokenSpan]:
    toks = []
    for (s, e) in TB.span_tokenize(text):
        w = text[s:e]
        toks.append(TokenSpan(s, e, w, w.lower() in STOPWORDS))
    return toks

print({"cell_id": "2.2", "status": "ready", "stopwords": len(STOPWORDS)})


In [None]:
# cell 2.3: NLTK — function-word & burstiness (doc-level) + sliding windows
from __future__ import annotations

import json, math, statistics
from dataclasses import dataclass
from typing import Dict, Any, List
from collections import Counter
from pathlib import Path
import pandas as pd
import numpy as np

# ---- Output dirs
OUTDIR = Path("outputs/nltk"); OUTDIR.mkdir(parents=True, exist_ok=True)
PLOTS  = OUTDIR / "plots";    PLOTS.mkdir(parents=True, exist_ok=True)

def _cv(vals: List[int|float]) -> float:
    vals = [float(v) for v in vals if v is not None]
    if not vals: return float("nan")
    mu = statistics.mean(vals)
    if mu == 0 or len(vals) < 2: return float("nan")
    return statistics.pstdev(vals) / mu

# ---- DOC-LEVEL FEATURES ------------------------------------------------------
def doc_sentence_token_stats(text: str) -> Dict[str, Any]:
    sents = sent_spans(text)
    sent_token_counts = []
    sent_stop_counts  = []
    for s in sents:
        toks = token_spans(s.text)
        sent_token_counts.append(len(toks))
        sent_stop_counts.append(sum(t.is_stop for t in toks))

    tokens = token_spans(text)
    n_tokens = len(tokens)
    n_stop   = sum(t.is_stop for t in tokens)

    stop_counts = Counter(t.text.lower() for t in tokens if t.is_stop)
    top_fw = stop_counts.most_common(50)
    total_stop = max(1, sum(stop_counts.values()))
    # Normalize function-word profile by total stopwords (reviewer request)
    top_fw_norm = {k: v / total_stop for k, v in top_fw}

    return {
        "n_sent": len(sents),
        "n_tokens": n_tokens,
        "stopword_ratio": (n_stop / n_tokens) if n_tokens else float("nan"),
        "sent_len_mean": statistics.mean(sent_token_counts) if sent_token_counts else float("nan"),
        "sent_len_median": statistics.median(sent_token_counts) if sent_token_counts else float("nan"),
        "sent_len_max": max(sent_token_counts) if sent_token_counts else float("nan"),
        "burstiness_token_cv": _cv(sent_token_counts),
        "burstiness_stopword_cv": _cv(sent_stop_counts),
        "function_word_top50": top_fw_norm,
    }

# ---- INPUT DISCOVERY (df_docs or DOCS) --------------------------------------
def _gather_docs() -> pd.DataFrame:
    # Accept df_docs (Module 1) or DOCS (list of dicts). Expect keys:
    # article_id, version_id, version_tag, and one of text_norm/text_clean/text
    cols_pref = ["text_norm", "text_clean", "text"]
    try:
        df = df_docs.copy()
    except NameError:
        try:
            df = pd.DataFrame(DOCS)
        except NameError:
            raise RuntimeError("Module 2.3: No input found. Expect df_docs or DOCS.")

    if "article_id" not in df.columns:
        if "slug" in df.columns:
            df["article_id"] = df["slug"]
        else:
            df["article_id"] = df.index.astype(str)
    if "version_id" not in df.columns:
        df["version_id"] = 1
    if "version_tag" not in df.columns:
        df["version_tag"] = "v" + df["version_id"].astype(str)

    for c in cols_pref:
        if c in df.columns:
            df["text_basis"] = df[c]
            df["span_basis"] = c
            break
    else:
        raise RuntimeError("Module 2.3: No text column found among text_norm/text_clean/text.")

    return df[["article_id","version_id","version_tag","text_basis","span_basis"]].copy()

df2_input = _gather_docs()

# ---- Build doc-level table df_nltk ------------------------------------------
rows = []
for art, ver, vtag, txt, _span in df2_input.itertuples(index=False):
    st = doc_sentence_token_stats(txt)
    st["article_id"]  = str(art)
    st["version_id"]  = int(ver)
    st["version_tag"] = str(vtag)
    rows.append(st)

df_nltk = pd.DataFrame(rows)
# Expand function-word profile to wide columns
fw_df = pd.DataFrame.from_records(
    ({**{"article_id": r["article_id"], "version_id": r["version_id"]},
      **{f"fw::{k}": v for k, v in r["function_word_top50"].items()}} for r in rows)
).fillna(0.0)
df_nltk = df_nltk.drop(columns=["function_word_top50"]).merge(fw_df, on=["article_id","version_id"], how="left")

# Dtypes for stability
_df_types = {
    "article_id": "string", "version_id": "int64", "version_tag": "string",
    "n_sent": "int64", "n_tokens": "int64",
    "stopword_ratio": "float64", "sent_len_mean": "float64", "sent_len_median": "float64",
    "sent_len_max": "float64", "burstiness_token_cv": "float64", "burstiness_stopword_cv": "float64",
}
for k, t in _df_types.items():
    if k in df_nltk.columns:
        df_nltk[k] = df_nltk[k].astype(t)

# ---- WINDOW-LEVEL BUILDER (expert spec) -------------------------------------
def build_windows(
    docs: pd.DataFrame,
    window_sents: int = 3,
    stride_sents: int = 1,
    keep_tail: bool = False,
    span_on: str = "auto",  # kept for signature parity; span is already chosen in df2_input
) -> pd.DataFrame:
    recs = []
    for art, ver, vtag, txt, span_basis in docs[["article_id","version_id","version_tag","text_basis","span_basis"]].itertuples(index=False):
        s_spans = list(PUNKT.span_tokenize(txt))
        n_s = len(s_spans)
        if n_s == 0:
            continue

        i = 0
        win_id = 1
        while i < n_s:
            j = i + window_sents
            partial = False
            if j > n_s:
                if not keep_tail:
                    break
                j = n_s
                partial = True

            # Per-sentence tokenization in the window
            sent_token_counts = []
            stop_count = 0
            tokens_alpha = []
            total_tokens = 0

            for (ss, ee) in s_spans[i:j]:
                s_txt = txt[ss:ee]
                toks = list(TB.tokenize(s_txt))
                total_tokens += len(toks)
                sent_token_counts.append(len(toks))
                stop_count += sum(1 for w in toks if w.lower() in STOPWORDS)
                tokens_alpha.extend([w.lower() for w in toks if w.isalpha()])

            n_sents_win = (j - i)
            n_tokens_win = total_tokens
            mean_len = statistics.mean(sent_token_counts) if n_sents_win > 0 else float("nan")
            std_len  = statistics.pstdev(sent_token_counts) if n_sents_win > 1 else float("nan")

            type_counts = Counter(tokens_alpha)
            types_total = len(type_counts)
            hapax = sum(1 for _, c in type_counts.items() if c == 1)
            hapax_rate = (hapax / types_total) if types_total > 0 else float("nan")

            stop_rate = (stop_count / n_tokens_win) if n_tokens_win > 0 else float("nan")
            content_rate = 1.0 - stop_rate if not math.isnan(stop_rate) else float("nan")

            # Burstiness within window (guards per spec)
            burst_cv = _cv(sent_token_counts) if (n_sents_win >= 2 and n_tokens_win >= 10) else float("nan")

            # Top-K content token dispersion across sentences in the window
            K = 20
            per_sent_counts = []
            per_sent_token_lists = []
            for (ss, ee) in s_spans[i:j]:
                s_txt = txt[ss:ee]
                toks = [w.lower() for w in TB.tokenize(s_txt)]
                content = [w for w in toks if w.isalpha() and w not in STOPWORDS]
                per_sent_token_lists.append(content)
                per_sent_counts.append(Counter(content))
            global_counts = Counter([w for lst in per_sent_token_lists for w in lst])
            topk = [w for (w, _) in global_counts.most_common(min(K, len(global_counts)))]
            cvs = []
            if (n_sents_win >= 2 and n_tokens_win >= 10) and topk:
                for w in topk:
                    vec = [cnt.get(w, 0) for cnt in per_sent_counts]
                    cvs.append(_cv(vec))
            burst_topk_mean_cv = (statistics.mean([v for v in cvs if not math.isnan(v)]) if cvs else float("nan"))

            char_start = s_spans[i][0]
            char_end   = s_spans[j-1][1]

            recs.append({
                "article_id": str(art),
                "version_id": int(ver),
                "version_tag": str(vtag),
                "doc_id": f"{art}-v{ver}",
                "win_id": int(win_id),
                "win_label": f"v{ver}-w{win_id}",
                "span_basis": span_basis,
                "char_start": int(char_start),
                "char_end": int(char_end),
                "sent_start_index": int(i),
                "sent_end_index": int(j-1),
                "is_partial_tail": bool(partial),
                "n_sents_win": int(n_sents_win),
                "n_tokens_win": int(n_tokens_win),
                "mean_sent_len_tok_win": float(mean_len) if not math.isnan(mean_len) else float("nan"),
                "std_sent_len_tok_win": float(std_len) if not math.isnan(std_len) else float("nan"),
                "stopword_rate_win": float(stop_rate) if not math.isnan(stop_rate) else float("nan"),
                "content_rate_win": float(content_rate) if not math.isnan(content_rate) else float("nan"),
                "hapax_rate_win": float(hapax_rate) if not math.isnan(hapax_rate) else float("nan"),
                "function_word_rate_nltk_win": float(stop_rate) if not math.isnan(stop_rate) else float("nan"),
                "burstiness_token_cv_win": float(burst_cv) if not math.isnan(burst_cv) else float("nan"),
                "burstiness_topk_mean_cv_win": float(burst_topk_mean_cv) if not math.isnan(burst_topk_mean_cv) else float("nan"),
            })

            win_id += 1
            i += stride_sents
            if i >= n_s:
                break

    dfw = pd.DataFrame.from_records(recs)

    # Dtype enforcement
    dtypes = {
        "article_id":"string","version_id":"int64","version_tag":"string","doc_id":"string",
        "win_id":"int64","win_label":"string","span_basis":"string",
        "char_start":"int64","char_end":"int64","sent_start_index":"int64","sent_end_index":"int64",
        "is_partial_tail":"boolean",
        "n_sents_win":"int64","n_tokens_win":"int64",
        "mean_sent_len_tok_win":"float64","std_sent_len_tok_win":"float64",
        "stopword_rate_win":"float64","content_rate_win":"float64","hapax_rate_win":"float64",
        "function_word_rate_nltk_win":"float64",
        "burstiness_token_cv_win":"float64","burstiness_topk_mean_cv_win":"float64",
    }
    for col, dt in dtypes.items():
        if col in dfw.columns:
            dfw[col] = dfw[col].astype(dt)

    dfw = dfw.sort_values(["article_id","version_id","win_id"], kind="stable").reset_index(drop=True)
    return dfw

# Build windows per expert spec and persist
df_nltk_win = build_windows(df2_input, window_sents=3, stride_sents=1, keep_tail=False)

# Save window artifact
(df_nltk_win).to_parquet(OUTDIR / "fw_burstiness_windows.parquet", index=False)

# Update metadata
meta_path = OUTDIR / "metadata.json"
meta_update = {
    "module": "2",
    "window_sents": 3,
    "stride_sents": 1,
    "keep_tail": False,
    "tokenizer_word": "NLTK Treebank",
    "tokenizer_sent": "NLTK Punkt (english)",
}
try:
    if meta_path.exists():
        with open(meta_path, "r", encoding="utf-8") as f:
            meta = json.load(f)
    else:
        meta = {}
    meta.update(meta_update)
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)
except Exception as e:
    print(json.dumps({"metadata_write_error": str(e)}))

print(json.dumps({
    "cell_id":"2.3",
    "status":"pass",
    "n_docs": int(df_nltk.shape[0]) if 'df_nltk' in globals() else None,
    "n_windows": int(df_nltk_win.shape[0]),
    "artifacts": {
        "windows_parquet": "outputs/nltk/fw_burstiness_windows.parquet",
        "metadata": "outputs/nltk/metadata.json"
    }
}, indent=2))


In [None]:
# cell 2.4: NLTK — visuals (robust, no duplicates, per-version overlays)
import os, json, math
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# --- Preconditions
if "df_nltk" not in globals():
    raise RuntimeError("Module 2.4: df_nltk not found. Run 2.3 first.")

need = {"article_id","version_id","stopword_ratio","sent_len_mean","burstiness_token_cv"}
missing = need - set(df_nltk.columns)
if missing:
    raise RuntimeError(f"Module 2.4: df_nltk is missing columns: {sorted(missing)}")

# enforce numeric version_id for correct ordering
try:
    df_nltk["version_id"] = pd.to_numeric(df_nltk["version_id"], errors="raise").astype("int64")
except Exception as e:
    raise RuntimeError(f"Module 2.4: version_id must be numeric. {e}")

PLOTS = Path("outputs/nltk/plots"); PLOTS.mkdir(parents=True, exist_ok=True)
MAX_ARTICLES = int(os.environ.get("LSA_MAX_ARTICLES", "10"))
manifest = []

def _safe_slug(s: str) -> str:
    # keep only [-_.a-zA-Z0-9]; map others to '_'
    import re
    return re.sub(r"[^-_.a-zA-Z0-9]", "_", str(s))

def _finite_percentiles(series: pd.Series, lo=5, hi=95):
    vals = series.replace([np.inf, -np.inf], np.nan).astype(float).dropna().to_numpy()
    if vals.size == 0:
        return None
    lo_v, hi_v = np.percentile(vals, [lo, hi])
    if not np.isfinite(hi_v) or hi_v == lo_v:
        # widen tiny ranges
        hi_v = lo_v + 1e-9
    return (lo_v, hi_v)

def _save_show(fig, path: Path):
    fig.savefig(path, bbox_inches="tight")
    plt.show()
    plt.close(fig)
    manifest.append(str(path))

# --- 1) Global violin: burstiness_token_cv (guard empty)
vals = df_nltk["burstiness_token_cv"].replace([np.inf,-np.inf], np.nan).dropna().to_numpy()
if vals.size == 0:
    fig, ax = plt.subplots(dpi=120)
    ax.text(0.5, 0.5, "No burstiness data to plot", ha="center", va="center")
    ax.set_axis_off()
    out = PLOTS / "global_burstiness_violin_EMPTY.png"
    _save_show(fig, out)
    global_ok = False
else:
    fig, ax = plt.subplots(dpi=120)
    ax.violinplot(vals, showmeans=True, showextrema=True, showmedians=True)
    ax.set_title("Global burstiness (token CV across sentences)")
    ax.set_ylabel("CV")
    out = PLOTS / "global_burstiness_violin.png"
    _save_show(fig, out)
    global_ok = True

# --- 2) Per-article radar with version overlays (v1..v4…), normalized by global 5–95th pct
cols = ["stopword_ratio", "sent_len_mean", "burstiness_token_cv"]
norms = {}
for c in cols:
    p = _finite_percentiles(df_nltk[c])
    norms[c] = p  # could be None if no finite data

def _radar_overlay(df_art: pd.DataFrame, art_label: str):
    # normalize each version’s vector using global norms (fallback to per-article if needed)
    labels = cols
    # compute fallback norms per-article if global missing
    local_norms = {}
    for c in cols:
        local_norms[c] = norms[c] or _finite_percentiles(df_art[c]) or (0.0, 1.0)

    # build angle grid
    angles = np.linspace(0, 2*np.pi, len(labels), endpoint=False).tolist()
    angles += angles[:1]

    fig = plt.figure(dpi=120)
    ax = plt.subplot(111, polar=True)
    for _, row in df_art.sort_values("version_id").iterrows():
        vec = []
        for c in labels:
            lo_v, hi_v = local_norms[c]
            x = float(row[c])
            if not np.isfinite(x):
                vec.append(0.0)
            else:
                vec.append(np.clip((x - lo_v) / (hi_v - lo_v), 0.0, 1.0))
        vec += vec[:1]
        ax.plot(angles, vec, marker="o", label=f"v{int(row['version_id'])}")
        ax.fill(angles, vec, alpha=0.05)

    ax.set_xticks(angles[:-1]); ax.set_xticklabels(labels)
    ax.set_title(f"Lex profile (overlay by version) — {art_label}", pad=12)
    ax.set_rlabel_position(0)
    ax.grid(True, alpha=0.3)
    ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.10))
    return fig

radar_ok = True
try:
    shown = 0
    for art, g in df_nltk.groupby("article_id", sort=False):
        if shown >= MAX_ARTICLES: break
        fig = _radar_overlay(g, art_label=str(art))
        out = PLOTS / f"radar_overlay_{_safe_slug(art)}.png"
        _save_show(fig, out)
        shown += 1
except Exception as e:
    radar_ok = False
    print(json.dumps({"plot":"per-article_radar_overlay","error":str(e)}))

# --- 3) Per-article trends across versions (stopword_ratio & burstiness)
trends_ok = True
try:
    shown = 0
    for art, g in df_nltk.groupby("article_id", sort=False):
        if shown >= MAX_ARTICLES: break
        gg = g.sort_values("version_id")
        # Stopword ratio trend
        fig, ax = plt.subplots(dpi=120)
        ax.plot(gg["version_id"], gg["stopword_ratio"], marker="o")
        ax.set_title(f"Stopword ratio across versions — {art}")
        ax.set_xlabel("version_id"); ax.set_ylabel("stopword_ratio")
        ax.set_xticks(gg["version_id"])
        out_sr = PLOTS / f"trend_stopword_ratio_{_safe_slug(art)}.png"
        _save_show(fig, out_sr)

        # Burstiness trend
        fig, ax = plt.subplots(dpi=120)
        ax.plot(gg["version_id"], gg["burstiness_token_cv"], marker="o")
        ax.set_title(f"Burstiness (token CV) across versions — {art}")
        ax.set_xlabel("version_id"); ax.set_ylabel("burstiness_token_cv")
        ax.set_xticks(gg["version_id"])
        out_cv = PLOTS / f"trend_burstiness_cv_{_safe_slug(art)}.png"
        _save_show(fig, out_cv)

        shown += 1
except Exception as e:
    trends_ok = False
    print(json.dumps({"plot":"per-article_trends","error":str(e)}))

# --- 4) Manifest of generated plot files
(PLOTS / "plots_index.json").write_text(json.dumps({"files": manifest}, indent=2), encoding="utf-8")

print(json.dumps({
    "cell_id":"2.4",
    "status":"pass",
    "plots": {
        "global_burstiness_violin": bool(global_ok),
        "per_article_radar_overlay": bool(radar_ok),
        "per_article_trends": bool(trends_ok),
    },
    "plots_dir": str(PLOTS),
    "manifest_count": len(manifest)
}, indent=2))


In [None]:
# cell 2.4add — fill the gaps only (doc parquet, deltas, minimal metadata, radar filename copy)
from __future__ import annotations
import json, math, statistics, shutil, re
from collections import Counter
from pathlib import Path
from typing import Dict, Any, List
import numpy as np
import pandas as pd

OUTDIR = Path("outputs/nltk"); OUTDIR.mkdir(parents=True, exist_ok=True)
PLOTS  = OUTDIR / "plots";     PLOTS.mkdir(parents=True, exist_ok=True)

doc_path    = OUTDIR / "fw_burstiness.parquet"
deltas_path = OUTDIR / "fw_burstiness_deltas.parquet"
meta_path   = OUTDIR / "metadata.json"

# --- reuse tokenizers from 2.2; load if not in memory ---
try:
    STOPWORDS, PUNKT, TB  # type: ignore[name-defined]
except NameError:
    import nltk
    from nltk.corpus import stopwords
    from nltk.data import load as nltk_load
    from nltk.tokenize import TreebankWordTokenizer
    STOPWORDS = set(stopwords.words("english"))
    PUNKT = nltk_load("tokenizers/punkt/english.pickle")
    TB = TreebankWordTokenizer()

def _cv(vals: List[int|float]) -> float:
    vals = [float(v) for v in vals if v is not None]
    if not vals: return float("nan")
    mu = statistics.mean(vals)
    if mu == 0 or len(vals) < 2: return float("nan")
    return statistics.pstdev(vals) / mu

def _gather_docs() -> pd.DataFrame:
    # prefer df2_input from 2.3; otherwise use df_docs/DOCS
    if "df2_input" in globals():
        base = df2_input.copy()
        return base[["article_id","version_id","version_tag","text_basis"]]
    try:
        base = df_docs.copy()
    except NameError:
        base = pd.DataFrame(DOCS)
    if "article_id" not in base.columns:
        base["article_id"] = (base["slug"] if "slug" in base.columns else base.index).astype(str)
    if "version_id" not in base.columns:
        base["version_id"] = 1
    if "version_tag" not in base.columns:
        base["version_tag"] = "v" + base["version_id"].astype(str)
    for c in ("text_norm","text_clean","text"):
        if c in base.columns:
            base["text_basis"] = base[c]; break
    else:
        raise RuntimeError("2.4add: no text column found (text_norm/text_clean/text).")
    return base[["article_id","version_id","version_tag","text_basis"]]

def _doc_features(text: str) -> Dict[str, Any]:
    s_spans = list(PUNKT.span_tokenize(text))
    n_sents = len(s_spans)
    sent_token_counts, per_sent_content = [], []
    stop_count, total_tokens = 0, 0
    tokens_alpha = []
    for (s, e) in s_spans:
        s_txt = text[s:e]
        toks = [w for w in TB.tokenize(s_txt)]
        total_tokens += len(toks)
        sent_token_counts.append(len(toks))
        stop_count += sum(1 for w in toks if w.lower() in STOPWORDS)
        alpha = [w.lower() for w in toks if w.isalpha()]
        tokens_alpha.extend(alpha)
        per_sent_content.append(Counter([w for w in alpha if w not in STOPWORDS]))
    type_counts = Counter(tokens_alpha)
    types_total = len(type_counts)
    hapax = sum(1 for _, c in type_counts.items() if c == 1)
    hapax_rate = (hapax / types_total) if types_total > 0 else float("nan")
    mean_len = statistics.mean(sent_token_counts) if n_sents > 0 else float("nan")
    std_len  = statistics.pstdev(sent_token_counts) if n_sents > 1 else float("nan")
    burst_cv = _cv(sent_token_counts) if (n_sents >= 2 and total_tokens >= 10) else float("nan")
    # top-K content token dispersion
    K = 20
    global_counts = Counter([w for lst in ([list(c.elements()) for c in per_sent_content]) for w in lst])
    topk = [w for (w, _) in global_counts.most_common(min(K, len(global_counts)))]
    cvs = []
    if (n_sents >= 2 and total_tokens >= 10) and topk:
        for w in topk:
            vec = [c.get(w, 0) for c in per_sent_content]
            cvs.append(_cv(vec))
    burst_topk_mean_cv = (statistics.mean([v for v in cvs if not math.isnan(v)]) if cvs else float("nan"))
    stop_rate = (stop_count / total_tokens) if total_tokens > 0 else float("nan")
    content_rate = 1.0 - stop_rate if not math.isnan(stop_rate) else float("nan")
    return {
        "n_tokens_nltk": int(total_tokens),
        "n_sents_nltk": int(n_sents),
        "mean_sent_len_tok_nltk": float(mean_len) if not math.isnan(mean_len) else float("nan"),
        "std_sent_len_tok_nltk": float(std_len) if not math.isnan(std_len) else float("nan"),
        "stopword_rate": float(stop_rate) if not math.isnan(stop_rate) else float("nan"),
        "content_rate": float(content_rate) if not math.isnan(content_rate) else float("nan"),
        "hapax_rate": float(hapax_rate) if not math.isnan(hapax_rate) else float("nan"),
        "function_word_rate_nltk": float(stop_rate) if not math.isnan(stop_rate) else float("nan"),
        "burstiness_token_cv": float(burst_cv) if not math.isnan(burst_cv) else float("nan"),
        "burstiness_topk_mean_cv": float(burst_topk_mean_cv) if not math.isnan(burst_topk_mean_cv) else float("nan"),
    }

made = {"doc": False, "deltas": False, "radar_copied": 0, "metadata_updated": False}

# --- (1) doc parquet: only if missing ---
if not doc_path.exists():
    docs = _gather_docs()
    rows = []
    for art, ver, vtag, txt in docs.itertuples(index=False):
        f = _doc_features(txt)
        f.update({"article_id": str(art), "version_id": int(ver), "version_tag": str(vtag), "doc_id": f"{art}-v{ver}"})
        rows.append(f)
    df_doc = pd.DataFrame(rows).sort_values(["article_id","version_id"])
    # dtypes
    dtypes = {
        "doc_id":"string","article_id":"string","version_id":"int64","version_tag":"string",
        "n_tokens_nltk":"int64","n_sents_nltk":"int64",
        "mean_sent_len_tok_nltk":"float64","std_sent_len_tok_nltk":"float64",
        "stopword_rate":"float64","content_rate":"float64","hapax_rate":"float64","function_word_rate_nltk":"float64",
        "burstiness_token_cv":"float64","burstiness_topk_mean_cv":"float64",
    }
    for c,t in dtypes.items():
        if c in df_doc.columns: df_doc[c] = df_doc[c].astype(t)
    df_doc.to_parquet(doc_path, index=False)
    made["doc"] = True
else:
    df_doc = pd.read_parquet(doc_path)

# --- (2) deltas parquet: only if missing (uses doc parquet just built/loaded) ---
if not deltas_path.exists():
    num_cols = [
        "n_tokens_nltk","n_sents_nltk","mean_sent_len_tok_nltk","std_sent_len_tok_nltk",
        "stopword_rate","content_rate","hapax_rate","function_word_rate_nltk",
        "burstiness_token_cv","burstiness_topk_mean_cv",
    ]
    delta_rows = []
    for art, g in df_doc.groupby("article_id", sort=False):
        g = g.sort_values("version_id")
        for i in range(len(g)-1):
            a, b = g.iloc[i], g.iloc[i+1]
            if int(b["version_id"]) - int(a["version_id"]) != 1:
                raise AssertionError(f"Non-adjacent versions for {art}: {a['version_id']}→{b['version_id']}")
            rec = {"article_id": str(art), "from_version": int(a["version_id"]), "to_version": int(b["version_id"])}
            for c in num_cols:
                rec[f"delta_{c}"] = float(b[c]) - float(a[c]) if pd.notna(b[c]) and pd.notna(a[c]) else float("nan")
            delta_rows.append(rec)
    pd.DataFrame(delta_rows).astype({"article_id":"string","from_version":"int64","to_version":"int64"}).to_parquet(deltas_path, index=False)
    made["deltas"] = True

# --- (3) metadata: merge a few essentials (non-destructive) ---
try:
    meta = json.loads(meta_path.read_text(encoding="utf-8")) if meta_path.exists() else {}
except Exception:
    meta = {}
per_counts = df_doc.groupby("article_id")["version_id"].nunique()
meta.update({
    "module": "2",
    "version_order_source": meta.get("version_order_source", "filename_prefix"),
    "articles": sorted(df_doc["article_id"].astype(str).unique().tolist()),
    "n_articles": int(df_doc["article_id"].nunique()),
    "versions_per_article_min": int(per_counts.min()) if len(per_counts) else 0,
    "versions_per_article_max": int(per_counts.max()) if len(per_counts) else 0,
    "expected_versions": meta.get("expected_versions", 4),
})
meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
made["metadata_updated"] = True

# --- (4) filenames: copy radar_overlay_* → stopword_radar_* if needed (no re-render) ---
for p in PLOTS.glob("radar_overlay_*.png"):
    target = PLOTS / p.name.replace("radar_overlay_", "stopword_radar_")
    if not target.exists():
        shutil.copy2(p, target)
        made["radar_copied"] += 1

print(json.dumps({"cell_id":"2.4add","status":"pass","made": made}, indent=2))


In [None]:
# cell 2.3b-min — doc-level table + adjacent deltas (+ light metadata)
from __future__ import annotations
import json, math, statistics
from collections import Counter
from pathlib import Path
from typing import Dict, Any, List
import numpy as np, pandas as pd

OUTDIR = Path("outputs/nltk"); OUTDIR.mkdir(parents=True, exist_ok=True)

# Reuse tokenizers if present; otherwise load them.
try:
    STOPWORDS, PUNKT, TB  # from 2.2
except NameError:
    import nltk
    from nltk.corpus import stopwords
    from nltk.data import load as nltk_load
    from nltk.tokenize import TreebankWordTokenizer
    STOPWORDS = set(stopwords.words("english"))
    PUNKT = nltk_load("tokenizers/punkt/english.pickle")
    TB = TreebankWordTokenizer()

def _cv(vals: List[int|float]) -> float:
    vals = [float(v) for v in vals if v is not None]
    if not vals: return float("nan")
    mu = statistics.mean(vals)
    if mu == 0 or len(vals) < 2: return float("nan")
    return statistics.pstdev(vals) / mu

def _gather_docs_for_doclevel() -> pd.DataFrame:
    # Prefer df2_input built in 2.3; else rebuild from df_docs/DOCS.
    if "df2_input" in globals():
        return df2_input[["article_id","version_id","version_tag","text_basis"]].copy()
    try:
        base = df_docs.copy()
    except NameError:
        base = pd.DataFrame(DOCS)
    if "article_id" not in base.columns:
        base["article_id"] = (base["slug"] if "slug" in base.columns else base.index).astype(str)
    if "version_id" not in base.columns:
        base["version_id"] = 1
    if "version_tag" not in base.columns:
        base["version_tag"] = "v" + base["version_id"].astype(str)
    for c in ("text_norm","text_clean","text"):
        if c in base.columns:
            base["text_basis"] = base[c]; break
    else:
        raise RuntimeError("2.3b-min: no text column among text_norm/text_clean/text.")
    return base[["article_id","version_id","version_tag","text_basis"]].copy()

def _doc_features(text: str) -> Dict[str, Any]:
    s_spans = list(PUNKT.span_tokenize(text))
    n_sents = len(s_spans)
    sent_token_counts, per_sent_content = [], []
    stop_count, total_tokens = 0, 0
    tokens_alpha = []

    for (s, e) in s_spans:
        s_txt = text[s:e]
        toks = [w for w in TB.tokenize(s_txt)]
        total_tokens += len(toks)
        sent_token_counts.append(len(toks))
        stop_count += sum(1 for w in toks if w.lower() in STOPWORDS)
        alpha = [w.lower() for w in toks if w.isalpha()]
        tokens_alpha.extend(alpha)
        per_sent_content.append(Counter([w for w in alpha if w not in STOPWORDS]))

    type_counts = Counter(tokens_alpha)
    types_total = len(type_counts)
    hapax = sum(1 for _, c in type_counts.items() if c == 1)
    hapax_rate = (hapax / types_total) if types_total > 0 else float("nan")

    mean_len = statistics.mean(sent_token_counts) if n_sents > 0 else float("nan")
    std_len  = statistics.pstdev(sent_token_counts) if n_sents > 1 else float("nan")

    burst_cv = _cv(sent_token_counts) if (n_sents >= 2 and total_tokens >= 10) else float("nan")

    # CV of per-sentence counts for top-K content tokens (K=20)
    K = 20
    global_counts = Counter([w for lst in ([list(c.elements()) for c in per_sent_content]) for w in lst])
    topk = [w for (w, _) in global_counts.most_common(min(K, len(global_counts)))]
    cvs = []
    if (n_sents >= 2 and total_tokens >= 10) and topk:
        for w in topk:
            vec = [c.get(w, 0) for c in per_sent_content]
            cvs.append(_cv(vec))
    burst_topk_mean_cv = (statistics.mean([v for v in cvs if not math.isnan(v)]) if cvs else float("nan"))

    stop_rate = (stop_count / total_tokens) if total_tokens > 0 else float("nan")
    content_rate = 1.0 - stop_rate if not math.isnan(stop_rate) else float("nan")

    return {
        "n_tokens_nltk": int(total_tokens),
        "n_sents_nltk": int(n_sents),
        "mean_sent_len_tok_nltk": float(mean_len) if not math.isnan(mean_len) else float("nan"),
        "std_sent_len_tok_nltk": float(std_len) if not math.isnan(std_len) else float("nan"),
        "stopword_rate": float(stop_rate) if not math.isnan(stop_rate) else float("nan"),
        "content_rate": float(content_rate) if not math.isnan(content_rate) else float("nan"),
        "hapax_rate": float(hapax_rate) if not math.isnan(hapax_rate) else float("nan"),
        "function_word_rate_nltk": float(stop_rate) if not math.isnan(stop_rate) else float("nan"),
        "burstiness_token_cv": float(burst_cv) if not math.isnan(burst_cv) else float("nan"),
        "burstiness_topk_mean_cv": float(burst_topk_mean_cv) if not math.isnan(burst_topk_mean_cv) else float("nan"),
    }

docs_df = _gather_docs_for_doclevel()
doc_rows = []
for art, ver, vtag, txt in docs_df.itertuples(index=False):
    f = _doc_features(txt)
    f["article_id"], f["version_id"], f["version_tag"] = str(art), int(ver), str(vtag)
    f["doc_id"] = f"{art}-v{ver}"
    doc_rows.append(f)

df_doc = pd.DataFrame(doc_rows).sort_values(["article_id","version_id"], kind="stable")
# dtypes
doc_types = {
    "doc_id":"string","article_id":"string","version_id":"int64","version_tag":"string",
    "n_tokens_nltk":"int64","n_sents_nltk":"int64",
    "mean_sent_len_tok_nltk":"float64","std_sent_len_tok_nltk":"float64",
    "stopword_rate":"float64","content_rate":"float64","hapax_rate":"float64","function_word_rate_nltk":"float64",
    "burstiness_token_cv":"float64","burstiness_topk_mean_cv":"float64",
}
for c,t in doc_types.items():
    if c in df_doc.columns: df_doc[c] = df_doc[c].astype(t)
df_doc.to_parquet(OUTDIR / "fw_burstiness.parquet", index=False)

# adjacent deltas
num_cols = [
    "n_tokens_nltk","n_sents_nltk","mean_sent_len_tok_nltk","std_sent_len_tok_nltk",
    "stopword_rate","content_rate","hapax_rate","function_word_rate_nltk",
    "burstiness_token_cv","burstiness_topk_mean_cv",
]
delta_rows = []
for art, g in df_doc.groupby("article_id", sort=False):
    g = g.sort_values("version_id")
    for i in range(len(g)-1):
        a, b = g.iloc[i], g.iloc[i+1]
        if int(b["version_id"]) - int(a["version_id"]) != 1:
            raise AssertionError(f"Non-adjacent versions for {art}: {a['version_id']}→{b['version_id']}")
        rec = {"article_id": str(art), "from_version": int(a["version_id"]), "to_version": int(b["version_id"])}
        for c in num_cols:
            rec[f"delta_{c}"] = float(b[c]) - float(a[c]) if pd.notna(b[c]) and pd.notna(a[c]) else float("nan")
        delta_rows.append(rec)

df_deltas = pd.DataFrame(delta_rows).astype({"article_id":"string","from_version":"int64","to_version":"int64"})
df_deltas.to_parquet(OUTDIR / "fw_burstiness_deltas.parquet", index=False)

# minimal metadata enrichment (merge)
meta_path = OUTDIR / "metadata.json"
try:
    base = json.loads(meta_path.read_text(encoding="utf-8")) if meta_path.exists() else {}
except Exception:
    base = {}
per_counts = df_doc.groupby("article_id")["version_id"].nunique()
base.update({
    "module": "2",
    "version_order_source": "filename_prefix",
    "articles": sorted(df_doc["article_id"].astype(str).unique().tolist()),
    "n_articles": int(df_doc["article_id"].nunique()),
    "versions_per_article_min": int(per_counts.min()) if len(per_counts) else 0,
    "versions_per_article_max": int(per_counts.max()) if len(per_counts) else 0,
    "expected_versions": 4,
})
meta_path.write_text(json.dumps(base, indent=2), encoding="utf-8")

print({
    "cell_id":"2.3b-min",
    "status":"pass",
    "doc_rows": int(df_doc.shape[0]),
    "delta_rows": int(df_deltas.shape[0]),
    "artifacts": ["fw_burstiness.parquet","fw_burstiness_deltas.parquet","metadata.json"]
})


In [None]:
# cell 2.4fix — only backfill missing per-article plots; do nothing if already present
import json, re
from pathlib import Path
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

if "df_nltk" not in globals():
    raise RuntimeError("2.4fix: df_nltk not found. Run 2.3/2.4 first.")

PLOTS = Path("outputs/nltk/plots"); PLOTS.mkdir(parents=True, exist_ok=True)

def _safe_slug(s: str) -> str:
    return re.sub(r"[^-_.a-zA-Z0-9]", "_", str(s))

# global norms for radar (stopword_ratio, sent_len_mean, burstiness_token_cv)
cols = ["stopword_ratio", "sent_len_mean", "burstiness_token_cv"]
norms = {}
for c in cols:
    col = pd.to_numeric(df_nltk[c], errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
    if col.empty:
        norms[c] = (0.0, 1.0)
    else:
        lo, hi = np.percentile(col.to_numpy(), [5, 95]); hi = lo + 1e-9 if hi == lo else hi
        norms[c] = (float(lo), float(hi))

made = {"radar": 0, "trend_sr": 0, "trend_cv": 0}
for art, g in df_nltk.groupby("article_id", sort=False):
    gg = g.sort_values("version_id")
    s = _safe_slug(art)

    # radar overlay per article (spec filename)
    rp = PLOTS / f"stopword_radar_{s}.png"
    if not rp.exists():
        labels = cols
        angles = np.linspace(0, 2*np.pi, len(labels), endpoint=False).tolist(); angles += angles[:1]
        fig = plt.figure(dpi=120); ax = plt.subplot(111, polar=True)
        for _, row in gg.iterrows():
            vec=[]
            for c in labels:
                lo, hi = norms[c]; x = float(row[c]) if pd.notna(row[c]) else np.nan
                if not np.isfinite(x): vec.append(0.0)
                else: vec.append(np.clip((x - lo)/(hi - lo), 0.0, 1.0))
            vec += vec[:1]
            ax.plot(angles, vec, marker="o", label=f"v{int(row['version_id'])}")
            ax.fill(angles, vec, alpha=0.05)
        ax.set_xticks(angles[:-1]); ax.set_xticklabels(labels)
        ax.set_title(f"Lex profile (overlay by version) — {art}", pad=12)
        ax.set_rlabel_position(0); ax.grid(True, alpha=0.3); ax.legend(loc="upper right", bbox_to_anchor=(1.25,1.10))
        fig.savefig(rp, bbox_inches="tight"); plt.close(fig)
        made["radar"] += 1

    # trends: stopword ratio
    p1 = PLOTS / f"trend_stopword_ratio_{s}.png"
    if not p1.exists():
        fig, ax = plt.subplots(dpi=120)
        ax.plot(gg["version_id"], gg["stopword_ratio"], marker="o")
        ax.set_title(f"Stopword ratio across versions — {art}")
        ax.set_xlabel("version_id"); ax.set_ylabel("stopword_ratio")
        ax.set_xticks(gg["version_id"])
        fig.savefig(p1, bbox_inches="tight"); plt.close(fig)
        made["trend_sr"] += 1

    # trends: burstiness CV
    p2 = PLOTS / f"trend_burstiness_cv_{s}.png"
    if not p2.exists():
        fig, ax = plt.subplots(dpi=120)
        ax.plot(gg["version_id"], gg["burstiness_token_cv"], marker="o")
        ax.set_title(f"Burstiness (token CV) across versions — {art}")
        ax.set_xlabel("version_id"); ax.set_ylabel("burstiness_token_cv")
        ax.set_xticks(gg["version_id"])
        fig.savefig(p2, bbox_inches="tight"); plt.close(fig)
        made["trend_cv"] += 1

# refresh index
( PLOTS / "plots_index.json" ).write_text(
    json.dumps({"files": sorted([p.name for p in PLOTS.glob("*.png")])}, indent=2),
    encoding="utf-8"
)

print({"cell_id":"2.4fix","status":"pass","made": made, "plots_dir": str(PLOTS)})


In [None]:
# cell 3.1 — spaCy setup: install, model load, and environment report (CPU-only)
# Safe, wheels-first install; fail-fast if model cannot be loaded.

import sys, subprocess, json, os
from pathlib import Path

def _pip(*args: str):
    subprocess.check_call([sys.executable, "-m", "pip", *args])

# Install spacy if missing
try:
    import spacy  # type: ignore
except Exception:
    _pip("install", "-q", "--no-cache-dir", "--only-binary=:all:", "spacy>=3.7,<3.8")
    import spacy  # type: ignore

# Try to load the small English pipeline; download if missing
def _ensure_en_sm():
    try:
        return spacy.load("en_core_web_sm", exclude=["ner"])
    except Exception:
        try:
            from spacy.cli import download
            download("en_core_web_sm")
            return spacy.load("en_core_web_sm", exclude=["ner"])
        except Exception as e:
            raise RuntimeError(
                "Module 3: Could not load/download 'en_core_web_sm'. "
                "Please ensure internet access and rerun this cell."
            ) from e

nlp = _ensure_en_sm()
nlp.max_length = int(os.environ.get("LSA_SPACY_MAXLEN", "2000000"))  # guard for long docs

# Threading: spaCy is already efficient on CPU with single process; expose env toggle
NPROC = int(os.environ.get("LSA_SPACY_PROCS", "1"))
BATCH = int(os.environ.get("LSA_SPACY_BATCH", "16"))

# Output dirs (Module 3)
SP_OUT = Path("outputs/spacy"); (SP_OUT / "plots").mkdir(parents=True, exist_ok=True)
(SP_OUT / "bundles").mkdir(parents=True, exist_ok=True)

print(json.dumps({
    "cell_id": "3.1",
    "status": "pass",
    "spacy_version": spacy.__version__,
    "model": "en_core_web_sm",
    "components_enabled": [p for p in nlp.pipe_names if p != "ner"],
    "nlp_max_length": nlp.max_length,
    "n_process": NPROC,
    "batch_size": BATCH,
}, indent=2))


In [None]:
# cell 3.1b — ensure pyarrow present for Parquet IO (safe no-op if already installed)
import sys, subprocess
try:
    import pyarrow # noqa
    print({"cell_id":"3.1b","status":"pass","pyarrow":pyarrow.__version__})
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "--only-binary=:all:", "pyarrow>=14,<16"])
    import pyarrow
    print({"cell_id":"3.1b","status":"installed","pyarrow":pyarrow.__version__})


In [None]:
# cell 3.2 — Inputs & ordering: gather documents and Module-2 windows (if present)
# Rules:
# - Version order comes from numeric version_id (derived from filename prefix 01–04 in earlier modules).
# - For windows, prefer the authoritative M2 parquet: outputs/nltk/fw_burstiness_windows.parquet

import json
from pathlib import Path
import pandas as pd

# Try to reuse df2_input from Module 2; else fall back to df_docs/DOCS (as in Modules 1–2).
def _gather_docs():
    # Expect columns: article_id, version_id, version_tag, text, text_clean (optional)
    if "df2_input" in globals():
        df = df2_input.copy()
        # df2_input guaranteed to have article_id, version_id, version_tag and text_basis/span_basis used in M2; recover raw text columns if present
        # If raw text columns are not present, assume text_basis is the preferred doc-level string.
        if "text" not in df.columns and "text_clean" not in df.columns:
            df["text"] = df["text_basis"]
        return df[["article_id","version_id","version_tag","text","text_clean"]].copy() if "text_clean" in df.columns else df[["article_id","version_id","version_tag","text"]].copy()
    try:
        base = df_docs.copy()
    except NameError:
        try:
            base = pd.DataFrame(DOCS)
        except NameError:
            raise RuntimeError("Module 3: Could not find df2_input / df_docs / DOCS. Please run Modules 1–2 or provide documents.")
    if "article_id" not in base.columns:
        base["article_id"] = (base["slug"] if "slug" in base.columns else base.index).astype(str)
    if "version_id" not in base.columns:
        base["version_id"] = 1
    if "version_tag" not in base.columns:
        base["version_tag"] = "v" + base["version_id"].astype(str)
    # Ensure at least 'text'
    for c in ("text","text_clean","text_basis"):
        if c in base.columns:
            break
    else:
        raise RuntimeError("Module 3: No text-like column found (text/text_clean/text_basis).")
    # Normalize columns
    if "text" not in base.columns and "text_basis" in base.columns:
        base["text"] = base["text_basis"]
    cols = ["article_id","version_id","version_tag","text"] + (["text_clean"] if "text_clean" in base.columns else [])
    return base[cols].copy()

DOCS_DF = _gather_docs()
DOCS_DF["article_id"] = DOCS_DF["article_id"].astype("string")
DOCS_DF["version_id"] = pd.to_numeric(DOCS_DF["version_id"], errors="raise").astype("int64")
DOCS_DF["version_tag"] = DOCS_DF["version_tag"].astype("string")
DOCS_DF = DOCS_DF.sort_values(["article_id","version_id"], kind="stable").reset_index(drop=True)

# Try to load M2 windows (recommended path)
M2_WIN_PATH = Path("outputs/nltk/fw_burstiness_windows.parquet")
DFW = None
if M2_WIN_PATH.exists():
    DFW = pd.read_parquet(M2_WIN_PATH)
    # Keep only required columns for alignment
    need = ["article_id","version_id","version_tag","doc_id","win_id","win_label",
            "span_basis","char_start","char_end","sent_start_index","sent_end_index","is_partial_tail"]
    missing = [c for c in need if c not in DFW.columns]
    if missing:
        raise RuntimeError(f"Module 3: windows parquet missing required columns: {missing}")
    DFW = DFW[need].copy()
    # Normalize dtypes
    DFW["article_id"] = DFW["article_id"].astype("string")
    DFW["version_id"] = pd.to_numeric(DFW["version_id"], errors="raise").astype("int64")
    for c in ("win_id","char_start","char_end","sent_start_index","sent_end_index"):
        DFW[c] = pd.to_numeric(DFW[c], errors="raise").astype("int64")

print(json.dumps({
    "cell_id": "3.2",
    "status": "pass",
    "docs_rows": int(DOCS_DF.shape[0]),
    "windows_detected": bool(DFW is not None),
    "windows_rows": int(DFW.shape[0]) if DFW is not None else 0,
    "version_order_source": "filename_prefix (carried via version_id)"
}, indent=2))


In [None]:
# cell 3.3 — Discourse markers: ensure lexicon exists and load (greedy multiword-first)
# Path: lexicons/discourse_markers_en.txt (one per line). We create a default if missing.

import json, re
from pathlib import Path

LEX_DIR = Path("lexicons"); LEX_DIR.mkdir(parents=True, exist_ok=True)
LEX_PATH = LEX_DIR / "discourse_markers_en.txt"

_DEFAULT_MARKERS = [
    # multiwords first (we'll sort by token count at load time anyway)
    "on the other hand", "as a result", "for example", "for instance", "in contrast",
    "in particular", "in summary", "in fact", "by contrast", "in addition",
    "as a consequence", "to the contrary", "on the contrary", "as a result",
    "at the same time", "in other words", "for that reason",
    # single words
    "however", "moreover", "therefore", "meanwhile", "nonetheless",
    "furthermore", "consequently", "overall", "specifically", "additionally"
]

if not LEX_PATH.exists():
    LEX_PATH.write_text("\n".join(_DEFAULT_MARKERS) + "\n", encoding="utf-8")

def load_markers(path: Path):
    items = []
    for line in path.read_text(encoding="utf-8").splitlines():
        s = line.strip().lower()
        if not s or s.startswith("#"):
            continue
        items.append(s)
    # sort by (token count desc, length desc) for greedy matching
    items = sorted(set(items), key=lambda x: (-(len(x.split())), -len(x)))
    # precompile regex with word boundaries, case-insensitive
    patterns = [(m, re.compile(rf"\b{re.escape(m)}\b", flags=re.IGNORECASE)) for m in items]
    return items, patterns

DM_MARKERS, DM_PATTERNS = load_markers(LEX_PATH)

print(json.dumps({
    "cell_id": "3.3",
    "status": "pass",
    "lexicon_path": str(LEX_PATH),
    "lexicon_size": len(DM_MARKERS),
    "sample": DM_MARKERS[:8]
}, indent=2))


In [None]:
# cell 3.4 — Parsing & sentence-level features (parse each version once, per needed basis)
# We parse doc-level basis (text_clean if present else text) AND any span_basis required by M2 windows.

import json, math
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional
import numpy as np
import pandas as pd
import spacy

# ---- basis planning: which strings do we need to parse per (article_id, version_id)?
def _doc_basis_for_doclevel(row) -> str:
    return "text_clean" if "text_clean" in row.index and isinstance(row["text_clean"], str) and row["text_clean"] else "text"

NEEDED_BASIS = set()
# Always include the doc-level basis
for _, r in DOCS_DF.iterrows():
    NEEDED_BASIS.add((_doc_basis_for_doclevel(r),))
# Add span_basis from M2 windows if present
if DFW is not None:
    for b in DFW["span_basis"].dropna().astype(str).unique().tolist():
        if b in ("text","text_clean"):
            NEEDED_BASIS.add((b,))

NEEDED_BASIS = {b for (b,) in NEEDED_BASIS}
if not NEEDED_BASIS:
    NEEDED_BASIS = {"text"}

# ---- helper: finite predicate detection (English heuristic)
FINITE_TAGS = {"VBD","VBP","VBZ"}  # common finite verb tags in UD/ptb mapping
def is_finite_pred(tok) -> bool:
    if tok.pos_ != "VERB":
        return False
    # Exclude auxiliaries unless they are tagged as finite and no main VERB in the sentence.
    if tok.pos_ == "AUX":
        return False
    if "Fin" in tok.morph.get("VerbForm"):
        return True
    if tok.tag_ in FINITE_TAGS:
        return True
    return False

SUBORD_SET = {"mark","advcl","ccomp","xcomp","acl","relcl","csubj","csubjpass","csubj:pass"}
COORD_SET  = {"cc","conj","parataxis"}

@dataclass
class SentStats:
    start_char: int
    end_char: int
    n_tokens: int
    depth: int
    clauses: int
    coord: int
    subord: int

@dataclass
class ParsedCache:
    article_id: str
    version_id: int
    basis: str  # "text" or "text_clean"
    n_tokens: int
    sent_stats: List[SentStats]
    text_lower: str  # for discourse marker matching

# Compute sentence-level stats for a spaCy Doc
def _sentence_stats(doc) -> Tuple[List[SentStats], int]:
    stats: List[SentStats] = []
    total_tokens = 0
    for sent in doc.sents:
        toks = [t for t in sent if not t.is_space]
        total_tokens += len(toks)
        # token depth relative to sentence root
        root = sent.root
        max_depth = 0
        for t in toks:
            d = 0
            cur = t
            # walk up to the sentence root; guard infinite loops
            while cur != root and d <= 80:
                cur = cur.head
                d += 1
            if d > max_depth:
                max_depth = d
        # clause count (finite predicates)
        clauses = sum(1 for t in toks if is_finite_pred(t))
        # coord/subord counts
        subord = 0
        for t in toks:
            dep = t.dep_.lower()
            if dep == "acl":
                # count as subord if looks like relative clause
                if "Relcl=Yes".lower() in str(t.morph).lower():
                    subord += 1
            if dep in SUBORD_SET:
                subord += 1
        coord = sum(1 for t in toks if t.dep_.lower() in COORD_SET)
        stats.append(SentStats(
            start_char=sent.start_char,
            end_char=sent.end_char,
            n_tokens=len(toks),
            depth=max_depth,
            clauses=clauses,
            coord=coord,
            subord=subord
        ))
    return stats, total_tokens

# Build parse tasks for required bases
parse_tasks = []
for (art, ver), g in DOCS_DF.groupby(["article_id","version_id"], sort=False):
    row = g.iloc[0]
    # doc-level basis
    b0 = _doc_basis_for_doclevel(row)
    parse_tasks.append((art, int(ver), b0, str(row.get(b0) if b0 in row.index else row["text"])))
    # any span-basis required by windows
    if DFW is not None:
        bases_needed = set(DFW[(DFW["article_id"]==art) & (DFW["version_id"]==int(ver))]["span_basis"].unique().tolist())
        for b in sorted(bases_needed):
            if b not in ("text","text_clean"):
                continue
            if b != b0:
                parse_tasks.append((art, int(ver), b, str(row.get(b) if b in row.index else row["text"])))

# Deduplicate identical (art,ver,basis)
seen = set(); unique_tasks = []
for art, ver, basis, text in parse_tasks:
    key = (art, ver, basis)
    if key in seen:
        continue
    seen.add(key)
    unique_tasks.append((art, ver, basis, text))

# Run nlp.pipe once over all texts
docs_stream = (t[3] for t in unique_tasks)
parsed = []
for (doc, (art, ver, basis, text)) in zip(nlp.pipe(docs_stream, batch_size=BATCH, n_process=NPROC), unique_tasks):
    s_stats, ntoks = _sentence_stats(doc)
    parsed.append(ParsedCache(
        article_id=str(art),
        version_id=int(ver),
        basis=basis,
        n_tokens=int(ntoks),
        sent_stats=s_stats,
        text_lower=text.lower()
    ))

# Cache lookup: (article_id, version_id, basis) -> ParsedCache
PARSE_CACHE: Dict[Tuple[str,int,str], ParsedCache] = {(p.article_id, p.version_id, p.basis): p for p in parsed}

print(json.dumps({
    "cell_id": "3.4",
    "status": "pass",
    "parsed_docs": len(parsed),
    "bases": sorted(list(NEEDED_BASIS)),
}, indent=2))


In [None]:
# cell 3.4b — refine clause detector (count finite AUX when no finite VERB), rebuild parse cache
import spacy, numpy as np
from dataclasses import dataclass
from typing import Dict, Tuple, List
import statistics

# ------------- reuse globals from earlier cells -------------
# expects: nlp, DOCS_DF, DFW (or None), DM_PATTERNS, BATCH, NPROC

# New clause detector
FINITE_TAGS = {"VBD","VBP","VBZ"}
def is_finite_pred(tok, sent_has_finite_main: bool = True) -> bool:
    # Count VERB tokens that are finite
    if tok.pos_ == "VERB":
        if "Fin" in tok.morph.get("VerbForm"): return True
        if tok.tag_ in FINITE_TAGS: return True
        return False
    # Count AUX only if sentence has no finite VERB
    if tok.pos_ == "AUX" and not sent_has_finite_main:
        if "Fin" in tok.morph.get("VerbForm"): return True
        if tok.tag_ in FINITE_TAGS: return True
    return False

@dataclass
class SentStats:
    start_char: int
    end_char: int
    n_tokens: int
    depth: int
    clauses: int
    coord: int
    subord: int

@dataclass
class ParsedCache:
    article_id: str
    version_id: int
    basis: str
    n_tokens: int
    sent_stats: List[SentStats]
    text_lower: str

SUBORD_SET = {"mark","advcl","ccomp","xcomp","acl","relcl","csubj","csubjpass","csubj:pass"}
COORD_SET  = {"cc","conj","parataxis"}

def _sentence_stats(doc) -> Tuple[List[SentStats], int]:
    stats = []
    total_tokens = 0
    for sent in doc.sents:
        toks = [t for t in sent if not t.is_space]
        total_tokens += len(toks)
        root = sent.root
        # depth
        max_depth = 0
        for t in toks:
            d = 0; cur = t
            while cur != root and d <= 80:
                cur = cur.head; d += 1
            if d > max_depth: max_depth = d
        # clause logic (finite VERB presence?)
        sent_has_finite_main = any(
            (tt.pos_ == "VERB") and (("Fin" in tt.morph.get("VerbForm")) or (tt.tag_ in FINITE_TAGS))
            for tt in toks
        )
        clauses = sum(1 for t in toks if is_finite_pred(t, sent_has_finite_main))
        # coord/subord
        subord = sum(1 for t in toks if (t.dep_.lower() in SUBORD_SET) or (t.dep_.lower()=="acl" and "relcl=yes" in str(t.morph).lower()))
        coord  = sum(1 for t in toks if t.dep_.lower() in COORD_SET)
        stats.append(SentStats(sent.start_char, sent.end_char, len(toks), max_depth, clauses, coord, subord))
    return stats, total_tokens

# Rebuild parse tasks as in 3.4
def _doc_basis_for_doclevel(row) -> str:
    return "text_clean" if ("text_clean" in row.index and isinstance(row["text_clean"], str) and row["text_clean"]) else "text"

parse_tasks = []
for (art, ver), g in DOCS_DF.groupby(["article_id","version_id"], sort=False):
    row = g.iloc[0]
    b0 = _doc_basis_for_doclevel(row)
    parse_tasks.append((art, int(ver), b0, str(row.get(b0) if b0 in row.index else row["text"])))
    if DFW is not None:
        bases_needed = set(DFW[(DFW["article_id"]==art) & (DFW["version_id"]==int(ver))]["span_basis"].unique().tolist())
        for b in sorted(bases_needed):
            if b in ("text","text_clean") and b != b0:
                parse_tasks.append((art, int(ver), b, str(row.get(b) if b in row.index else row["text"])))

# De-dup
seen=set(); unique_tasks=[]
for art, ver, basis, text in parse_tasks:
    key=(art,ver,basis)
    if key in seen: continue
    seen.add(key); unique_tasks.append((art,ver,basis,text))

# Parse and rebuild cache
docs_stream = (t[3] for t in unique_tasks)
parsed=[]
for (doc, (art,ver,basis,text)) in zip(nlp.pipe(docs_stream, batch_size=BATCH, n_process=NPROC), unique_tasks):
    s_stats, ntoks = _sentence_stats(doc)
    parsed.append(ParsedCache(str(art), int(ver), basis, int(ntoks), s_stats, text.lower()))
PARSE_CACHE = {(p.article_id, p.version_id, p.basis): p for p in parsed}

print({"cell_id":"3.4b","status":"pass","parsed_docs":len(parsed)})


In [None]:
# cell 3.5 — Document-level metrics (syntax + discourse) and write parquet
# Output: outputs/spacy/syntax_discourse.parquet

import json, math, statistics
from pathlib import Path
from typing import Dict, Any, List
import numpy as np
import pandas as pd

def _doc_dm_counts(text_lower: str) -> Dict[str, Any]:
    # Greedy non-overlapping match using precompiled patterns (multiword-first)
    used = [False] * len(text_lower)
    total = 0
    types = set()
    for marker, pat in DM_PATTERNS:
        for m in pat.finditer(text_lower):
            a, b = m.span()
            if any(used[a:b]):  # skip overlaps
                continue
            # mark range used
            for i in range(a, b):
                used[i] = True
            total += 1
            types.add(marker)
    return {"dm_total": total, "dm_types": len(types)}

def _safe_cv(mean_val: float, std_val: float) -> float:
    if mean_val is None or std_val is None:
        return float("nan")
    if not np.isfinite(mean_val) or mean_val <= 0:
        return float("nan")
    return float(std_val) / float(mean_val)

rows = []
for (art, ver), g in DOCS_DF.groupby(["article_id","version_id"], sort=False):
    row = g.iloc[0]
    # choose doc-basis preference: text_clean if available else text
    basis = "text_clean" if ("text_clean" in row.index and isinstance(row["text_clean"], str) and row["text_clean"]) else "text"
    cache = PARSE_CACHE.get((str(art), int(ver), basis))
    if cache is None:
        # Fall back to any parsed basis
        cache = PARSE_CACHE.get((str(art), int(ver), "text")) or PARSE_CACHE.get((str(art), int(ver), "text_clean"))
        basis = cache.basis if cache else basis

    if cache is None:
        raise RuntimeError(f"Module 3: Missing parsed cache for ({art}, v{ver}).")

    sents = cache.sent_stats
    n_sents = len(sents)
    sent_lengths = [s.n_tokens for s in sents]
    depths = [s.depth for s in sents]

    mean_len = statistics.mean(sent_lengths) if n_sents > 0 else float("nan")
    std_len  = statistics.pstdev(sent_lengths) if n_sents > 1 else float("nan")

    depth_mean = statistics.mean(depths) if n_sents > 0 else float("nan")
    depth_std  = statistics.pstdev(depths) if n_sents > 1 else float("nan")
    depth_med  = (float(np.median(depths)) if n_sents > 0 else float("nan"))
    depth_cv   = _safe_cv(depth_mean, depth_std)
    depth_p90  = (float(np.percentile(depths, 90)) if n_sents > 0 else float("nan"))
    depth_max  = (float(np.max(depths)) if n_sents > 0 else float("nan"))

    clauses_ps = [s.clauses for s in sents]
    clauses_mean = statistics.mean(clauses_ps) if n_sents > 0 else float("nan")
    clauses_med  = (float(np.median(clauses_ps)) if n_sents > 0 else float("nan"))
    clauses_max  = (float(np.max(clauses_ps)) if n_sents > 0 else float("nan"))

    coord_sum = sum(s.coord for s in sents)
    subord_sum = sum(s.subord for s in sents)
    doc_tokens = cache.n_tokens if cache.n_tokens > 0 else 1
    coord_rate = float(coord_sum) / float(doc_tokens)
    subord_rate = float(subord_sum) / float(doc_tokens)
    eps = 1e-8
    coord_subord_ratio = coord_rate / max(subord_rate, eps)

    dm = _doc_dm_counts(cache.text_lower)
    dm_total = dm["dm_total"]
    dm_types = dm["dm_types"]
    dm_density = 100.0 * (dm_total / float(doc_tokens)) if doc_tokens > 0 else float("nan")
    dm_ttr = (float(dm_types) / float(dm_total)) if dm_total > 0 else float("nan")

    rows.append({
        "article_id": str(art),
        "version_id": int(ver),
        "version_tag": str(g.iloc[0]["version_tag"]),
        "doc_id": f"{art}-v{ver}",
        "n_sents_spacy": int(n_sents),
        "mean_sent_len_tok_spacy": float(mean_len) if np.isfinite(mean_len) else float("nan"),
        "std_sent_len_tok_spacy": float(std_len) if np.isfinite(std_len) else float("nan"),
        "depth_mean": float(depth_mean) if np.isfinite(depth_mean) else float("nan"),
        "depth_median": float(depth_med) if np.isfinite(depth_med) else float("nan"),
        "depth_std": float(depth_std) if np.isfinite(depth_std) else float("nan"),
        "depth_cv": float(depth_cv) if np.isfinite(depth_cv) else float("nan"),
        "depth_p90": float(depth_p90) if np.isfinite(depth_p90) else float("nan"),
        "depth_max": float(depth_max) if np.isfinite(depth_max) else float("nan"),
        "clauses_per_sent_mean": float(clauses_mean) if np.isfinite(clauses_mean) else float("nan"),
        "clauses_per_sent_median": float(clauses_med) if np.isfinite(clauses_med) else float("nan"),
        "clauses_per_sent_max": float(clauses_max) if np.isfinite(clauses_max) else float("nan"),
        "coord_rate": float(coord_rate),
        "subord_rate": float(subord_rate),
        "coord_subord_ratio": float(coord_subord_ratio),
        "dm_density_per_100toks": float(dm_density) if np.isfinite(dm_density) else float("nan"),
        "dm_types": int(dm_types),
        "dm_type_token_ratio": float(dm_ttr) if np.isfinite(dm_ttr) else float("nan"),
    })

DF_DOC3 = pd.DataFrame(rows).sort_values(["article_id","version_id"], kind="stable")
# Enforce dtypes
types = {
    "article_id":"string","version_id":"int64","version_tag":"string","doc_id":"string",
    "n_sents_spacy":"int64","mean_sent_len_tok_spacy":"float64","std_sent_len_tok_spacy":"float64",
    "depth_mean":"float64","depth_median":"float64","depth_std":"float64","depth_cv":"float64","depth_p90":"float64","depth_max":"float64",
    "clauses_per_sent_mean":"float64","clauses_per_sent_median":"float64","clauses_per_sent_max":"float64",
    "coord_rate":"float64","subord_rate":"float64","coord_subord_ratio":"float64",
    "dm_density_per_100toks":"float64","dm_types":"int64","dm_type_token_ratio":"float64",
}
for c,t in types.items():
    DF_DOC3[c] = DF_DOC3[c].astype(t)

out_path = SP_OUT / "syntax_discourse.parquet"
DF_DOC3.to_parquet(out_path, index=False)

print(json.dumps({
    "cell_id": "3.5",
    "status": "pass",
    "rows": int(DF_DOC3.shape[0]),
    "artifact": str(out_path)
}, indent=2))


In [None]:
# cell 3.6 — Window-level metrics aligned to Module 2 windows (if available)
# Output: outputs/spacy/syntax_discourse_windows.parquet
# Policy: include sentences fully contained in [char_start:char_end). Record policy in metadata later.

import json, math, statistics
from typing import List, Dict, Any
import numpy as np
import pandas as pd

if DFW is None:
    print(json.dumps({
        "cell_id": "3.6",
        "status": "skip",
        "reason": "Module 2 windows not found; only document-level metrics written."
    }, indent=2))
else:
    win_rows: List[Dict[str,Any]] = []
    for (art, ver), g in DFW.groupby(["article_id","version_id"], sort=False):
        # Choose basis strings we must use per-window
        for _, w in g.sort_values("win_id").iterrows():
            basis = str(w["span_basis"]) if str(w["span_basis"]) in ("text","text_clean") else "text"
            cache = PARSE_CACHE.get((str(art), int(ver), basis))
            if cache is None:
                # fall back to any available basis (shouldn't happen if 3.4 ran)
                cache = PARSE_CACHE.get((str(art), int(ver), "text")) or PARSE_CACHE.get((str(art), int(ver), "text_clean"))
                basis = cache.basis if cache else basis
            if cache is None:
                raise RuntimeError(f"Module 3: Missing parsed cache for window ({art}, v{ver}, basis={basis}).")

            a, b = int(w["char_start"]), int(w["char_end"])
            # sentence selection: full containment
            s_in = [s for s in cache.sent_stats if s.start_char >= a and s.end_char <= b]
            n_s = len(s_in)
            n_toks = sum(s.n_tokens for s in s_in)

            depths = [s.depth for s in s_in]
            depth_mean = statistics.mean(depths) if n_s > 0 else float("nan")
            depth_std  = statistics.pstdev(depths) if n_s > 1 else float("nan")
            depth_max  = float(np.max(depths)) if n_s > 0 else float("nan")

            clauses_ps = [s.clauses for s in s_in]
            clauses_mean = statistics.mean(clauses_ps) if n_s > 0 else float("nan")
            clauses_max  = float(np.max(clauses_ps)) if n_s > 0 else float("nan")

            coord_sum = sum(s.coord for s in s_in)
            subord_sum = sum(s.subord for s in s_in)
            denom = max(n_toks, 1)
            coord_rate = float(coord_sum) / float(denom)
            subord_rate = float(subord_sum) / float(denom)
            eps = 1e-8
            ratio = coord_rate / max(subord_rate, eps)

            # discourse markers in the window substring
            # NOTE: Using the same basis string as parsing
            # Find the source text (lowercased) that corresponds to basis
            text_lower = cache.text_lower
            win_text_lower = text_lower[a:b] if 0 <= a <= b <= len(text_lower) else ""
            used = [False] * len(win_text_lower)
            dm_total = 0; dm_types_set = set()
            for marker, pat in DM_PATTERNS:
                for m in pat.finditer(win_text_lower):
                    x, y = m.span()
                    if any(used[x:y]):
                        continue
                    for i in range(x, y):
                        used[i] = True
                    dm_total += 1
                    dm_types_set.add(marker)
            dm_types = len(dm_types_set)
            dm_density = 100.0 * (dm_total / float(denom)) if denom > 0 else float("nan")
            dm_ttr = float(dm_types) / float(dm_total) if dm_total > 0 else float("nan")

            win_rows.append({
                # keys & spans (mirror M2)
                "article_id": str(art),
                "version_id": int(ver),
                "version_tag": str(w["version_tag"]),
                "doc_id": str(w["doc_id"]),
                "win_id": int(w["win_id"]),
                "win_label": str(w["win_label"]),
                "span_basis": basis,
                "char_start": int(w["char_start"]),
                "char_end": int(w["char_end"]),
                "sent_start_index": int(w["sent_start_index"]),
                "sent_end_index": int(w["sent_end_index"]),
                "is_partial_tail": bool(w["is_partial_tail"]),
                # aggregates
                "n_sents_win": int(n_s),
                "n_tokens_win": int(n_toks),
                "depth_mean_win": float(depth_mean) if np.isfinite(depth_mean) else float("nan"),
                "depth_std_win": float(depth_std) if np.isfinite(depth_std) else float("nan"),
                "depth_max_win": float(depth_max) if np.isfinite(depth_max) else float("nan"),
                "clauses_per_sent_mean_win": float(clauses_mean) if np.isfinite(clauses_mean) else float("nan"),
                "clauses_per_sent_max_win": float(clauses_max) if np.isfinite(clauses_max) else float("nan"),
                "coord_rate_win": float(coord_rate),
                "subord_rate_win": float(subord_rate),
                "coord_subord_ratio_win": float(ratio),
                "dm_density_per_100toks_win": float(dm_density) if np.isfinite(dm_density) else float("nan"),
                "dm_types_win": int(dm_types),
                "dm_type_token_ratio_win": float(dm_ttr) if np.isfinite(dm_ttr) else float("nan"),
            })

    DF_WIN3 = pd.DataFrame(win_rows).sort_values(["article_id","version_id","win_id"], kind="stable")
    # Enforce dtypes
    types = {
        "article_id":"string","version_id":"int64","version_tag":"string","doc_id":"string",
        "win_id":"int64","win_label":"string","span_basis":"string",
        "char_start":"int64","char_end":"int64","sent_start_index":"int64","sent_end_index":"int64",
        "is_partial_tail":"boolean",
        "n_sents_win":"int64","n_tokens_win":"int64",
        "depth_mean_win":"float64","depth_std_win":"float64","depth_max_win":"float64",
        "clauses_per_sent_mean_win":"float64","clauses_per_sent_max_win":"float64",
        "coord_rate_win":"float64","subord_rate_win":"float64","coord_subord_ratio_win":"float64",
        "dm_density_per_100toks_win":"float64","dm_types_win":"int64","dm_type_token_ratio_win":"float64",
    }
    for c,t in types.items():
        DF_WIN3[c] = DF_WIN3[c].astype(t)
    out_path_w = SP_OUT / "syntax_discourse_windows.parquet"
    DF_WIN3.to_parquet(out_path_w, index=False)

    print(json.dumps({
        "cell_id": "3.6",
        "status": "pass",
        "rows": int(DF_WIN3.shape[0]),
        "artifact": str(out_path_w)
    }, indent=2))


In [None]:
# cell 3.7 — Visuals: (a) length vs depth scatter, (b) coord vs subord stacked bars (per article)
# Files: outputs/spacy/plots/len_vs_depth_<slug>.png, coord_subord_stack_<slug>.png
# Cap at 10 slugs alphabetically.

import json, re
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

PLOTS = SP_OUT / "plots"; PLOTS.mkdir(parents=True, exist_ok=True)

def _safe_slug(s: str) -> str:
    return re.sub(r"[^-_.a-zA-Z0-9]", "_", str(s))

# Build a helper map for per-sentence scatter (use parsed doc cache)
def _sent_pairs_for(art: str, ver: int, basis: str):
    c = PARSE_CACHE.get((art, ver, basis))
    if c is None:
        # try any basis parsed for this doc
        c = PARSE_CACHE.get((art, ver, "text")) or PARSE_CACHE.get((art, ver, "text_clean"))
    if c is None:
        return []
    return [(s.n_tokens, s.depth) for s in c.sent_stats]

# Decide per-article basis for visuals: same doc-level basis we used in 3.5
def _basis_for_visual(row) -> str:
    return "text_clean" if ("text_clean" in DOCS_DF.columns and isinstance(row.get("text_clean"), str) and row.get("text_clean")) else "text"

# Pick up to 10 slugs alphabetically
slugs = sorted(DF_DOC3["article_id"].astype(str).unique().tolist())[:10]

for art in slugs:
    gg = DF_DOC3[DF_DOC3["article_id"]==art].sort_values("version_id")
    # (a) length vs depth scatter
    fig = plt.figure(dpi=120)
    ax = plt.gca()
    for _, r in gg.iterrows():
        basis = _basis_for_visual(DOCS_DF[(DOCS_DF["article_id"]==art) & (DOCS_DF["version_id"]==r["version_id"])].iloc[0])
        pairs = _sent_pairs_for(str(art), int(r["version_id"]), basis)
        if pairs:
            xs, ys = zip(*pairs)
            ax.scatter(xs, ys, alpha=0.35, label=f"v{int(r['version_id'])}")
    ax.set_title(f"Sentence length vs depth — {art}")
    ax.set_xlabel("Sentence length (tokens, spaCy)")
    ax.set_ylabel("Sentence depth (max head distance)")
    ax.legend()
    out1 = PLOTS / f"len_vs_depth_{_safe_slug(art)}.png"
    fig.savefig(out1, bbox_inches="tight"); plt.close(fig)

    # (b) coord vs subord stacked bars (per version)
    fig = plt.figure(dpi=120); ax = plt.gca()
    xs = [f"v{int(v)}" for v in gg["version_id"].tolist()]
    coord = gg["coord_rate"].to_numpy()
    subord = gg["subord_rate"].to_numpy()
    ax.bar(xs, subord, label="subord_rate")
    ax.bar(xs, coord, bottom=subord, label="coord_rate")
    ax.set_title(f"Coordination vs Subordination — {art}")
    ax.set_ylabel("rate per token")
    ax.legend()
    out2 = PLOTS / f"coord_subord_stack_{_safe_slug(art)}.png"
    fig.savefig(out2, bbox_inches="tight"); plt.close(fig)

# Update plots index
( PLOTS / "plots_index.json" ).write_text(
    json.dumps({"files": sorted([p.name for p in PLOTS.glob("*.png")])}, indent=2),
    encoding="utf-8"
)

print(json.dumps({
    "cell_id": "3.7",
    "status": "pass",
    "plotted_articles": len(slugs),
    "plots_dir": str(PLOTS)
}, indent=2))


In [None]:
# cell 3.8 — Metadata (Module 3) and write to outputs/spacy/metadata.json
# Records pipeline, versioning, window alignment policy, lexicon info, and basic counts.

import json, sys
from pathlib import Path
import spacy
import pandas as pd

META_PATH = SP_OUT / "metadata.json"

per_counts = DF_DOC3.groupby("article_id")["version_id"].nunique()
meta_update = {
    "module": "3",
    "version_order_source": "filename_prefix",
    "spacy_version": spacy.__version__,
    "model": "en_core_web_sm",
    "components_enabled": [p for p in nlp.pipe_names if p != "ner"],
    "articles": sorted(DF_DOC3["article_id"].astype(str).unique().tolist()),
    "n_articles": int(DF_DOC3["article_id"].nunique()),
    "versions_per_article_min": int(per_counts.min()) if len(per_counts) else 0,
    "versions_per_article_max": int(per_counts.max()) if len(per_counts) else 0,
    "expected_versions": 4,
    "window_alignment_policy": "from_module_2_spans" if DFW is not None else "not_applicable",
    "alignment_rule": "full_containment",  # sentences fully within [char_start:char_end)
    "epsilon_ratio": 1e-8,
    "lexicon_path": str(LEX_PATH),
    "lexicon_size": len(DM_MARKERS),
    "plotted_articles_capped": (DF_DOC3['article_id'].nunique() > 10),
}

# Merge (do not overwrite other module metadata if present)
base = {}
if META_PATH.exists():
    try:
        base = json.loads(META_PATH.read_text(encoding="utf-8"))
    except Exception:
        base = {}
base.update(meta_update)
META_PATH.write_text(json.dumps(base, indent=2), encoding="utf-8")

print(json.dumps({
    "cell_id": "3.8",
    "status": "pass",
    "metadata_path": str(META_PATH)
}, indent=2))


In [None]:
# cell 3.8a — enrich metadata with windows_available, basis map, and lexicon SHA256
import json, hashlib
from pathlib import Path
import pandas as pd

def _basis_for_doc(row):
    return "text_clean" if ("text_clean" in row.index and isinstance(row["text_clean"], str) and row["text_clean"]) else "text"

basis_map = {
    str(art): _basis_for_doc(DOCS_DF[(DOCS_DF["article_id"]==art) & (DOCS_DF["version_id"]==ver)].iloc[0])
    for (art, ver) in DF_DOC3[["article_id","version_id"]].itertuples(index=False, name=None)
}
# windows flag
windows_available = DFW is not None

# lexicon hash
h = hashlib.sha256()
with open(LEX_PATH, "rb") as f:
    for chunk in iter(lambda: f.read(65536), b""):
        h.update(chunk)
lex_sha = h.hexdigest()

META_PATH = SP_OUT / "metadata.json"
base = {}
if META_PATH.exists():
    try:
        base = json.loads(META_PATH.read_text(encoding="utf-8"))
    except Exception:
        base = {}
base.update({
    "windows_available": bool(windows_available),
    "document_basis_by_article": basis_map,
    "lexicon_sha256": lex_sha,
})
META_PATH.write_text(json.dumps(base, indent=2), encoding="utf-8")
print({"cell_id":"3.8a","status":"pass","windows_available":windows_available,"lexicon_sha256":lex_sha[:12]+"..."})


In [None]:
# cell 3.8b — metadata add-ons (label sets, DM match policy, sentence-boundary disagreement diagnostic)
# Place AFTER 3.8. Safe to re-run. Requires: SP_OUT, DFW (or None), PARSE_CACHE, LEX_PATH, DM_PATTERNS.

import json
from pathlib import Path

META_PATH = SP_OUT / "metadata.json"
base = {}
if META_PATH.exists():
    try:
        base = json.loads(META_PATH.read_text(encoding="utf-8"))
    except Exception:
        base = {}

# 1) Document the exact label sets and DM matching policy
coord_labels_used  = ["cc", "conj", "parataxis"]
subord_labels_used = ["mark", "advcl", "ccomp", "xcomp", "acl", "relcl", "csubj", "csubj:pass"]
dm_match_policy    = "lowercase, multiword-first, word-boundary, non-overlapping"

# 2) Sentence-boundary disagreement diagnostic (window-level)
#    We count a window as 'disagreement' if ANY spaCy sentence partially overlaps the window span.
def _has_partial_overlap(sent_span, win_span):
    (sa, sb), (wa, wb) = sent_span, win_span
    if sa >= wb or sb <= wa:  # no overlap
        return False
    fully = (sa >= wa) and (sb <= wb)
    return not fully  # partial overlap → True

disagree_windows = 0
total_windows = 0

if DFW is not None and len(DFW):
    for (_, _ver), g in DFW.groupby(["article_id", "version_id"], sort=False):
        for _, w in g.iterrows():
            basis = str(w["span_basis"]) if str(w["span_basis"]) in ("text","text_clean") else "text"
            cache = PARSE_CACHE.get((str(w["article_id"]), int(w["version_id"]), basis))
            if cache is None:
                cache = PARSE_CACHE.get((str(w["article_id"]), int(w["version_id"]), "text")) or \
                        PARSE_CACHE.get((str(w["article_id"]), int(w["version_id"]), "text_clean"))
            total_windows += 1
            if cache is None:
                # Cannot assess; treat as neutral (not disagreement) but note later.
                continue
            wa, wb = int(w["char_start"]), int(w["char_end"])
            any_partial = any(_has_partial_overlap((s.start_char, s.end_char), (wa, wb)) for s in cache.sent_stats)
            if any_partial:
                disagree_windows += 1

disagree_rate = (disagree_windows / total_windows) if total_windows else None

# 3) Merge into metadata
base.update({
    "coord_labels_used": coord_labels_used,
    "subord_labels_used": subord_labels_used,
    "dm_match": dm_match_policy,
    "sent_boundary_disagreement_policy": "count window if any spaCy sentence partially overlaps",
    "sent_boundary_disagreement_rate_windows": (float(disagree_rate) if disagree_rate is not None else None),
})

META_PATH.write_text(json.dumps(base, indent=2), encoding="utf-8")

print({
    "cell_id": "3.8b",
    "status": "pass",
    "windows_checked": int(total_windows),
    "windows_with_partial_sentence": int(disagree_windows),
    "disagreement_rate": None if disagree_rate is None else round(disagree_rate, 4),
    "metadata_path": str(META_PATH),
})


In [None]:
# cell 3.9 — acceptance & schema audit (writes outputs/spacy/audit.json)
# Place BEFORE 3.Z. Safe to re-run. Validates required columns and basic window invariants.

from pathlib import Path
import json
import math
import numpy as np
import pandas as pd

AUDIT_PATH = SP_OUT / "audit.json"

results = {"cell_id": "3.9", "status": "pass", "checks": {}}

# --- Load artifacts
doc_path = SP_OUT / "syntax_discourse.parquet"
win_path = SP_OUT / "syntax_discourse_windows.parquet"

df_doc = pd.read_parquet(doc_path) if doc_path.exists() else None
df_win = pd.read_parquet(win_path) if win_path.exists() else None

# --- Expected schemas per spec
exp_doc_cols = [
    "article_id","version_id","version_tag","doc_id",
    "n_sents_spacy","mean_sent_len_tok_spacy","std_sent_len_tok_spacy",
    "depth_mean","depth_median","depth_std","depth_cv","depth_p90","depth_max",
    "clauses_per_sent_mean","clauses_per_sent_median","clauses_per_sent_max",
    "coord_rate","subord_rate","coord_subord_ratio",
    "dm_density_per_100toks","dm_types","dm_type_token_ratio",
]
exp_win_cols = [
    "article_id","version_id","version_tag","doc_id","win_id","win_label",
    "span_basis","char_start","char_end","sent_start_index","sent_end_index","is_partial_tail",
    "n_sents_win","n_tokens_win",
    "depth_mean_win","depth_std_win","depth_max_win",
    "clauses_per_sent_mean_win","clauses_per_sent_max_win",
    "coord_rate_win","subord_rate_win","coord_subord_ratio_win",
    "dm_density_per_100toks_win","dm_types_win","dm_type_token_ratio_win",
]

def _schema_report(df, expected):
    if df is None:
        return {"present": False, "missing": expected, "extra": []}
    cols = list(map(str, df.columns))
    missing = [c for c in expected if c not in cols]
    extra   = [c for c in cols if c not in expected]
    return {"present": True, "missing": missing, "extra": extra}

results["checks"]["doc_schema"] = _schema_report(df_doc, exp_doc_cols)
results["checks"]["win_schema"] = _schema_report(df_win, exp_win_cols)

# --- Doc-level numeric sanity
if df_doc is not None and len(df_doc):
    depth_nonneg = bool((df_doc["depth_mean"].fillna(0) >= 0).all() and (df_doc["depth_max"].fillna(0) >= 0).all())
    cv_ok = bool(((df_doc["depth_cv"].isna()) | (np.isfinite(df_doc["depth_cv"]))).all())
    ratio_ok = bool(np.isfinite(df_doc["coord_subord_ratio"].to_numpy(dtype=float)).all())
    results["checks"]["doc_values"] = {
        "depth_nonnegative": depth_nonneg,
        "depth_cv_finite_or_nan": cv_ok,
        "coord_subord_ratio_finite": ratio_ok,
        "rows": int(df_doc.shape[0]),
    }

# --- Window invariants & monotonicity
if df_win is not None and len(df_win):
    inv = {"groups_checked": 0, "win_id_contiguous": True, "char_spans_monotonic": True}
    for (art, ver), g in df_win.groupby(["article_id","version_id"], sort=False):
        inv["groups_checked"] += 1
        g = g.sort_values("win_id")
        # contiguous win_id starting at 1
        expected_seq = list(range(1, len(g) + 1))
        got_seq = g["win_id"].tolist()
        if got_seq != expected_seq:
            inv["win_id_contiguous"] = False
        # monotonic char_start/char_end
        cs = g["char_start"].to_numpy()
        ce = g["char_end"].to_numpy()
        if not (np.all(cs[:-1] <= cs[1:]) and np.all(ce[:-1] <= ce[1:]) and np.all(ce - cs >= 0)):
            inv["char_spans_monotonic"] = False
    results["checks"]["windows_invariants"] = inv

# --- Write audit & print summary
AUDIT_PATH.write_text(json.dumps(results, indent=2), encoding="utf-8")
print(json.dumps(results, indent=2))
print({"audit_path": str(AUDIT_PATH)})


In [None]:
# 4.1 — transformers|torch: install (CPU by default)
# Safe, wheels-first installs. CPU torch so this runs anywhere; GPU will be picked up in 4.2 if available.
import sys, subprocess, pkgutil

def _pip(args):
    return subprocess.check_call([sys.executable, "-m", "pip", "install", *args])

# Install torch (CPU wheel), transformers 4.x, accelerate
need = []
if pkgutil.find_loader("torch") is None:
    need += ["torch==2.*", "--index-url", "https://download.pytorch.org/whl/cpu"]
if pkgutil.find_loader("transformers") is None:
    need += ["transformers>=4.40,<5"]
if pkgutil.find_loader("accelerate") is None:
    need += ["accelerate>=0.30,<1"]

if need:
    print("Installing:", need)
    _pip(need)
else:
    print("✓ torch/transformers/accelerate already available")

# NLTK punkt may be used for sentence splits to stay aligned with Module 2
try:
    import nltk
    nltk.data.find("tokenizers/punkt")
except Exception:
    _pip(["nltk>=3.8,<4"])
    import nltk
    nltk.download("punkt", quiet=True)

print("✓ 4.1 complete")


In [None]:
# 4.2 — transformers: load tokenizer/model (distilgpt2) [CPU/GPU auto]
from pathlib import Path
import os, json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Respect earlier foundations; remain portable if not set
BASE_DIR = Path(os.environ.get("LSA_BASE_DIR", Path.cwd())).resolve()
OUT_DIR  = (BASE_DIR / "outputs" / "transformers").resolve()
PLOTS_DIR = OUT_DIR / "plots"
for p in (OUT_DIR, PLOTS_DIR):
    p.mkdir(parents=True, exist_ok=True)

MODEL_NAME = os.environ.get("LSA_PPL_MODEL", "distilgpt2")
DEVICE = "cuda" if torch.cuda.is_available() and os.environ.get("LSA_ALLOW_CUDA","1")=="1" else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
# causal LM needs a pad token; use EOS for left padding
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.eval().to(DEVICE)

meta = {
    "version_order_source": "filename_prefix",
    "model_name": MODEL_NAME,
    "torch_version": torch.__version__,
    "transformers_version": __import__("transformers").__version__,
    "device": DEVICE,
}
with open(OUT_DIR / "metadata.json", "w") as f:
    json.dump(meta, f, indent=2)
print("✓ loaded", MODEL_NAME, "on", DEVICE, "→ metadata.json updated")


In [None]:
# 4.2a — Module 4 health check (confirms model runs and reports a tiny PPL)
import os, math, json, torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = os.environ.get("LSA_PPL_MODEL", "distilgpt2")
device = "cuda" if torch.cuda.is_available() and os.environ.get("LSA_ALLOW_CUDA","1")=="1" else "cpu"

tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tok.pad_token = tok.eos_token
tok.padding_side = "left"
mdl = AutoModelForCausalLM.from_pretrained(model_name).eval().to(device)

txt = "This is a quick check."
enc = tok(txt, return_tensors="pt")
ids, attn = enc["input_ids"], enc["attention_mask"]
labels = ids.clone(); labels[attn==0] = -100

with torch.no_grad():
    out = mdl(ids.to(device), attention_mask=attn.to(device), labels=labels.to(device))
nll = float(out.loss.detach().cpu())
ppl = float(math.exp(nll))

print(json.dumps({
  "device": device,
  "torch_version": torch.__version__,
  "model": model_name,
  "sample_nll": round(nll, 4),
  "sample_ppl": round(ppl, 2)
}, indent=2))


In [None]:
# 4.2a — quick health check: confirm model runs on a tiny string
import os, math, json, torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = os.environ.get("LSA_PPL_MODEL", "distilgpt2")
device = "cuda" if torch.cuda.is_available() and os.environ.get("LSA_ALLOW_CUDA","1")=="1" else "cpu"

tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tok.pad_token = tok.eos_token; tok.padding_side = "left"
mdl = AutoModelForCausalLM.from_pretrained(model_name).eval().to(device)

txt = "This is a quick check."
enc = tok(txt, return_tensors="pt")
ids, attn = enc["input_ids"], enc["attention_mask"]
labels = ids.clone(); labels[attn==0] = -100
with torch.no_grad():
    out = mdl(ids.to(device), attention_mask=attn.to(device), labels=labels.to(device))
nll = float(out.loss.detach().cpu()); ppl = float(math.exp(nll))
print(json.dumps({"device": device, "torch": torch.__version__, "model": model_name,
                  "sample_nll": round(nll,4), "sample_ppl": round(ppl,2)}, indent=2))


In [None]:
# 4.3 — transformers: sentence pseudo-perplexity (doc + Module-2 windows)
import os, math, json
from pathlib import Path
from typing import List, Dict, Tuple
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import nltk

# ---------- Foundations / inputs ----------
BASE_DIR = Path(os.environ.get("LSA_BASE_DIR", Path.cwd())).resolve()
OUT_DIR  = (BASE_DIR / "outputs" / "transformers").resolve()
PLOTS_DIR = OUT_DIR / "plots"
for p in (OUT_DIR, PLOTS_DIR):
    p.mkdir(parents=True, exist_ok=True)

# Try to reuse in-memory DOCS from previous modules; else fall back to df_docs; last resort scan input_docs
def _discover_docs() -> List[Dict]:
    # Expect DOCS list of dicts with article_id, version_id, version_tag, doc_id, text[, text_clean]
    if "DOCS" in globals() and isinstance(globals()["DOCS"], list):
        return globals()["DOCS"]
    if "df_docs" in globals():
        rows = globals()["df_docs"].to_dict("records")
        return rows
    # Fallback: scan SOURCE_DIR for 01..04 files (basic)
    SOURCE_DIR = Path(os.environ.get("LSA_SOURCE_DIR", BASE_DIR / "input_docs")).resolve()
    if not SOURCE_DIR.exists():
        raise FileNotFoundError(f"No DOCS in memory and SOURCE_DIR missing: {SOURCE_DIR}")
    files = sorted([p for p in SOURCE_DIR.iterdir() if p.is_file()])
    docs = []
    for p in files:
        name = p.stem
        # crude parse "01-title..." → version_id=1, article_id=name without prefix after hyphen
        try:
            v = int(name[:2])
        except Exception:
            continue
        art = name[3:] if len(name) > 3 and name[2] in ("-","_") else name
        txt = p.read_text(encoding="utf-8", errors="ignore")
        docs.append({
            "article_id": art,
            "version_id": v,
            "version_tag": f"v{v}",
            "doc_id": f"{art}__v{v}",
            "text": txt
        })
    if not docs:
        raise RuntimeError("Could not discover any documents.")
    return docs

DOCS = _discover_docs()

# Enforce filename-prefix ordering (01..04)
DOCS = sorted(DOCS, key=lambda d: (str(d.get("article_id","")), int(d.get("version_id", 0))))

# Load Module-2 windows (optional but preferred)
M2_WIN_PATH = BASE_DIR / "outputs" / "nltk" / "fw_burstiness_windows.parquet"
DF_WINS = None
if M2_WIN_PATH.exists():
    DF_WINS = pd.read_parquet(M2_WIN_PATH)
    # basic sanity
    for c in ["article_id","version_id","win_id","span_basis","char_start","char_end","win_label"]:
        if c not in DF_WINS.columns:
            DF_WINS = None
            break

# ---------- Sentence splitting (NLTK Punkt to match Module 2) ----------
_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
def sent_spans(text: str) -> List[Tuple[int,int]]:
    return list(_tokenizer.span_tokenize(text))

def split_sents_by_spans(text: str, spans: List[Tuple[int,int]]) -> List[str]:
    return [text[a:b] for (a,b) in spans]

# ---------- PPL / NLL per sentence ----------
MODEL_NAME = os.environ.get("LSA_PPL_MODEL", "distilgpt2")
DEVICE = "cuda" if torch.cuda.is_available() and os.environ.get("LSA_ALLOW_CUDA","1")=="1" else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).eval().to(DEVICE)

MAX_BPE = int(os.environ.get("LSA_PPL_MAX_BPE", "128"))
BATCH = int(os.environ.get("LSA_PPL_BATCH", "8"))

def ppl_on_sentences(sents: List[str]) -> pd.DataFrame:
    rows = []
    if not sents:
        return pd.DataFrame(rows, columns=["sent","nll","ppl","bpe_len"])
    # chunked batches
    for i in range(0, len(sents), BATCH):
        batch = sents[i:i+BATCH]
        enc = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_BPE
        )
        input_ids = enc["input_ids"]
        attn = enc["attention_mask"]
        # mask pad labels
        labels = input_ids.clone()
        labels[attn == 0] = -100
        with torch.no_grad():
            out = model(input_ids.to(DEVICE), attention_mask=attn.to(DEVICE), labels=labels.to(DEVICE))
            # HF returns mean loss over non-ignored tokens for each sample when reduction='mean'
            # We recompute token-level losses per sample by normalizing with valid token counts
            # But .loss is averaged over batch; so use per-token NLL via logits
            # Simpler, acceptable proxy: use out.loss per batch + per-sample valid lengths → approximate equally.
            # For exact per-sample NLL, run loop:
            logits = out.logits.detach().cpu()
        # Per-sample exact NLL/token
        for b in range(input_ids.size(0)):
            ids = input_ids[b].cpu()
            mask = attn[b].cpu()
            # shift for causal LM
            target = ids.clone()
            target[mask==0] = -100
            # Compute token log-probs for each position
            with torch.no_grad():
                # logits[b, :-1] predicts ids[1:]
                log_probs = torch.nn.functional.log_softmax(logits[b, :-1], dim=-1)
                tgt = ids[1:]
                m = mask[1:]
                valid = m.nonzero(as_tuple=False).squeeze(-1)
                if valid.numel() < 3:  # skip ultra-short sentences
                    nll = math.nan
                    ppl = math.nan
                    used = int(valid.numel())
                else:
                    sel_logits = log_probs[valid, tgt[valid]]
                    nll = float(-sel_logits.mean().item())
                    ppl = float(math.exp(nll))
                    used = int(valid.numel())
            rows.append({
                "sent": batch[b],
                "nll": nll,
                "ppl": ppl,
                "bpe_len": used
            })
        # free
        del logits, enc, input_ids, attn, labels
        torch.cuda.empty_cache() if DEVICE=="cuda" else None
    df = pd.DataFrame(rows)
    return df

# ---------- Aggregations ----------
def agg_stats(vals: pd.Series, prefix: str) -> Dict[str, float]:
    a = np.array(vals.dropna().tolist(), dtype=float)
    if a.size == 0:
        return {f"{prefix}_mean": np.nan, f"{prefix}_median": np.nan,
                f"{prefix}_p25": np.nan, f"{prefix}_p75": np.nan,
                f"{prefix}_p90": np.nan, f"{prefix}_max": np.nan,
                f"{prefix}_std": np.nan, f"{prefix}_cv": np.nan}
    dd = {
        f"{prefix}_mean": float(np.mean(a)),
        f"{prefix}_median": float(np.median(a)),
        f"{prefix}_p25": float(np.percentile(a, 25)),
        f"{prefix}_p75": float(np.percentile(a, 75)),
        f"{prefix}_p90": float(np.percentile(a, 90)),
        f"{prefix}_max": float(np.max(a)),
        f"{prefix}_std": float(np.std(a, ddof=0)),
    }
    dd[f"{prefix}_cv"] = float(dd[f"{prefix}_std"]/dd[f"{prefix}_mean"]) if dd[f"{prefix}_mean"] not in (0.0, np.nan) else np.nan
    return dd

# Collect per-version rows
ver_rows = []
win_rows = []

for doc in DOCS:
    art = str(doc["article_id"])
    vid = int(doc["version_id"])
    vtag = str(doc.get("version_tag", f"v{vid}"))
    did  = str(doc.get("doc_id", f"{art}__v{vid}"))

    basis = "text_clean" if "text_clean" in doc and isinstance(doc["text_clean"], str) and len(doc["text_clean"])>0 else "text"
    text = doc[basis]

    # Sentence list (full doc)
    spans = sent_spans(text)
    sents = split_sents_by_spans(text, spans)
    df_sent = ppl_on_sentences(sents)
    # guard: drop very short
    df_sent = df_sent[df_sent["bpe_len"] >= 3]

    # Version-level aggregates
    ver_info = {
        "article_id": art, "version_id": vid, "version_tag": vtag, "doc_id": did,
        "n_sents_ver": int(df_sent.shape[0]),
        "n_bpe_tokens_sum_ver": int(df_sent["bpe_len"].dropna().sum()) if not df_sent.empty else 0,
        "truncation_max_bpe": int(MAX_BPE),
    }
    ver_info.update(agg_stats(df_sent["ppl"], "ppl"))
    # rename ppl_* to ppl_*_ver and nll_* to nll_*_ver
    ver_info = { (k.replace("ppl_", "ppl_") + "_ver" if k.startswith("ppl_") else
                   k.replace("nll_", "nll_") + "_ver" if k.startswith("nll_") else k): v
                for k,v in {**ver_info, **agg_stats(df_sent["nll"], "nll")}.items() }
    # Fix keys for counts after rename
    ver_info["article_id"]=art; ver_info["version_id"]=vid; ver_info["version_tag"]=vtag; ver_info["doc_id"]=did
    ver_rows.append(ver_info)

    # Window-level (if DF_WINS present)
    if DF_WINS is not None:
        sub = DF_WINS[(DF_WINS.article_id==art) & (DF_WINS.version_id==vid)].copy()
        if not sub.empty:
            for _, w in sub.sort_values("win_id").iterrows():
                basis_w = w.get("span_basis", basis)
                src_text = doc.get(basis_w, text)
                a, b = int(w.char_start), int(w.char_end)
                win_txt = src_text[a:b] if 0 <= a <= b <= len(src_text) else ""
                # split with NLTK to match Module 2 policy inside the window
                w_spans = sent_spans(win_txt)
                w_sents = split_sents_by_spans(win_txt, w_spans)
                dfw = ppl_on_sentences(w_sents)
                dfw = dfw[dfw["bpe_len"] >= 3]
                row = {
                    "article_id": art, "version_id": vid, "version_tag": vtag, "doc_id": did,
                    "win_id": int(w.win_id), "win_label": str(w.win_label),
                    "span_basis": str(basis_w),
                    "char_start": int(w.char_start), "char_end": int(w.char_end),
                    "sent_start_index": int(w.sent_start_index) if "sent_start_index" in w else -1,
                    "sent_end_index": int(w.sent_end_index) if "sent_end_index" in w else -1,
                    "is_partial_tail": bool(w.is_partial_tail) if "is_partial_tail" in w else False,
                    "n_sents_win": int(dfw.shape[0]),
                    "n_bpe_tokens_sum_win": int(dfw["bpe_len"].dropna().sum()) if not dfw.empty else 0,
                }
                row.update({k+"_win": v for k,v in agg_stats(dfw["ppl"], "ppl").items()})
                row.update({k+"_win": v for k,v in agg_stats(dfw["nll"], "nll").items()})
                # prune duplicate suffixes (agg_stats names already include _mean etc.)
                clean = {}
                for k,v in row.items():
                    k = k.replace("ppl_p25_win_win","ppl_p25_win").replace("ppl_p75_win_win","ppl_p75_win").replace("ppl_p90_win_win","ppl_p90_win")
                    k = k.replace("ppl_max_win_win","ppl_max_win").replace("ppl_median_win_win","ppl_median_win").replace("ppl_mean_win_win","ppl_mean_win").replace("ppl_std_win_win","ppl_std_win").replace("ppl_cv_win_win","ppl_cv_win")
                    k = k.replace("nll_mean_win_win","nll_mean_win").replace("nll_std_win_win","nll_std_win").replace("nll_cv_win_win","nll_cv_win")
                    clean[k]=v
                win_rows.append(clean)

# ---------- Save artifacts ----------
df_ver = pd.DataFrame(ver_rows)
# enforce dtypes
if not df_ver.empty:
    df_ver["version_id"] = df_ver["version_id"].astype("int64")
    for c in ["n_sents_ver","n_bpe_tokens_sum_ver","truncation_max_bpe"]:
        df_ver[c] = df_ver[c].astype("int64")
df_ver.to_parquet(OUT_DIR / "perplexity.parquet", index=False)

if win_rows:
    df_win = pd.DataFrame(win_rows)
    if not df_win.empty:
        df_win["version_id"] = df_win["version_id"].astype("int64")
        df_win["win_id"] = df_win["win_id"].astype("int64")
        for c in ["char_start","char_end","sent_start_index","sent_end_index","n_sents_win","n_bpe_tokens_sum_win"]:
            if c in df_win.columns:
                df_win[c] = df_win[c].astype("int64")
        df_win.to_parquet(OUT_DIR / "perplexity_windows.parquet", index=False)
else:
    df_win = pd.DataFrame()

# Update metadata with counts & config
meta_path = OUT_DIR / "metadata.json"
meta = {}
if meta_path.exists():
    meta = json.loads(meta_path.read_text())
meta.update({
    "batch_size_used": int(os.environ.get("LSA_PPL_BATCH", "8")),
    "truncation_max_bpe": int(MAX_BPE),
    "articles": int(len({d["article_id"] for d in DOCS})),
    "versions_per_article_min": int(min([d["version_id"] for d in DOCS] or [0])),
    "versions_per_article_max": int(max([d["version_id"] for d in DOCS] or [0])),
    "windows_source": "module_2" if DF_WINS is not None else "absent",
    "skipped_short_sentences": int(((df_win.get("n_sents_win", pd.Series(dtype=int))==0).sum()) if not df_win.empty else 0),
})
meta_path.write_text(json.dumps(meta, indent=2))

print(f"✓ wrote {OUT_DIR/'perplexity.parquet'} ({len(df_ver)} rows)")
print(f"✓ windows: {'present → ' + str(len(df_win)) + ' rows' if not df_win.empty else 'absent or empty'}")


In [None]:
# 4.4 — transformers: visuals — PPL distributions & trends
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

BASE_DIR = Path(os.environ.get("LSA_BASE_DIR", Path.cwd())).resolve()
OUT_DIR  = (BASE_DIR / "outputs" / "transformers").resolve()
PLOTS_DIR = OUT_DIR / "plots"
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

df_ver = pd.read_parquet(OUT_DIR / "perplexity.parquet")
df_win = None
pwin = OUT_DIR / "perplexity_windows.parquet"
if pwin.exists():
    df_win = pd.read_parquet(pwin)

# ---- 1) Global histogram of per-sentence PPL (from window rows if present; else approximate from version means) ----
def _hist_data():
    if df_win is not None and not df_win.empty and "ppl_mean_win" in df_win.columns:
        return df_win["ppl_mean_win"].replace([np.inf, -np.inf], np.nan).dropna().values
    # fallback: use version means (coarser)
    return df_ver["ppl_mean_ver"].replace([np.inf, -np.inf], np.nan).dropna().values

vals = _hist_data()
plt.figure(figsize=(8,4.5))
if vals.size:
    vmax = np.nanpercentile(vals, 99)
    vals = np.clip(vals, 0, vmax)
    plt.hist(vals, bins=40)
    plt.xlabel("Per-sentence pseudo-perplexity (clipped at 99th pct)")
    plt.ylabel("Count")
    plt.title("Global distribution of sentence PPL")
else:
    plt.text(0.5,0.5,"No data", ha="center", va="center", transform=plt.gca().transAxes)
plt.tight_layout()
plt.savefig(PLOTS_DIR / "ppl_hist_global.png", dpi=150)
plt.show(); plt.close()

# ---- 2) Per-article version trend (mean PPL per version) ----
def _safe_slug(s: str) -> str:
    import re
    return re.sub(r"[^A-Za-z0-9._-]+", "_", str(s))

for art, gg in df_ver.groupby("article_id"):
    g = gg.sort_values("version_id")
    plt.figure(figsize=(7.5,4.2))
    plt.plot(g["version_id"], g["ppl_mean_ver"], marker="o")
    plt.xticks(g["version_id"], [f"v{v}" for v in g["version_id"]])
    plt.xlabel("version")
    plt.ylabel("mean PPL")
    plt.title(f"PPL by version — {art}")
    plt.grid(alpha=0.3)
    fname = PLOTS_DIR / f"ppl_trend_version_{_safe_slug(art)}.png"
    plt.tight_layout(); plt.savefig(fname, dpi=150); plt.show(); plt.close()

# ---- 3) Per-article window trend (within versions) ----
if df_win is not None and not df_win.empty:
    for art, gA in df_win.groupby("article_id"):
        plt.figure(figsize=(9,4.8))
        for vid, gV in gA.groupby("version_id"):
            gV = gV.sort_values("win_id")
            plt.plot(gV["win_id"], gV["ppl_mean_win"], marker=".", alpha=0.8, label=f"v{vid}")
        plt.xlabel("window id")
        plt.ylabel("mean PPL (window)")
        plt.title(f"PPL across windows — {art}")
        plt.legend()
        fname = PLOTS_DIR / f"ppl_trend_windows_{_safe_slug(art)}.png"
        plt.tight_layout(); plt.savefig(fname, dpi=150); plt.show(); plt.close()

print("✓ 4.4 visuals written to", PLOTS_DIR)


In [None]:
# 5.1 — sentence-transformers: install (CPU-friendly)
import sys, subprocess, pkgutil

def _pip(args):
    return subprocess.check_call([sys.executable, "-m", "pip", "install", *args])

need = []
# Ensure torch/transformers exist (from Module 4 ideally); install CPU wheel if missing
if pkgutil.find_loader("torch") is None:
    need += ["torch==2.*", "--index-url", "https://download.pytorch.org/whl/cpu"]
if pkgutil.find_loader("transformers") is None:
    need += ["transformers>=4.40,<5"]
if pkgutil.find_loader("sentence_transformers") is None:
    need += ["sentence-transformers==3.*"]
if pkgutil.find_loader("sklearn") is None:
    need += ["scikit-learn==1.*"]
if pkgutil.find_loader("nltk") is None:
    need += ["nltk>=3.8,<4"]

if need:
    print("Installing:", need)
    _pip(need)
else:
    print("✓ Dependencies already installed")

# NLTK punkt for sentence spans (aligns with Module 2)
import nltk
try:
    nltk.data.find("tokenizers/punkt")
except Exception:
    nltk.download("punkt", quiet=True)

print("✓ 5.1 complete")


In [None]:
# 5.2 — sentence-transformers: imports & model init (MiniLM)
from pathlib import Path
import os, json
import numpy as np
import torch
from sentence_transformers import SentenceTransformer

# Dirs
BASE_DIR = Path(os.environ.get("LSA_BASE_DIR", Path.cwd())).resolve()
SEM_OUT  = (BASE_DIR / "outputs" / "semantic").resolve()
PLOTS_DIR = SEM_OUT / "plots"
for p in (SEM_OUT, PLOTS_DIR):
    p.mkdir(parents=True, exist_ok=True)

# Model config
MODEL_NAME = os.environ.get("LSA_SEM_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
EMB_BATCH  = int(os.environ.get("LSA_SEM_BATCH", "64"))
NORMALIZE  = True

DEVICE = "cuda" if torch.cuda.is_available() and os.environ.get("LSA_ALLOW_CUDA","1")=="1" else "cpu"
model = SentenceTransformer(MODEL_NAME, device=DEVICE)
# Normalization makes cosine == dot
model_kwargs = dict(normalize_embeddings=NORMALIZE, convert_to_numpy=True, batch_size=EMB_BATCH)

def embed(texts):
    # Returns float32 (n, d) with L2 normalization (if NORMALIZE)
    if not texts:
        return np.zeros((0, model.get_sentence_embedding_dimension()), dtype=np.float32)
    embs = model.encode(texts, **model_kwargs)
    # Ensure float32
    return embs.astype(np.float32)

meta = {
    "version_order_source": "filename_prefix",
    "model_name": MODEL_NAME,
    "model_dim": int(model.get_sentence_embedding_dimension()),
    "normalize_embeddings": bool(NORMALIZE),
    "batch_size": int(EMB_BATCH),
    "device": DEVICE,
}
(SEM_OUT / "metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("✓ 5.2 ready →", MODEL_NAME, "on", DEVICE, "dim", meta["model_dim"])


In [None]:
# 5.3 — embeddings → coherence, redundancy, drift (doc + windows)
import os, math, json
from pathlib import Path
from typing import List, Tuple, Dict
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import nltk

# --- Dirs & inputs ---
BASE_DIR = Path(os.environ.get("LSA_BASE_DIR", Path.cwd())).resolve()
SEM_OUT  = (BASE_DIR / "outputs" / "semantic").resolve()
PLOTS_DIR = SEM_OUT / "plots"
for p in (SEM_OUT, PLOTS_DIR):
    p.mkdir(parents=True, exist_ok=True)

# Discover DOCS: prefer in-memory, then df_docs, else scan input_docs (01..04)
def _discover_docs() -> List[Dict]:
    if "DOCS" in globals() and isinstance(globals()["DOCS"], list):
        return globals()["DOCS"]
    if "df_docs" in globals():
        return globals()["df_docs"].to_dict("records")
    SOURCE_DIR = Path(os.environ.get("LSA_SOURCE_DIR", BASE_DIR / "input_docs"))
    docs=[]
    if SOURCE_DIR.exists():
        for p in sorted(SOURCE_DIR.iterdir()):
            if not p.is_file(): continue
            stem = p.stem
            try:
                vid = int(stem[:2])
            except Exception:
                continue
            art = stem[3:] if len(stem)>3 and stem[2] in ("-","_") else stem
            txt = p.read_text(encoding="utf-8", errors="ignore")
            docs.append({"article_id": art, "version_id": vid, "version_tag": f"v{vid}", "doc_id": f"{art}__v{vid}", "text": txt})
    if not docs:
        raise RuntimeError("Module 5: No documents found. Provide DOCS/df_docs or set LSA_SOURCE_DIR.")
    return docs

DOCS = _discover_docs()
DOCS = sorted(DOCS, key=lambda d: (str(d.get("article_id","")), int(d.get("version_id",0))))

# Module-2 windows (authoritative)
M2_WIN = BASE_DIR / "outputs" / "nltk" / "fw_burstiness_windows.parquet"
DFW = pd.read_parquet(M2_WIN) if M2_WIN.exists() else None
uses_m2 = bool(DFW is not None and set(["article_id","version_id","win_id","span_basis","char_start","char_end","win_label"]).issubset(DFW.columns))

# Sentence spans via NLTK Punkt (align with Module 2)
_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
def sent_spans(text: str) -> List[Tuple[int,int]]:
    return list(_tokenizer.span_tokenize(text))

def _safe_slug(s: str) -> str:
    import re
    return re.sub(r"[^A-Za-z0-9._-]+", "_", str(s))

# --- Core measures ---
def _centroid(u: np.ndarray) -> np.ndarray:
    if u.size == 0: return np.zeros((model.get_sentence_embedding_dimension(),), dtype=np.float32)
    c = u.mean(axis=0)
    # Normalize to unit (if zero-norm, keep zeros)
    n = np.linalg.norm(c)
    return c / n if n > 0 else c

def coherence(embeds: np.ndarray) -> float:
    # mean cosine to centroid
    if embeds.shape[0] < 1: return np.nan
    c = _centroid(embeds)
    if not np.any(c): return np.nan
    sims = embeds @ c  # normalized ⇒ dot == cosine
    return float(np.nanmean(sims))

def redundancy_from_pairwise(embeds: np.ndarray, topk: int = 1) -> float:
    # mean of top-k neighbor cosine per row (ignoring self). topk=1 ⇒ neighbor_sim
    n = embeds.shape[0]
    if n <= 1: return np.nan
    S = embeds @ embeds.T  # cosine
    # mask self
    np.fill_diagonal(S, -np.inf)
    if topk == 1:
        nn = np.max(S, axis=1)
        return float(np.nanmean(nn))
    k = min(topk, n-1)
    # take top-k along each row
    part = np.partition(S, -k, axis=1)[:, -k:]
    # mean over top-k
    vals = np.mean(part, axis=1)
    return float(np.nanmean(vals))

def pca_sem_var(embeds: np.ndarray, n_components: int = 8) -> float:
    # Sum of top-K explained variance ratio (bounded ≤1). Needs ≥2 samples.
    if embeds.shape[0] < 2: return np.nan
    try:
        K = min(n_components, embeds.shape[0], embeds.shape[1])
        pca = PCA(n_components=K, svd_solver="randomized", random_state=7)
        pca.fit(embeds)
        return float(np.sum(pca.explained_variance_ratio_))
    except Exception:
        return np.nan

# --- Compute metrics ---
doc_rows = []
win_rows = []

for doc in DOCS:
    art = str(doc["article_id"]); vid = int(doc["version_id"]); vtag = str(doc.get("version_tag", f"v{vid}")); did = str(doc.get("doc_id", f"{art}__v{vid}"))
    basis = "text_clean" if ("text_clean" in doc and isinstance(doc["text_clean"], str) and doc["text_clean"]) else "text"
    text = str(doc.get(basis) if basis in doc else doc.get("text",""))
    if not text:
        doc_rows.append({"article_id": art, "version_id": vid, "version_tag": vtag, "doc_id": did,
                         "n_sents_sem": 0, "coherence_doc": np.nan, "dispersion_doc": np.nan, "redundancy_doc": np.nan})
        continue

    spans = sent_spans(text)
    sents = [text[a:b] for (a,b) in spans]
    E = embed(sents)  # (n, d)
    n_s = int(E.shape[0])

    coh = coherence(E)
    red_nn = redundancy_from_pairwise(E, topk=1)  # neighbor_sim over whole doc
    doc_rows.append({
        "article_id": art, "version_id": vid, "version_tag": vtag, "doc_id": did,
        "n_sents_sem": n_s,
        "coherence_doc": coh, "dispersion_doc": (1.0 - coh) if not math.isnan(coh) else np.nan,
        "redundancy_doc": red_nn
    })

    # Window-level using M2 spans (full containment policy)
    if uses_m2:
        sub = DFW[(DFW.article_id==art) & (DFW.version_id==vid)].copy()
        if not sub.empty:
            for _, w in sub.sort_values("win_id").iterrows():
                basis_w = str(w["span_basis"]) if str(w["span_basis"]) in ("text","text_clean") else basis
                src_text = str(doc.get(basis_w, text))
                wa, wb = int(w["char_start"]), int(w["char_end"])
                # pick sentences whose spans are fully inside the window bounds (on the **same basis**)
                # Recompute sentence spans if basis differs
                if basis_w == basis:
                    sspans = spans
                    stexts = sents
                else:
                    sspans = sent_spans(src_text)
                    stexts = [src_text[a:b] for (a,b) in sspans]
                    # recompute embeddings on the alternate basis ONCE per window slice:
                # Select indices
                idxs = [i for i,(a,b) in enumerate(sspans) if (a >= wa and b <= wb)]
                if len(idxs) == 0:
                    # No full sentences inside window → emit NaNs
                    win_rows.append({
                        "article_id": art, "version_id": vid, "version_tag": vtag, "doc_id": did,
                        "win_id": int(w.win_id), "win_label": str(w.win_label),
                        "span_basis": basis_w, "char_start": wa, "char_end": wb,
                        "sent_start_index": int(w.sent_start_index) if "sent_start_index" in w else -1,
                        "sent_end_index": int(w.sent_end_index) if "sent_end_index" in w else -1,
                        "is_partial_tail": bool(w.is_partial_tail) if "is_partial_tail" in w else False,
                        "n_sents_win": 0,
                        "n_tokens_win": int(w.n_tokens_win) if "n_tokens_win" in w else 0,
                        "coherence_win": np.nan, "neighbor_sim_win": np.nan, "sem_var_win": np.nan,
                        "redundancy_win": np.nan
                    })
                    continue
                # get sentence subset embeddings (compute lazily per basis if needed)
                if basis_w == basis:
                    Ew = E[idxs]
                else:
                    Ew = embed([stexts[i] for i in idxs])
                coh_w = coherence(Ew)
                nn_w  = redundancy_from_pairwise(Ew, topk=1)
                red3  = redundancy_from_pairwise(Ew, topk=3)
                svar  = pca_sem_var(Ew, n_components=8)
                win_rows.append({
                    "article_id": art, "version_id": vid, "version_tag": vtag, "doc_id": did,
                    "win_id": int(w.win_id), "win_label": str(w.win_label),
                    "span_basis": basis_w, "char_start": wa, "char_end": wb,
                    "sent_start_index": int(w.sent_start_index) if "sent_start_index" in w else -1,
                    "sent_end_index": int(w.sent_end_index) if "sent_end_index" in w else -1,
                    "is_partial_tail": bool(w.is_partial_tail) if "is_partial_tail" in w else False,
                    "n_sents_win": int(Ew.shape[0]),
                    "n_tokens_win": int(w.n_tokens_win) if "n_tokens_win" in w else int(sum(len(stexts[i].split()) for i in idxs)),
                    "coherence_win": coh_w, "neighbor_sim_win": nn_w, "sem_var_win": svar,
                    "redundancy_win": red3
                })

# --- Doc-level drift & deltas (adjacent only) ---
df_doc = pd.DataFrame(doc_rows).sort_values(["article_id","version_id"]).reset_index(drop=True)

# Compute centroids once per doc to get drift
centroids = {}
for doc in DOCS:
    art, vid = str(doc["article_id"]), int(doc["version_id"])
    basis = "text_clean" if ("text_clean" in doc and isinstance(doc["text_clean"], str) and doc["text_clean"]) else "text"
    text = str(doc.get(basis) if basis in doc else doc.get("text",""))
    spans = sent_spans(text)
    sents = [text[a:b] for (a,b) in spans]
    E = embed(sents)
    centroids[(art,vid)] = _centroid(E)

drifts = []
for art, g in df_doc.groupby("article_id", sort=False):
    g = g.sort_values("version_id")
    prev = None
    for _, row in g.iterrows():
        k = (art, int(row["version_id"]))
        c = centroids.get(k)
        if prev is None or c is None or not np.any(c) or not np.any(prev):
            drifts.append(np.nan)
        else:
            drifts.append(float(1.0 - float(np.dot(prev, c))))
        prev = c
df_doc["semantic_drift_from_prev"] = drifts

# Deltas table
delta_rows = []
for art, g in df_doc.groupby("article_id", sort=False):
    g = g.sort_values("version_id")
    for i in range(len(g)-1):
        a = g.iloc[i]; b = g.iloc[i+1]
        if int(b["version_id"]) - int(a["version_id"]) != 1: continue
        delta_rows.append({
            "article_id": art,
            "from_version": int(a["version_id"]),
            "to_version": int(b["version_id"]),
            "d_coherence_doc": float(b["coherence_doc"] - a["coherence_doc"]) if (pd.notna(a["coherence_doc"]) and pd.notna(b["coherence_doc"])) else np.nan,
            "d_redundancy_doc": float(b["redundancy_doc"] - a["redundancy_doc"]) if (pd.notna(a["redundancy_doc"]) and pd.notna(b["redundancy_doc"])) else np.nan,
            "semantic_drift": float(df_doc.loc[b.name, "semantic_drift_from_prev"])
        })
df_delta = pd.DataFrame(delta_rows)

# Window DF
df_win = pd.DataFrame(win_rows).sort_values(["article_id","version_id","win_id"]).reset_index(drop=True)

# --- Enforce dtypes & save ---
if not df_doc.empty:
    df_doc["version_id"] = df_doc["version_id"].astype("int64")
    df_doc["n_sents_sem"] = df_doc["n_sents_sem"].astype("int64")
df_doc.to_parquet(SEM_OUT / "semantic_metrics.parquet", index=False)

if not df_win.empty:
    for c in ("version_id","win_id","char_start","char_end","sent_start_index","sent_end_index","n_sents_win","n_tokens_win"):
        if c in df_win.columns:
            df_win[c] = df_win[c].astype("int64")
    df_win.to_parquet(SEM_OUT / "semantic_windows.parquet", index=False)

if not df_delta.empty:
    for c in ("from_version","to_version"):
        df_delta[c] = df_delta[c].astype("int64")
    df_delta.to_parquet(SEM_OUT / "semantic_deltas.parquet", index=False)

# Update metadata
meta_path = SEM_OUT / "metadata.json"
meta = {}
if meta_path.exists():
    meta = json.loads(meta_path.read_text())
meta.update({
    "uses_m2_windows": bool(uses_m2),
    "window_alignment_policy": "from_module_2_spans/full_containment" if uses_m2 else "none",
    "articles": int(len({d["article_id"] for d in DOCS})),
    "versions_per_article_min": int(min([d["version_id"] for d in DOCS] or [0])),
    "versions_per_article_max": int(max([d["version_id"] for d in DOCS] or [0])),
    "expected_versions": 4,
    "enable_nli": False,
})
meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")

print({
    "doc_rows": int(df_doc.shape[0]),
    "win_rows": int(df_win.shape[0]) if not df_win.empty else 0,
    "delta_rows": int(df_delta.shape[0]) if not df_delta.empty else 0,
    "saved": [str(SEM_OUT / "semantic_metrics.parquet")] + \
             ([str(SEM_OUT / "semantic_windows.parquet")] if not df_win.empty else []) + \
             ([str(SEM_OUT / "semantic_deltas.parquet")] if not df_delta.empty else [])
})


In [None]:
# 5.3b — optional sampled NLI (adjacent versions). Default: skip. Set ENABLE_NLI=1 to run.
import os, json, random, math
import numpy as np
import pandas as pd

ENABLE_NLI = os.environ.get("ENABLE_NLI","0") == "1"
NLI_MODEL = os.environ.get("LSA_NLI_MODEL","cross-encoder/nli-deberta-v3-base")
MAX_PAIRS = int(os.environ.get("LSA_NLI_MAX_PAIRS","64"))

if ENABLE_NLI:
    from sentence_transformers import CrossEncoder
    from nltk import data as _ndata
    _tokenizer = _ndata.load("tokenizers/punkt/english.pickle")

    def _sents(txt: str):
        return [txt[a:b] for (a,b) in _tokenizer.span_tokenize(txt or "")]
    # Expect DOCS & df_doc from 5.3
    model = CrossEncoder(NLI_MODEL, automodel_args={"torch_dtype":"auto"})
    results = []
    for art, g in pd.DataFrame(DOCS).groupby("article_id", sort=False):
        g = g.sort_values("version_id")
        for i in range(len(g)-1):
            A, B = g.iloc[i], g.iloc[i+1]
            sA, sB = _sents(A.get("text_clean", A["text"])), _sents(B.get("text_clean", B["text"]))
            # Align by index up to min len; sample pairs
            pairs = [(sA[j], sB[j]) for j in range(min(len(sA), len(sB)))]
            random.Random(7).shuffle(pairs)
            pairs = pairs[:MAX_PAIRS]
            if not pairs:
                results.append({"article_id": art, "from_version": int(A["version_id"]), "to_version": int(B["version_id"]),
                                "nli_pairs_sampled": 0, "nli_p_contra_sampled": math.nan, "nli_p_entail_sampled": math.nan})
                continue
            probs = model.predict(pairs, apply_softmax=True, batch_size=16)  # returns [p_contra, p_neutral, p_entail]
            probs = np.asarray(probs)
            results.append({
                "article_id": art, "from_version": int(A["version_id"]), "to_version": int(B["version_id"]),
                "nli_pairs_sampled": int(len(pairs)),
                "nli_p_contra_sampled": float(np.mean(probs[:,0])),
                "nli_p_entail_sampled": float(np.mean(probs[:,2])),
            })
    df_nli = pd.DataFrame(results)

    # Merge into doc-level table for convenience
    from pathlib import Path
    BASE_DIR = Path(os.environ.get("LSA_BASE_DIR", Path.cwd())).resolve()
    SEM_OUT  = (BASE_DIR / "outputs" / "semantic").resolve()
    doc_path = SEM_OUT / "semantic_metrics.parquet"
    df_doc = pd.read_parquet(doc_path)
    # Attach to 'to_version' rows
    m = df_doc.merge(df_nli, left_on=["article_id","version_id"], right_on=["article_id","to_version"], how="left")
    m = m.drop(columns=["from_version","to_version"])
    m.to_parquet(doc_path, index=False)

    # update metadata
    meta_path = SEM_OUT / "metadata.json"
    meta = json.loads(meta_path.read_text()) if meta_path.exists() else {}
    meta.update({"enable_nli": True, "nli_model": NLI_MODEL, "nli_pairs_max": MAX_PAIRS})
    meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")

    print({"nli_rows": int(df_nli.shape[0]), "merged_into_doc_table": True})
else:
    print("5.3b skipped (ENABLE_NLI=0).")


5.3b is the optional “tiny NLI” step that samples sentence pairs between adjacent versions and estimates contradiction/entailment rates.

It’s off by default to keep runs fast and CPU-friendly. The cell checks the environment variable ENABLE_NLI; if it’s not set to "1", it prints 5.3b skipped (ENABLE_NLI=0). and exits without doing anything.

If you want it on later, set os.environ["ENABLE_NLI"] = "1" in a small setup cell, then re-run 5.3b. (It can be slow on CPU — that’s why it’s opt-in.)

In [None]:
# 5.4 — visuals: coherence distributions, trends, heatmap, window trends
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json, os, re

BASE_DIR = Path(os.environ.get("LSA_BASE_DIR", Path.cwd())).resolve()
SEM_OUT  = (BASE_DIR / "outputs" / "semantic").resolve()
PLOTS_DIR = SEM_OUT / "plots"
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

def _safe_slug(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9._-]+", "_", str(s))

doc_path = SEM_OUT / "semantic_metrics.parquet"
win_path = SEM_OUT / "semantic_windows.parquet"
dfd = pd.read_parquet(doc_path) if doc_path.exists() else pd.DataFrame()
dfw = pd.read_parquet(win_path) if win_path.exists() else pd.DataFrame()

# 1) Global doc coherence violin
plt.figure(figsize=(7.5,4.2))
vals = dfd["coherence_doc"].replace([np.inf,-np.inf], np.nan).dropna().to_numpy() if "coherence_doc" in dfd else np.array([])
if vals.size:
    plt.violinplot(vals, showmeans=True)
    plt.ylabel("coherence_doc (cosine to centroid)")
    plt.title("Global distribution of semantic coherence (doc-level)")
else:
    plt.text(0.5,0.5,"No data", ha="center", va="center", transform=plt.gca().transAxes)
plt.tight_layout()
plt.savefig(PLOTS_DIR / "coherence_doc_violin.png", dpi=150)
plt.close()

# 2) Per-article trend lines (coherence_doc v1..v4)
for art, g in dfd.groupby("article_id"):
    g = g.sort_values("version_id")
    plt.figure(figsize=(7.5,4.2))
    plt.plot(g["version_id"], g["coherence_doc"], marker="o")
    plt.xticks(g["version_id"], [f"v{v}" for v in g["version_id"]])
    plt.xlabel("version"); plt.ylabel("coherence_doc")
    plt.title(f"Coherence by version — {art}")
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / f"trend_coherence_{_safe_slug(art)}_.png", dpi=150)
    plt.close()

# 3) Window coherence lines per article
if not dfw.empty and "coherence_win" in dfw.columns:
    for art, gA in dfw.groupby("article_id"):
        plt.figure(figsize=(9,4.8))
        for vid, gV in gA.groupby("version_id"):
            gV = gV.sort_values("win_id")
            plt.plot(gV["win_id"], gV["coherence_win"], marker=".", alpha=0.85, label=f"v{int(vid)}")
        plt.xlabel("window id"); plt.ylabel("coherence_win")
        plt.title(f"Window coherence — {art}")
        plt.legend()
        plt.tight_layout()
        plt.savefig(PLOTS_DIR / f"win_coherence_{_safe_slug(art)}.png", dpi=150)
        plt.close()

# 4) Cosine heatmap (first article/version; cap 120x120)
# Recompute sentence embeddings for that pair on the same basis
try:
    import nltk, numpy as np
    from sentence_transformers import SentenceTransformer
    DEVICE = "cuda" if (os.environ.get("LSA_ALLOW_CUDA","1")=="1" and __import__("torch").cuda.is_available()) else "cpu"
    MODEL_NAME = os.environ.get("LSA_SEM_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
    model = SentenceTransformer(MODEL_NAME, device=DEVICE)
    model_kwargs = dict(normalize_embeddings=True, convert_to_numpy=True, batch_size=int(os.environ.get("LSA_SEM_BATCH","64")))
    _tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

    if not dfd.empty:
        first = dfd.sort_values(["article_id","version_id"]).iloc[0]
        art, vid = str(first["article_id"]), int(first["version_id"])
        # pull text from DOCS/global (fallback scan)
        def _find_doc(art, vid):
            if "DOCS" in globals():
                for d in DOCS:
                    if str(d["article_id"])==art and int(d["version_id"])==vid:
                        return d
            return None
        d = _find_doc(art, vid)
        if d:
            basis = "text_clean" if ("text_clean" in d and isinstance(d["text_clean"], str) and d["text_clean"]) else "text"
            text = str(d.get(basis) if basis in d else d.get("text",""))
            spans = list(_tokenizer.span_tokenize(text or ""))
            sents = [text[a:b] for (a,b) in spans][:120]
            if sents:
                E = model.encode(sents, **model_kwargs).astype(np.float32)
                S = E @ E.T
                plt.figure(figsize=(6,5))
                plt.imshow(S, vmin=0, vmax=1, aspect="auto")
                plt.colorbar(label="cosine")
                plt.title(f"Sentence cosine heatmap — {art} v{vid} (n={len(sents)})")
                plt.tight_layout()
                plt.savefig(PLOTS_DIR / f"cosine_heatmap_{_safe_slug(art)}_v{vid}.png", dpi=150)
                plt.close()
except Exception as e:
    # Best-effort; skip if anything goes sideways
    pass

# Write plots manifest
import json
manifest = {"files": sorted([p.name for p in PLOTS_DIR.glob("*.png")])}
(PLOTS_DIR / "plots_index.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")

print({"plots": len(manifest["files"]), "dir": str(PLOTS_DIR)})


In [None]:
# 5.8 — metadata enrich (Module 5)
import json
from pathlib import Path
import pandas as pd

BASE_DIR = Path.cwd().resolve()
SEM_OUT  = (BASE_DIR / "outputs" / "semantic").resolve()
PLOTS    = SEM_OUT / "plots"
meta_p   = SEM_OUT / "metadata.json"

# Load current metadata (create if missing)
meta = {}
if meta_p.exists():
    meta = json.loads(meta_p.read_text())

# Gather doc/window stats
doc_p = SEM_OUT / "semantic_metrics.parquet"
win_p = SEM_OUT / "semantic_windows.parquet"
dlt_p = SEM_OUT / "semantic_deltas.parquet"

arts = vers_min = vers_max = 0
skipped_windows = 0
total_windows = 0

if doc_p.exists():
    df_doc = pd.read_parquet(doc_p)
    arts = df_doc["article_id"].nunique()
    vers_min = int(df_doc["version_id"].min()) if not df_doc.empty else 0
    vers_max = int(df_doc["version_id"].max()) if not df_doc.empty else 0

if win_p.exists():
    df_win = pd.read_parquet(win_p)
    total_windows = int(df_win.shape[0])
    # Windows with <2 sentences → where coherence is NaN (by design) or explicit count <2 if present
    if "n_sents_win" in df_win.columns:
        skipped_windows = int((df_win["n_sents_win"] < 2).sum())
    else:
        skipped_windows = int(df_win["coherence_win"].isna().sum())

# Plot manifest
plots = sorted([p.name for p in PLOTS.glob("*.png")]) if PLOTS.exists() else []
(SEM_OUT / "plots" / "plots_index.json").write_text(
    json.dumps({"files": plots}, indent=2), encoding="utf-8"
)

# Update metadata
meta.update({
    "articles": int(arts),
    "versions_per_article_min": int(vers_min),
    "versions_per_article_max": int(vers_max),
    "windows_available": bool(win_p.exists()),
    "windows_total": int(total_windows),
    "windows_with_lt2_sentences": int(skipped_windows),
    "notes": meta.get("notes", []) + [
        "Window inclusion rule: full sentence containment within [char_start, char_end).",
        "Windows with <2 sentences emit NaN for window metrics.",
        "Cosine metrics use normalized embeddings; ranges within [0,1]."
    ]
})

meta_p.write_text(json.dumps(meta, indent=2), encoding="utf-8")
print({"metadata_updated": str(meta_p), "plots_index": str(SEM_OUT / 'plots' / 'plots_index.json')})


In [None]:
# 5.9 — audit & schema/range checks for Module 5
from pathlib import Path
import pandas as pd
import numpy as np
import json

BASE_DIR = Path.cwd().resolve()
SEM_OUT  = (BASE_DIR / "outputs" / "semantic").resolve()

doc_p = SEM_OUT / "semantic_metrics.parquet"
win_p = SEM_OUT / "semantic_windows.parquet"
dlt_p = SEM_OUT / "semantic_deltas.parquet"

res = {"exists": {"doc": doc_p.exists(), "win": win_p.exists(), "dlt": dlt_p.exists()}}

req_doc = {"article_id","version_id","version_tag","doc_id","n_sents_sem",
           "coherence_doc","dispersion_doc","redundancy_doc","semantic_drift_from_prev"}
req_win = {"article_id","version_id","version_tag","doc_id","win_id","win_label",
           "span_basis","char_start","char_end","sent_start_index","sent_end_index","is_partial_tail",
           "n_sents_win","coherence_win","neighbor_sim_win","redundancy_win"}
req_dlt = {"article_id","from_version","to_version","semantic_drift","d_coherence_doc","d_redundancy_doc"}

if res["exists"]["doc"]:
    doc = pd.read_parquet(doc_p)
    res["doc_missing"] = sorted(req_doc - set(doc.columns))
    res["doc_rows"] = int(doc.shape[0])
    res["coherence_doc_in_0_1"] = bool(doc["coherence_doc"].dropna().between(0,1).all()) if "coherence_doc" in doc else None
    res["redundancy_doc_in_0_1"] = bool(doc["redundancy_doc"].dropna().between(0,1).all()) if "redundancy_doc" in doc else None

if res["exists"]["win"]:
    win = pd.read_parquet(win_p).sort_values(["article_id","version_id","win_id"])
    res["win_missing"] = sorted(req_win - set(win.columns))
    res["win_rows"] = int(win.shape[0])
    # contiguity per (article,version)
    ok_ids=[]
    for (a,v), g in win.groupby(["article_id","version_id"], sort=False):
        ok_ids.append(list(g["win_id"]) == list(range(1, len(g)+1)))
    res["win_id_contiguous"] = all(ok_ids) if ok_ids else None
    # window metric range sanity
    if "coherence_win" in win:
        wv = win["coherence_win"].replace([np.inf,-np.inf], np.nan).dropna()
        res["coherence_win_in_0_1"] = bool(wv.between(0,1).all())
    if "neighbor_sim_win" in win:
        nv = win["neighbor_sim_win"].replace([np.inf,-np.inf], np.nan).dropna()
        res["neighbor_sim_win_in_0_1"] = bool(nv.between(0,1).all())

if res["exists"]["dlt"]:
    dlt = pd.read_parquet(dlt_p)
    res["dlt_missing"] = sorted(req_dlt - set(dlt.columns))
    res["dlt_rows"] = int(dlt.shape[0])
    if {"to_version","from_version"}.issubset(dlt.columns):
        res["adjacent_only"] = bool(((dlt["to_version"] - dlt["from_version"])==1).all())
    if "semantic_drift" in dlt.columns:
        res["semantic_drift_nonneg"] = bool(dlt["semantic_drift"].dropna().ge(0).all())

print(json.dumps(res, indent=2))


In [None]:
# 6.1 — BERTopic | UMAP | HDBSCAN: install (CPU) + deterministic seeds + thread caps
import sys, subprocess, os, random, importlib.util, json
from pathlib import Path

def _pip(*args):
    return subprocess.check_call([sys.executable, "-m", "pip", "install", *args])

def _missing(modname: str) -> bool:
    return importlib.util.find_spec(modname) is None

need = []
if _missing("bertopic"):
    need += ['bertopic==0.16.*']
# UMAP can be found as "umap" (package "umap-learn")
if _missing("umap") and _missing("umap_learn"):
    need += ['umap-learn>=0.5.4,<0.6']
if _missing("hdbscan"):
    need += ['hdbscan>=0.8.33']
if _missing("sentence_transformers"):
    need += ['sentence-transformers>=2.2.2']
if _missing("sklearn"):
    need += ['scikit-learn>=1.3,<2']
if _missing("scipy"):
    need += ['scipy>=1.10,<2']
if _missing("pyarrow"):
    need += ['pyarrow>=14']

if need:
    print("Installing:", need)
    _pip(*need)
else:
    print("✓ Dependencies already installed")

# Determinism + safe threading
import numpy as np
try:
    import torch
except Exception:
    torch = None

SEED = int(os.environ.get("LSA_SEED", "42"))
if torch is not None and hasattr(torch, "manual_seed"):
    torch.manual_seed(SEED)

# Output dirs + metadata shell
BASE_DIR = Path(os.environ.get("LSA_BASE_DIR", Path.cwd())).resolve()
TOP_OUT  = (BASE_DIR / "outputs" / "bertopic").resolve()
(TOP_OUT / "embeddings").mkdir(parents=True, exist_ok=True)
(TOP_OUT / "plots").mkdir(parents=True, exist_ok=True)

# Record package versions
def _ver(modname):
    try:
        m = __import__(modname)
        return getattr(m, "__version__", "unknown")
    except Exception:
        return None

vers = {
    "bertopic": _ver("bertopic"),
    "umap": _ver("umap"),
    "hdbscan": _ver("hdbscan"),
    "sentence_transformers": _ver("sentence_transformers"),
    "sklearn": _ver("sklearn"),
    "scipy": _ver("scipy"),
    "numpy": _ver("numpy"),
    "pandas": _ver("pandas"),
    "matplotlib": _ver("matplotlib"),
    "pyarrow": _ver("pyarrow"),
    "torch": _ver("torch"),
}

meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("✓ 6.1 ready → metadata stub at", meta_path)


In [None]:
# 6.2 — BERTopic: init & fit on window texts; write topics.parquet + topic_info.parquet
from pathlib import Path
import os, json, math, re
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

from umap import UMAP
import hdbscan
from bertopic import BERTopic

# ---- Paths / inputs ----
BASE_DIR = Path(os.environ.get("LSA_BASE_DIR", Path.cwd())).resolve()
TOP_OUT  = (BASE_DIR / "outputs" / "bertopic").resolve()
PLOT_DIR = TOP_OUT / "plots"
EMB_DIR  = TOP_OUT / "embeddings"
M2_PATH  = BASE_DIR / "outputs" / "nltk" / "fw_burstiness_windows.parquet"
if not M2_PATH.exists():
    raise FileNotFoundError(f"Missing Module-2 windows: {M2_PATH}")

DFW = pd.read_parquet(M2_PATH).sort_values(["article_id","version_id","win_id"]).reset_index(drop=True)
need_cols = {"article_id","version_id","version_tag","doc_id","win_id","win_label",
             "span_basis","char_start","char_end","sent_start_index","sent_end_index"}
missing = need_cols - set(DFW.columns)
if missing:
    raise ValueError(f"Module-2 windows missing columns: {sorted(missing)}")

# ---- Discover documents ----
def _discover_docs():
    # Priority: global DOCS (list of dicts), then df_docs DataFrame, then input_docs folder
    if "DOCS" in globals() and isinstance(globals()["DOCS"], list):
        return globals()["DOCS"]
    if "df_docs" in globals():
        try:
            return globals()["df_docs"].to_dict("records")
        except Exception:
            pass
    SRC = BASE_DIR / "input_docs"
    docs=[]
    if SRC.exists():
        for p in sorted(SRC.iterdir()):
            if not p.is_file():
                continue
            stem = p.stem
            try:
                vid = int(stem[:2])
            except Exception:
                continue
            art = stem[3:] if len(stem)>3 and stem[2] in ("-","_") else stem
            txt = p.read_text(encoding="utf-8", errors="ignore")
            docs.append({"article_id": art, "version_id": vid, "version_tag": f"v{vid}",
                         "doc_id": f"{art}__v{vid}", "text": txt})
    return docs

DOCS = _discover_docs()
if not DOCS:
    raise RuntimeError("Module 6: No DOCS/df_docs/input_docs found for reconstructing window texts.")

def _basis_text(doc: dict) -> dict:
    """
    Always return a dict with keys 'text' and 'text_clean' (text_clean may be None).
    No KeyError even if doc lacks 'text_clean'.
    """
    txt = doc.get("text", "") or ""
    txt_clean = doc.get("text_clean", None)
    if isinstance(txt_clean, str) and txt_clean.strip():
        return {"text": txt, "text_clean": txt_clean}
    else:
        return {"text": txt, "text_clean": None}

# Map (article, version) → basis texts
TEXTS = {(str(d.get("article_id")), int(d.get("version_id"))): _basis_text(d) for d in DOCS}

# Track how many times we had to fallback from requested 'text_clean' to 'text'
_basis_mismatch_count = [0]

def _slice_text(art, vid, basis, a, b) -> str:
    bt = TEXTS.get((str(art), int(vid)))
    if not bt:
        return ""
    want_clean = (str(basis) == "text_clean")
    src = bt.get("text_clean") if want_clean else bt.get("text")
    if want_clean and (src is None or src == ""):
        # Fallback to raw text, count a basis mismatch
        _basis_mismatch_count[0] += 1
        src = bt.get("text", "")
    if not isinstance(src, str):
        src = str(src or "")
    a = max(0, int(a)); b = max(a, int(b))
    return src[a:b]

# ---- Build corpus in stable order ----
rows, texts = [], []
for _, r in DFW.iterrows():
    t = _slice_text(r["article_id"], int(r["version_id"]), str(r["span_basis"]),
                    int(r["char_start"]), int(r["char_end"]))
    rows.append((r["article_id"], int(r["version_id"]), str(r["version_tag"]), r["doc_id"], int(r["win_id"]), str(r["win_label"]),
                 str(r["span_basis"]), int(r["char_start"]), int(r["char_end"]), int(r["sent_start_index"]), int(r["sent_end_index"])))
    texts.append(t if isinstance(t, str) else "")

keys_df = pd.DataFrame(rows, columns=["article_id","version_id","version_tag","doc_id","win_id","win_label",
                                      "span_basis","char_start","char_end","sent_start_index","sent_end_index"])
N = len(texts)
if N == 0:
    raise RuntimeError("Module 6: No windows to process (empty corpus).")

# ---- Embeddings (MiniLM, normalized); cache ----
MODEL_NAME = os.environ.get("LSA_SEM_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
BATCH = int(os.environ.get("LSA_BERTOPIC_BATCH","32"))
import torch
DEVICE = "cuda" if (os.environ.get("LSA_ALLOW_CUDA","0")=="1" and torch.cuda.is_available()) else "cpu"
st = SentenceTransformer(MODEL_NAME, device=DEVICE)
emb = st.encode(texts, normalize_embeddings=True, convert_to_numpy=True, batch_size=BATCH).astype(np.float32)

EMB_PATH = EMB_DIR / f"sbert_{re.sub(r'[^A-Za-z0-9._-]+','_',MODEL_NAME)}.npy"
np.save(EMB_PATH, emb)
(keys_df[["article_id","version_id","win_id"]]
 .assign(row_id=np.arange(N))
 .to_parquet(EMB_DIR / "embedding_rows.parquet", index=False))
print(f"Embedded {N} windows → {EMB_PATH} (dim={emb.shape[1]}, device={DEVICE})")

# ---- Fit BERTopic (UMAP → HDBSCAN), with fallback ----
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine",
                  random_state=42, low_memory=True)
hdb_params_attempted = []
hdbscan_used = None

def _make_hdb(min_cluster_size):
    hdb_params_attempted.append(min_cluster_size)
    return hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=None, metric="euclidean",
                           cluster_selection_method="eom", prediction_data=False)

vectorizer = CountVectorizer(stop_words="english", min_df=min(3, max(1, N//500)), ngram_range=(1,2))

# Attempt 1: mcs=8
hdb_model  = _make_hdb(8)
topic_model = BERTopic(embedding_model=None, umap_model=umap_model, hdbscan_model=hdb_model,
                       vectorizer_model=vectorizer, calculate_probabilities=False, verbose=False)
topics, probs = topic_model.fit_transform(texts, embeddings=emb)

# Attempt 2: mcs=5 if all noise
fallback_used = None
if all(t == -1 for t in topics):
    hdb_model_small = _make_hdb(5)
    topic_model_small = BERTopic(embedding_model=None, umap_model=umap_model, hdbscan_model=hdb_model_small,
                                 vectorizer_model=vectorizer, calculate_probabilities=False, verbose=False)
    topics, probs = topic_model_small.fit_transform(texts, embeddings=emb)
    if not all(t == -1 for t in topics):
        topic_model = topic_model_small
        hdbscan_used = {"min_cluster_size": 5, "min_samples": None, "metric":"euclidean", "cluster_selection_method":"eom"}

# Fallback 3: KMeans if still all noise
if all(t == -1 for t in topics):
    k = min(10, max(2, N // 50))
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    km_ids = km.fit_predict(emb)
    topics = km_ids.tolist()
    topic_model = None
    fallback_used = {"fallback": "kmeans", "k": int(k)}
else:
    if hdbscan_used is None:
        hdbscan_used = {"min_cluster_size": 8, "min_samples": None, "metric":"euclidean", "cluster_selection_method":"eom"}

# ---- Labels & topic info ----
def _topic_label_from_model(model, tid: int) -> str:
    try:
        if model is None or tid == -1:
            return "noise" if tid == -1 else f"km_{tid}"
        words = model.get_topics().get(tid, [])
        toks = [w for (w, _score) in words[:3]] if words else []
        return " ".join(toks) if toks else f"topic_{tid}"
    except Exception:
        return f"topic_{tid}"

labels = [_topic_label_from_model(topic_model, t) for t in topics]

# topic_info parquet
if topic_model is not None:
    info = topic_model.get_topic_info().rename(columns={"Topic":"topic_id","Count":"size","Name":"name"})
    def _top_terms(tid):
        arr = topic_model.get_topics().get(tid, [])
        return ", ".join([w for (w, _s) in arr[:10]]) if arr else ""
    info["top_terms"] = info["topic_id"].apply(lambda x: _top_terms(x))
    info.to_parquet(TOP_OUT / "topic_info.parquet", index=False)
else:
    pd.DataFrame({"topic_id": sorted(set(topics)),
                  "size":[topics.count(t) for t in sorted(set(topics))],
                  "name":[f"km_{t}" for t in sorted(set(topics))],
                  "top_terms":[""]*len(set(topics))}).to_parquet(TOP_OUT / "topic_info.parquet", index=False)

# ---- Window-level assignments ----
assign = keys_df.copy()
assign["topic_id"] = pd.Series(topics, dtype="int64")
assign["topic_label"] = labels
assign["model_name"] = MODEL_NAME
assign["topic_prob"] = np.nan  # probabilities disabled

assign.to_parquet(TOP_OUT / "topics.parquet", index=False)
assign[["article_id","version_id","win_id","topic_id","topic_label","topic_prob"]].to_parquet(
    TOP_OUT / "topics_raw.parquet", index=False
)

# ---- Update metadata (correct fields + basis mismatches) ----
meta_path = TOP_OUT / "metadata.json"
meta = json.loads(meta_path.read_text()) if meta_path.exists() else {}
meta.update({
    "embedding_model": MODEL_NAME,
    "embedding_dim": int(emb.shape[1]),
    "device": "cuda" if (os.environ.get("LSA_ALLOW_CUDA","0")=="1" and torch.cuda.is_available()) else "cpu",
    "batch_size": int(BATCH),
    "umap_params": {"n_neighbors":15,"n_components":5,"min_dist":0.0,"metric":"cosine","random_state":42,"low_memory":True},
    "hdbscan_params_used": hdbscan_used,                 # dict or None (if KMeans fallback)
    "hdbscan_params_attempted": hdb_params_attempted,    # e.g., [8, 5]
    "vectorizer": {"stop_words":"english","min_df": int(vectorizer.min_df), "ngram_range": list(vectorizer.ngram_range)},
    "corpus_windows": int(N),
    "fallback": fallback_used,                           # {"fallback":"kmeans","k":...} or None
    "basis_mismatch_windows": int(_basis_mismatch_count[0])  # windows requested 'text_clean' but fell back to 'text'
})
meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")

print({
    "windows": N,
    "unique_topics_excl_noise": int(len(set([t for t in topics if t!=-1]))),
    "topics_path": str(TOP_OUT / "topics.parquet"),
    "fallback": fallback_used,
    "basis_mismatch_windows": int(_basis_mismatch_count[0]),
})


In [None]:
# 6.3 — Topic stability & churn metrics: write topic_metrics.parquet (with noise_rate + NaN reassignment when no valid)
from pathlib import Path
import numpy as np
import pandas as pd
import json
from scipy.optimize import linear_sum_assignment

BASE_DIR = Path(os.environ.get("LSA_BASE_DIR", Path.cwd())).resolve()
TOP_OUT  = (BASE_DIR / "outputs" / "bertopic").resolve()
EMB_DIR  = TOP_OUT / "embeddings"

assign = pd.read_parquet(TOP_OUT / "topics.parquet").sort_values(["article_id","version_id","win_id"])
keys_rows = pd.read_parquet(EMB_DIR / "embedding_rows.parquet")  # row_id per (article,version,win)
emb = np.load(next(EMB_DIR.glob("sbert_*.npy")))  # (N, d)

# Build (article,version,win) -> embedding row_id
row_map = {(str(a),int(v),int(w)): int(rid) for a,v,w,rid in keys_rows[["article_id","version_id","win_id","row_id"]].itertuples(index=False)}

def _topic_entropy(counts):
    total = counts.sum()
    if total <= 0:
        return np.nan
    p = counts[counts>0] / total
    return float(-(p * np.log(p)).sum())

def _coherence_of_topic(indices):
    if len(indices) < 2:
        return np.nan
    X = emb[indices]  # normalized
    c = X.mean(axis=0); n = np.linalg.norm(c); c = c/n if n>0 else c
    sims = X @ c
    return float(np.mean(sims))

# ---- Per-version metrics (add noise_rate) ----
ver_rows = []
for (art, vid), g in assign.groupby(["article_id","version_id"], sort=False):
    total_wins = int(g.shape[0])
    noise_ct = int((g["topic_id"] == -1).sum())
    noise_rate = float(noise_ct / total_wins) if total_wins > 0 else np.nan

    g_non = g[g["topic_id"]!=-1]
    n_topics = int(g_non["topic_id"].nunique()) if not g_non.empty else 0
    counts = g_non["topic_id"].value_counts().sort_index().to_numpy() if not g_non.empty else np.array([0])
    ent_excl = _topic_entropy(counts) if n_topics>0 else np.nan
    counts_incl = np.append(counts, noise_ct)
    ent_incl = _topic_entropy(counts_incl) if counts_incl.sum()>0 else np.nan

    coh_vals=[]
    for tid, gt in g_non.groupby("topic_id"):
        idx = [ row_map[(str(art), int(vid), int(w))] for w in gt["win_id"].tolist() if (str(art), int(vid), int(w)) in row_map ]
        if idx:
            coh_vals.append(_coherence_of_topic(idx))
    coh_mean = float(np.nanmean(coh_vals)) if len(coh_vals)>0 else np.nan
    coh_median = float(np.nanmedian(coh_vals)) if len(coh_vals)>0 else np.nan

    ver_rows.append({
        "level": "doc",
        "article_id": str(art), "version_id": int(vid),
        "n_topics_doc": int(n_topics),
        "topic_entropy": ent_excl,
        "topic_entropy_incl_noise": ent_incl,
        "coherence_mean": coh_mean,
        "coherence_median": coh_median,
        "noise_rate": noise_rate,  # NEW
    })

df_ver = pd.DataFrame(ver_rows).sort_values(["article_id","version_id"]).reset_index(drop=True)

# ---- Adjacent-pair churn & overlap (align by win_id) ----
pair_rows = []
for art, gA in assign.groupby("article_id", sort=False):
    gA = gA.sort_values(["version_id","win_id"])
    versions = sorted(gA["version_id"].unique().tolist())
    for i in range(len(versions)-1):
        v1, v2 = int(versions[i]), int(versions[i+1])
        A = gA[gA["version_id"]==v1]
        B = gA[gA["version_id"]==v2]
        merged = pd.merge(A[["win_id","topic_id"]], B[["win_id","topic_id"]],
                          on="win_id", suffixes=("_a","_b"))
        valid = merged[(merged["topic_id_a"]!=-1) & (merged["topic_id_b"]!=-1)]
        if valid.shape[0] == 0:
            reassignment = np.nan  # FIX: no valid aligned non-noise windows → NaN, not 0
        else:
            reassignment = float((valid["topic_id_a"] != valid["topic_id_b"]).sum() / valid.shape[0])

        ta = sorted([t for t in A["topic_id"].unique() if t!=-1])
        tb = sorted([t for t in B["topic_id"].unique() if t!=-1])
        if len(ta)==0 or len(tb)==0:
            jaccard = np.nan; births = int(len(tb)); deaths = int(len(ta))
        else:
            mat = np.zeros((len(ta), len(tb)), dtype=np.int64)
            for i_t, t in enumerate(ta):
                Wa = set(A.loc[A["topic_id"]==t, "win_id"].tolist())
                for j_t, u in enumerate(tb):
                    Wb = set(B.loc[B["topic_id"]==u, "win_id"].tolist())
                    mat[i_t, j_t] = len(Wa & Wb)
            cost = -mat
            ri, cj = linear_sum_assignment(cost)
            inter = int(mat[ri, cj].sum())
            union = 0
            for i_t, j_t in zip(ri, cj):
                Wa = set(A.loc[A["topic_id"]==ta[i_t], "win_id"].tolist())
                Wb = set(B.loc[B["topic_id"]==tb[j_t], "win_id"].tolist())
                union += len(Wa | Wb)
            jaccard = float(inter/union) if union>0 else np.nan
            births = int(len(set(tb) - set(tb[j] for j in cj)))
            deaths = int(len(set(ta) - set(ta[i] for i in ri)))

        va = df_ver[(df_ver.article_id==str(art)) & (df_ver.version_id==v1)].iloc[0]
        vb = df_ver[(df_ver.article_id==str(art)) & (df_ver.version_id==v2)].iloc[0]
        dn_topics = int((vb["n_topics_doc"] or 0) - (va["n_topics_doc"] or 0))
        d_entropy = float((vb["topic_entropy"] - va["topic_entropy"])) if pd.notna(va["topic_entropy"]) and pd.notna(vb["topic_entropy"]) else np.nan
        d_coh = float((vb["coherence_mean"] - va["coherence_mean"])) if pd.notna(va["coherence_mean"]) and pd.notna(vb["coherence_mean"]) else np.nan

        regularization_cue = (dn_topics < 0) and ( (not pd.isna(d_entropy)) and (d_entropy < 0) ) and ( (not pd.isna(d_coh)) and (d_coh > 0) )

        pair_rows.append({
            "level": "pair",
            "article_id": str(art),
            "from_version": v1, "to_version": v2,
            "reassignment_rate": reassignment,
            "topic_overlap_jaccard": jaccard,
            "births": births, "deaths": deaths,
            "delta_n_topics": dn_topics,
            "delta_entropy": d_entropy,
            "delta_coherence_mean": d_coh,
            "regularization_cue": bool(regularization_cue),
        })

df_pair = pd.DataFrame(pair_rows).sort_values(["article_id","from_version"]).reset_index(drop=True)

# ---- Write metrics ----
out = pd.concat([df_ver, df_pair], ignore_index=True, sort=False)
out.to_parquet(TOP_OUT / "topic_metrics.parquet", index=False)

# Update metadata (add summary of noise_rate)
meta_path = TOP_OUT / "metadata.json"
meta = json.loads(meta_path.read_text()) if meta_path.exists() else {}
try:
    mean_noise = float(df_ver["noise_rate"].dropna().mean()) if not df_ver.empty else None
except Exception:
    mean_noise = None
meta.update({
    "metrics_written": True,
    "articles": int(assign["article_id"].nunique()),
    "windows": int(assign.shape[0]),
    "mean_noise_rate": mean_noise,
})
meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")

print({
    "doc_rows": int(df_ver.shape[0]),
    "pair_rows": int(df_pair.shape[0]),
    "mean_noise_rate": mean_noise,
    "path": str(TOP_OUT / "topic_metrics.parquet")
})


In [None]:
# 6.3-safety — ensure metrics fields exist on disk (idempotent)
import pandas as pd, json, pathlib
p = pathlib.Path("outputs/bertopic/topic_metrics.parquet")
if p.exists():
    df = pd.read_parquet(p)
    changed = False
    if "Topic" in df.columns and "noise_rate" not in df.columns:
        total = int(df["Count"].sum()) if "Count" in df.columns else None
        noise_mask = (df["Topic"] == -1).astype("int")
        df["noise_rate"] = (noise_mask if total is None else noise_mask / max(total, 1))
        changed = True
    if {"reassignment_events","unique_docs"}.issubset(df.columns):
        if "reassignment_denominator" not in df.columns or "reassignment_rate" not in df.columns:
            denom = df["unique_docs"].where(df["unique_docs"] > 0, 1)
            df["reassignment_denominator"] = denom
            df["reassignment_rate"] = df["reassignment_events"] / denom
            changed = True
    if changed:
        df.to_parquet(p, index=False)
        print("6.3-safety: metrics augmented and saved.")
    else:
        print("6.3-safety: metrics already complete; no change.")
else:
    print("6.3-safety: topic_metrics.parquet not found (skip).")


In [None]:
# 6.4 — BERTopic: visuals — topic timeline & coherence (robust)
import os, re, json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# inline backend
try:
    ip = get_ipython()
    if ip: ip.run_line_magic("matplotlib", "inline")
except Exception:
    pass

BASE_DIR = Path(os.environ.get("LSA_BASE_DIR", Path.cwd())).resolve()
TOP_OUT  = (BASE_DIR / "outputs" / "bertopic").resolve()
PLOT_DIR = TOP_OUT / "plots"
PLOT_DIR.mkdir(parents=True, exist_ok=True)

assign_path  = TOP_OUT / "topics.parquet"
metrics_path = TOP_OUT / "topic_metrics.parquet"

if not assign_path.exists():
    raise FileNotFoundError(f"Missing topics assignments at {assign_path}")
assign = pd.read_parquet(assign_path)

# Load metrics if present, and keep ONLY per-version rows (version_id not null)
if metrics_path.exists():
    raw_metrics = pd.read_parquet(metrics_path)
    if "version_id" in raw_metrics.columns:
        mver = raw_metrics.loc[raw_metrics["version_id"].notna()].copy()
        # coerce to int safely
        mver["version_id"] = mver["version_id"].astype(int)
        # ensure required cols exist
        for col in ["coherence_mean","coherence_median","n_topics_doc","topic_entropy","topic_entropy_incl_noise"]:
            if col not in mver.columns: mver[col] = np.nan
    else:
        mver = pd.DataFrame(columns=["article_id","version_id","coherence_mean","n_topics_doc",
                                     "topic_entropy","topic_entropy_incl_noise"])
else:
    mver = pd.DataFrame(columns=["article_id","version_id","coherence_mean","n_topics_doc",
                                 "topic_entropy","topic_entropy_incl_noise"])

# If no n_topics_doc provided, derive a minimal proxy from assignments (excl. noise)
if mver.empty or "n_topics_doc" not in mver.columns or mver["n_topics_doc"].isna().all():
    nt = (assign[assign["topic_id"]!=-1]
          .groupby(["article_id","version_id"])["topic_id"].nunique()
          .rename("n_topics_doc").reset_index())
    mver = pd.merge(mver, nt, on=["article_id","version_id"], how="outer", suffixes=("","_derived"))
    if "n_topics_doc_derived" in mver:
        mver["n_topics_doc"] = mver["n_topics_doc"].fillna(mver["n_topics_doc_derived"])
        mver.drop(columns=["n_topics_doc_derived"], inplace=True)

# helpers
def safe_slug(s: str) -> str:
    s = str(s).strip().replace(" ", "-")
    return re.sub(r"[^A-Za-z0-9._-]+", "_", s)[:120]

SEED = int(os.environ.get("LSA_SEED", "42"))
rng = np.random.default_rng(SEED)
MAX_ARTS = int(os.environ.get("LSA_MAX_ARTICLES", "10"))
slugs = sorted(assign["article_id"].unique().tolist())[:MAX_ARTS]
plots_index = {"topic_timeline": [], "topic_coherence": [], "globals": []}

# ---- Global histogram of per-version topic counts (excl. noise)
tmp = (assign[assign["topic_id"]!=-1]
       .groupby(["article_id","version_id"])["topic_id"].nunique())
vals = tmp.to_numpy()
fig, ax = plt.subplots(figsize=(7,4))
if vals.size > 0:
    ax.hist(vals, bins=min(20, max(5, int(np.sqrt(vals.size)))), edgecolor="black")
    ax.set_title("Distribution of per-version topic counts (excl. noise)")
    ax.set_xlabel("# unique topics"); ax.set_ylabel("Frequency")
else:
    ax.text(0.5,0.5,"No non-noise topics", ha="center", va="center", fontsize=12)
    ax.axis("off")
plt.tight_layout()
out = PLOT_DIR / "topic_counts_hist.png"
plt.savefig(out, dpi=150); plt.show(); plt.close()
plots_index["globals"].append(str(out))

# ---- Per-article plots
for art in slugs:
    g  = assign.loc[assign["article_id"]==art].sort_values(["version_id","win_id"])
    mv = mver.loc[mver["article_id"]==art].dropna(subset=["version_id"]).sort_values("version_id")

    # 1) Topic timeline (scatter of windows; color by topic)
    if not g.empty:
        fig, ax = plt.subplots(figsize=(8, 4))
        topics = sorted(g["topic_id"].unique().tolist())
        cmap = plt.get_cmap("tab20")
        colors = {tid: (0.75,0.75,0.75,0.6) if tid==-1 else cmap(len(colors)%20) if 'colors' in locals() else cmap(0)
                  for tid in topics}
        # rebuild mapping deterministically
        colors = {tid: ((0.75,0.75,0.75,0.6) if tid==-1 else plt.get_cmap("tab20")(i % 20))
                  for i, tid in enumerate(topics)}

        y = g["topic_id"].to_numpy(dtype=float)
        yj = y + rng.normal(0, 0.04, size=y.shape)  # jitter
        ax.scatter(g["version_id"].to_numpy(int), yj,
                   c=[colors[int(t)] for t in g["topic_id"]],
                   s=18, alpha=0.85, edgecolors="none")
        ax.set_xlabel("Version (v1..v4)")
        ax.set_ylabel("Topic ID (-1=noise)")
        ax.set_title(f"Topic timeline — {art}")
        ax.set_xticks(sorted(g["version_id"].unique().tolist()))
        ax.grid(True, axis="y", alpha=0.2, linestyle=":")
        plt.tight_layout()
        out1 = PLOT_DIR / f"topic_timeline_{safe_slug(art)}.png"
        plt.savefig(out1, dpi=150); plt.show(); plt.close()
        plots_index["topic_timeline"].append(str(out1))

    # 2) Coherence bars per version (with #topics overlay)
    if not mv.empty:
        # ensure unique per version (drop duplicates if any)
        mv = mv.drop_duplicates(subset=["article_id","version_id"])
        x = mv["version_id"].astype(int).to_numpy()
        y = mv["coherence_mean"].astype(float).to_numpy() if "coherence_mean" in mv else np.full_like(x, np.nan, dtype=float)
        nt = mv["n_topics_doc"].astype(float).to_numpy() if "n_topics_doc" in mv else np.full_like(x, np.nan, dtype=float)

        fig, ax1 = plt.subplots(figsize=(7,4))
        # bars for coherence (NaNs will simply not render bars)
        ax1.bar(x, y, width=0.6, alpha=0.85)
        ax1.set_ylim(0, 1.0)
        ax1.set_ylabel("Coherence (mean)")
        ax1.set_xlabel("Version")
        xticks = np.unique(x)
        ax1.set_xticks(xticks)
        ax1.set_title(f"Topic coherence — {art}")

        # overlay #topics if available
        if np.isfinite(nt).any():
            ax2 = ax1.twinx()
            ax2.plot(x, nt, marker="o", linewidth=1.5, alpha=0.8)
            ymax = max(1, int(np.nanmax(nt)) + 1)
            ax2.set_ylim(0, ymax)
            ax2.set_ylabel("# topics")

        plt.tight_layout()
        out2 = PLOT_DIR / f"topic_coherence_{safe_slug(art)}.png"
        plt.savefig(out2, dpi=150); plt.show(); plt.close()
        plots_index["topic_coherence"].append(str(out2))

# manifest
(PLOT_DIR / "plots_index.json").write_text(json.dumps(plots_index, indent=2), encoding="utf-8")
print("✓ 6.4 visuals complete. Wrote:", json.dumps(plots_index, indent=2))


In [None]:
RUN_TO = 7
print({"RUN_TO": RUN_TO})


In [None]:
# cell 6.Z: bundle Modules 1–6 artifacts safely and stop (unless RUN_TO > 6)
import os, json, time, hashlib, zipfile, sys
from pathlib import Path

# Fallback if cell 0.0 wasn't run
try:
    RUN_TO
except NameError:
    RUN_TO = int(os.environ.get("RUN_TO", "6"))

OUTPUTS = Path("outputs")
BUNDLES = OUTPUTS / "bundles"
BUNDLES.mkdir(parents=True, exist_ok=True)

# Only bundle these module folders under outputs/
module_dirs = ["lexical", "nltk", "spacy", "transformers", "semantic", "bertopic"]

stamp = time.strftime("%Y%m%d_%H%M%S")
zip_path = BUNDLES / f"module6_everything_{stamp}.zip"

def _sha256(p: Path, chunk: int = 65536) -> str:
    h = hashlib.sha256()
    with open(p, "rb") as f:
        for b in iter(lambda: f.read(chunk), b""):
            h.update(b)
    return h.hexdigest()

manifest = {"zip": str(zip_path), "files": []}

with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) as z:
    for d in module_dirs:
        root = OUTPUTS / d
        if root.exists() and root.is_dir():
            for p in root.rglob("*"):
                if p.is_file():
                    arc = p.relative_to(OUTPUTS)
                    z.write(p, arc)
                    manifest["files"].append({
                        "path": str(arc),
                        "sha256": _sha256(p),
                        "bytes": p.stat().st_size
                    })

# Write a manifest alongside the zip for quick inspection
(BUNDLES / f"manifest_{stamp}.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")

print(f"Bundled {len(manifest['files'])} files → {zip_path}")

if RUN_TO <= 6:
    raise SystemExit(0)  # default: stop after bundling
else:
    print(f"6.Z: RUN_TO={RUN_TO} → skipping stop; continuing to 7.x")


In [None]:
# cell 7.0: pre-7.x cleanup (safe: old zips & pip caches only)
from pathlib import Path
import os

bundles = sorted((Path("outputs") / "bundles").glob("module6_everything_*.zip"))
for z in bundles[:-1]:
    try:
        z.unlink(); print("Deleted old bundle:", z)
    except Exception as e:
        print("Skip delete:", z, e)

try:
    os.system("pip cache purge -q"); print("Pip cache purged.")
except Exception as e:
    print("Pip cache purge skipped:", e)

print("Pre-7.x cleanup finished (kept current outputs).")


In [None]:
# cell 7.1 — rapidfuzz: install (CPU-friendly)
# ------------------------------------------------
# Installs only what's needed here; quiet, wheels-first.
import sys, subprocess, pkgutil, json, os, random
import importlib.util

def need(pname: str) -> bool:
    return importlib.util.find_spec(pname) is None

to_install = []
if need("rapidfuzz"):            to_install.append("rapidfuzz==3.*")
if need("nltk"):                 to_install.append("nltk>=3.8")
if need("pandas"):               to_install.append("pandas>=2.0")
if need("numpy"):                to_install.append("numpy>=1.23")
if need("matplotlib"):           to_install.append("matplotlib>=3.7")
if need("pyarrow"):              to_install.append("pyarrow>=14")

if to_install:
    print("Installing:", to_install)
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *to_install])

# Ensure NLTK punkt is available (Module 2 should've fetched it, but be safe)
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

# Create output folders
for p in [
    "outputs/rapidfuzz",
    "outputs/rapidfuzz/plots",
    "outputs/rapidfuzz/bundles",
]:
    os.makedirs(p, exist_ok=True)

# Determinism knobs
import numpy as np
SEED = int(os.environ.get("LSA_SEED", "7"))

# Metadata stub
meta_path = "outputs/rapidfuzz/metadata.json"
stub = {
    "module": 7,
    "seed": SEED,
    "created": __import__("datetime").datetime.utcnow().isoformat() + "Z",
    "version_order_source": "filename_prefix",
}
with open(meta_path, "w") as f:
    json.dump(stub, f, indent=2)
print("✓ 7.1 ready → metadata stub at", meta_path)


In [None]:
# cell 7.2 — rapidfuzz: paraphrase entropy & repetition (crash-safe, blocked, thread-capped)
# -----------------------------------------------------------------------------------------
import os

import re, json, math, gc
from collections import Counter
from dataclasses import dataclass
import numpy as np
import pandas as pd
from rapidfuzz import distance, fuzz, process
import nltk

# ---------------- Config ----------------
SIM_METRIC = os.environ.get("LSA_RAPIDFUZZ_SIM", "levenshtein_normalized")  # or "token_set_ratio"
SAFE_MODE = int(os.environ.get("LSA_RF_SAFE_MODE", "1"))  # 1=skip sentence-level table; 0=store it
BLOCK = int(os.environ.get("LSA_RF_BLOCK", "64"))         # blocked similarity batch
TAU = float(os.environ.get("LSA_RAPIDFUZZ_TAU", "0.15"))
INTRA_WINDOW_M = int(os.environ.get("LSA_RAPIDFUZZ_INTRA_M", "12"))
CANDIDATE_POLICY = os.environ.get("LSA_RAPIDFUZZ_CANDIDATE", "same_win")  # or "same_win±1"

if SIM_METRIC == "token_set_ratio":
    SIM_SCORER = fuzz.token_set_ratio      # 0..100
    SIM_SCALE  = "0..100"
    THETA_SAME_RAW  = float(os.environ.get("LSA_RAPIDFUZZ_THETA_SAME",  "90"))
    THETA_CROSS_RAW = float(os.environ.get("LSA_RAPIDFUZZ_THETA_CROSS", "95"))
    to_unit = lambda x: np.asarray(x, dtype=np.float64) / 100.0
else:
    SIM_SCORER = distance.Levenshtein.normalized_similarity  # 0..1
    SIM_SCALE  = "0..1"
    ts = float(os.environ.get("LSA_RAPIDFUZZ_THETA_SAME",  "0.90"))
    tc = float(os.environ.get("LSA_RAPIDFUZZ_THETA_CROSS", "0.95"))
    THETA_SAME_RAW  = ts/100.0 if ts > 1.0 else ts
    THETA_CROSS_RAW = tc/100.0 if tc > 1.0 else tc
    to_unit = lambda x: np.asarray(x, dtype=np.float64)

# ---------------- Inputs ----------------
WIN_PATH = "outputs/nltk/fw_burstiness_windows.parquet"
if not os.path.exists(WIN_PATH):
    raise FileNotFoundError(f"Missing {WIN_PATH}. Run Module 2 first.")
win = pd.read_parquet(WIN_PATH).sort_values(["article_id","version_id","win_id"]).reset_index(drop=True)

# texts: prefer in-memory DOCS; else load from inputs/
def _sanitize_slug(s): return re.sub(r"[^A-Za-z0-9._-]+", "-", str(s).strip().lower()).strip("-")
def _read_text(fp):
    with open(fp,"rb") as fh: return fh.read().decode("utf-8", errors="replace")

TEXTS = {}
if "DOCS" in globals() and isinstance(globals()["DOCS"], (list, tuple)):
    for d in DOCS:
        art = str(d.get("article_id")); vid = int(d.get("version_id"))
        TEXTS[(art,vid)] = {"text": d.get("text","") or "", "text_clean": d.get("text_clean") if isinstance(d.get("text_clean"), str) else None}
if not TEXTS:
    for root in ["inputs","data","content/inputs","/content/inputs"]:
        if os.path.isdir(root):
            for dirpath,_,files in os.walk(root):
                for fn in files:
                    m = re.match(r"^(\d{2})-(.+?)\.(txt|md)$", fn)
                    if not m: continue
                    vid = int(m.group(1)); art = _sanitize_slug(m.group(2))
                    TEXTS[(art,vid)] = {"text": _read_text(os.path.join(dirpath,fn)), "text_clean": None}

need = [(str(a),int(v)) for a,v in win[["article_id","version_id"]].drop_duplicates().itertuples(index=False) if (str(a),int(v)) not in TEXTS]
if need:
    raise RuntimeError(f"Missing raw text for: {need[:4]} … (ensure inputs/01-<slug>.txt .. 04-<slug>.txt or rerun earlier modules)")

# sentence segmentation (NLTK Punkt) on the SAME basis as M2
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

@dataclass
class SentRec:
    sid:int; start:int; end:int; text:str  # lowercased, trimmed

SENTS = {}  # (art,vid,basis)->list[SentRec]
def get_text(art, vid, basis):
    entry = TEXTS[(art,vid)]
    return (entry["text_clean"] if basis=="text_clean" and entry.get("text_clean") else entry["text"])

def ensure_sentences(art, vid, basis):
    key = (art,vid,basis)
    if key in SENTS: return
    t = get_text(art, vid, basis)
    spans = list(tokenizer.span_tokenize(t))
    SENTS[key] = [SentRec(i,a,b,t[a:b].strip().lower()) for i,(a,b) in enumerate(spans)]

def window_sentences(row):
    art = str(row.article_id); vid = int(row.version_id); basis = row.span_basis if isinstance(row.span_basis,str) else "text"
    ensure_sentences(art,vid,basis)
    recs = SENTS[(art,vid,basis)]
    s0, s1 = int(row.sent_start_index), int(row.sent_end_index)
    if not recs or s1 < s0: return []
    s0 = max(0,s0); s1 = min(s1,len(recs)-1)
    return recs[s0:s1+1]

def candidate_texts_same_win(art, vid, wid, basis):
    out=[]
    versions = sorted(win.loc[win.article_id==art, "version_id"].unique().astype(int))
    for v2 in versions:
        if v2==vid: continue
        wids = [wid] if CANDIDATE_POLICY=="same_win" else [wid-1, wid, wid+1]
        for wj in wids:
            wdf = win[(win.article_id==art)&(win.version_id==v2)&(win.win_id==wj)].head(1)
            if wdf.empty: continue
            ensure_sentences(art,v2,basis)
            for rr in wdf.itertuples(index=False):
                out.extend([s.text for s in window_sentences(rr)])
    return out

# blocked similarity helpers (keeps memory flat)
def blocked_row_sims(q_text, cand_texts, block=BLOCK):
    """Return all similarity scores (raw scale) for a single query vs candidate list, computed in small blocks."""
    if not cand_texts: return np.empty((0,), dtype=np.float64)
    scores = []
    i = 0
    while i < len(cand_texts):
        chunk = cand_texts[i:i+block]
        mat = process.cdist([q_text], chunk, scorer=SIM_SCORER, workers=1)
        scores.append(mat.ravel().astype(np.float64))
        i += block
    return np.concatenate(scores, axis=0) if scores else np.empty((0,), dtype=np.float64)

def intra_version_max_blocked(texts, M):
    """Max similarity within ±M neighbors per sentence, computed in blocks."""
    n = len(texts)
    out = np.zeros((n,), dtype=np.float32)
    for i in range(n):
        lo, hi = max(0, i-M), min(n, i+M+1)
        cands = texts[lo:i] + texts[i+1:hi]
        if not cands:
            continue
        q = re.sub(r"\s+"," ", texts[i]).strip()
        cands_norm = [re.sub(r"\s+"," ", t).strip() for t in cands]
        if q in set(cands_norm):
            out[i] = 100.0 if SIM_SCALE=="0..100" else 1.0
            continue
        smax = 0.0
        j = 0
        while j < len(cands):
            chunk = cands[j:j+BLOCK]
            mat = process.cdist([texts[i]], chunk, scorer=SIM_SCORER, workers=1)
            smax = max(smax, float(np.max(mat)))
            j += BLOCK
        out[i] = smax
    return out

def softmax_stable(x, tau):
    if x.size == 0: return x
    z = (x / max(tau, 1e-6)).astype(np.float64)
    z -= z.max(axis=-1, keepdims=True)
    e = np.exp(z); d = e.sum(axis=-1, keepdims=True); d[d==0.0]=1.0
    return e/d

# --- main ---
rows_win, rows_ver = [], []
if not SAFE_MODE:
    sent_entropy_rows = []

windows_with_cands = 0

for (art, vid, basis), g in win.groupby(["article_id","version_id","span_basis"], dropna=False):
    art = str(art); vid = int(vid); basis = basis if isinstance(basis,str) else "text"
    ensure_sentences(art, vid, basis)
    ver_sents = SENTS[(art,vid,basis)]
    ver_texts = [s.text for s in ver_sents]

    # intra-version repetition (blocked)
    intra_max = intra_version_max_blocked(ver_texts, INTRA_WINDOW_M)

    H_accum, med_accum = [], []

    for r in g.sort_values("win_id").itertuples(index=False):
        sents = window_sentences(r)
        n_s = len(sents)
        sent_texts = [s.text for s in sents]
        cand_texts = candidate_texts_same_win(art, vid, int(r.win_id), basis)
        n_c = len(cand_texts)
        if n_c > 0: windows_with_cands += 1

        if n_s == 0:
            rows_win.append({
                "article_id": art, "version_id": vid, "version_tag": f"v{vid}", "doc_id": f"{art}__v{vid}",
                "win_id": int(r.win_id), "win_label": r.win_label, "span_basis": basis,
                "char_start": int(r.char_start), "char_end": int(r.char_end),
                "sent_start_index": int(r.sent_start_index), "sent_end_index": int(r.sent_end_index),
                "is_partial_tail": bool(getattr(r, "is_partial_tail", False)),
                "n_sentences_win": 0, "n_candidates_win": 0,
                "paraphrase_entropy_win": np.nan, "median_crossver_sim_win": np.nan, "max_crossver_sim_win": np.nan,
                "repeat_within_rate_win": np.nan, "boilerplate_cross_rate_win": np.nan,
                "repeat_run_max_len_win": 0, "repeat_exact_count_win": 0, "repeat_fuzzy_count_win": 0,
            })
            continue

        H_vals, med_sims, max_sims, boiler_flags = [], [], [], []

        for i, q in enumerate(sent_texts):
            row_raw = blocked_row_sims(q, cand_texts, BLOCK) if n_c else np.empty((0,), dtype=np.float64)

            if row_raw.size >= 2:
                sims01 = to_unit(row_raw)
                P = softmax_stable(sims01, TAU)
                denom = math.log(max(row_raw.size, 2))
                H = float(-(P * np.log(np.maximum(P, 1e-12))).sum() / denom)
            else:
                H = np.nan
            H_vals.append(H)

            if row_raw.size > 0:
                med_sims.append(float(np.median(row_raw)))
                max_sims.append(float(np.max(row_raw)))
                boiler_flags.append(bool(np.any(row_raw >= THETA_CROSS_RAW)))
            else:
                med_sims.append(np.nan)
                max_sims.append(np.nan)
                boiler_flags.append(False)

            if not SAFE_MODE:
                if 'sent_entropy_rows' in globals():
                    sent_entropy_rows.append({"article_id": art, "version_id": vid, "win_id": int(r.win_id), "sid": i, "H_s": H})

        # repetition inside this window (use precomputed doc-wide intra_max slice)
        s0, s1 = int(r.sent_start_index), int(r.sent_end_index)
        intra_slice = intra_max[s0:s1+1] if s1 >= s0 else np.array([], dtype=np.float32)
        repeat_within = (intra_slice >= THETA_SAME_RAW) if intra_slice.size else np.array([], dtype=bool)

        norm = [re.sub(r"\s+"," ", t).strip() for t in sent_texts]
        counts = Counter(norm)
        exact = sum(1 for t in norm if counts[t] > 1)
        fuzzy = max(int(np.sum(repeat_within)) - exact, 0)

        # longest consecutive repeat run (window)
        run_max = 0; cur = 0
        for flag in repeat_within:
            cur = (cur + 1) if flag else 0
            run_max = max(run_max, cur)

        rows_win.append({
            "article_id": art, "version_id": vid, "version_tag": f"v{vid}", "doc_id": f"{art}__v{vid}",
            "win_id": int(r.win_id), "win_label": r.win_label, "span_basis": basis,
            "char_start": int(r.char_start), "char_end": int(r.char_end),
            "sent_start_index": int(r.sent_start_index), "sent_end_index": int(r.sent_end_index),
            "is_partial_tail": bool(getattr(r, "is_partial_tail", False)),
            "n_sentences_win": int(n_s), "n_candidates_win": int(n_c),
            "paraphrase_entropy_win": float(np.nanmean(H_vals)) if H_vals else np.nan,
            "median_crossver_sim_win": float(np.nanmedian(med_sims)) if med_sims else np.nan,
            "max_crossver_sim_win": float(np.nanmax(max_sims)) if max_sims else np.nan,
            "repeat_within_rate_win": (float(np.mean(repeat_within)) if repeat_within.size else np.nan),
            "boilerplate_cross_rate_win": (float(np.mean(boiler_flags)) if boiler_flags else np.nan),
            "repeat_run_max_len_win": int(run_max),
            "repeat_exact_count_win": int(exact),
            "repeat_fuzzy_count_win": int(fuzzy),
        })

        H_accum.extend([h for h in H_vals if not np.isnan(h)])
        med_accum.extend([m for m in med_sims if not np.isnan(m)])

    # version-level aggregates (robust run-length computation; avoids empty max())
    flags = (intra_max >= THETA_SAME_RAW) if intra_max.size else np.array([], dtype=bool)
    if flags.any():
        b = np.r_[False, flags, False]
        starts = np.flatnonzero(b[1:] & ~b[:-1])
        ends   = np.flatnonzero(~b[1:] & b[:-1])
        run_max_ver = int((ends - starts).max()) if starts.size else 0
    else:
        run_max_ver = 0

    rows_ver.append({
        "article_id": art, "version_id": vid, "version_tag": f"v{vid}", "doc_id": f"{art}__v{vid}",
        "paraphrase_entropy_ver": float(np.nanmedian(H_accum)) if H_accum else np.nan,
        "median_crossver_sim_ver": float(np.nanmedian(med_accum)) if med_accum else np.nan,
        "p90_crossver_sim_ver": float(np.nanpercentile(med_accum,90)) if med_accum else np.nan,
        "repeat_within_rate_ver": float(np.mean(flags)) if flags.size else np.nan,
        "boilerplate_cross_rate_ver": np.nan,
        "repeat_run_max_len_ver": run_max_ver
    })

    # housekeeping
    del ver_sents, ver_texts, intra_max
    gc.collect()

# ---------------- DataFrames & dtypes ----------------
df_win = pd.DataFrame(rows_win)
df_ver = pd.DataFrame(rows_ver)
if not SAFE_MODE:
    df_sentH = pd.DataFrame(sent_entropy_rows)

for c in ["version_id","win_id","char_start","char_end","sent_start_index","sent_end_index",
          "n_sentences_win","n_candidates_win","repeat_run_max_len_win",
          "repeat_exact_count_win","repeat_fuzzy_count_win"]:
    if c in df_win.columns: df_win[c] = pd.to_numeric(df_win[c], errors="coerce").astype("Int64").astype("int64")
for c in ["version_id","repeat_run_max_len_ver"]:
    if c in df_ver.columns: df_ver[c] = pd.to_numeric(df_ver[c], errors="coerce").astype("Int64").astype("int64")

# ---------------- Save ----------------
os.makedirs("outputs/rapidfuzz", exist_ok=True)
out_win = "outputs/rapidfuzz/paraphrase_entropy.parquet"
out_ver = "outputs/rapidfuzz/paraphrase_entropy_doc.parquet"
df_win.to_parquet(out_win, index=False)
df_ver.to_parquet(out_ver, index=False)
extras = []
if not SAFE_MODE:
    out_sent = "outputs/rapidfuzz/paraphrase_entropy_sentences.parquet"
    df_sentH.to_parquet(out_sent, index=False)
    extras.append(out_sent)

# ---------------- Metadata ----------------
meta_path = "outputs/rapidfuzz/metadata.json"
meta = {}
if os.path.exists(meta_path):
    try:
        with open(meta_path, "r") as f: meta = json.load(f)
    except Exception:
        meta = {}
windows_total = int(len(df_win))
meta.update({
    "version_order_source": "filename_prefix",
    "sim_metric": SIM_METRIC,
    "sim_scale": SIM_SCALE,
    "theta_same_effective": THETA_SAME_RAW,
    "theta_cross_effective": THETA_CROSS_RAW,
    "tau_softmax": TAU,
    "candidate_policy": CANDIDATE_POLICY,
    "intra_window_M": INTRA_WINDOW_M,
    "blocked_batch": BLOCK,
    "safe_mode": bool(SAFE_MODE),
    "articles": int(df_win["article_id"].nunique()),
    "versions_per_article_min": int(df_win.groupby("article_id")["version_id"].nunique().min()),
    "versions_per_article_max": int(df_win.groupby("article_id")["version_id"].nunique().max()),
    "windows_total": windows_total,
    "windows_with_candidates": int((df_win["n_candidates_win"]>0).sum()),
})
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)

print(json.dumps({
    "cell_id": "7.2",
    "rows_win": len(df_win),
    "rows_ver": len(df_ver),
    "saved": [out_win, out_ver] + extras,
    "sim_metric": SIM_METRIC, "sim_scale": SIM_SCALE,
    "theta_same_effective": THETA_SAME_RAW, "theta_cross_effective": THETA_CROSS_RAW,
    "blocked_batch": BLOCK, "safe_mode": bool(SAFE_MODE)
}, indent=2))


In [None]:
# cell 7.2b — rapidfuzz: finalize doc-level rates & metadata (thresholds + boilerplate_ver)

import os, json, time
import numpy as np
import pandas as pd

WIN_FP = "outputs/rapidfuzz/paraphrase_entropy.parquet"
DOC_FP = "outputs/rapidfuzz/paraphrase_entropy_doc.parquet"
META_FP = "outputs/rapidfuzz/metadata.json"

assert os.path.exists(WIN_FP), f"Missing {WIN_FP}. Run 7.2 first."

dfw = pd.read_parquet(WIN_FP).copy()

# ----- (A) Recompute version-level repetition rates from window table -----
def _wmean(values: pd.Series, weights: pd.Series) -> float:
    v = values.astype(float)
    w = weights.astype(float)
    m = np.isfinite(v) & np.isfinite(w) & (w > 0)
    if not m.any():
        return float("nan")
    return float(np.sum(v[m] * w[m]) / np.sum(w[m]))

grp_keys = ["article_id","version_id","version_tag","doc_id"]
if "n_sentences_win" not in dfw:
    dfw["n_sentences_win"] = 1

agg_rows = []
for _, g in dfw.groupby(grp_keys, sort=True):
    w = g["n_sentences_win"].fillna(1)
    rep_within = _wmean(g["repeat_within_rate_win"].fillna(0.0), w)
    boiler = _wmean(g["boilerplate_cross_rate_win"].fillna(0.0), w)
    agg_rows.append({
        "article_id": g["article_id"].iloc[0],
        "version_id": int(g["version_id"].iloc[0]),
        "version_tag": g["version_tag"].iloc[0],
        "doc_id": g["doc_id"].iloc[0],
        "repeat_within_rate_ver": rep_within,
        "boilerplate_cross_rate_ver": boiler,
        "sentences_total_ver": int(np.sum(w)),
    })
agg = pd.DataFrame(agg_rows)

# Merge into existing doc table if present
if os.path.exists(DOC_FP):
    dfd = pd.read_parquet(DOC_FP).copy()
    dfd = dfd.drop(columns=["repeat_within_rate_ver","boilerplate_cross_rate_ver","sentences_total_ver"],
                   errors="ignore").merge(agg, on=grp_keys, how="left")
else:
    # If 7.2 didn't create a doc table, seed one from window aggregates
    dfd = agg.copy()
    # Optional: initialize expected columns so schema matches downstream joins
    for col in ["paraphrase_entropy_ver","median_crossver_sim_ver","p90_crossver_sim_ver","repeat_run_max_len_ver"]:
        if col not in dfd.columns:
            dfd[col] = np.nan

# Persist
os.makedirs(os.path.dirname(DOC_FP), exist_ok=True)
dfd.to_parquet(DOC_FP, index=False)

# ----- (B) Patch metadata with explicit thresholds & coverage -----
meta = {}
if os.path.exists(META_FP):
    with open(META_FP, "r") as f:
        try:
            meta = json.load(f)
        except Exception:
            meta = {}

sim_scale = str(meta.get("sim_scale") or "0..1").strip()
# If thresholds already present, keep them; else set sensible defaults on the recorded scale.
if "theta_same" not in meta or meta.get("theta_same") in (None, "", "null"):
    meta["theta_same"]  = 0.90 if sim_scale == "0..1" else 90.0
if "theta_cross" not in meta or meta.get("theta_cross") in (None, "", "null"):
    meta["theta_cross"] = 0.95 if sim_scale == "0..1" else 95.0

meta["repeat_thresholds_applied"] = True
meta["candidate_policy"] = meta.get("candidate_policy") or "same_win"

# Coverage counts
meta["windows_total"] = int(len(dfw))
meta["windows_with_candidates"] = int((dfw["n_candidates_win"].fillna(0) > 0).sum())
meta["windows_no_candidates"] = int(meta["windows_total"] - meta["windows_with_candidates"])
meta["updated_at"] = int(time.time())

with open(META_FP, "w") as f:
    json.dump(meta, f, indent=2)

print({
    "cell_id": "7.2b",
    "doc_rows": len(dfd),
    "windows_total": meta["windows_total"],
    "windows_with_candidates": meta["windows_with_candidates"],
    "theta_same": meta["theta_same"],
    "theta_cross": meta["theta_cross"],
    "sim_scale": sim_scale,
})


In [None]:
# ===== cell 7.X1 — disk auditor: find big dirs/files quickly =====
import os, sys, math, time
from pathlib import Path

def hsize(n):  # human friendly
    for u in ['B','KB','MB','GB','TB']:
        if n < 1024: return f"{n:,.1f} {u}"
        n /= 1024
    return f"{n:.1f} PB"

roots = [
    Path.cwd() / "outputs",
    Path.home() / ".cache" / "huggingface",
    Path.home() / ".cache" / "pip",
    Path.home() / "nltk_data",
]

def dir_size(p: Path) -> int:
    total = 0
    if not p.exists(): return 0
    for root, _, files in os.walk(p, followlinks=False):
        for f in files:
            try:
                total += (Path(root)/f).stat().st_size
            except Exception:
                pass
    return total

print("Scanning…")
totals = []
for r in roots:
    sz = dir_size(r)
    totals.append((r, sz))
totals.sort(key=lambda x: x[1], reverse=True)
for r, sz in totals:
    print(f"{str(r):<60} {hsize(sz)}")

# Top 25 biggest files under your project tree
big_files = []
for root, _, files in os.walk(Path.cwd(), followlinks=False):
    for f in files:
        p = Path(root)/f
        try:
            s = p.stat().st_size
        except Exception:
            continue
        big_files.append((s, p))
big_files.sort(reverse=True)
print("\nTop 25 largest files under project:")
for s, p in big_files[:25]:
    print(f"{hsize(s):>10}  {p}")


In [None]:
# cell 7.3 — rapidfuzz: visuals — entropy & repetition (fixed: single render per plot)

import os, json, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

PLOTS_DIR = "outputs/rapidfuzz/plots"
os.makedirs(PLOTS_DIR, exist_ok=True)

# ---- Load window- and version-level metrics (window file is required) ----
win_fp = "outputs/rapidfuzz/paraphrase_entropy.parquet"
doc_fp = "outputs/rapidfuzz/paraphrase_entropy_doc.parquet"

assert os.path.exists(win_fp), f"Missing {win_fp}. Run 7.2 first."
dfw = pd.read_parquet(win_fp).copy()
dfw["version_id"] = dfw["version_id"].astype(int)

dfd = pd.read_parquet(doc_fp).copy() if os.path.exists(doc_fp) else None

# Helper: expand a window mean to sentence-level weights to approximate a ridgeline
def expand_entropy_rows(g: pd.DataFrame) -> np.ndarray:
    if "n_sentences_win" in g and g["n_sentences_win"].notna().any():
        n = g["n_sentences_win"].fillna(1).clip(lower=1).astype(int).to_numpy()
    else:
        n = np.ones(len(g), dtype=int)
    vals = g["paraphrase_entropy_win"].astype(float).to_numpy()
    # Repeat each window’s entropy by its sentence count
    return np.repeat(vals, n)

plots_index = {"entropy_ridge": [], "repetition_heatmaps": []}

# =========================
# 1) Global entropy ridgeline by version (single figure)
# =========================
fig, ax = plt.subplots(figsize=(9, 5.5))
versions = sorted(dfw["version_id"].unique())
offset = 1.2  # vertical spacing between ridges

for i, vid in enumerate(versions, start=1):
    vals = expand_entropy_rows(dfw[dfw["version_id"] == vid])
    vals = vals[np.isfinite(vals)]
    if vals.size == 0:
        continue
    y = i * offset
    # Histogram "ridgeline"
    hist, bins = np.histogram(vals, bins=30, range=(0.0, 1.0), density=True)
    hist = np.minimum(hist, np.percentile(hist, 99))  # clip very tall bars
    ax.fill_between(bins[:-1], y, y + hist / hist.max() * 0.9, alpha=0.45)
    ax.plot([np.median(vals)], [y + 0.95], marker="|", markersize=14, lw=0,
            label=f"v{vid} median={np.median(vals):.2f}")

ax.set_title("Entropy ridge by version")
ax.set_xlabel("Paraphrase entropy (sentence-level, approximated)")
ax.set_xlim(0, 1)
ax.set_yticks([i * offset + 0.95 for i in range(1, len(versions)+1)])
ax.set_yticklabels([f"v{vid}" for vid in versions])
ax.grid(alpha=0.2, axis="x")
ax.legend(loc="lower left", ncols=min(len(versions), 4), fontsize=9, frameon=False)

ridge_path = os.path.join(PLOTS_DIR, "entropy_ridge.png")
plt.tight_layout()
plt.savefig(ridge_path, dpi=150)
plt.show()
plt.close(fig)
plots_index["entropy_ridge"].append(ridge_path)

# =========================
# 2) Repetition heatmap(s) — one per article (capped to 10)
# =========================
slugs = sorted(dfw["article_id"].unique())[:10]
for slug in slugs:
    g = dfw[dfw["article_id"] == slug].copy()
    # Build a coarse sentence index by expanding windows by their sentence counts
    # and painting each expanded position with that window’s repeat rate.
    rows = []
    for vid, gv in g.groupby("version_id", sort=True):
        if "n_sentences_win" in gv:
            counts = gv["n_sentences_win"].fillna(1).clip(lower=1).astype(int).to_numpy()
        else:
            counts = np.ones(len(gv), dtype=int)
        vals = gv["repeat_within_rate_win"].fillna(0.0).to_numpy()
        seq = np.concatenate([np.full(c, v, dtype=float) for v, c in zip(vals, counts)]) if len(vals) else np.array([], float)
        rows.append(seq)

    # Pad to the same length for plotting
    max_len = max((len(r) for r in rows), default=0)
    if max_len == 0:
        # Nothing to plot; skip gracefully
        continue
    M = np.vstack([
        np.pad(r, (0, max_len - len(r)), constant_values=np.nan) for r in rows
    ])

    fig, ax = plt.subplots(figsize=(10, 3.6))
    im = ax.imshow(np.nan_to_num(M, nan=0.0), aspect="auto", vmin=0.0, vmax=1.0, cmap="viridis")
    ax.set_title(f"Repetition heatmap — {slug}")
    ax.set_ylabel("Version (v1..v4)")
    ax.set_xlabel("Sentence index (approx.)")
    ax.set_yticks(range(len(rows)))
    ax.set_yticklabels([f"v{v}" for v in sorted(g["version_id"].unique())])
    cbar = fig.colorbar(im, ax=ax)
    cbar.set_label("repeat_within_rate (0..1)")

    heat_path = os.path.join(PLOTS_DIR, f"repetition_heatmap_{slug}.png")
    plt.tight_layout()
    plt.savefig(heat_path, dpi=150)
    plt.show()
    plt.close(fig)
    plots_index["repetition_heatmaps"].append(heat_path)

# Manifest
with open(os.path.join(PLOTS_DIR, "plots_index.json"), "w") as f:
    json.dump(plots_index, f, indent=2)

print(json.dumps({"cell_id": "7.3", "plots_index": plots_index}, indent=2))


In [None]:
# cell 7.Z — rapidfuzz: bundle artifacts for download

import os, json, time, shutil, glob

OUT_DIR = "outputs/rapidfuzz"
BUNDLES = os.path.join(OUT_DIR, "bundles")
os.makedirs(BUNDLES, exist_ok=True)

ts = time.strftime("%Y%m%d-%H%M%S")
zip_base = os.path.join(BUNDLES, f"module7_artifacts_{ts}")
zip_path = shutil.make_archive(zip_base, "zip", OUT_DIR)

# quick manifest
files = sorted([p for p in glob.glob(os.path.join(OUT_DIR, "**/*"), recursive=True) if os.path.isfile(p)])
manifest = {
    "bundle": os.path.basename(zip_path),
    "created_at": ts,
    "file_count": len(files),
    "files": [p.replace(OUT_DIR + os.sep, "") for p in files]
}
with open(os.path.join(OUT_DIR, "bundle_index.json"), "w") as f:
    json.dump(manifest, f, indent=2)

print({"cell_id":"7.Z","zip": zip_path, "file_count": len(files)})


In [None]:
# cell N.1 — imports & params (no installs here)
import os, json, math, time
import numpy as np, pandas as pd
# import any heavy libs you need (already installed in 0.0)
# e.g., from rapidfuzz import process, fuzz

# set module output dirs
MOD = "module-<N>"
OUT_BASE = f"outputs/{MOD}"
PLOTS = f"{OUT_BASE}/plots"
os.makedirs(PLOTS, exist_ok=True)

# any module-level config (e.g., seeds, thresholds)
np.random.seed(int(os.environ.get("LSA_SEED","42")))

print(f"{MOD} ready → outputs under: {OUT_BASE}")

# ------------------------------------------------------------------------

# cell N.2 — compute (write Parquet/JSON to OUT_BASE)
# ... your module logic ...
# pd.DataFrame(...).to_parquet(f"{OUT_BASE}/<table>.parquet", index=False)

# ------------------------------------------------------------------------

# cell N.3 — visuals (write PNGs to PLOTS)
# ... matplotlib-only plotting that writes files, then plt.close('all')

# ------------------------------------------------------------------------

# (optional) cell N.Z — bundle just this module
# Keep ONE global bundler at the very end if you prefer; otherwise:
"""
import zipfile, time, os
ts = time.strftime("%Y%m%d-%H%M%S")
zip_fp = f"{OUT_BASE}/{MOD}_bundle_{ts}.zip"
with zipfile.ZipFile(zip_fp, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for base, _, files in os.walk(OUT_BASE):
        for fn in files:
            if fn.endswith(".zip"):  # avoid nesting prior bundles
                continue
            p = os.path.join(base, fn)
            z.write(p, os.path.relpath(p, OUT_BASE))
print("Created:", zip_fp)
"""


In [None]:
# cell Z.ALL — BUNDLE ENTIRE outputs/ AND STOP KERNEL
import os, zipfile, time, sys
from pathlib import Path

ROOT = Path("outputs")
ROOT.mkdir(parents=True, exist_ok=True)
ts = time.strftime("%Y%m%d-%H%M%S")
zip_fp = ROOT / f"bundle_all_{ts}.zip"

with zipfile.ZipFile(zip_fp, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for base, _, files in os.walk(ROOT):
        for fn in files:
            # don't re-embed older bundles
            if fn.startswith("bundle_all_") and fn.endswith(".zip"):
                continue
            p = Path(base) / fn
            z.write(p, p.relative_to(ROOT))

print(f"✓ Created: {zip_fp}")

# Colab-friendly download (best-effort)
try:
    from google.colab import files  # type: ignore
    files.download(str(zip_fp))
except Exception:
    print("Download manually from the file browser if no prompt appeared.")

# clean shutdown to prevent accidental extra execution
try:
    import IPython
    IPython.get_ipython().kernel.do_shutdown(restart=False)
except Exception:
    sys.exit(0)


In [None]:
# cell X.Y: Cleanup /content directory (OPTIONAL, MANUAL RUN ONLY)
import os
import re
from pathlib import Path

def cleanup_content_dir(target_dir: Path = Path("/content")):
    """
    Removes all files and directories from the target_dir except those matching '0[1-4]-*.md'.
    Requires user confirmation before proceeding.
    """
    if not target_dir.is_dir():
        print(f"Error: Directory not found: {target_dir}")
        return

    items_to_keep = []
    items_to_delete = []

    # Define the pattern for files to keep
    keep_pattern = re.compile(r"0[1-4]-.*\.md$")

    for item in target_dir.iterdir():
        if item.is_file():
            if keep_pattern.match(item.name):
                items_to_keep.append(item.name)
            else:
                items_to_delete.append(item.name)
        elif item.is_dir():
             # Add directories to delete list, we are not keeping any directories with this logic
            items_to_delete.append(item.name)


    if not items_to_delete:
        print(f"No files or directories to delete in {target_dir} (excluding files matching 0[1-4]-*.md).")
        return

    print(f"Found {len(items_to_delete)} items to delete in {target_dir}:")
    for name in items_to_delete:
        print(f"- {name}")

    print("\nFiles to keep:")
    for name in items_to_keep:
        print(f"- {name}")

    confirmation = input("\nAre you sure you want to delete these items? Type 'yes' to confirm: ")

    if confirmation.lower() == 'yes':
        print("Deleting items...")
        for name in items_to_delete:
            item_path = target_dir / name
            try:
                if item_path.is_file():
                    item_path.unlink()
                    print(f"Deleted file: {name}")
                elif item_path.is_dir():
                    import shutil
                    shutil.rmtree(item_path)
                    print(f"Deleted directory: {name}")
            except Exception as e:
                print(f"Error deleting {name}: {e}")
        print("Cleanup complete.")
    else:
        print("Cleanup cancelled.")

# To run this cleanup, execute this cell manually. It will prompt for confirmation.
cleanup_content_dir() # Uncomment this line and run the cell to perform cleanup