In [None]:
# cell 0.0: foundations: helpers (no installs, no heavy imports)
from __future__ import annotations

import os, sys, json, random, re, unicodedata, datetime, textwrap, math, subprocess
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import List, Tuple, Dict, Any, Optional

def _pip(*args: str) -> None:
    subprocess.check_call([sys.executable, "-m", "pip", *args])

def try_import(name: str, import_name: Optional[str] = None):
    try:
        return __import__(import_name or name)
    except Exception:
        return None

def ensure_packaging():
    try:
        from packaging.version import Version  # noqa: F401
        from packaging.specifiers import SpecifierSet  # noqa: F401
    except Exception:
        _pip("install", "-q", "--no-cache-dir", "packaging")

def versions_dict(pkgs=("numpy","pandas","matplotlib","scipy","sklearn")) -> Dict[str, Optional[str]]:
    out = {}
    for p in pkgs:
        mod = try_import(p if p!="sklearn" else "sklearn")
        out[p] = getattr(mod, "__version__", None) if mod else None
    return out

print(json.dumps({"cell_id":"0.0 foundations: helpers","status":"ready"}, indent=2))


In [None]:
try:
    installed = {}
    order = ["numpy", "scipy", "scikit-learn", "pandas", "matplotlib"]
    specs = {
        "numpy": ">=2.0,<3",
        "pandas": "==2.2.2",
        "matplotlib": ">=3.8,<3.9",
        "scipy": ">=1.13,<2.0",
        "scikit-learn": ">=1.6,<1.7",
    }
    for pip_name in order:
        import_name = "sklearn" if pip_name == "scikit-learn" else pip_name
        if try_import(import_name) is None:
            _pip("install", "-q", "--no-cache-dir", "--only-binary=:all:", f"{pip_name}{specs[pip_name]}")
            installed[pip_name] = specs[pip_name]
    print(json.dumps({
        "cell_id":"0.1 foundations: installs (only-if-missing)",
        "status":"pass","installed_if_missing": installed,"versions": versions_dict()
    }, indent=2))
except Exception as e:
    print(json.dumps({"cell_id":"0.1 foundations: installs (only-if-missing)",
                      "status":"fail","error":f"{type(e).__name__}: {e}"} , indent=2))
    raise


In [None]:
# cell 0.2: foundations: imports, determinism, version capture (CPU-only)

# stdlib (import locally so this cell is idempotent)
import os, sys, json, random

# Cap threads BEFORE importing numeric libs (repro & predictable perf)
threads = os.environ.get("LSA_THREADS", "1")
for _v in ("OPENBLAS_NUM_THREADS","MKL_NUM_THREADS","NUMEXPR_NUM_THREADS","OMP_NUM_THREADS"):
    os.environ[_v] = threads

# heavy stack (single imports cell)
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import scipy
import sklearn

# Allow thread override; default to 1 for reproducibility on CPU
threads = os.environ.get("LSA_THREADS", "1")
for _v in ("OPENBLAS_NUM_THREADS","MKL_NUM_THREADS","NUMEXPR_NUM_THREADS","OMP_NUM_THREADS"):
    os.environ[_v] = threads

SEED = int(os.environ.get("LSA_SEED","12345"))
os.environ["PYTHONHASHSEED"] = str(SEED)  # note: only affects new processes’ hash order
random.seed(SEED); np.random.seed(SEED)

# Optional strict hash guard (off by default)
if os.environ.get("LSA_HASH_GUARD","0") == "1":
    import builtins
    _orig_hash = builtins.hash
    def _det_hash_guard(x):
        raise RuntimeError("Avoid builtins.hash() for logic; nondeterministic across runs.")
    builtins.hash = _det_hash_guard

# Plot defaults (matplotlib per your preference)
plt.rcParams.update({"figure.figsize": (8,3.5), "axes.grid": True, "grid.alpha": 0.25, "figure.dpi": 110})

VERSIONS = {
    "python": sys.version.split()[0],
    "numpy": np.__version__,
    "pandas": pd.__version__,
    "matplotlib": matplotlib.__version__,
    "scipy": scipy.__version__,
    "scikit_learn": sklearn.__version__,
}
print(json.dumps({"cell_id":"0.2 foundations: imports+determinism","status":"pass","seed":SEED,"threads":threads,"versions":VERSIONS}, indent=2))


In [None]:
# cell 0.3: foundations: configuration & directories (portable)
BASE_DIR = Path(os.environ.get("LSA_BASE_DIR","/content"))

@dataclass
class LSAPaths:
    input_dir: Path = BASE_DIR / "input_docs"
    out_dir: Path = BASE_DIR / "lsa_outputs"
    models_dir: Path = BASE_DIR / "models"

@dataclass
class LSAConfig:
    language: str = "en"
    window_sentences: int = 4
    window_stride: int = 2
    percentile_minor: float = 0.60
    percentile_unchanged: float = 0.85
    new_floor: float = 0.20
    new_floor_tfidf: float = 0.15

paths = LSAPaths(); cfg = LSAConfig()
for p in (paths.input_dir, paths.out_dir, paths.models_dir):
    p.mkdir(parents=True, exist_ok=True)

print(json.dumps({
    "cell_id":"0.3 foundations: config+dirs",
    "status":"pass",
    "base_dir": str(BASE_DIR),
    "dirs": { "input": str(paths.input_dir), "out": str(paths.out_dir), "models": str(paths.models_dir) }
}, indent=2))


In [None]:
# cell 0.4: foundations: status logging & self-report
MODULE_STATUS: Dict[str, Dict[str, Any]] = {}

def report_status(module: str, ok: bool, note: str = "", extra: Optional[Dict[str,Any]] = None):
    MODULE_STATUS[module] = {
        "ok": bool(ok),
        "note": note,
        "extra": extra or {},
        "ts": datetime.datetime.utcnow().isoformat() + "Z",
    }
    print(json.dumps({"module": module, "ok": ok, "note": note, "extra": extra or {} }))

def dump_status_json(out_path: Path = paths.out_dir / "module_status.json"):
    out_path.write_text(json.dumps(MODULE_STATUS, indent=2), encoding="utf-8")
    print(json.dumps({"wrote": str(out_path), "entries": len(MODULE_STATUS)}))

# cell-level self-report (per rule #9)
print(json.dumps({"cell_id": "0.4 foundations: status utilities", "status": "pass"}))


In [None]:
# cell 0.5: foundations: offset-preserving normalization (fixed CRLF + maps)
from typing import NamedTuple

class NormResult(NamedTuple):
    text: str
    norm_to_orig: List[int]
    orig_to_norm: List[int]

def normalize_with_offsets(s: str) -> NormResult:
    norm_to_orig: List[int] = []
    out_chars: List[str] = []
    prev_space = False
    j = 0
    L = len(s)

    while j < L:
        ch = s[j]
        if ch == '\r':
            if j + 1 < L and s[j+1] == '\n':
                out_chars.append('\n'); norm_to_orig.append(j)
                prev_space = False; j += 2; continue
            else:
                ch_n = '\n'
        else:
            ch_n = unicodedata.normalize("NFKC", ch)
            if ch_n in ["\u2028", "\u2029"]:
                ch_n = "\n"

        if ch_n.isspace() and ch_n != "\n":
            if prev_space: j += 1; continue
            ch_n = " "; prev_space = True
        else:
            prev_space = (ch_n == " ")

        out_chars.append(ch_n); norm_to_orig.append(j); j += 1

    text = "".join(out_chars)
    orig_to_norm = [-1] * L
    for norm_i, orig_j in enumerate(norm_to_orig):
        if 0 <= orig_j < L:
            orig_to_norm[orig_j] = norm_i
    last = 0
    for k in range(L):
        if orig_to_norm[k] == -1:
            orig_to_norm[k] = last
        else:
            last = orig_to_norm[k]

    return NormResult(text=text, norm_to_orig=norm_to_orig, orig_to_norm=orig_to_norm)

# sanity self-report
try:
    demo = "A  test\u00A0string\r\nwith\rmixed  spaces.\nNext line."
    res = normalize_with_offsets(demo)
    assert "\r" not in res.text and "\r\n" not in res.text
    report_status("0.foundation.normalize", True, "Normalization OK", {"len": len(res.text)})
except Exception as e:
    report_status("0.foundation.normalize", False, f"Init error: {e}")


In [None]:
# cell 0.6: foundations: sentence segmentation & windowing (regex heuristic)
# Reviewer fix: reduce false splits on initials like "A." / "Dr."
SENT_SPLIT_RE = re.compile(r'(?<!\b[A-Z]\.)(?<=[.!?])\s+(?=[A-Z0-9])')

def split_sentences(text: str) -> List[Tuple[str, Tuple[int,int]]]:
    spans: List[Tuple[int,int]] = []
    start = 0
    for m in SENT_SPLIT_RE.finditer(text):
        end = m.start() + 1
        spans.append((start, end))
        start = m.end()
    if start < len(text): spans.append((start, len(text)))
    return [(text[a:b], (a,b)) for a,b in spans if b > a]

def window_sentences(sents: List[Tuple[str, Tuple[int,int]]], win: int, stride: int):
    windows = []
    for i in range(0, max(len(sents)-win+1, 0), stride):
        chunk = sents[i:i+win]
        if len(chunk) < win: break
        text = " ".join(x[0].strip() for x in chunk)
        start = chunk[0][1][0]; end = chunk[-1][1][1]
        windows.append({"sent_start_idx": i, "sent_end_idx": i+win, "char_span": [start,end], "text": text})
    return windows

# sanity self-report
try:
    sample = "Dr. A. Smith wrote this. Second sentence! Third sentence? Fourth one. Fifth here."
    sents = split_sentences(sample)
    wins = window_sentences(sents, 4, 2)
    report_status("0.foundation.segmentation", True, "Splitter/windowing OK", {"sents": len(sents), "windows": len(wins)})
except Exception as e:
    report_status("0.foundation.segmentation", False, f"Error: {e}")


In [None]:
# cell 0.7: foundations: visualization smoke test (matplotlib only) — use plt from 0.2
try:
    xs = np.arange(0, 10); ys = np.sqrt(xs)
    plt.figure(); plt.plot(xs, ys, marker="o")
    plt.title("Matplotlib render check (0.#)"); plt.xlabel("x"); plt.ylabel("sqrt(x)")
    fig = plt.gcf(); axes = fig.get_axes()
    assert axes and any(ax.has_data() for ax in axes), "No plotted data detected"
    plt.show()
    report_status("0.foundation.viz", True, "Matplotlib rendering OK")
except Exception as e:
    report_status("0.foundation.viz", False, f"Matplotlib failed: {e}")
