In [1]:
!pip -q uninstall -y autogen pyautogen ag2 >/dev/null 2>&1
!pip -q install -U "ag2[openai]" python-dotenv pypdf sentence-transformers >/dev/null 2>&1

!pip show ag2
!python -c "import autogen; print('autogen version:', getattr(autogen,'__version__', 'unknown'))"


Name: ag2
Version: 0.10.5
Summary: A programming framework for agentic AI
Home-page: https://ag2.ai/
Author: 
Author-email: Chi Wang & Qingyun Wu <support@ag2.ai>
License: 
Location: /usr/local/lib/python3.12/dist-packages
Requires: anyio, diskcache, docker, httpx, packaging, pydantic, python-dotenv, termcolor, tiktoken
Required-by: 
autogen version: 0.10.5


In [None]:
import os, json, time, re, hashlib, sqlite3, sys
from dotenv import load_dotenv
load_dotenv()

# Ensure API key exists
os.environ["OPENAI_API_KEY"] =  # set via Secrets
os.environ["OPENAI_MODEL_NAME"] = os.environ.get("OPENAI_MODEL_NAME", "gpt-4o-mini")

print("OPENAI_MODEL_NAME:", os.environ["OPENAI_MODEL_NAME"])
print("OPENAI_API_KEY present?", bool(os.getenv("OPENAI_API_KEY")))


OPENAI_MODEL_NAME: gpt-4o-mini
OPENAI_API_KEY present? True


## Mount Drive + Set Base Directories (no single FILE_PATH hardcode)

In [3]:
# ================================
# CELL 1 — Mount Drive + Set Dataset Paths
# ================================
# This cell mounts Google Drive and defines the dataset directory + pinned CSV file.
# We also export PINNED_FILE_PATH to an environment variable so later cells don't rely
# on Python variable persistence.

import os
from google.colab import drive

# Mount drive if not already mounted
if not os.path.exists("/content/drive/MyDrive"):
    drive.mount("/content/drive", force_remount=False)

# Your dataset directory (folder containing CSVs)
DATASET_DIR = "/content/drive/MyDrive/Agentic-AI-Asset-management/Tamim/codes/Data/Maintenance-data"

# Your pinned CSV file (this specific file will be used as the dataset)
PINNED_FILE_PATH = "/content/drive/MyDrive/Agentic-AI-Asset-management/Tamim/codes/Data/Maintenance-data/2017-2019-maintenance-dataset.csv"

# Fail fast if paths are invalid
assert os.path.exists(DATASET_DIR), f"DATASET_DIR not found: {DATASET_DIR}"
assert os.path.exists(PINNED_FILE_PATH), f"Pinned CSV not found: {PINNED_FILE_PATH}"

# Export pinned path to env var (used by mm.validate_registry and audit checks)
os.environ["PINNED_FILE_PATH"] = os.path.abspath(PINNED_FILE_PATH)

print("DATASET_DIR:", DATASET_DIR)
print("PINNED_FILE_PATH (env):", os.environ["PINNED_FILE_PATH"])


Mounted at /content/drive
DATASET_DIR: /content/drive/MyDrive/Agentic-AI-Asset-management/Tamim/codes/Data/Maintenance-data
PINNED_FILE_PATH (env): /content/drive/MyDrive/Agentic-AI-Asset-management/Tamim/codes/Data/Maintenance-data/2017-2019-maintenance-dataset.csv


In [4]:
# ================================
# CELL 2 — Set WORK_DIR
# ================================
# WORK_DIR is where we will write:
# - mm_runtime.py
# - registry file_path_registry.json
# - artifacts like dataset_profile.json, plots, etc.
# This is the "auditable workspace".

import os

WORK_DIR = "/content/pavement_agentic_workspace"
WORK_DIR = os.path.abspath(WORK_DIR)

os.makedirs(WORK_DIR, exist_ok=True)
os.environ["WORK_DIR"] = WORK_DIR  # mm_runtime reads this env var

print("WORK_DIR:", WORK_DIR)


WORK_DIR: /content/pavement_agentic_workspace


## Memory

In [5]:
mm_runtime_code = r'''
import os, re, json, time, glob, hashlib, sqlite3
from typing import Optional, List, Dict, Any, Tuple, Iterable
import numpy as np

# -------------------------
# Optional KG dependency
# -------------------------
try:
    import networkx as nx  # type: ignore
except Exception:
    nx = None

# -------------------------
# Workspace paths
# -------------------------
WORK_DIR = os.getenv("WORK_DIR", "/content/pavement_agentic_workspace")
os.makedirs(WORK_DIR, exist_ok=True)

MEM_PATH      = os.path.join(WORK_DIR, "memory.jsonl")
DB_PATH       = os.path.join(WORK_DIR, "memory_hybrid.sqlite")
VEC_PATH      = os.path.join(WORK_DIR, "memory_vectors.npz")
KG_PATH       = os.path.join(WORK_DIR, "knowledge_graph.graphml")
REGISTRY_PATH = os.path.join(WORK_DIR, "file_path_registry.json")

_ENFORCE_LIMITS = os.getenv("ENFORCE_LIMITS", "1").strip() != "0"
_EPS = 1e-12

# ============================================================
# BASIC HELPERS
# ============================================================
def _now_ts() -> str:
    """Return local timestamp string for audit logging."""
    return time.strftime("%Y-%m-%d %H:%M:%S")

def _sha24(s: str) -> str:
    """Short stable-ish ID from sha256, used for memory record IDs."""
    return hashlib.sha256(s.encode("utf-8")).hexdigest()[:24]

def _stable_file_fingerprint(file_path: str) -> str:
    """
    Create a small dataset_id fingerprint using:
    abs_path + file_size + mtime.
    """
    st = os.stat(file_path)
    raw = f"{os.path.abspath(file_path)}|{st.st_size}|{int(st.st_mtime)}"
    return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:12]

def _ensure_files():
    """Ensure workspace + base memory file exist."""
    os.makedirs(WORK_DIR, exist_ok=True)
    if not os.path.exists(MEM_PATH):
        open(MEM_PATH, "a", encoding="utf-8").close()

def validate_registry_or_fail(work_dir: str) -> Dict[str, Any]:
    """
    Governance gate for dataset access.
    Ensures:
    - registry exists
    - latest_path exists on disk
    - registry does NOT point inside WORK_DIR
    - if PINNED_FILE_PATH env var is set, it must match latest_path exactly

    Returns the registry dict if all checks pass.
    """
    reg = registry_load()
    if reg is None:
        raise RuntimeError("Registry not found. Run mm.registry_build(...) or mm.registry_pin_to_file(...).")

    FILE_PATH = reg.get("latest_path", "")
    if not FILE_PATH:
        raise RuntimeError("Registry missing latest_path.")

    if not os.path.exists(FILE_PATH):
        raise FileNotFoundError(FILE_PATH)

    # Guard: registry must not point inside WORK_DIR
    if os.path.commonpath([os.path.abspath(FILE_PATH), os.path.abspath(work_dir)]) == os.path.abspath(work_dir):
        raise RuntimeError("Registry points inside WORK_DIR; fix registry pinning.")

    # Guard: pinned env var must match exactly if set
    pinned = os.environ.get("PINNED_FILE_PATH", "").strip()
    if pinned:
        if os.path.abspath(FILE_PATH) != os.path.abspath(pinned):
            raise AssertionError(f"Pinned file mismatch. registry={FILE_PATH} env={pinned}")

    return reg

# ============================================================
# KNOWLEDGE GRAPH (OPTIONAL)
# ============================================================
def _kg_enabled() -> bool:
    """Return True if networkx is available."""
    return nx is not None

def _kg_load():
    """Load KG from GraphML if available; otherwise return new MultiDiGraph."""
    if not _kg_enabled():
        return None
    if os.path.exists(KG_PATH):
        try:
            g = nx.read_graphml(KG_PATH)
            mg = nx.MultiDiGraph()
            mg.add_nodes_from(g.nodes(data=True))
            mg.add_edges_from(g.edges(data=True))
            return mg
        except Exception:
            pass
    return nx.MultiDiGraph()

_KG = _kg_load()

def _kg_save():
    """Persist KG to GraphML (lossy for multiedges but keeps core relations)."""
    if not _kg_enabled() or _KG is None:
        return
    g = nx.DiGraph()
    g.add_nodes_from(_KG.nodes(data=True))
    for u, v, data in _KG.edges(data=True):
        g.add_edge(u, v, **data)
    nx.write_graphml(g, KG_PATH)

def kg_add_fact(subj: str, pred: str, obj: str, confidence: float = 1.0, meta: Optional[dict] = None):
    """Add a simple subject-predicate-object fact to KG and save."""
    if not _kg_enabled() or _KG is None:
        return
    meta = meta or {}
    subj = str(subj); pred = str(pred); obj = str(obj)
    _KG.add_node(subj)
    _KG.add_node(obj)
    _KG.add_edge(subj, obj, relation=pred, confidence=float(confidence), **meta)
    _kg_save()

# ============================================================
# JSONL MEMORY — APPEND ONLY
# ============================================================
def _enforce_limits(kind: str, text: str):
    """
    Prevent storing huge text blobs in memory (audit logs should be light).
    """
    if not _ENFORCE_LIMITS:
        return
    if len(text) > 5000 and kind not in {"pipeline_state", "results"}:
        raise RuntimeError("Memory text too large; store only summaries and artifact paths.")

def mem_add_jsonl(kind: str, text: str, meta: Optional[dict] = None) -> str:
    """
    Append one JSON record per line to memory.jsonl.
    Returns memory_id.
    """
    meta = meta or {}
    ts = _now_ts()
    text = str(text).strip()
    _enforce_limits(kind, text)
    raw = f"{ts}|{kind}|{text}|{json.dumps(meta, sort_keys=True)}"
    memory_id = _sha24(raw)
    rec = {"id": memory_id, "ts": ts, "kind": str(kind), "text": text, "meta": meta}
    _ensure_files()
    with open(MEM_PATH, "a", encoding="utf-8") as f:
        # IMPORTANT: real newline, not literal backslash+n
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    return memory_id

# ============================================================
# SQLITE STRUCT + FTS
# ============================================================
def _db_connect() -> sqlite3.Connection:
    """
    Connect to SQLite and ensure both:
    - memory_struct: canonical storage
    - memory_fts: FTS5 index for fast keyword search
    """
    conn = sqlite3.connect(DB_PATH)
    conn.execute("PRAGMA journal_mode=WAL;")
    conn.execute(
        "CREATE TABLE IF NOT EXISTS memory_struct ("
        "id TEXT PRIMARY KEY, ts TEXT, kind TEXT, text TEXT, meta_json TEXT);"
    )
    conn.execute(
        "CREATE VIRTUAL TABLE IF NOT EXISTS memory_fts "
        "USING fts5(id UNINDEXED, kind, text, content='', tokenize='porter');"
    )
    conn.commit()
    return conn

def keyword_index_add(memory_id: str, ts: str, kind: str, text: str, meta: dict):
    """
    Store record in memory_struct AND index it in memory_fts.
    """
    conn = _db_connect()
    conn.execute(
        "INSERT OR REPLACE INTO memory_struct(id, ts, kind, text, meta_json) VALUES (?, ?, ?, ?, ?)",
        (memory_id, ts, kind, text, json.dumps(meta, ensure_ascii=False))
    )
    conn.execute("DELETE FROM memory_fts WHERE id = ?", (memory_id,))
    conn.execute("INSERT INTO memory_fts(id, kind, text) VALUES (?, ?, ?)", (memory_id, kind, text))
    conn.commit()
    conn.close()

def _fts_safe_query(q: str) -> str:
    """
    Convert a user query to a safe FTS5 query using AND over tokens.
    """
    q = (q or "").strip()
    if not q:
        return '""'
    toks = re.findall(r"[a-zA-Z0-9_]+", q)
    if not toks:
        return '""'
    return " AND ".join([f'"{t}"' for t in toks])

def keyword_search(query: str, k: int = 5, kind: Optional[str] = None):
    """
    Keyword search using SQLite FTS5; falls back to LIKE if needed.
    Returns list of dicts: {id, kind, text, score}
    """
    conn = _db_connect()
    where_kind = " AND kind = ? " if kind else " "
    params = [_fts_safe_query(query)]
    if kind:
        params.append(kind)
    params.append(k)

    sql = (
        "SELECT id, kind, text, bm25(memory_fts) AS score "
        "FROM memory_fts WHERE memory_fts MATCH ? "
        + where_kind +
        "ORDER BY score LIMIT ?;"
    )
    try:
        rows = conn.execute(sql, params).fetchall()
    except Exception:
        like = f"%{(query or '').strip()}%"
        if kind:
            rows = conn.execute(
                "SELECT id, kind, text, 0.0 AS score FROM memory_struct WHERE kind = ? AND text LIKE ? LIMIT ?;",
                (kind, like, k)
            ).fetchall()
        else:
            rows = conn.execute(
                "SELECT id, kind, text, 0.0 AS score FROM memory_struct WHERE text LIKE ? LIMIT ?;",
                (like, k)
            ).fetchall()
    conn.close()

    # FTS bm25 lower is better; we return inverted sign as "higher is better"
    return [{"id": rid, "kind": rkind, "text": rtext, "score": float(-score)} for rid, rkind, rtext, score in rows]

def db_fetch_by_ids(ids: Iterable[str]) -> Dict[str, Dict[str, Any]]:
    """
    Fetch structured records by IDs from memory_struct.
    """
    ids = [i for i in ids if i]
    if not ids:
        return {}
    conn = _db_connect()
    qmarks = ",".join(["?"] * len(ids))
    rows = conn.execute(
        f"SELECT id, ts, kind, text, meta_json FROM memory_struct WHERE id IN ({qmarks});",
        ids
    ).fetchall()
    conn.close()

    out: Dict[str, Dict[str, Any]] = {}
    for rid, ts, kind, text, meta_json in rows:
        try:
            meta = json.loads(meta_json) if meta_json else {}
        except Exception:
            meta = {}
        out[rid] = {"id": rid, "ts": ts, "kind": kind, "text": text, "meta": meta}
    return out

# ============================================================
# SEMANTIC STORE (NPZ)
# ============================================================
def _load_vec_store() -> Tuple[List[str], np.ndarray]:
    """Load vector store from NPZ (ids + vecs)."""
    if not os.path.exists(VEC_PATH):
        return [], np.zeros((0, 0), dtype=np.float32)
    data = np.load(VEC_PATH, allow_pickle=True)
    return data["ids"].tolist(), data["vecs"].astype(np.float32)

def _save_vec_store(ids: List[str], vecs: np.ndarray):
    """Save vector store to NPZ."""
    np.savez(VEC_PATH, ids=np.array(ids, dtype=object), vecs=vecs.astype(np.float32))

def _normalize_rows(X: np.ndarray) -> np.ndarray:
    """Row-normalize vectors for cosine similarity."""
    if X.size == 0:
        return X
    norms = np.linalg.norm(X, axis=1, keepdims=True) + _EPS
    return X / norms

def embed_text(text: str) -> np.ndarray:
    """
    Embed text using SentenceTransformers if available.
    Fallback: deterministic hashing into 512-d sparse-ish vector.
    """
    try:
        from sentence_transformers import SentenceTransformer  # type: ignore
        global _ST_MODEL
        if "_ST_MODEL" not in globals():
            _ST_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
        v = _ST_MODEL.encode([text], normalize_embeddings=True)[0]
        return np.array(v, dtype=np.float32)
    except Exception:
        dim = 512
        v = np.zeros(dim, dtype=np.float32)
        for token in re.findall(r"[a-z0-9_]+", (text or "").lower()):
            h = int(hashlib.sha256(token.encode()).hexdigest(), 16)
            v[h % dim] += 1.0
        v = v / (np.linalg.norm(v) + _EPS)
        return v

def semantic_add(item_id: str, text: str):
    """Add or update embedding for an item_id in NPZ store."""
    ids, vecs = _load_vec_store()
    v = embed_text(text).reshape(1, -1)
    if vecs.size == 0 or (v.shape[1] != vecs.shape[1]):
        _save_vec_store([item_id], v)
        return
    try:
        idx = ids.index(item_id)
        vecs[idx:idx+1, :] = v
        _save_vec_store(ids, vecs)
    except ValueError:
        ids.append(item_id)
        vecs = np.vstack([vecs, v])
        _save_vec_store(ids, vecs)

def semantic_search_ids(query: str, k: int = 5) -> List[Tuple[str, float]]:
    """
    Return top-k (id, similarity) by cosine similarity.
    """
    ids, vecs = _load_vec_store()
    if vecs.size == 0:
        return []
    q = embed_text(query).reshape(1, -1).astype(np.float32)
    sims = (_normalize_rows(vecs) @ _normalize_rows(q).T).ravel()
    top = np.argsort(-sims)[: min(k, len(ids))]
    return [(ids[i], float(sims[i])) for i in top]

# ============================================================
# REGISTRY
# ============================================================
def registry_build(
    dataset_dir: str,
    allow_globs: Optional[List[str]] = None,
    exclude_dirs: Optional[List[str]] = None,
    prefer_regex: Optional[str] = None
) -> Dict[str, Any]:
    """
    Build a registry by scanning dataset_dir for CSVs and selecting the latest by mtime.
    Also stores mapping: path -> dataset_id fingerprint.
    """
    dataset_dir = os.path.abspath(dataset_dir)
    allow_globs = allow_globs or ["**/*.csv"]
    exclude_dirs = exclude_dirs or []
    if os.path.abspath(WORK_DIR) not in [os.path.abspath(d) for d in exclude_dirs]:
        exclude_dirs = exclude_dirs + [WORK_DIR]

    csvs: List[str] = []
    for g in allow_globs:
        csvs.extend(glob.glob(os.path.join(dataset_dir, g), recursive=True))
    csvs = sorted(set(csvs))

    def _keep(p: str) -> bool:
        for d in exclude_dirs or []:
            try:
                if os.path.commonpath([os.path.abspath(p), os.path.abspath(d)]) == os.path.abspath(d):
                    return False
            except Exception:
                pass
        return True

    csvs = [p for p in csvs if _keep(p)]
    if not csvs:
        raise FileNotFoundError(f"No CSV found under dataset_dir={dataset_dir}")

    if prefer_regex:
        try:
            rx = re.compile(prefer_regex)
            preferred = [p for p in csvs if rx.search(os.path.basename(p)) or rx.search(p)]
            if preferred:
                csvs = preferred
        except Exception:
            pass

    mapping = {p: _stable_file_fingerprint(p) for p in csvs}
    latest_path = max(mapping.keys(), key=lambda p: os.path.getmtime(p))

    reg = {
        "created_ts": _now_ts(),
        "dataset_dir": dataset_dir,
        "allow_globs": allow_globs,
        "exclude_dirs": [os.path.abspath(d) for d in exclude_dirs],
        "prefer_regex": prefer_regex,
        "path_to_dataset_id": mapping,
        "latest_path": latest_path,
        "latest_dataset_id": mapping[latest_path],
        "pinned": False,  # important for downstream logic
    }
    with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
        json.dump(reg, f, ensure_ascii=False, indent=2)
    return reg

def registry_load() -> Optional[Dict[str, Any]]:
    """Load registry json from disk."""
    if not os.path.exists(REGISTRY_PATH):
        return None
    try:
        with open(REGISTRY_PATH, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception:
        return None

def registry_pin_to_file(pinned_path: str) -> Dict[str, Any]:
    """
    Pin the registry to a single CSV file (federally auditable pinning).
    """
    pinned_path = os.path.abspath(pinned_path)
    if not os.path.exists(pinned_path):
        raise FileNotFoundError(f"Pinned file not found: {pinned_path}")

    dsid = _stable_file_fingerprint(pinned_path)
    reg = {
        "created_ts": _now_ts(),
        "path_to_dataset_id": {pinned_path: dsid},
        "latest_path": pinned_path,
        "latest_dataset_id": dsid,
        "pinned": True,
    }
    with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
        json.dump(reg, f, ensure_ascii=False, indent=2)
    return reg

# ============================================================
# CONDITION CANDIDATE DISCOVERY (NO TIME-SIDE INFERENCE)
# ============================================================
def _norm(s: str) -> str:
    """Normalize column name for matching."""
    return re.sub(r"\s+", " ", (s or "").strip().lower())

def _family_match(col_lower: str) -> Optional[Dict[str, Any]]:
    """
    Classify column into: iri, pci, cracking(_percent), rutting
    Using explicit patterns (interpretable, auditable).
    """
    reasons: List[str] = []

    if re.search(r"(^|[^a-z0-9])iri([^a-z0-9]|$)", col_lower) or "roughness" in col_lower:
        reasons.append("Matched IRI/Roughness via token 'iri' or substring 'roughness'.")
        return {"family": "iri", "confidence": "high", "reasons": reasons}

    if re.search(r"(^|[^a-z0-9])pci([^a-z0-9]|$)", col_lower):
        reasons.append("Matched PCI via token 'pci'.")
        return {"family": "pci", "confidence": "high", "reasons": reasons}

    if "rutting" in col_lower or re.search(r"(^|[^a-z0-9])rut([^a-z0-9]|$)", col_lower):
        reasons.append("Matched Rutting via substring 'rutting' or token 'rut'.")
        return {"family": "rutting", "confidence": "medium", "reasons": reasons}

    if "cracking" in col_lower or re.search(r"(^|[^a-z0-9])crack([^a-z0-9]|$)", col_lower):
        reasons.append("Matched Cracking via substring 'cracking' or token 'crack'.")
        if ("%" in col_lower) or ("percent" in col_lower) or ("pct" in col_lower):
            reasons.append("Name suggests percent measure (%, percent, or pct).")
            return {"family": "cracking_percent", "confidence": "high", "reasons": reasons}
        return {"family": "cracking", "confidence": "medium", "reasons": reasons}

    return None

def discover_condition_candidates(columns: List[str]) -> List[Dict[str, Any]]:
    """
    Return ranked list of candidate condition metric columns.
    No time-side inference; downstream agents handle leakage rules.
    """
    cols = list(columns or [])
    out: List[Dict[str, Any]] = []

    for col in cols:
        fam = _family_match(_norm(col))
        if not fam:
            continue
        out.append({
            "column": col,
            "family": fam["family"],
            "family_confidence": fam["confidence"],
            "reasons": fam["reasons"],
            "leakage_guard": (
                "Potential condition metric. Downstream agents must prevent leakage by excluding "
                "any post-period / outcome / future condition columns from features if applicable."
            ),
        })

    family_rank = {"iri": 1, "pci": 2, "cracking_percent": 3, "cracking": 4, "rutting": 5}
    conf_rank = {"high": 1, "medium": 2, "low": 3}

    out.sort(key=lambda r: (
        family_rank.get(r["family"], 99),
        conf_rank.get(r["family_confidence"], 99),
        _norm(r["column"]),
    ))
    return out

# ============================================================
# META FILTER
# ============================================================
def _meta_matches_filters(meta: dict, filters: Optional[Dict[str, Any]]) -> bool:
    """Return True if meta satisfies all key=value filters."""
    if not filters:
        return True
    for k, v in filters.items():
        if meta.get(k, None) != v:
            return False
    return True

# ============================================================
# MM CLASS (PUBLIC INTERFACE)
# ============================================================
class MM:
    def health_check(self) -> bool:
        """Initialize storage and verify DB connectivity."""
        _ensure_files()
        conn = _db_connect()
        conn.close()
        return True

    def rag_add(self, kind: str, text: str, meta: Optional[dict] = None) -> str:
        """Add an auditable record to JSONL + SQLite + vector store."""
        meta = meta or {}
        ts = _now_ts()
        mid = mem_add_jsonl(kind, text, meta)
        keyword_index_add(mid, ts, kind, text, meta)
        semantic_add(mid, text)
        return mid

    # Convenience alias used by your agents (prevents AttributeError)
    def record_artifact(self, kind: str, meta: Optional[dict] = None, text: str = "") -> str:
        """Alias for rag_add for artifact logging."""
        meta = meta or {}
        if not text:
            text = f"artifact kind={kind}"
        return self.rag_add(kind=kind, text=text, meta=meta)

    def rag_search(
        self,
        query: str,
        k: int = 5,
        kind: Optional[str] = None,
        alpha: float = 0.65,
        meta_filters: Optional[Dict[str, Any]] = None
    ):
        """
        Hybrid search: keyword (FTS) + semantic (vectors), then meta-filter.
        Returns list of (score, record_dict).
        """
        kw = keyword_search(query, k=max(10, k * 4), kind=kind)
        sem_ids = semantic_search_ids(query, k=max(10, k * 4))
        sem_map = {rid: sc for rid, sc in sem_ids}

        scores: Dict[str, Dict[str, float]] = {}
        for r in kw:
            scores.setdefault(r["id"], {"kw": 0.0, "sem": 0.0})
            scores[r["id"]]["kw"] = max(scores[r["id"]]["kw"], r["score"])
        for rid, sc in sem_map.items():
            scores.setdefault(rid, {"kw": 0.0, "sem": 0.0})
            scores[rid]["sem"] = max(scores[rid]["sem"], sc)

        if not scores:
            return []

        ids = list(scores.keys())
        kw_vals = np.array([scores[rid]["kw"] for rid in ids], dtype=np.float32)
        sem_vals = np.array([scores[rid]["sem"] for rid in ids], dtype=np.float32)

        def _minmax(v):
            if v.size == 0:
                return v
            vmin = float(v.min()); vmax = float(v.max())
            if abs(vmax - vmin) < _EPS:
                return np.full_like(v, 0.5, dtype=np.float32)
            return (v - vmin) / (vmax - vmin + _EPS)

        kw_norm = _minmax(kw_vals)
        sem_norm = _minmax(sem_vals)

        for i, rid in enumerate(ids):
            scores[rid]["hybrid"] = float(alpha * kw_norm[i] + (1 - alpha) * sem_norm[i])

        id_to_row = db_fetch_by_ids(ids)

        filtered: List[Tuple[str, float]] = []
        for rid in ids:
            row = id_to_row.get(rid)
            if not row:
                continue
            if _meta_matches_filters(row.get("meta", {}), meta_filters):
                filtered.append((rid, scores[rid]["hybrid"]))

        filtered.sort(key=lambda x: x[1], reverse=True)
        filtered = filtered[:k]
        return [(sc, id_to_row[rid]) for rid, sc in filtered]

    def rag_get_latest(
        self,
        kind: str,
        meta_filters: Optional[Dict[str, Any]] = None,
        contains_text: Optional[str] = None
    ):
        """
        Return the most recent memory record of a given kind, optionally filtered by meta and text.
        """
        conn = _db_connect()
        params = [kind]
        sql = "SELECT id, ts, kind, text, meta_json FROM memory_struct WHERE kind = ? "
        if contains_text:
            sql += " AND text LIKE ? "
            params.append(f"%{contains_text}%")
        sql += " ORDER BY ts DESC LIMIT 200;"
        rows = conn.execute(sql, params).fetchall()
        conn.close()

        for rid, ts, knd, text, meta_json in rows:
            try:
                meta = json.loads(meta_json) if meta_json else {}
            except Exception:
                meta = {}
            if _meta_matches_filters(meta, meta_filters):
                return {"id": rid, "ts": ts, "kind": knd, "text": text, "meta": meta}
        return None

    # -------------------------
    # Registry APIs
    # -------------------------
    def registry_build(self, dataset_dir: str, allow_globs=None, exclude_dirs=None, prefer_regex=None):
        return registry_build(dataset_dir, allow_globs, exclude_dirs, prefer_regex)

    def registry_pin_to_file(self, pinned_path: str, extra_meta: Optional[dict] = None, write_mm_record: bool = True) -> Dict[str, Any]:
        reg = registry_pin_to_file(pinned_path)
        if write_mm_record:
            meta = {
                "dataset_id": reg["latest_dataset_id"],
                "file_path": reg["latest_path"],
                "state": "REGISTRY_PINNED",
                "pinned": True,
            }
            if extra_meta:
                meta.update(extra_meta)
            self.rag_add(
                kind="pinned_dataset",
                text=f"Registry pinned to {reg['latest_path']} (dataset_id={reg['latest_dataset_id']})",
                meta=meta
            )
        return reg

    def validate_registry(self, work_dir: Optional[str] = None) -> Dict[str, Any]:
        """Public wrapper around validate_registry_or_fail."""
        work_dir = work_dir or WORK_DIR
        return validate_registry_or_fail(work_dir)

    def rag_get_latest_registry(self) -> Dict[str, Any]:
        """Load registry (no extra validation)."""
        reg = registry_load()
        if reg is None:
            raise RuntimeError("Registry not found. Run mm.registry_build(...) or mm.registry_pin_to_file(...).")
        return reg

    # -------------------------
    # Condition candidate discovery API
    # -------------------------
    def discover_condition_candidates(self, columns: List[str]) -> List[Dict[str, Any]]:
        return discover_condition_candidates(columns)

    # -------------------------
    # KG API
    # -------------------------
    def kg_add_fact(self, subj: str, pred: str, obj: str, confidence: float = 1.0, meta: Optional[dict] = None) -> bool:
        kg_add_fact(subj, pred, obj, confidence=confidence, meta=meta)
        return True

mm = MM()
'''


In [6]:
# ================================
# CELL 4 — Write mm_runtime.py to disk
# ================================
# This cell creates an importable module at WORK_DIR/mm_runtime.py.

import os, textwrap

MM_PATH = os.path.join(WORK_DIR, "mm_runtime.py")

assert "mm_runtime_code" in globals(), "mm_runtime_code not defined. Run CELL 3 first."

with open(MM_PATH, "w", encoding="utf-8") as f:
    # dedent + lstrip keeps file clean if mm_runtime_code was indented
    f.write(textwrap.dedent(mm_runtime_code).lstrip())

print("✅ Wrote mm_runtime.py:", MM_PATH)
print("Exists?", os.path.exists(MM_PATH), "Size:", os.path.getsize(MM_PATH), "bytes")


✅ Wrote mm_runtime.py: /content/pavement_agentic_workspace/mm_runtime.py
Exists? True Size: 25125 bytes


In [7]:
# ================================
# CELL 5 — Import mm_runtime robustly (path-based)
# ================================
# Normal "import mm_runtime" can fail in Colab due to path resolution issues.
# This method loads it directly from file path and registers it into sys.modules
# so later cells can do: from mm_runtime import mm

import os, sys, importlib.util

MM_PATH = os.path.join(WORK_DIR, "mm_runtime.py")
assert os.path.exists(MM_PATH), f"mm_runtime.py missing at: {MM_PATH}"

spec = importlib.util.spec_from_file_location("mm_runtime", MM_PATH)
mm_runtime = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mm_runtime)

# Register so standard imports work in later cells
sys.modules["mm_runtime"] = mm_runtime

from mm_runtime import mm

print("✅ Imported mm_runtime from:", MM_PATH)
print("mm has validate_registry?", hasattr(mm, "validate_registry"))
print("mm has registry_pin_to_file?", hasattr(mm, "registry_pin_to_file"))
print("mm.health_check():", mm.health_check())


✅ Imported mm_runtime from: /content/pavement_agentic_workspace/mm_runtime.py
mm has validate_registry? True
mm has registry_pin_to_file? True
mm.health_check(): True


In [8]:
# ================================
# CELL 6 — Pin registry
# ================================
# This writes WORK_DIR/file_path_registry.json pinned to your dataset file.
# It also writes an mm record kind="pinned_dataset" for audit traceability.

import os
from mm_runtime import mm

mm.health_check()

PINNED = os.environ.get("PINNED_FILE_PATH", "").strip()
if not PINNED:
    raise RuntimeError("PINNED_FILE_PATH env var is not set. Run CELL 1 first.")
if not os.path.exists(PINNED):
    raise FileNotFoundError(f"Pinned CSV not found: {PINNED}")

reg = mm.registry_pin_to_file(
    PINNED,
    extra_meta={"note": "Pinned from Drive"},
    write_mm_record=True
)

print("✅ Registry pinned:")
print(reg)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Registry pinned:
{'created_ts': '2026-02-10 08:22:34', 'path_to_dataset_id': {'/content/drive/MyDrive/Agentic-AI-Asset-management/Tamim/codes/Data/Maintenance-data/2017-2019-maintenance-dataset.csv': '9bc07dd2ac12'}, 'latest_path': '/content/drive/MyDrive/Agentic-AI-Asset-management/Tamim/codes/Data/Maintenance-data/2017-2019-maintenance-dataset.csv', 'latest_dataset_id': '9bc07dd2ac12', 'pinned': True}


In [9]:
# ================================
# CELL 7 — DISCOVER_DATASET state
# ================================
# This produces the required auditable artifacts for discovery:
# - dataset_profile.json
# - dataset_head.csv
# - target_candidates.csv (from mm.discover_condition_candidates)
# - plots/discovery_diagnostic.png
# - final_code.py (reproducibility script)
# and logs mm records:
# - dataset_artifact
# - target_candidates
# - pipeline_state

import os, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mm_runtime import mm

def discover_dataset_state(work_dir: str):
    # Ensure mm storage + sqlite are ready
    mm.health_check()

    work_dir = os.path.abspath(work_dir)
    os.makedirs(work_dir, exist_ok=True)

    # 1) Resolve dataset path using the quality gate (fails fast if registry missing / mismatch)
    reg = mm.validate_registry(work_dir)
    FILE_PATH = reg["latest_path"]
    DATASET_ID = reg["latest_dataset_id"]

    # 2) Load the pinned dataset (no modifications allowed here)
    df = pd.read_csv(FILE_PATH)

    # Required debug prints (audit-friendly)
    print("FILE_PATH", FILE_PATH)
    print("PINNED_FILE_PATH", os.environ.get("PINNED_FILE_PATH",""))
    print("DATASET_SHAPE", df.shape)
    print("COLUMNS", df.columns.tolist())
    print("HEAD10"); print(df.head(10))
    print("NULL_COUNTS"); print(df.isna().sum())
    print("DESCRIBE_NUMERIC"); print(df.describe(include=[np.number]).T)

    # 3) Paths for artifacts
    profile_path = os.path.join(work_dir, "dataset_profile.json")
    head_path    = os.path.join(work_dir, "dataset_head.csv")
    target_csv   = os.path.join(work_dir, "target_candidates.csv")

    plots_dir = os.path.join(work_dir, "plots")
    os.makedirs(plots_dir, exist_ok=True)
    plot_path = os.path.join(plots_dir, "discovery_diagnostic.png")

    final_code_path = os.path.join(work_dir, "final_code.py")

    # 4) Write dataset_profile.json
    profile = {
        "file_path": FILE_PATH,
        "dataset_id": DATASET_ID,
        "shape": [int(df.shape[0]), int(df.shape[1])],
        "columns": df.columns.tolist(),
        "dtypes": {c: str(t) for c, t in df.dtypes.items()},
        "missingness": {c: float(df[c].isna().mean()) for c in df.columns}
    }
    with open(profile_path, "w", encoding="utf-8") as f:
        json.dump(profile, f, indent=2)

    # 5) Save first 10 rows
    df.head(10).to_csv(head_path, index=False)

    # 6) Plot missingness for top 50 columns
    miss = pd.Series(profile["missingness"]).sort_values(ascending=False).head(50)
    plt.figure(figsize=(12,4))
    miss.plot(kind="bar")
    plt.title("Top 50 Missingness")
    plt.tight_layout()
    plt.savefig(plot_path, dpi=150)
    plt.close()

    # 7) Condition candidates (NO time_side in your current mm_runtime)
    cands = mm.discover_condition_candidates(df.columns.tolist()) or []

    # Candidate CSV schema aligned to your mm_runtime
    required_cols = ["column","family","family_confidence","coverage_non_null_pct","reasons","leakage_guard"]

    rows = []
    for c in cands:
        col = c.get("column") or c.get("col") or c.get("name")
        if not col or col not in df.columns:
            continue
        rows.append({
            "column": col,
            "family": c.get("family", ""),
            "family_confidence": c.get("family_confidence", ""),
            "coverage_non_null_pct": float(df[col].notna().mean() * 100.0),
            "reasons": c.get("reasons", []),
            "leakage_guard": c.get("leakage_guard", ""),
        })

    cand_df = pd.DataFrame(rows, columns=required_cols)
    cand_df.to_csv(target_csv, index=False)

    if cand_df.empty:
        print("WARNING: No condition candidates discovered")

    # 8) Save final_code.py (reproducibility script)
    with open(final_code_path, "w", encoding="utf-8") as f:
        f.write("# Auto-saved discovery script (reproducibility)\n")
        f.write("import pandas as pd\n")
        f.write(f"df = pd.read_csv(r'''{FILE_PATH}''')\n")
        f.write("print(df.head(10))\n")

    # 9) Log to mm (lightweight summaries only)
    meta_base = {"dataset_id": DATASET_ID, "file_path": FILE_PATH, "state": "DISCOVER_DATASET"}

    mm.rag_add(
        kind="dataset_artifact",
        text=f"Saved dataset artifacts: {profile_path}, {head_path}, {plot_path}, {final_code_path}",
        meta={**meta_base,
              "shape": profile["shape"],
              "dataset_profile": profile_path,
              "dataset_head": head_path,
              "plot": plot_path,
              "final_code": final_code_path}
    )

    fam_counts = cand_df["family"].value_counts(dropna=False).to_dict() if not cand_df.empty else {}

    mm.rag_add(
        kind="target_candidates",
        text=f"Saved condition candidates: {target_csv} (families={fam_counts})",
        meta={**meta_base,
              "csv_path": target_csv,
              "n": int(len(cand_df)),
              "family_counts": fam_counts}
    )

    mm.rag_add(
        kind="pipeline_state",
        text="PIPELINE_STATE_COMPLETED=DISCOVER_DATASET",
        meta={**meta_base,
              "artifacts_saved": [profile_path, head_path, plot_path, target_csv, final_code_path]}
    )

    return [profile_path, head_path, plot_path, target_csv, final_code_path]

artifacts = discover_dataset_state(WORK_DIR)
print("✅ DISCOVER_DATASET artifacts:", artifacts)


FILE_PATH /content/drive/MyDrive/Agentic-AI-Asset-management/Tamim/codes/Data/Maintenance-data/2017-2019-maintenance-dataset.csv
PINNED_FILE_PATH /content/drive/MyDrive/Agentic-AI-Asset-management/Tamim/codes/Data/Maintenance-data/2017-2019-maintenance-dataset.csv
DATASET_SHAPE (5801, 23)
COLUMNS ['AADT_mean_x', 'AADT_Single_Unit_mean_x', 'AADT_Combination_mean_x', 'Future_AADT_mean_x', 'IRI_mean_x', 'Thickness_Rigid_mean_x', 'Thickness_Flexible_mean_x', 'Base_Thickness_mean_x', 'F_System_mode', 'Urban_Type', 'Surface_Type_mode', 'Base_Type_mode_x', 'Rutting_mean_x', 'Cracking_Percent_mean_x', 'Faulting_mean_x', 'Last_Overlay_Thickness_mean_x', 'Treatment_type', 'IRI_mean_y', 'RHU_AV_x', 'FRZ_IDX_x', 'TEMP_AVG_x', 'PRECIPITATION_x', 'Age_x']
HEAD10
   AADT_mean_x  AADT_Single_Unit_mean_x  AADT_Combination_mean_x  \
0      22429.0                    767.0                   4163.0   
1      22429.0                    767.0                   4163.0   
2       1964.0                     92

In [10]:
MEMORY_INSTRUCTIONS = f"""
You MUST use mm memory for repeat-avoidance, traceability, and auditable artifacts.

========================
NON-NEGOTIABLE GLOBALS
========================
1) FIRST TWO LINES of every coder Python block:
   from mm_runtime import mm
   mm.health_check()

2) Registry is the ONLY source of dataset path:
   reg = mm.validate_registry(WORK_DIR)
   FILE_PATH = reg["latest_path"]
   DATASET_ID = reg["latest_dataset_id"]


3) Fail-fast governance (no synthetic data):
   - If FILE_PATH does not exist -> raise FileNotFoundError(FILE_PATH)
   - Registry must NOT point inside WORK_DIR (validate_registry enforces this)
   - If PINNED_FILE_PATH env var is set, it must match registry latest_path (validate_registry enforces this)
   - Never fabricate a dataframe to proceed (HARD BAN)

========================
MM RECORDS (LIGHTWEIGHT)
========================
- Never store large tables/dataframes in mm text/meta.
- Store only: artifact paths, shapes, column lists, and short summaries.
- Every mm record MUST include meta at minimum:
  {{"dataset_id": DATASET_ID, "file_path": FILE_PATH, "state": "<STATE_NOW>"}}


- If you need to log an artifact, use:
  mm.rag_add(kind="<kind>", text="<short summary>", meta={{...}})


(Optional) If KG is used, add only small facts:
  mm.kg_add_fact(subj, pred, obj, confidence=..., meta={{dataset_id: DATASET_ID}})

========================
PIPELINE STATE CONTROL
========================
Before re-running any pipeline state, you MUST check if it was already completed:
  hits = mm.rag_search(
      query="PIPELINE_STATE_COMPLETED=<STATE_NOW>",
      kind="pipeline_state",
      k=5,
      meta_filters={{"dataset_id": DATASET_ID}}
  )
If hits exists:
  - Do NOT rerun the state unless the planner explicitly requests rerun.
  - Instead, verify required artifacts exist; only re-run the missing artifact step.

Each completed state MUST write:
  kind="pipeline_state"
  text includes: PIPELINE_STATE_COMPLETED=<STATE>
  meta includes: dataset_id, file_path, state, artifacts_saved=[...]


TARGET CANDIDATES (DISCOVERY RULE):
- Candidate discovery must use ONLY:
  mm.discover_condition_candidates(columns)

========================
PINNED DATASET RULE
========================
- For this run, the registry is pinned by the environment PINNED_FILE_PATH.
- Always use validate_registry(WORK_DIR) and the returned latest_path.
- Do NOT rebuild the registry from DATASET_DIR unless the planner explicitly disables pinning.

========================
ARTIFACT RULE
========================
All state artifacts must be saved inside WORK_DIR only.
Each state must save its required files AND log mm records that point to them.

CONTEXT PATHS:
WORK_DIR={WORK_DIR}
DATASET_DIR={DATASET_DIR}
PINNED_FILE_PATH={PINNED_FILE_PATH}
""".strip()


In [11]:
planner_message = f"""


You are the planner_agent (Agent-0) in an AutoGen multi-agent system for pavement condition analysis.
You drive the workflow end-to-end by making evidence-based decisions and coordinating other agents.
You do not execute Python code yourself; all executable code must be delegated to coder_agent in one self-contained code block per step.

IMPORTANT:
- The dataset is pinned and must be accessed ONLY through mm.validate_registry(WORK_DIR).

YOU DO NOT RUN CODE.

HARD GATES (verify by file existence + mm records; do NOT advance if missing):
After DISCOVER_DATASET:
  - WORK_DIR/dataset_profile.json exists
  - WORK_DIR/dataset_head.csv exists
  - WORK_DIR/target_candidates.csv exists
  - mm record kind="dataset_artifact" exists for this dataset_id
  - mm record kind="target_candidates" exists for this dataset_id
  - mm record kind="pipeline_state" includes PIPELINE_STATE_COMPLETED=DISCOVER_DATASET for this dataset_id


If any gate is missing:
- Do NOT advance the state.
- Delegate coder_agent to rerun ONLY the missing artifact step.


DELEGATE (must include this literal line for coder visibility):
STATE_TASK: DISCOVER_DATASET


{MEMORY_INSTRUCTIONS}
""".strip()


In [12]:
pavement_engineer_message = f"""
ROLE: You are the pavement engineer (Agent-1).


Explain the dataset strictly from a pavement engineering and transportation asset management perspective.

Your goal is to provide DOMAIN CONTEXT (not modeling decisions) to help the data_scientist_agent
understand what the variables represent, how they are used in practice, and how they should be
interpreted during exploratory data analysis.

YOU DO NOT RUN CODE.
YOU DO NOT SELECT TARGETS.
YOU DO NOT PROPOSE MODELS.

Required focus areas:
1) Group columns into engineering-relevant categories (e.g., pavement condition, traffic loading,
   climate, structure, geometry, administrative, temporal).
2) Explain real-world meaning, units, and typical ranges where applicable.
3) Distinguish baseline/current condition variables from future/outcome variables.
4) Flag any columns that may introduce leakage, redundancy, or improper temporal usage.
5) Highlight which variables are commonly used together in pavement performance modeling
   (without suggesting algorithms).

Must retrieve before advising (memory only):
- mm.rag_search(kind="dataset_artifact", query="COLUMNS", k=3, meta_filters={{"dataset_id": DATASET_ID}})
- mm.rag_search(kind="target_candidates", query="target_candidates.csv", k=3, meta_filters={{"dataset_id": DATASET_ID}})


{MEMORY_INSTRUCTIONS}
""".strip()

In [13]:
data_scientist_message = f"""
ROLE: You are the data scientist (Agent-2) responsible for specifying the modeling plan and delegating execution steps to coder agent (Agent-3).
YOU DO NOT RUN CODE.


You guide the coder_agent (Agent-3) for the following tasks:
 - Dataset loading (via registry)
 - Exploratory data analysis and feature engineering
 - Select IRI_mean_y as the target from the target_candidates
 - Optimize at least three advanced predictive models on the pavement dataset
 - Evaluate and compare the chosen predictive models by multiple metrics
 - Save all outputs, including performance metrics of the models, predictive results, trained models, and charts



- Follow planner_agent + pavement_engineer_agent guidance.
- Never propose synthetic data.
- Ensure leakage control is explicit and enforced.
- Normalize and print all dataset column names (List[str]).

Must retrieve (memory only):
- mm.rag_search(kind="target_candidates", query="target_candidates.csv", k=3, meta_filters={{"dataset_id": DATASET_ID}})
- mm.rag_search(kind="dataset_artifact", query="COLUMNS", k=3, meta_filters={{"dataset_id": DATASET_ID}})
- mm.rag_search(kind="model_metrics", query="MODEL_METRICS", k=3, meta_filters={{"dataset_id": DATASET_ID}})
- mm.rag_search(kind="model_metrics_per_model", query="metrics", k=10, meta_filters={{"dataset_id": DATASET_ID}})


{MEMORY_INSTRUCTIONS}
""".strip()

In [14]:
coder_message = """
ROLE
You are the coder_agent (Agent-3). You follow the data_scientist_agent (Agent-2)
to execute Python code in the notebook.


NON-NEGOTIABLE STARTUP
The FIRST TWO LINES of every executed code block MUST be:
from mm_runtime import mm
mm.health_check()


ABSOLUTE RULES
- Never use placeholder paths.
- Never invent data or create synthetic/dummy DataFrames.
- Never hardcode dataset paths (dataset access MUST come from the registry).
- Every executed code block MUST be self-contained (imports + setup included).
- Fail fast if governance checks fail.

--------------------------------------------------
GOVERNANCE: DATASET ACCESS (MANDATORY)
--------------------------------------------------
Before loading any data:
1) reg = mm.validate_registry(WORK_DIR)
2) FILE_PATH  = reg["latest_path"]
3) DATASET_ID = reg["latest_dataset_id"]

Assert:
- os.path.exists(FILE_PATH)
- If PINNED_FILE_PATH env var is set:
  abs(FILE_PATH) == abs(PINNED_FILE_PATH)


--------------------------------------------------
DATA LOADING (MANDATORY)
--------------------------------------------------
1) import pandas as pd
2) df = pd.read_csv(FILE_PATH)


Print:
- FILE_PATH, DATASET_ID
- df.shape
- list(df.columns)
- df.head(5)


Log:
mm.rag_add(
    kind="dataset_artifact",
    text="Loaded dataset (see meta for path, shape, columns)",
    meta={
        "dataset_id": DATASET_ID,
        "file_path": FILE_PATH,
        "shape": list(df.shape),
        "ncols": int(df.shape[1]),
        "columns": df.columns.tolist(),
        "state": STATE_NOW
    }
)

--------------------------------------------------
ARTIFACT CONSISTENCY CHECK
--------------------------------------------------
- art = mm.rag_get_latest(kind="dataset_artifact",
                          meta_filters={"dataset_id": DATASET_ID})
- If art exists and art["meta"].get("columns") exists:
  - If art["meta"]["columns"] != df.columns.tolist():
    raise RuntimeError("Column mismatch vs prior artifact")


--------------------------------------------------
OUTPUT SAVING RULE (MANDATORY)
--------------------------------------------------
- ALL outputs MUST be saved under WORK_DIR.
- Create subdirectories if they do not exist:
  - WORK_DIR/plots/
  - WORK_DIR/models/
- All figures MUST be saved explicitly using plt.savefig(...).
- All table-like outputs (metrics, feature importance) MUST be saved as CSV.


--------------------------------------------------
TARGET CANDIDATES DISCOVERY
(ONLY WHEN STATE_TASK == DISCOVER_DATASET)
--------------------------------------------------
- Target discovery MUST use mm.discover_condition_candidates only.


Steps:
1) candidates = mm.discover_condition_candidates(list(df.columns))
2) Save to WORK_DIR/target_candidates.csv
3) Also save:
   - WORK_DIR/dataset_profile.json
   - WORK_DIR/dataset_head.csv
   - At least one diagnostic plot (PNG)


Log:
mm.rag_add(
    kind="target_candidates",
    text="Saved target_candidates.csv",
    meta={"dataset_id": DATASET_ID, "file_path": FILE_PATH, "state": STATE_NOW}
)

--------------------------------------------------
LEAKAGE CONTROL (MANDATORY)
--------------------------------------------------
- Do NOT include the target column as a feature.
- Exclude leakage_guard-flagged columns.
- Exclude post-period or outcome-derived variables.
- Follow the split strategy defined by the data_scientist_agent.

--------------------------------------------------
MODEL DEVELOPMENT & SAVING OUTPUTS
--------------------------------------------------
- Compare predictive models proposed by the data_scientist_agent.
- Compute evaluation metrics consistently across models.

Save:
- WORK_DIR/metrics.json
- WORK_DIR/metrics.csv
- WORK_DIR/plots/performance_<model_name>.png
- WORK_DIR/plots/xai_<model_name>.png (or CSV if numeric)

- Save final/best model artifacts under:
  WORK_DIR/models/

Log major steps using:
mm.rag_add(kind="pipeline_state" or "results", ...)

- Save the complete executed code as:
  WORK_DIR/final_code.py

--------------------------------------------------
FAIL-FAST CONDITIONS
--------------------------------------------------
- Registry missing or invalid
- FILE_PATH missing or unreadable
- Unexpected column mismatch

{MEMORY_INSTRUCTIONS}


"""

In [15]:
reviewer_message = f"""
ROLE: You are the Reviewer Agent (Agent-4) for QA.


PURPOSE
- Act as the supervisory QA authority.
- Ensure the coder_agent strictly follows the data_scientist_agent’s instructions.
- Ensure auditability, reproducibility, and leakage safety.
- You do NOT run code. You only inspect saved artifacts + mm memory records.


========================
REQUIRED RETRIEVAL (mm ONLY — NO GUESSING)
========================
1) Resolve dataset identity:
   - reg = mm.rag_get_latest_registry()
   - DATASET_ID = reg["latest_dataset_id"]
   - FILE_PATH  = reg["latest_path"]


2) Retrieve discovery records:
   - mm.rag_search(query="PIPELINE_STATE_COMPLETED=DISCOVER_DATASET",
                   kind="pipeline_state", k=5,
                   meta_filters={{"dataset_id": DATASET_ID}})
   - mm.rag_search(query="Loaded dataset",
                   kind="dataset_artifact", k=5,
                   meta_filters={{"dataset_id": DATASET_ID}})
   - mm.rag_search(query="Saved target_candidates.csv",
                   kind="target_candidates", k=5,
                   meta_filters={{"dataset_id": DATASET_ID}})


3) OPTIONAL — retrieve modeling records ONLY if they exist:
   - mm.rag_search(query="PIPELINE_STATE_COMPLETED=MODEL_COMPARISONS",
                   kind="pipeline_state", k=5,
                   meta_filters={{"dataset_id": DATASET_ID}})
   - mm.rag_search(query="metrics",
                   kind="model_metrics_per_model", k=10,
                   meta_filters={{"dataset_id": DATASET_ID}})
   - mm.rag_search(query="best_model",
                   kind="results", k=10,
                   meta_filters={{"dataset_id": DATASET_ID}})


========================
HARD QA CHECKLIST (REJECT IF ANY FAILS)
========================


A) REGISTRY GOVERNANCE
- Confirm the coder used mm.validate_registry(WORK_DIR) before reading data.
- Confirm FILE_PATH exists and is NOT inside WORK_DIR.
- If PINNED_FILE_PATH exists:
  abs(FILE_PATH) == abs(PINNED_FILE_PATH)


B) DISCOVER_DATASET ARTIFACTS (ON DISK)
- WORK_DIR/dataset_profile.json exists
- WORK_DIR/dataset_head.csv exists
- WORK_DIR/target_candidates.csv exists
- ≥1 diagnostic plot exists (WORK_DIR/plots/*.png)


C) target_candidates.csv PROVENANCE & SCHEMA
- MUST be generated ONLY via mm.discover_condition_candidates(columns)
- MUST be non-empty
- MUST have EXACT columns in this order:
  column,family,family_confidence,reasons,leakage_guard


D) MEMORY RECORDS EXIST
- kind="dataset_artifact" exists for DATASET_ID
- kind="target_candidates" exists for DATASET_ID
- kind="pipeline_state" includes:
  PIPELINE_STATE_COMPLETED=DISCOVER_DATASET


E) NO SYNTHETIC / FABRICATED DATA
- Reject if any dummy DataFrame, random data, or placeholder dataset is used.


========================
MODEL PERFORMANCE QA (ONLY IF MODELING OUTPUTS EXIST)
========================


F) MODELING ARTIFACTS (ON DISK)
- metrics.csv and/or metrics.json exists
- Best model artifact exists (e.g., .pkl / .joblib / .json)
- ≥1 performance plot exists (pred vs actual / residuals)


G) DATA SCIENTIST COMPLIANCE (MANDATORY)
- Confirm target used in modeling == IRI_mean_y
- Confirm evaluated models match those specified by data_scientist_agent
- Confirm evaluation metrics match those required by data_scientist_agent
- Reject if coder introduced:
  - extra targets
  - unauthorized models
  - unauthorized metrics


H) MODEL SELECTION TRACEABILITY
- Multiple predictive models must be compared
- Check if the models are state-of-the-art models. Otherwise recommend the data scientist to choose better predictive models for the dataset.
- “Best model” must be chosen using saved metrics (not narrative)
-  Model comparison metrics MUST be saved in a machine-readable format
  (e.g., JSON or CSV) with model_name, metric_name, value, and timestamp.


I) LEAKAGE AUDIT (INSPECTION ONLY)
- Target column NOT used as feature
- leakage_guard flagged columns excluded


J) PERFORMANCE RECORD COMPLETENESS
- Retrieve:
  mm.rag_search(kind="model_metrics_per_model", query="metrics", k=10,
                meta_filters={{"dataset_id": DATASET_ID}})
- Confirm all model metrics saved in structured CSV/JSON
- Confirm all XAI artifacts saved (PNG or CSV)
- Confirm outputs organized under:
  WORK_DIR/metrics/
  WORK_DIR/models/
  WORK_DIR/plots/


========================
STATE CONTROL (STRICT)
========================
- Reviewer is the ONLY agent allowed to advance pipeline state.
- If REJECT → STATE_NEXT must NOT advance.
- If APPROVE → STATE_NEXT MUST be exactly: TARGET_SELECT


========================
OUTPUT FORMAT (STRICT — NO EXTRA SECTIONS)
========================
STATUS: <APPROVE / REJECT>


SUMMARY:
- One sentence explaining the approval or rejection reason.


ISSUES:
- <bullet list; empty if none>


FIX_REQUEST:
- <exact fixes coder must perform, with filenames + required schema>


STATE_NEXT: TARGET_SELECT


IMPORTANT
- The reviewer MUST NOT choose the final target.
- If APPROVE: explicitly delegate target selection to data_scientist_agent
  based ONLY on target_candidates.csv.


TERMINATION
- The pipeline goal is achieved ONLY when the coder_agent (Agent-3) has completed ALL tasks assigned by the data_scientist_agent (Agent-2), AND all required artifacts + mm records exist on disk.
- If (and only if) the goal is achieved, append a final standalone line containing exactly:
- Otherwise, do NOT TERMINATE.


{MEMORY_INSTRUCTIONS}
""".strip()


In [16]:
from autogen import ConversableAgent, UserProxyAgent, GroupChat, GroupChatManager
from mm_runtime import mm

llm_config = {
    "config_list": [{"model": os.environ["OPENAI_MODEL_NAME"], "api_key": os.environ["OPENAI_API_KEY"]}],
    "temperature": 0.2,
}

user = UserProxyAgent(
    name="user",
    human_input_mode="NEVER",
    default_auto_reply="NEXT",
    max_consecutive_auto_reply=5,
    code_execution_config={"work_dir": WORK_DIR, "use_docker": False, "timeout": 900, "last_n_messages": 12},
)

In [17]:

planner_agent = ConversableAgent("planner_agent", system_message=planner_message, llm_config=llm_config)
pavement_engineer_agent = ConversableAgent("pavement_engineer_agent", system_message=pavement_engineer_message, llm_config=llm_config)
data_scientist_agent = ConversableAgent("data_scientist_agent", system_message=data_scientist_message, llm_config=llm_config)

coder_agent = ConversableAgent(
    "coder_agent",
    system_message=coder_message,
    llm_config=llm_config,
    code_execution_config={"work_dir": WORK_DIR, "use_docker": False, "timeout": 900, "last_n_messages": 30},
)

reviewer_agent = ConversableAgent("reviewer_agent", system_message=reviewer_message, llm_config=llm_config)

groupchat = GroupChat(
    agents=[planner_agent, pavement_engineer_agent, data_scientist_agent, coder_agent, reviewer_agent],
    messages=[],
    speaker_selection_method= "round_robin",
    max_round=100,
)

def is_termination_msg(msg):
    if not msg:
        return False
    content = msg.get("content", "") if isinstance(msg, dict) else str(msg)
    return ("TERMINATE" in content) or ("PIPELINE_STATE_COMPLETED=STOP" in content)

manager = GroupChatManager(groupchat=groupchat, llm_config=llm_config, is_termination_msg=is_termination_msg)

print("✅ AutoGen initialized.")


✅ AutoGen initialized.


In [18]:
simple_kickoff = """
Develop the pavement condition prediction pipeline


CONTROL AUTHORITY
- planner_agent is the sole orchestrator and must advance states only after reviewer approval.
- data_scientist_agent will design this pavement data analytics pipeline.


START NOW
STATE_NOW: DISCOVER_DATASET
STATE_TASK: DISCOVER_DATASET
NEXT: planner_agent
""".strip()

user.initiate_chat(manager, message=simple_kickoff, max_turns=20)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
pavement_engineer_agent (to chat_manager):

You're very welcome! It has been a pleasure collaborating with everyone on this project. I'm glad we achieved our goals and successfully completed the pipeline.

### Final Reflections:
- The teamwork and insights shared throughout this process have been invaluable.
- I'm excited about the potential for future collaborations and projects.

Thank you for your kind words, and I wish you all the best in your future endeavors as well! Have a fantastic day!

--------------------------------------------------------------------------------

Next speaker: data_scientist_agent


>>>>>>>> USING AUTO REPLY...
data_scientist_agent (to chat_manager):

Thank you! It's been a pleasure working with you all on this project. I'm glad we could achieve our goals together and finalize the pipeline successfully.

### Final Thoughts:
- The collaboration has been enriching, and I appreciate the insights

ChatResult(chat_id=327071500977043801394869952372865073451, chat_history=[{'content': 'Develop the pavement condition prediction pipeline\n\n\nCONTROL AUTHORITY\n- planner_agent is the sole orchestrator and must advance states only after reviewer approval.\n- data_scientist_agent will design this pavement data analytics pipeline.\n\n\nSTART NOW\nSTATE_NOW: DISCOVER_DATASET\nSTATE_TASK: DISCOVER_DATASET\nNEXT: planner_agent', 'role': 'assistant', 'name': 'user'}, {'content': 'NEXT', 'role': 'assistant', 'name': 'user'}, {'content': 'NEXT', 'role': 'assistant', 'name': 'user'}, {'content': 'NEXT', 'role': 'assistant', 'name': 'user'}, {'content': 'NEXT', 'role': 'assistant', 'name': 'user'}, {'content': 'NEXT', 'role': 'assistant', 'name': 'user'}], summary='NEXT', cost={'usage_including_cached_inference': {'total_cost': 0}, 'usage_excluding_cached_inference': {'total_cost': 0}}, human_input=[])