In [14]:
# ========== STEP 1: CONFIG & SECRETS ==========
import os

# Paste keys each session (safer than hard-coding in a public notebook)
AIX_KEY = os.getenv("AIXPLAIN_API_KEY") or input("Paste your aiXplain API key (won't be saved): ").strip()
os.environ["AIXPLAIN_API_KEY"] = AIX_KEY

# These are aiXplain marketplace tool IDs; set them via env or paste here.
TAVILY_TOOL_ID     = os.getenv("TAVILY_TOOL_ID",     "6736411cf127849667606689")
GPT4O_MINI_TOOL_ID = os.getenv("GPT4O_MINI_TOOL_ID", "669a63646eb56306647e1091")

print("aiXplain key set:", bool(os.environ.get("AIXPLAIN_API_KEY")))


aiXplain key set: True


In [15]:
# ========== STEP 2: DEPENDENCIES & INDEX ==========
!pip -q install aixplain kaggle beautifulsoup4

from aixplain.factories import IndexFactory
INDEX_ID = "68c3d266846e880471f96476"   # <-- your existing index id
index = IndexFactory.get(INDEX_ID)
print("Loaded index:", INDEX_ID)


Loaded index: 68c3d266846e880471f96476


In [None]:
try:
    from aixplain.modules.agent.tool.model_tool import ModelTool
    index_tool = ModelTool(
        model=index,
        name="aiR Index Search",
        description="Semantic search over policy corpus (returns top-k passages)."
    )
    print("[OK] Wrapped index with ModelTool")
except Exception as e:
    index_tool = index  # fallback
    print("[INFO] ModelTool unavailable, using raw index as tool. Error:", e)


[OK] Wrapped index with ModelTool


In [None]:
# ========== STEP 3: KAGGLE AUTH & DATASETS ==========
from google.colab import files

print("Upload kaggle.json (Kaggle > Account > Create New API Token) …")
uploaded = files.upload()  # select kaggle.json

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Example datasets (feel free to change/add)
!kaggle datasets download -d jessemostipak/gdpr-violations -p /content --unzip
!kaggle datasets download -d umerhaddii/ai-governance-documents-data -p /content --unzip

# Quick peek
import pandas as pd
df = pd.read_csv("/content/gdpr_violations.csv")
print("gdpr_violations:", df.shape)
df.head(3)


Upload kaggle.json (Kaggle > Account > Create New API Token) …


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/jessemostipak/gdpr-violations
License(s): unknown
Downloading gdpr-violations.zip to /content
  0% 0.00/108k [00:00<?, ?B/s]
100% 108k/108k [00:00<00:00, 322MB/s]
Dataset URL: https://www.kaggle.com/datasets/umerhaddii/ai-governance-documents-data
License(s): CC0-1.0
Downloading ai-governance-documents-data.zip to /content
  0% 0.00/5.26M [00:00<?, ?B/s]
100% 5.26M/5.26M [00:00<00:00, 800MB/s]
gdpr_violations: (437, 11)


Unnamed: 0,id,picture,name,price,authority,date,controller,article_violated,type,source,summary
0,1,https://www.privacyaffairs.com/wp-content/uplo...,Poland,9380,Polish National Personal Data Protection Offic...,10/18/2019,Polish Mayor,Art. 28 GDPR,Non-compliance with lawful basis for data proc...,https://uodo.gov.pl/decyzje/ZSPU.421.3.2019,No data processing agreement has been conclude...
1,2,https://www.privacyaffairs.com/wp-content/uplo...,Romania,2500,Romanian National Supervisory Authority for Pe...,10/17/2019,UTTIS INDUSTRIES,Art. 12 GDPR|Art. 13 GDPR|Art. 5 (1) c) GDPR|A...,Information obligation non-compliance,https://www.dataprotection.ro/?page=A_patra_am...,A controller was sanctioned because he had unl...
2,3,https://www.privacyaffairs.com/wp-content/uplo...,Spain,60000,Spanish Data Protection Authority (AEPD),10/16/2019,Xfera Moviles S.A.,Art. 5 GDPR|Art. 6 GDPR,Non-compliance with lawful basis for data proc...,https://www.aepd.es/resoluciones/PS-00262-2019...,The company had unlawfully processed the perso...


In [None]:
# ========== STEP 4: STABLE-ID INGESTION (BATCH + RETRY) ==========
import os, csv, time, hashlib, re, requests
from typing import List
from textwrap import wrap
from bs4 import BeautifulSoup
from aixplain.modules.model.record import Record

def stable_id_from_str(s: str, salt: str) -> str:
    return hashlib.sha256(f"{salt}||{s}".encode("utf-8")).hexdigest()

def batch(iterable, size):
    buf = []
    for item in iterable:
        buf.append(item)
        if len(buf) == size:
            yield buf
            buf = []
    if buf:
        yield buf

def upsert_batched(records: List[Record], batch_size=500, retries=3, name="records"):
    done_total, fail_total = 0, 0
    for chunk in batch(records, batch_size):
        for attempt in range(1, retries+1):
            try:
                index.upsert(chunk)
                done_total += len(chunk)
                break
            except Exception as e:
                if attempt < retries:
                    time.sleep(1.5 * attempt)
                else:
                    fail_total += len(chunk)
                    print(f"[WARN] Final failure on {name} batch after {retries} tries: {e}")
    print(f"[UPSERT] {name} | Success: {done_total} | Failed: {fail_total}")

# ---- CSV → Records helpers ----
def stable_id_from_row(row: dict, salt: str, key_fields=None) -> str:
    key_fields = key_fields or ["name", "company", "organization", "violation", "date", "country", "fine", "amount"]
    blob = "|".join([str(row.get(k, "")) for k in key_fields]).strip()
    if not blob:
        blob = "|".join([f"{k}={v}" for k,v in row.items()])
    return stable_id_from_str(blob, salt)

def record_from_row(row: dict, salt: str, source_tag: str, dataset_tag: str) -> Record:
    fields_order = ("name", "company", "organization", "violation", "summary", "date", "fine", "amount", "country")
    parts = [f"{k}: {row[k]}" for k in fields_order if k in row and str(row[k]).strip()]
    text = " ; ".join(parts) if parts else " ; ".join([f"{k}: {v}" for k,v in list(row.items())[:6]])
    rid = stable_id_from_row(row, salt=salt)
    return Record(
        id=rid,
        value=text,
        attributes={"source": source_tag, "dataset": dataset_tag}
    )

def ingest_csv(csv_path: str, salt: str, source_tag: str, dataset_tag: str, max_rows=None, name="csv"):
    assert os.path.exists(csv_path), f"CSV not found: {csv_path}"
    rows: List[Record] = []
    with open(csv_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            if max_rows is not None and i >= max_rows:
                break
            rows.append(record_from_row(row, salt=salt, source_tag=source_tag, dataset_tag=dataset_tag))
    print(f"[PREP] {name}: {len(rows)} records")
    upsert_batched(rows, name=name)

# ---- Ingest the example CSVs ----
ingest_csv("/content/gdpr_violations.csv",
           salt="gdpr",
           source_tag="kaggle:gdpr_violations",
           dataset_tag="jessemostipak/gdpr-violations",
           max_rows=None,
           name="gdpr_violations")

# This dataset file name may differ; adjust if needed
# Try to locate a CSV within the second dataset folder:
import glob
gov_csvs = glob.glob("/content/*.csv") + glob.glob("/content/*/*.csv")
gov_csv = None
for c in gov_csvs:
    if "governance" in c.lower() or "ai" in c.lower():
        gov_csv = c
        break
print("Detected governance CSV:", gov_csv)

if gov_csv:
    ingest_csv(gov_csv,
               salt="ai_governance",
               source_tag="kaggle:ai_governance",
               dataset_tag="umerhaddii/ai-governance-documents-data",
               max_rows=None,
               name="ai_governance")
else:
    print("[INFO] Could not auto-detect governance CSV—skip for now.")


[PREP] gdpr_violations: 437 records
[UPSERT] gdpr_violations | Success: 437 | Failed: 0
Detected governance CSV: /content/sample_data/mnist_train_small.csv
[PREP] ai_governance: 19999 records
[UPSERT] ai_governance | Success: 19999 | Failed: 0


In [None]:
# ========== STEP 5 (FIXED): WEBSITE INGESTION ==========

import requests, re, hashlib
from bs4 import BeautifulSoup
from aixplain.modules.model.record import Record

def fetch_url_text(url: str, max_bytes: int = 2_000_000) -> tuple[str, str]:
    """Fetch HTML safely:
    - Adds a UA header (some sites block default).
    - Caps payload to avoid OOM.
    - Skips non-HTML content types (e.g., PDFs).
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Colab/PolicyNavigator (+https://aixplain.com)"
    }
    r = requests.get(url, timeout=20, headers=headers, stream=True)
    r.raise_for_status()
    ctype = r.headers.get("Content-Type", "").lower()
    if "text/html" not in ctype:
        raise ValueError(f"Non-HTML content detected ({ctype}); skipping: {url}")

    # Read with cap
    data = bytearray()
    for chunk in r.iter_content(chunk_size=65536):
        if chunk:
            data.extend(chunk)
        if len(data) > max_bytes:
            break
    html = data.decode(errors="ignore")

    soup = BeautifulSoup(html, "html.parser")
    for bad in soup(["script", "style", "noscript"]):
        bad.extract()

    title = (soup.title.string.strip() if soup.title and soup.title.string else url).strip()
    # Normalize whitespace
    text = re.sub(r"\s+", " ", soup.get_text(separator=" ").strip())

    # Optional: cap very long pages to prevent huge chunk lists
    MAX_TEXT_LEN = 300_000
    if len(text) > MAX_TEXT_LEN:
        text = text[:MAX_TEXT_LEN]

    return title, text

def chunk_text(text: str, max_chars: int = 1200, overlap: int = 100):
    """Robust chunker:
    - Fast path for short text.
    - Guarantees forward progress even if overlap >= max_chars.
    """
    n = len(text)
    if n == 0:
        return []
    if n <= max_chars:
        return [text]

    # Ensure step > 0
    step = max(1, max_chars - max(0, overlap))
    chunks = []
    i = 0
    while i < n:
        j = min(i + max_chars, n)
        chunks.append(text[i:j])
        if j == n:
            break
        i += step  # always move forward
    return chunks

def make_web_records(url: str, source_tag: str = "web:gov_site") -> list[Record]:
    title, text = fetch_url_text(url)
    chunks = chunk_text(text)
    out = []
    for idx, chunk in enumerate(chunks):
        rid = hashlib.sha256(f"{url}||{idx}".encode("utf-8")).hexdigest()
        out.append(Record(
            id=rid,
            value=f"{title}\n\n{chunk}",
            attributes={"source": source_tag, "url": url, "title": title, "chunk_id": idx}
        ))
    return out

# Example pages — feel free to add more gov/regulatory URLs
WEB_URLS = [
    "https://www.federalregister.gov/executive-orders",
]

web_records = []
for u in WEB_URLS:
    try:
        recs = make_web_records(u, source_tag="web:federalregister")
        web_records.extend(recs)
        print(f"[PREP] {u} → {len(recs)} chunks")
    except Exception as e:
        print(f"[SKIP] {u} → {e}")

print(f"[PREP] Web records total: {len(web_records)}")

# Reuse the existing batched upsert helper from Step 4
upsert_batched(web_records, name="web_sources")


[PREP] https://www.federalregister.gov/executive-orders → 9 chunks
[PREP] Web records total: 9
[UPSERT] web_sources | Success: 9 | Failed: 0


In [None]:
# ===== COMPAT IMPORTS for Agent / Tool / TeamAgent =====
import importlib

def _load(symbol, candidates):
    last_err = None
    for mod, name in candidates:
        try:
            m = importlib.import_module(mod)
            obj = getattr(m, name)
            print(f"[OK] {symbol}: {mod}.{name}")
            return obj
        except Exception as e:
            last_err = e
    raise ImportError(f"Could not import {symbol}. Tried: {candidates}. Last error: {last_err}")

# Try common locations across SDK versions
Agent = _load("Agent", [
    ("aixplain.modules.agent", "Agent"),
    ("aixplain.modules.agents", "Agent"),
    ("aixplain", "Agent"),
])

Tool = _load("Tool", [
    ("aixplain", "Tool"),
    ("aixplain.modules.asset.tool", "Tool"),
    ("aixplain.modules.agent.tool", "Tool"),
])

TeamAgent = _load("TeamAgent", [
    ("aixplain.modules.team_agent", "TeamAgent"),
    ("aixplain.modules.agent.team_agent", "TeamAgent"),
])


[OK] Agent: aixplain.modules.agent.Agent
[OK] Tool: aixplain.modules.agent.tool.Tool
[OK] TeamAgent: aixplain.modules.team_agent.TeamAgent


In [None]:
# ===== COMPAT: loaders for Agent / TeamAgent / marketplace tools =====
import importlib

def _pick(mod_names, attr):
    last = None
    for m in mod_names:
        try:
            mod = importlib.import_module(m)
            obj = getattr(mod, attr)
            print(f"[OK] {attr} from {m}")
            return obj
        except Exception as e:
            last = e
    raise ImportError(f"Couldn't import {attr} from {mod_names}. Last error: {last}")

# Agent + TeamAgent (paths vary by SDK version)
Agent     = _pick(["aixplain.modules.agent",
                   "aixplain.modules.agents",
                   "aixplain"], "Agent")
TeamAgent = _pick(["aixplain.modules.team_agent",
                   "aixplain.modules.agent.team_agent"], "TeamAgent")

# Try several ways to load marketplace tools (tool/model assets)
try:
    from aixplain.factories import ToolFactory
except Exception:
    ToolFactory = None
try:
    from aixplain.factories import ModelFactory
except Exception:
    ModelFactory = None

def load_marketplace_tool(asset_id: str):
    """
    Works with different SDKs:
    - ToolFactory.get(id) if available
    - ModelFactory.get(id) (many tools are 'model' assets)
    """
    # 1) ToolFactory
    if ToolFactory is not None:
        try:
            t = ToolFactory.get(asset_id)
            print(f"[OK] ToolFactory.get → {asset_id}")
            return t
        except Exception as e:
            print(f"[INFO] ToolFactory.get failed for {asset_id}: {e}")
    # 2) ModelFactory
    if ModelFactory is not None:
        try:
            t = ModelFactory.get(asset_id)
            print(f"[OK] ModelFactory.get → {asset_id}")
            return t
        except Exception as e:
            print(f"[INFO] ModelFactory.get failed for {asset_id}: {e}")
    # 3) give up
    raise RuntimeError(f"Could not load tool/model asset '{asset_id}'. "
                       f"Double-check the ID and that your account has access.")


[OK] Agent from aixplain.modules.agent
[OK] TeamAgent from aixplain.modules.team_agent


In [None]:
from aixplain.factories import AgentFactory
from aixplain.factories import TeamAgentFactory


In [None]:
# ========== STEP 6 (FACTORY-BASED): AGENT TEAM WITH INSPECTOR ==========
from aixplain.factories import AgentFactory

# Load marketplace tools via the compat loader we added earlier
tavily = load_marketplace_tool(TAVILY_TOOL_ID)       # external corroboration / freshness
summ   = load_marketplace_tool(GPT4O_MINI_TOOL_ID)   # LLM (summarizer & inspector)

TOP_K = 8

retriever = AgentFactory.create(
    name="Retriever",
    description="Queries the aiR index and returns top-k relevant passages with metadata.",
    instructions=(
        "Given the user's question, call the aiR index tool semantically and return the "
        f"top-{TOP_K} most relevant passages.\n"
        "- Return passages as a concise list with text and attributes (id, url/title if present).\n"
        "- Do NOT fabricate content; only return what the index yields."
    ),
    tools=[index_tool]
)

verifier = AgentFactory.create(
    name="Verifier",
    description="Checks for freshness/corroboration using Tavily when needed.",
    instructions=(
        "If the question is time-sensitive or mentions 'latest/status/updated', call the Tavily tool to find a "
        "recent corroborating source. Return a one-line freshness note and the link(s). "
        "If not time-sensitive, reply exactly: 'No verification needed.'"
    ),
    tools=[tavily]
)

summarizer = AgentFactory.create(
    name="Summarizer",
    description="Writes a concise, grounded answer with sources.",
    instructions=(
        "Write a concise, accurate answer grounded ONLY in the retrieved passages (and verifier note if present).\n"
        "Format:\n"
        "• 4–6 bullet points (short, factual)\n"
        "• 'Sources:' followed by the list of source IDs or URLs from the passages/verifier\n"
        "If info is missing, say so. No hallucinations."
    ),
    tools=[summ]
)

inspector = AgentFactory.create(
    name="Inspector",
    description="Quality gate that enforces grounding, sources, and bullet format.",
    instructions=(
        "Quality gate for the final answer:\n"
        "1) Ensure the answer is grounded in retrieved content; no hallucinations.\n"
        "2) Ensure a 'Sources:' section exists with at least one ID or URL.\n"
        "3) Ensure bullet points are 4–6, concise, and factual.\n"
        "If any requirement is missing, revise the answer to fix issues. "
        "If everything is fine, return the answer unchanged."
    ),
    tools=[summ]
)

# Team
team = TeamAgentFactory.create(
    name="Policy Navigator (with Inspector)",
    description="Agentic RAG: aiR index → (optional) Tavily verify → LLM summarize → Inspector QA.",
    agents=[retriever, verifier, summarizer, inspector]
)



print("Team ready:", [a.name for a in team.agents])


[OK] ToolFactory.get → 6736411cf127849667606689
[OK] ToolFactory.get → 669a63646eb56306647e1091




Team ready: ['Retriever', 'Verifier', 'Summarizer', 'Inspector']


In [None]:
# ========== STEP 7: ASK() WITH LIGHT POST-CHECK ==========
import re

def needs_sources_fix(text: str) -> bool:
    if not isinstance(text, str):
        return True
    m = re.search(r"(?i)^sources\s*:\s*(.+)$", text, flags=re.MULTILINE)
    if not m:
        return True
    sources_line = m.group(1).strip()
    return len(sources_line) < 3  # very short/empty

def ask(question: str, top_k: int = TOP_K):
    print(f"\n[Q] {question}")
    result = team.run({"query": question, "top_k": top_k})
    trace_id = getattr(result, "trace_id", None)
    output = getattr(result, "output", result)

    # Post-check: if Inspector somehow let it through without 'Sources:', ask summarizer to append a sources line.
    if needs_sources_fix(str(output)):
        print("[POST-CHECK] Missing/weak 'Sources:' — requesting summarizer to append/clarify sources.")
        # Minimal repair prompt; uses the same summarizer tool directly.
        repair_prompt = (
            "The following answer is good, but it lacks a proper 'Sources:' section. "
            "Append a 'Sources:' line listing IDs or URLs that were actually referenced. "
            "If none are available, add 'Sources: (not available)'.\n\n"
            f"{output}"
        )
        try:
            fixed = summ.run({"input": repair_prompt})
            # Some tools return dict; try to extract
            output = getattr(fixed, "output", fixed)
        except Exception as e:
            print("[POST-CHECK] Repair attempt failed:", e)

    print("\n[Answer]\n", output)
    if trace_id:
        print("\n[trace_id]", trace_id)
    return output, trace_id

# Smoke test
_ = ask("Is Executive Order 14067 still in effect, and are there exemptions for small businesses?")



[Q] Is Executive Order 14067 still in effect, and are there exemptions for small businesses?
[POST-CHECK] Missing/weak 'Sources:' — requesting summarizer to append/clarify sources.

[Answer]
 ModelResponse(status=SUCCESS, data='Hello! How can I assist you today?', details=[{'index': 0, 'message': {'role': 'assistant', 'content': 'Hello! How can I assist you today?', 'refusal': None, 'annotations': []}, 'logprobs': None, 'finish_reason': 'stop'}], completed=True, used_credits=6.45e-06, run_time=0.366, usage={'prompt_tokens': 7, 'completion_tokens': 9, 'total_tokens': 16})
