# Notebook 2 · Self-Improving Micro-Loop

This notebook demonstrates the smallest possible agentic feedback loop.

You will:

1. Run a retrieval → answer pipeline over a tiny local corpus.
2. Evaluate the answer with a lightweight keyword proxy.
3. Let an improver agent write a memory note capturing the missing facts.
4. Re-run retrieval so the new memory boosts the second attempt.

> 🧪 **Tip:** This entire demo runs offline. You can safely extend it with real embeddings, LLM calls, and groundedness metrics later.

## Setup

The corpus lives in `../data/mini_corpus.jsonl`. Memory notes are persisted to
`../data/memory_notes.jsonl` so you can inspect how knowledge accumulates between
iterations. Delete the memory file if you want to reset the loop.

In [None]:
from __future__ import annotations

import json
import uuid
from collections import Counter
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Tuple

import numpy as np
import pandas as pd

DATA_PATH = (Path("../data/mini_corpus.jsonl").resolve())
MEM_PATH = (Path("../data/memory_notes.jsonl").resolve())
EMBED_DIM = 256
TOP_K = 3
SCORE_THRESHOLD = 0.75

DATA_PATH.parent.mkdir(parents=True, exist_ok=True)
if not MEM_PATH.exists():
    MEM_PATH.touch()

In [None]:
def load_jsonl(path: Path) -> List[dict]:
    if not path.exists() or path.stat().st_size == 0:
        return []
    with path.open() as handle:
        return [json.loads(line) for line in handle]


corpus = load_jsonl(DATA_PATH)
memory = load_jsonl(MEM_PATH)
print(f"Loaded {len(corpus)} base docs and {len(memory)} memory notes.")

In [None]:
def toy_embed(text: str, dim: int = EMBED_DIM) -> np.ndarray:
    tokens = text.lower().split()
    counts = Counter(tokens)
    vec = np.zeros(dim, dtype=np.float32)
    for token, count in counts.items():
        bucket = int(uuid.uuid5(uuid.NAMESPACE_DNS, token).hex, 16) % dim
        vec[bucket] += float(count)
    norm = np.linalg.norm(vec) + 1e-9
    return vec / norm


@dataclass
class RetrievedDoc:
    doc: dict
    score: float


def embed_document(doc: dict) -> np.ndarray:
    return toy_embed(f"{doc['title']} {doc['text']}")


ALL_DOCS: List[dict] = corpus + memory
DOC_EMBEDDINGS: List[np.ndarray] = [embed_document(doc) for doc in ALL_DOCS]


def refresh_index() -> None:
    global ALL_DOCS, DOC_EMBEDDINGS
    ALL_DOCS = corpus + memory
    DOC_EMBEDDINGS = [embed_document(doc) for doc in ALL_DOCS]


refresh_index()

In [None]:
def retrieve(query: str, top_k: int = TOP_K) -> List[RetrievedDoc]:
    query_vec = toy_embed(query)
    sims = np.dot(DOC_EMBEDDINGS, query_vec)
    top_indices = np.argsort(-sims)[:top_k]
    return [RetrievedDoc(ALL_DOCS[i], float(sims[i])) for i in top_indices]


def assemble_context(retrieved: Iterable[RetrievedDoc]) -> str:
    chunks = []
    for item in retrieved:
        chunks.append(
            f"[{item.doc['id']}] {item.doc['title']} (score={item.score:.3f})\n{item.doc['text']}"
        )
    return "\n\n".join(chunks)


def generate_answer(question: str, context: str) -> str:
    return (
        "(SIMULATED ANSWER)\n"
        + f"Question: {question}\n"
        + "Context used:
"
        + context
        + "\n---\nThe assistant summarises the retrieved notes above."
    )

In [None]:
def eval_keywords(answer: str, required_phrases: Iterable[str]) -> Tuple[float, List[str]]:
    answer_lower = answer.lower()
    missing = [phrase for phrase in required_phrases if phrase.lower() not in answer_lower]
    score = (len(required_phrases) - len(missing)) / max(1, len(required_phrases))
    return score, missing


def write_memory_note(question: str, missing_phrases: Iterable[str]) -> dict:
    memory_note = {
        "id": f"mem-{uuid.uuid4().hex[:8]}",
        "title": f"Memory Note: {question[:48]}",
        "text": "Key additions: " + "; ".join(missing_phrases) + ". Reminder: revisit these concepts next time.",
    }
    memory.append(memory_note)
    with MEM_PATH.open("a") as handle:
        handle.write(json.dumps(memory_note) + "\n")
    refresh_index()
    return memory_note

In [None]:
evaluation_plan = [
    {
        "question": "Explain RAG and why retrieval reduces hallucination.",
        "required": ["retrieval", "grounding", "context", "hallucination"],
    },
    {
        "question": "What is the role of embeddings in a vector database?",
        "required": ["embeddings", "similarity search", "vector database"],
    },
]


def rag_attempt(question: str):
    retrieved = retrieve(question)
    context = assemble_context(retrieved)
    answer = generate_answer(question, context)
    return answer, retrieved


def run_feedback_loop(question: str, required_phrases: Iterable[str]):
    answer1, retrieved1 = rag_attempt(question)
    score1, missing = eval_keywords(answer1, required_phrases)
    memory_note = None
    if score1 < SCORE_THRESHOLD and missing:
        memory_note = write_memory_note(question, missing)
    answer2, retrieved2 = rag_attempt(question)
    score2, _ = eval_keywords(answer2, required_phrases)
    return {
        "question": question,
        "attempt_1_score": score1,
        "attempt_1_docs": [doc.doc["id"] for doc in retrieved1],
        "memory_written": memory_note["id"] if memory_note else None,
        "attempt_2_score": score2,
        "attempt_2_docs": [doc.doc["id"] for doc in retrieved2],
    }

In [None]:
reports = [run_feedback_loop(item["question"], item["required"]) for item in evaluation_plan]

df = pd.DataFrame(reports)
df

In [None]:
for item in reports:
    print(f"Question: {item['question']}")
    print(f"  Attempt 1 score: {item['attempt_1_score']:.2f} | docs: {item['attempt_1_docs']}")
    print(f"  Memory note: {item['memory_written']}")
    print(f"  Attempt 2 score: {item['attempt_2_score']:.2f} | docs: {item['attempt_2_docs']}\n")

## Next steps

* Replace `toy_embed` with real embeddings such as `text-embedding-3-small` or
  `sentence-transformers` to mirror production behaviour.
* Swap the simulated summariser with a live LLM call via LangChain or the
  OpenAI client.
* Upgrade the keyword score to a groundedness metric like RAGAS.
* Track additional telemetry (token cost, run metadata) so you can compare the
  micro-loop against the larger agentic notebooks.