In [1]:
# =============================================================================
# Stage 1: Statement Extraction with DSPy
# =============================================================================
# This notebook orchestrates statement extraction training/evaluation.
# All application logic is in ontological_engineer - this notebook coordinates.
#
# Outputs (with CID provenance):
#   - data/training/chunks/*.ipynb - Chunked Wikipedia pages
#   - data/training/statements.ipynb - Extracted statements
#   - data/training/classifications.ipynb - Per-statement judgments
# =============================================================================

import sys
sys.path.insert(0, '/workspaces/wiki3-kg-project')

import dspy
import json
from pathlib import Path
from random import shuffle, seed as random_seed
from tqdm import tqdm

from ontological_engineer import (
    # LM Configuration
    configure_lm,
    # DSPy Modules
    StatementExtractor,
    StatementQualityJudge,
    StatementClassifier,
    StatementClassification,
    # Data Loading (from provenance-tracked notebooks)
    WikipediaPage,
    WikipediaChunk,
    load_sample_from_notebook,
    load_chunks_from_notebook,
    # Processing (all logic in module!)
    process_wikipedia_sample,
    fetch_page_content,
    chunk_article,
    # Provenance notebook generation
    save_notebook,
    get_processed_chunk_cids,
)
from ontological_engineer.judges import statement_quality_metric
from ontological_engineer.training.bootstrap import (
    load_chunks_from_notebook as load_albert_chunks,
    load_facts_from_notebook,
    create_training_examples,
)

## 1. Configure Language Model

Connect to LM Studio running Qwen-30B (or your preferred model).

In [2]:
# Configure the LM (defaults to Qwen-30B via LM Studio)
lm = configure_lm(
    model="qwen/qwen3-coder-30b",
    api_base="http://host.docker.internal:1234/v1",
    temperature=0.7,
)

print(f"Configured LM: {lm}")

Configured LM: <dspy.clients.lm.LM object at 0xffff8d3debf0>


## 2. Load Few-Shot Examples (Albert Einstein)

Albert Einstein is our gold-standard example. These chunks and their extracted facts
serve as few-shot demonstrations for the extractor and judge.

In [3]:
# Load Albert Einstein data for few-shot examples
fewshot_dir = Path("/workspaces/wiki3-kg-project/data/albert_einstein/20251218_231446")

# Use load_albert_chunks (from bootstrap module) - NOT load_chunks_from_notebook
fewshot_chunks = load_albert_chunks(fewshot_dir / "chunks.ipynb")
fewshot_facts = load_facts_from_notebook(fewshot_dir / "facts.ipynb")

print(f"Loaded {len(fewshot_chunks)} chunks from Albert Einstein")
print(f"Loaded {len(fewshot_facts)} fact sets")

# Create few-shot examples
fewshot_examples = create_training_examples(fewshot_chunks, fewshot_facts)
print(f"Created {len(fewshot_examples)} few-shot examples")

Loaded 63 chunks from Albert Einstein
Loaded 19 fact sets
Created 19 few-shot examples


In [4]:
# Show a few-shot example
if fewshot_examples:
    ex = fewshot_examples[0]
    print("Sample few-shot example:")
    print(f"  Context: {ex.section_context}")
    print(f"  Text: {ex.chunk_text[:200]}...")
    print(f"  Statements: {len(ex.statements)} items")
    for stmt in ex.statements[:3]:
        print(f"    - {stmt}")

Sample few-shot example:
  Context: Albert Einstein > Introduction
  Text: Albert Einstein (14 March 1879 ‚Äì 18 April 1955) was a German-born theoretical physicist best known for developing the theory of relativity. Einstein also made important contributions to quantum theory...
  Statements: 28 items
    - Albert Einstein was a German-born theoretical physicist.
    - Albert Einstein developed the theory of relativity.
    - Albert Einstein made important contributions to quantum theory.


## 3. Load Wikipedia Sample for Training

Load the 100-page Wikipedia sample from the provenance-tracked notebook.
If the notebook doesn't exist, fall back to JSON format.

In [5]:
# Load the Wikipedia sample (prefer provenance-tracked notebook)
sample_notebook = Path("/workspaces/wiki3-kg-project/data/training/wikipedia_sample.ipynb")
sample_json = Path("/workspaces/wiki3-kg-project/data/training/wikipedia_sample.json")

if sample_notebook.exists():
    # Load from provenance-tracked notebook
    wiki_pages = load_sample_from_notebook(sample_notebook)
    print(f"‚úÖ Loaded {len(wiki_pages)} pages from provenance-tracked notebook")
    print(f"   Source: {sample_notebook}")
elif sample_json.exists():
    # Fall back to JSON format
    with open(sample_json) as f:
        wiki_sample = json.load(f)
    wiki_pages = [WikipediaPage(title=p['title'], views=p['views']) for p in wiki_sample['pages']]
    print(f"‚ö†Ô∏è  Loaded {len(wiki_pages)} pages from JSON (no provenance)")
    print(f"   Run sample_wikipedia_pages.ipynb to generate provenance-tracked version")
else:
    raise FileNotFoundError("No Wikipedia sample found. Run sample_wikipedia_pages.ipynb first.")

print(f"\nFirst 10 pages:")
for p in wiki_pages[:10]:
    print(f"  - {p.title} ({p.views:,} views)")

‚úÖ Loaded 100 pages from provenance-tracked notebook
   Source: /workspaces/wiki3-kg-project/data/training/wikipedia_sample.ipynb

First 10 pages:
  - Zohran Mamdani (9,344,963 views)
  - ChatGPT (3,639,485 views)
  - James A. Garfield (3,524,531 views)
  - 1989 Tiananmen Square protests and massacre (2,867,005 views)
  - 2025 Bihar Legislative Assembly election (2,555,071 views)
  - Mira Nair (2,503,516 views)
  - Dick Cheney (2,186,840 views)
  - 2026 FIFA World Cup (2,155,565 views)
  - 1xBet (1,831,684 views)
  - Survivor Series: WarGames (2025) (1,590,263 views)


## 4. Fetch and Chunk Wikipedia Pages (with CID Provenance)

Fetch page content and chunk it. Each page's chunks are saved to a 
provenance-tracked notebook with CID signatures.

**Note**: Uses `fetch_page_content` and `chunk_article` from `ontological_engineer` - 
no application logic defined in this notebook!

In [6]:
# Processing parameters
chunks_dir = Path("/workspaces/wiki3-kg-project/data/training/chunks")
MAX_PAGES = len(wiki_pages)
MIN_CHUNK_LENGTH = 60  # Skip very short chunks

# Quick test on one page first
test_page = wiki_pages[0]
print(f"Testing on: {test_page.title}")

content = fetch_page_content(test_page.title)
if content:
    chunks = chunk_article(test_page.title, content)
    chunks = [c for c in chunks if len(c.text) >= MIN_CHUNK_LENGTH]
    print(f"  ‚Üí {len(chunks)} chunks (filtered by min_length={MIN_CHUNK_LENGTH})")
    if chunks:
        print(f"  First chunk preview: {chunks[0].text[:200]}...")
else:
    print(f"  ‚ö†Ô∏è Could not fetch content")

Testing on: Zohran Mamdani
  ‚Üí 20 chunks (filtered by min_length=60)
  First chunk preview: Zohran Kwame Mamdani (born October 18, 1991) is an American politician who is the mayor-elect of New York City. A member of the Democratic Party and the Democratic Socialists of America, he is set to ...


In [7]:
# Process all pages - logic is in process_wikipedia_sample()
# Handles: fetching, chunking, saving with CID provenance, incremental processing

training_chunks, pages_processed = process_wikipedia_sample(
    pages=wiki_pages,
    output_dir=chunks_dir,
    max_pages=MAX_PAGES,
    min_chunk_length=MIN_CHUNK_LENGTH,
)

print(f"\n‚úÖ Processed {pages_processed} pages")
print(f"   Total training chunks: {len(training_chunks)}")
print(f"   Chunks saved to: {chunks_dir}")

Processing pages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 261.16it/s]


‚úÖ Processed 99 pages
   Total training chunks: 3428
   Chunks saved to: /workspaces/wiki3-kg-project/data/training/chunks





## 5. Initialize Extractor with Few-Shot Examples

Create the statement extractor and provide Albert Einstein examples as demonstrations.

In [8]:
# Select best few-shot examples (ones with good variety of statements)
NUM_FEWSHOT = 3

# Sort by statement count to get diverse examples
sorted_fewshot = sorted(fewshot_examples, key=lambda x: len(x.statements), reverse=True)
selected_fewshot = sorted_fewshot[:NUM_FEWSHOT]

print(f"Selected {len(selected_fewshot)} few-shot examples:")
for i, ex in enumerate(selected_fewshot, 1):
    print(f"  {i}. {ex.section_context[:50]}... ({len(ex.statements)} statements)")

Selected 3 few-shot examples:
  1. Albert Einstein > Introduction... (28 statements)
  2. Albert Einstein > Life and career > Personal views... (28 statements)
  3. Albert Einstein > Introduction... (27 statements)


In [9]:
# Create extractor with few-shot demonstrations
extractor = StatementExtractor()

# In DSPy, we can provide demonstrations directly
# The few-shot examples will be used by MIPROv2 for bootstrapping
print("Extractor initialized")
print(f"Few-shot examples available: {len(selected_fewshot)}")

Extractor initialized
Few-shot examples available: 3


## 6. Test Extraction on Training Sample

Run the extractor on a few training chunks to verify it works.

In [10]:
# Test on a training chunk
if training_chunks:
    test_chunk = training_chunks[0]
    
    print(f"Testing on: {test_chunk.section_context}")
    print(f"Text: {test_chunk.text[:300]}...")
    print("\n" + "="*60 + "\n")
    
    result = extractor(
        chunk_text=test_chunk.text,
        section_context=test_chunk.section_context,
    )
    
    print(f"Extracted {len(result.statements)} statements:")
    for i, stmt in enumerate(result.statements[:10], 1):
        print(f"  {i}. {stmt}")
    if len(result.statements) > 10:
        print(f"  ... and {len(result.statements) - 10} more")

Testing on: Zohran Mamdani > Zohran Mamdani
Text: Zohran Kwame Mamdani (born October 18, 1991) is an American politician who is the mayor-elect of New York City. A member of the Democratic Party and the Democratic Socialists of America, he is set to become New York's first Muslim and Asian American mayor. Mamdani has served as a member of the New Y...


Extracted 22 statements:
  1. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) was born on October 18, 1991.
  2. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is an American politician.
  3. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is the mayor-elect of [New York City](/wiki/New_York_City).
  4. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is a member of the [Democratic Party](/wiki/Democratic_Party).
  5. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is a member of the [Democratic Socialists of America](/wiki/Democratic_Socialists_of_America).
  6. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is set t

## 7. Create Training Dataset

Convert chunks into DSPy examples. For training, we need to generate initial extractions
that can be scored and optimized.

In [11]:
# Create training examples (without labels - we'll generate and judge them)
# For DSPy optimization, we just need the inputs

random_seed(42)  # For reproducibility

# Convert WikipediaChunk objects to DSPy examples
trainset_chunks = list(training_chunks)  # Make a copy
shuffle(trainset_chunks)

trainset = []
for chunk in trainset_chunks:
    ex = dspy.Example(
        chunk_text=chunk.text,
        section_context=chunk.section_context,
    ).with_inputs('chunk_text', 'section_context')
    trainset.append(ex)

# Split into train/dev
split_idx = int(len(trainset) * 0.8)
devset = trainset[split_idx:]
trainset = trainset[:split_idx]

print(f"Training set: {len(trainset)} examples")
print(f"Dev set: {len(devset)} examples")

Training set: 2742 examples
Dev set: 686 examples


## 8. Initialize Judge with Few-Shot Guidance

The judge scores extraction quality. We use Albert Einstein examples to calibrate.

In [12]:
# Initialize judge
judge = StatementQualityJudge()

# Test judge on a known good example (Albert Einstein few-shot)
if selected_fewshot:
    test_ex = selected_fewshot[0]
    
    evaluation = judge(
        chunk_text=test_ex.chunk_text,
        section_context=test_ex.section_context,
        statements=test_ex.statements,
    )
    
    print("Judge calibration on few-shot example:")
    print(f"  Completeness:      {evaluation.completeness:.2f}")
    print(f"  Atomicity:         {evaluation.atomicity:.2f}")
    print(f"  Accuracy:          {evaluation.accuracy:.2f}")
    print(f"  Link preservation: {evaluation.link_preservation:.2f}")
    print(f"  ---")
    print(f"  Weighted score:    {evaluation.weighted_score:.2f}")

Judge calibration on few-shot example:
  Completeness:      0.95
  Atomicity:         0.85
  Accuracy:          0.95
  Link preservation: 1.00
  ---
  Weighted score:    0.94


## 9. Baseline Evaluation

Evaluate the unoptimized extractor on the dev set.

In [13]:
# Evaluate baseline on dev set
EVAL_SIZE = min(10, len(devset))  # Limit for speed

evaluator = dspy.Evaluate(
    devset=devset[:EVAL_SIZE],
    metric=statement_quality_metric,
    num_threads=1,
    display_progress=True,
)

baseline_extractor = StatementExtractor()
baseline_result = evaluator(baseline_extractor)

baseline_score = baseline_result.score if hasattr(baseline_result, 'score') else float(baseline_result)
print(f"\nBaseline quality score: {baseline_score:.2f}")

Average Metric: 8.24 / 10 (82.4%): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 240.28it/s]

2025/12/20 07:01:27 INFO dspy.evaluate.evaluate: Average Metric: 8.2375 / 10 (82.4%)




Baseline quality score: 82.38


## 9b. MLflow Observability Setup

MLflow provides tracing, evaluation, and human feedback tools for DSPy pipelines.

### Quick Setup (One-time)

1. **Install MLflow** (already in requirements or run cell below)
2. **Start the MLflow server** in a terminal:
   ```bash
   cd /workspaces/wiki3-kg-project
   mlflow server \
     --backend-store-uri sqlite:///mlflow.sqlite \
     --default-artifact-root ./mlflow-artifacts \
     --host 0.0.0.0 \
     --port 5000
   ```
3. **Open the UI** at http://localhost:5000 (or via VS Code port forwarding)

### What MLflow Provides
- **Tracing**: See every LM call, inputs, outputs, latency
- **Evaluation**: Compare model versions side-by-side
- **Human Feedback**: Add labels/assessments directly in the UI
- **Experiment Tracking**: Track metrics across optimization runs

In [14]:
# =============================================================================
# MLflow Setup for DSPy Optimization Tracking
# =============================================================================
# Reference: https://dspy.ai/tutorials/optimizer_tracking/
#
# Prerequisites:
#   1. Install: pip install "mlflow>=2.21.1"
#   2. Start server (use SQL store for tracing):
#      mlflow server --backend-store-uri sqlite:///mlflow.sqlite --port 5000
#   3. Open UI: http://localhost:5000
# =============================================================================

import mlflow

MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"

try:
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment("wiki3-kg-stage1-statements")
    
    # Enable autologging with full optimizer tracking (per DSPy docs)
    # https://dspy.ai/tutorials/optimizer_tracking/
    mlflow.dspy.autolog(
        log_compiles=True,           # Track optimization process
        log_evals=True,              # Track evaluation results  
        log_traces_from_compile=True # Track program traces during optimization
    )
    
    print(f"‚úÖ MLflow configured successfully")
    print(f"   Tracking URI: {MLFLOW_TRACKING_URI}")
    print(f"   Experiment: wiki3-kg-stage1-statements")
    print(f"\nüìä Open MLflow UI: {MLFLOW_TRACKING_URI}")
    MLFLOW_ENABLED = True
except Exception as e:
    print(f"‚ö†Ô∏è  MLflow not available: {e}")
    print(f"\nüí° To enable MLflow, start the server:")
    print(f"   mlflow server --backend-store-uri sqlite:///mlflow.sqlite --port 5000")
    MLFLOW_ENABLED = False

‚úÖ MLflow configured successfully
   Tracking URI: http://127.0.0.1:5000
   Experiment: wiki3-kg-stage1-statements

üìä Open MLflow UI: http://127.0.0.1:5000


In [15]:
# Run evaluation with MLflow tracing (if enabled)
# Each prediction creates a trace viewable in the MLflow UI

if MLFLOW_ENABLED:
    with mlflow.start_run(run_name="baseline_evaluation"):
        # Log parameters for reproducibility
        mlflow.log_param("eval_size", EVAL_SIZE)
        mlflow.log_param("model", "qwen/qwen3-coder-30b")
        mlflow.log_param("num_fewshot", NUM_FEWSHOT)
        
        # Run extractions on dev set - each one is traced
        results = []
        for i, ex in enumerate(tqdm(devset[:EVAL_SIZE], desc="Evaluating")):
            with mlflow.start_span(name=f"example_{i}") as span:
                # Run extraction
                pred = baseline_extractor(
                    chunk_text=ex.chunk_text,
                    section_context=ex.section_context,
                )
                
                # Run judge
                eval_result = judge(
                    chunk_text=ex.chunk_text,
                    section_context=ex.section_context,
                    statements=pred.statements,
                )
                
                # Log to span for MLflow UI review
                span.set_inputs({
                    "chunk_text": ex.chunk_text[:500],
                    "section_context": ex.section_context,
                })
                span.set_outputs({
                    "statements": list(pred.statements),
                    "completeness": float(eval_result.completeness),
                    "atomicity": float(eval_result.atomicity),
                    "accuracy": float(eval_result.accuracy),
                    "link_preservation": float(eval_result.link_preservation),
                    "weighted_score": float(eval_result.weighted_score),
                    "reasoning": eval_result.reasoning,
                })
                
                results.append({
                    "index": i,
                    "score": float(eval_result.weighted_score),
                })
        
        # Log aggregate metrics
        avg_score = sum(r["score"] for r in results) / len(results)
        mlflow.log_metric("avg_quality_score", avg_score)
        
        print(f"\n‚úÖ Evaluation complete!")
        print(f"   Average score: {avg_score:.2f}")
        print(f"   Traces logged: {len(results)}")
        print(f"\nüìä Review in MLflow UI: {MLFLOW_TRACKING_URI}")
        print(f"   ‚Üí Click 'Traces' tab to see all predictions")
        print(f"   ‚Üí Click individual traces to review inputs/outputs")
        print(f"   ‚Üí Use 'Feedback' to add human labels")
else:
    print("‚è≠Ô∏è  Skipping MLflow evaluation (server not running)")
    print("   Run baseline evaluation with dspy.Evaluate instead")

Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:01<00:00,  5.62it/s]



‚úÖ Evaluation complete!
   Average score: 0.82
   Traces logged: 10

üìä Review in MLflow UI: http://127.0.0.1:5000
   ‚Üí Click 'Traces' tab to see all predictions
   ‚Üí Click individual traces to review inputs/outputs
   ‚Üí Use 'Feedback' to add human labels
üèÉ View run baseline_evaluation at: http://127.0.0.1:5000/#/experiments/1/runs/3b98426ce0954065a119d35246a740b2
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


In [16]:
# Alternative: Use dspy.inspect_history() for quick debugging
# This shows recent LM calls without needing MLflow server

print("Recent LM calls (use MLflow UI for full traces):")
print("=" * 60)
dspy.inspect_history(n=2)

Recent LM calls (use MLflow UI for full traces):




[34m[2025-12-20T07:01:30.404005][0m

[31mSystem message:[0m

Your input fields are:
1. `chunk_text` (str): Wikipedia article chunk with markdown links preserved
2. `section_context` (str): Breadcrumb showing location: Article > Section > Subsection
Your output fields are:
1. `reasoning` (str): 
2. `statements` (list[str]): List of atomic statements, each preserving [Entity](/wiki/...) links
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## chunk_text ## ]]
{chunk_text}

[[ ## section_context ## ]]
{section_context}

[[ ## reasoning ## ]]
{reasoning}

[[ ## statements ## ]]
{statements}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Extract atomic, verifiable statements from Wikipedia text.
        
        Each statement must:
 

### MLflow Evaluation with Human Feedback

Use MLflow's evaluation API to systematically review predictions and collect human labels.
The MLflow UI provides a proper interface for reviewing and annotating.

In [17]:
# Create evaluation dataset for MLflow
import pandas as pd

eval_data = []
for i, ex in enumerate(devset[:EVAL_SIZE]):
    eval_data.append({
        "index": i,
        "chunk_text": ex.chunk_text,
        "section_context": ex.section_context,
    })

eval_df = pd.DataFrame(eval_data)
print(f"Created evaluation dataset with {len(eval_df)} examples")
eval_df.head()

Created evaluation dataset with 10 examples


Unnamed: 0,index,chunk_text,section_context
0,0,"Abbey Laurel-Smith, businesswoman\nAdam Oremla...",2021 New York City mayoral election > Failed t...
1,1,"At the time of Gacy's arrest, he had claimed t...",John Wayne Gacy > Possible additional victims
2,2,. Kumar's government also announced a scheme o...,Nitish Kumar > Consolidation of Extremely Back...
3,3,"Queen Elizabeth II died on 8 September 2022, a...","William, Prince of Wales > Prince of Wales"
4,4,"""The Life and Death of Robin Williams"". ABC Ne...",Robin Williams > Further reading


In [19]:
# ============================================================================
# Per-Statement Classification for ALL Evaluation Examples
# ============================================================================
# Uses StatementClassifier to get GOOD/BAD verdicts per statement

from ontological_engineer import StatementClassifier, StatementClassification

classifier = StatementClassifier()

# Store all results for summary
all_classification_results = []

print(f"Classifying statements for {EVAL_SIZE} chunks...")
print("=" * 70)

for idx, ex in enumerate(devset[:EVAL_SIZE]):
    print(f"\n[{idx+1}/{EVAL_SIZE}] {ex.section_context[:50]}...")
    
    # Extract statements
    pred = baseline_extractor(
        chunk_text=ex.chunk_text,
        section_context=ex.section_context,
    )
    
    # Classify each statement
    result = classifier(
        chunk_text=ex.chunk_text,
        section_context=ex.section_context,
        statements=list(pred.statements),
    )
    
    # Store result
    all_classification_results.append({
        "idx": idx,
        "section": ex.section_context,
        "chunk_text": ex.chunk_text,
        "statements": list(pred.statements),
        "score": result.score,
        "classifications": result.classifications,
        "missing_facts": result.missing_facts,
    })
    
    # Show quick summary
    good = sum(1 for c in result.classifications if c.is_good)
    total = len(result.classifications)
    print(f"   ‚Üí {good}/{total} GOOD ({result.score:.0%})")

# Summary
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
avg_score = sum(r["score"] for r in all_classification_results) / len(all_classification_results)
total_good = sum(sum(1 for c in r["classifications"] if c.is_good) for r in all_classification_results)
total_bad = sum(sum(1 for c in r["classifications"] if not c.is_good) for r in all_classification_results)
print(f"Average score: {avg_score:.1%}")
print(f"Total GOOD: {total_good}, Total BAD: {total_bad}")

Classifying statements for 10 chunks...

[1/10] 2021 New York City mayoral election > Failed to qu...
   ‚Üí 0/0 GOOD (0%)

[2/10] John Wayne Gacy > Possible additional victims...
   ‚Üí 12/12 GOOD (100%)

[3/10] Nitish Kumar > Consolidation of Extremely Backward...
   ‚Üí 9/9 GOOD (100%)

[4/10] William, Prince of Wales > Prince of Wales...
   ‚Üí 9/9 GOOD (100%)

[5/10] Robin Williams > Further reading...
   ‚Üí 0/0 GOOD (0%)

[6/10] Shah Rukh Khan > 2004‚Äì2009: Comeback...
   ‚Üí 11/11 GOOD (100%)

[7/10] John Wayne Gacy > Cited works...
   ‚Üí 10/10 GOOD (100%)

[8/10] The Black Phone > Plot...
   ‚Üí 23/23 GOOD (100%)

[9/10] Thanksgiving > Australia...
   ‚Üí 5/5 GOOD (100%)

[10/10] John Wayne Gacy > Assault of Donald Voorhees...
   ‚Üí 18/18 GOOD (100%)

SUMMARY
Average score: 80.0%
Total GOOD: 97, Total BAD: 0


In [20]:
# Display detailed results for each chunk
print("DETAILED PER-CHUNK RESULTS")
print("=" * 70)

for r in all_classification_results:
    good = sum(1 for c in r["classifications"] if c.is_good)
    bad = sum(1 for c in r["classifications"] if not c.is_good)
    total = len(r["classifications"])
    
    print(f"\nüìÑ Chunk {r['idx']}: {r['section'][:60]}...")
    print(f"   Score: {r['score']:.0%} ({good}/{total} GOOD)")
    
    # Show BAD statements (these need attention)
    bad_stmts = [c for c in r["classifications"] if not c.is_good]
    if bad_stmts:
        print(f"   ‚ùå BAD statements:")
        for c in bad_stmts:
            print(f"      [{c.index}] {c.statement[:80]}...")
            print(f"          Reason: {c.reason}")
    
    if r["missing_facts"] and r["missing_facts"].lower() != "none":
        print(f"   üìù Missing: {r['missing_facts'][:100]}...")

DETAILED PER-CHUNK RESULTS

üìÑ Chunk 0: 2021 New York City mayoral election > Failed to qualify for ...
   Score: 0% (0/0 GOOD)

üìÑ Chunk 1: John Wayne Gacy > Possible additional victims...
   Score: 100% (12/12 GOOD)

üìÑ Chunk 2: Nitish Kumar > Consolidation of Extremely Backward Castes...
   Score: 100% (9/9 GOOD)

üìÑ Chunk 3: William, Prince of Wales > Prince of Wales...
   Score: 100% (9/9 GOOD)

üìÑ Chunk 4: Robin Williams > Further reading...
   Score: 0% (0/0 GOOD)

üìÑ Chunk 5: Shah Rukh Khan > 2004‚Äì2009: Comeback...
   Score: 100% (11/11 GOOD)

üìÑ Chunk 6: John Wayne Gacy > Cited works...
   Score: 100% (10/10 GOOD)

üìÑ Chunk 7: The Black Phone > Plot...
   Score: 100% (23/23 GOOD)

üìÑ Chunk 8: Thanksgiving > Australia...
   Score: 100% (5/5 GOOD)

üìÑ Chunk 9: John Wayne Gacy > Assault of Donald Voorhees...
   Score: 100% (18/18 GOOD)


### Export Annotations from MLflow

After reviewing and labeling in the MLflow UI, export your annotations for judge improvement.

In [21]:
# Load annotations from MLflow (after you've labeled them in the UI)
# MLflow stores feedback as assessments on traces

client = mlflow.MlflowClient()

# Get the latest evaluation run
experiment = client.get_experiment_by_name("wiki3-kg-stage1-statements")
if experiment:
    runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        order_by=["start_time DESC"],
        max_results=1,
    )
    
    if runs:
        latest_run = runs[0]
        print(f"Latest run: {latest_run.info.run_id}")
        print(f"Metrics: {latest_run.data.metrics}")
        
        # Get traces with assessments (human feedback)
        try:
            traces = client.search_traces(
                experiment_ids=[experiment.experiment_id],
                max_results=100,
            )
            print(f"Found {len(traces)} traces")
        except Exception as e:
            print(f"Trace search error: {e}")
else:
    print("No experiment found. Run evaluation first.")

Latest run: 3b98426ce0954065a119d35246a740b2
Metrics: {'avg_quality_score': 0.8237500000000001}


  traces = client.search_traces(


Found 100 traces


In [None]:
# Use human feedback to improve the judge
# After collecting labels in MLflow, create DSPy training examples

# For now, save the evaluation data for later use
output_dir = Path("/workspaces/wiki3-kg-project/data/training")
output_dir.mkdir(exist_ok=True)

eval_df.to_json(output_dir / "eval_dataset.json", orient="records", indent=2)
print(f"Saved evaluation dataset to {output_dir / 'eval_dataset.json'}")

print("""
üìã Next steps for human feedback:

1. Start MLflow server:
   mlflow server --backend-store-uri sqlite:///mlflow.sqlite --port 5000

2. Open MLflow UI at http://127.0.0.1:5000

3. Navigate to the experiment 'wiki3-kg-stage1-statements'

4. Click on traces to review predictions

5. Use the feedback/assessment features to label quality

6. Export labeled data for judge improvement
""")

## 10. MIPROv2 Prompt Optimization

Use DSPy's MIPROv2 optimizer to improve the extractor's prompts.
This uses the few-shot examples to bootstrap better demonstrations.

In [22]:
from dspy.teleprompt import MIPROv2

# Configure optimizer
# Reference: https://dspy.ai/tutorials/optimizer_tracking/
optimizer = MIPROv2(
    metric=statement_quality_metric,
    auto="light",  # Use "light" preset (fast), "medium", or "heavy" for more trials
)

# Use training set for optimization
TRAIN_SIZE = len(trainset)

print(f"Optimizing with {TRAIN_SIZE} training examples...")
print(f"Using {len(selected_fewshot)} few-shot demos for bootstrapping...")
print(f"MIPROv2 mode: auto='light'")

# MIPROv2.compile() - MLflow automatically tracks via autolog()
# Parent run: overall optimization, child runs: each intermediate program
optimized_extractor = optimizer.compile(
    StatementExtractor(),
    trainset=trainset[:TRAIN_SIZE],
    max_bootstrapped_demos=NUM_FEWSHOT,
)

print("\nOptimization complete!")
print(f"üìä View optimization traces in MLflow UI: {MLFLOW_TRACKING_URI}")

2025/12/20 07:06:52 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'cdf4428219734698a0708c4dd0003049', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current dspy workflow
2025/12/20 07:06:52 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: False
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 16

2025/12/20 07:06:52 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/12/20 07:06:52 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/12/20 07:06:52 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Optimizing with 20 training examples...
Using 3 few-shot demos for bootstrapping...
MIPROv2 mode: auto='light'
Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 3/4 [00:10<00:03,  3.62s/it]

Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.





Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Bootstrapping set 4/6


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

 25%|‚ñà‚ñà‚ñå       | 1/4 [00:00<00:00,  3.75it/s]

Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.





Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Bootstrapping set 5/6


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

 25%|‚ñà‚ñà‚ñå       | 1/4 [00:09<00:27,  9.33s/it]

Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.





Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Bootstrapping set 6/6


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 3/4 [00:00<00:00,  3.95it/s]

Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.





Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

2025/12/20 07:07:17 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/12/20 07:07:17 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2025/12/20 07:07:22 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2025/12/20 07:08:11 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/12/20 07:08:11 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Extract atomic, verifiable statements from Wikipedia text.

Each statement must:
- Be self-contained (understandable without the original text)
- Preserve markdown links: [Entity Name](/wiki/Entity_Name)
- Contain exactly one verifiable claim
- Not editorialize or interpret beyond what's stated

Example input chunk:
    "Albert Einstein was born in Ulm, in the Kingdom of W√ºrttemberg 
   

Average Metric: 15.31 / 16 (95.7%): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [02:39<00:00,  9.99s/it]

2025/12/20 07:10:50 INFO dspy.evaluate.evaluate: Average Metric: 15.3125 / 16 (95.7%)





2025/12/20 07:10:51 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 95.7

2025/12/20 07:10:51 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 10 =====


üèÉ View run eval_full_0 at: http://127.0.0.1:5000/#/experiments/1/runs/e3ab7818eb3548a48f4feefad4ba41e4
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1
Average Metric: 14.26 / 16 (89.1%): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [03:07<00:00, 11.74s/it]

2025/12/20 07:13:59 INFO dspy.evaluate.evaluate: Average Metric: 14.255000000000003 / 16 (89.1%)





2025/12/20 07:13:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 89.09 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2025/12/20 07:13:59 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [95.7, 89.09]
2025/12/20 07:13:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 95.7


2025/12/20 07:13:59 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 10 =====


üèÉ View run eval_full_1 at: http://127.0.0.1:5000/#/experiments/1/runs/bd7afe8afb444ad2bcbb617e1b16f6ce
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1
Average Metric: 14.81 / 16 (92.6%): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [03:16<00:00, 12.26s/it]

2025/12/20 07:17:16 INFO dspy.evaluate.evaluate: Average Metric: 14.812500000000002 / 16 (92.6%)





2025/12/20 07:17:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.58 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/20 07:17:16 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [95.7, 89.09, 92.58]
2025/12/20 07:17:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 95.7


2025/12/20 07:17:16 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 10 =====


üèÉ View run eval_full_2 at: http://127.0.0.1:5000/#/experiments/1/runs/775e8f2f3e524cb59adc9bad3680b0eb
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1
Average Metric: 13.32 / 16 (83.3%): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [03:09<00:00, 11.86s/it]

2025/12/20 07:20:26 INFO dspy.evaluate.evaluate: Average Metric: 13.325 / 16 (83.3%)





2025/12/20 07:20:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 83.28 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5'].
2025/12/20 07:20:26 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [95.7, 89.09, 92.58, 83.28]
2025/12/20 07:20:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 95.7


2025/12/20 07:20:26 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 10 =====


üèÉ View run eval_full_3 at: http://127.0.0.1:5000/#/experiments/1/runs/a8ddc52f4c2840f78cff091161799153
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1
  0%|          | 0/16 [00:00<?, ?it/s]

2025/12/20 07:21:09 ERROR dspy.utils.parallelizer: Error for Example({'chunk_text': "Halloween, also known as All Hallows' Eve, or All Saints' Eve, is a celebration observed in many countries  on 31 October, the eve of the Western Christian feast of All Hallows' Day. It is at the beginning of the observance of Allhallowtide, the time in the Christian liturgical year dedicated to remembering the dead, including saints (hallows), martyrs, and all the faithful departed. In popular culture, Halloween has become a celebration of horror and is associated with the macabre and the supernatural.\nOne theory holds that many Halloween traditions were influenced by Celtic harvest festivals, particularly the Gaelic festival Samhain, which are believed to have pagan roots. Some theories go further and suggest that Samhain may have been Christianized as All Hallows' Day, along with its eve, by the Church. Other academics say Halloween began independently as a Christian holiday, being the vigil  of Al

Average Metric: 0.00 / 0 (0%):   6%|‚ñã         | 1/16 [00:42<10:41, 42.80s/it]

[W 2025-12-20 07:21:50,968] Trial 4 failed with parameters: {'0_predictor_instruction': 2, '0_predictor_demos': 2} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/workspaces/wiki3-kg-project/.venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
  File "/workspaces/wiki3-kg-project/.venv/lib/python3.10/site-packages/dspy/teleprompt/mipro_optimizer_v2.py", line 510, in objective
    score = eval_candidate_program(batch_size, valset, candidate_program, evaluate, self.rng).score
  File "/workspaces/wiki3-kg-project/.venv/lib/python3.10/site-packages/dspy/teleprompt/utils.py", line 53, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, callback_metadata={"metric_key": "eval_full"})
  File "/workspaces/wiki3-kg-project/.venv/lib/python3.10/site-packages/mlflow/utils/autologging_utils/safety.py", line 484, in safe_patch_function
    patch_function(

üèÉ View run eval_full_4 at: http://127.0.0.1:5000/#/experiments/1/runs/90620f9d3d884ca5acf2c33c0d427590
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1
üèÉ View run welcoming-toad-10 at: http://127.0.0.1:5000/#/experiments/1/runs/cdf4428219734698a0708c4dd0003049
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


KeyboardInterrupt: 

In [None]:
# Evaluate optimized extractor
optimized_result = evaluator(optimized_extractor)
optimized_score = optimized_result.score if hasattr(optimized_result, 'score') else float(optimized_result)

print(f"Baseline score:  {baseline_score:.2f}")
print(f"Optimized score: {optimized_score:.2f}")
print(f"Improvement:     {optimized_score - baseline_score:+.2f}")

## 11. Inspect Optimized Prompts

See what prompts MIPROv2 discovered.

In [None]:
# Inspect the optimized module
print("Optimized extractor configuration:")
print("="*60)

# Try to access the optimized signature/demos
if hasattr(optimized_extractor, 'demos'):
    print(f"\nDemonstrations: {len(optimized_extractor.demos)}")
    for i, demo in enumerate(optimized_extractor.demos[:2], 1):
        print(f"  Demo {i}: {demo.section_context[:50]}...")

# Check for any instruction changes
if hasattr(optimized_extractor, 'signature'):
    print(f"\nSignature: {optimized_extractor.signature}")

## 12. Save Results

Save the optimized extractor and training data.

In [None]:
# Save training metadata
output_dir = Path("/workspaces/wiki3-kg-project/data/training")
output_dir.mkdir(exist_ok=True)

# Save training results
results = {
    "baseline_score": baseline_score,
    "optimized_score": optimized_score,
    "train_size": TRAIN_SIZE,
    "eval_size": EVAL_SIZE,
    "num_fewshot": NUM_FEWSHOT,
    "pages_processed": pages_processed,
    "total_chunks": len(training_chunks),
}

with open(output_dir / "stage1_results.json", "w") as f:
    json.dump(results, f, indent=2)

print(f"Saved results to {output_dir / 'stage1_results.json'}")

In [None]:
# Save the optimized extractor state
try:
    optimized_extractor.save(output_dir / "optimized_extractor")
    print(f"Saved optimized extractor to {output_dir / 'optimized_extractor'}")
except Exception as e:
    print(f"Could not save extractor state: {e}")
    # Alternative: save as JSON
    if hasattr(optimized_extractor, 'dump_state'):
        state = optimized_extractor.dump_state()
        with open(output_dir / "optimized_extractor_state.json", "w") as f:
            json.dump(state, f, indent=2)
        print("Saved extractor state as JSON")

In [None]:
# Save few-shot examples for reference
fewshot_data = []
for ex in selected_fewshot:
    fewshot_data.append({
        "chunk_text": ex.chunk_text,
        "section_context": ex.section_context,
        "statements": list(ex.statements),
    })

with open(output_dir / "fewshot_examples.json", "w") as f:
    json.dump(fewshot_data, f, indent=2)

print(f"Saved {len(fewshot_data)} few-shot examples")

## Summary

This notebook:
1. Loaded Albert Einstein as few-shot examples (seed/guidance)
2. Fetched and chunked Wikipedia sample pages for training
3. Established baseline extraction quality
4. Ran MIPROv2 prompt optimization
5. Saved the optimized extractor

Next steps:
- **Stage 2**: Schema matching with optimized statements
- **Stage 3**: RDF generation training
- **Arbor GRPO**: Fine-tune the full pipeline end-to-end