In [1]:
# =============================================================================
# Stage 1: Statement Extraction with DSPy
# =============================================================================
# This notebook orchestrates statement extraction training/evaluation.
# All application logic is in ontological_engineer - this notebook coordinates.
#
# Outputs (with CID provenance):
#   - data/training/chunks/*.ipynb - Chunked Wikipedia pages
#   - data/training/stage1_config.json - Configuration with CID
#   - data/training/trainset.json - Training examples with CID
#   - data/training/devset.json - Dev examples with CID
#   - data/training/fewshot_examples.json - Few-shot examples with CID
#   - data/training/baseline_results.json - Baseline evaluation with CID
#
# Next: Run stage1_optimize.ipynb for MIPROv2 optimization
# =============================================================================

import sys
sys.path.insert(0, '/workspaces/wiki3-kg-project')

import dspy
import json
from pathlib import Path
from random import shuffle, seed as random_seed
from tqdm import tqdm

from ontological_engineer import (
    # LM Configuration
    configure_lm,
    # DSPy Modules
    StatementExtractor,
    StatementQualityJudge,
    StatementClassifier,
    StatementClassification,
    # Data Loading (from provenance-tracked notebooks)
    WikipediaPage,
    WikipediaChunk,
    load_sample_from_notebook,
    load_chunks_from_notebook,
    # Processing (all logic in module!)
    process_wikipedia_sample,
    fetch_page_content,
    chunk_article,
    # Provenance notebook generation
    save_notebook,
    get_processed_chunk_cids,
)
from ontological_engineer.judges import statement_quality_metric
from ontological_engineer.training.bootstrap import (
    load_chunks_from_notebook as load_albert_chunks,
    load_facts_from_notebook,
    create_training_examples,
)
from ontological_engineer.training import (
    save_stage1_config,
    save_trainset,
    save_devset,
    save_fewshot_examples,
    check_baseline_cache,
    save_baseline_results,
)

## 1. Configure Language Model

Connect to LM Studio running Qwen-30B (or your preferred model).

In [2]:
# Configure the LM (defaults to Qwen-30B via LM Studio)
MODEL = "qwen/qwen3-coder-30b"
API_BASE = "http://host.docker.internal:1234/v1"
TEMPERATURE = 0.7
NUM_FEWSHOT = 3

lm = configure_lm(
    model=MODEL,
    api_base=API_BASE,
    temperature=TEMPERATURE,
)

print(f"Configured LM: {lm}")

Configured LM: <dspy.clients.lm.LM object at 0xffff88cce740>


## 2. Load Few-Shot Examples (Albert Einstein)

Albert Einstein is our gold-standard example. These chunks and their extracted facts
serve as few-shot demonstrations for the extractor and judge.

In [3]:
# Load Albert Einstein data for few-shot examples
fewshot_dir = Path("/workspaces/wiki3-kg-project/data/albert_einstein/20251218_231446")

# Use load_albert_chunks (from bootstrap module) - NOT load_chunks_from_notebook
fewshot_chunks = load_albert_chunks(fewshot_dir / "chunks.ipynb")
fewshot_facts = load_facts_from_notebook(fewshot_dir / "facts.ipynb")

print(f"Loaded {len(fewshot_chunks)} chunks from Albert Einstein")
print(f"Loaded {len(fewshot_facts)} fact sets")

# Create few-shot examples
fewshot_examples = create_training_examples(fewshot_chunks, fewshot_facts)
print(f"Created {len(fewshot_examples)} few-shot examples")

Loaded 63 chunks from Albert Einstein
Loaded 19 fact sets
Created 19 few-shot examples


In [4]:
# Show a few-shot example
if fewshot_examples:
    ex = fewshot_examples[0]
    print("Sample few-shot example:")
    print(f"  Context: {ex.section_context}")
    print(f"  Text: {ex.chunk_text[:200]}...")
    print(f"  Statements: {len(ex.statements)} items")
    for stmt in ex.statements[:3]:
        print(f"    - {stmt}")

Sample few-shot example:
  Context: Albert Einstein > Introduction
  Text: Albert Einstein (14 March 1879 ‚Äì 18 April 1955) was a German-born theoretical physicist best known for developing the theory of relativity. Einstein also made important contributions to quantum theory...
  Statements: 28 items
    - Albert Einstein was a German-born theoretical physicist.
    - Albert Einstein developed the theory of relativity.
    - Albert Einstein made important contributions to quantum theory.


## 3. Load Wikipedia Sample for Training

Load the 100-page Wikipedia sample from the provenance-tracked notebook.
If the notebook doesn't exist, fall back to JSON format.

In [5]:
# Load the Wikipedia sample (prefer provenance-tracked notebook)
sample_notebook = Path("/workspaces/wiki3-kg-project/data/training/wikipedia_sample.ipynb")
sample_json = Path("/workspaces/wiki3-kg-project/data/training/wikipedia_sample.json")

if sample_notebook.exists():
    # Load from provenance-tracked notebook
    wiki_pages = load_sample_from_notebook(sample_notebook)
    print(f"‚úÖ Loaded {len(wiki_pages)} pages from provenance-tracked notebook")
    print(f"   Source: {sample_notebook}")
elif sample_json.exists():
    # Fall back to JSON format
    with open(sample_json) as f:
        wiki_sample = json.load(f)
    wiki_pages = [WikipediaPage(title=p['title'], views=p['views']) for p in wiki_sample['pages']]
    print(f"‚ö†Ô∏è  Loaded {len(wiki_pages)} pages from JSON (no provenance)")
    print(f"   Run sample_wikipedia_pages.ipynb to generate provenance-tracked version")
else:
    raise FileNotFoundError("No Wikipedia sample found. Run sample_wikipedia_pages.ipynb first.")

print(f"\nFirst 10 pages:")
for p in wiki_pages[:10]:
    print(f"  - {p.title} ({p.views:,} views)")

‚úÖ Loaded 100 pages from provenance-tracked notebook
   Source: /workspaces/wiki3-kg-project/data/training/wikipedia_sample.ipynb

First 10 pages:
  - Zohran Mamdani (9,344,963 views)
  - ChatGPT (3,639,485 views)
  - James A. Garfield (3,524,531 views)
  - 1989 Tiananmen Square protests and massacre (2,867,005 views)
  - 2025 Bihar Legislative Assembly election (2,555,071 views)
  - Mira Nair (2,503,516 views)
  - Dick Cheney (2,186,840 views)
  - 2026 FIFA World Cup (2,155,565 views)
  - 1xBet (1,831,684 views)
  - Survivor Series: WarGames (2025) (1,590,263 views)


## 4. Fetch and Chunk Wikipedia Pages (with CID Provenance)

Fetch page content and chunk it. Each page's chunks are saved to a 
provenance-tracked notebook with CID signatures.

**Note**: Uses `fetch_page_content` and `chunk_article` from `ontological_engineer` - 
no application logic defined in this notebook!

In [6]:
# Processing parameters
chunks_dir = Path("/workspaces/wiki3-kg-project/data/training/chunks")
MAX_PAGES = len(wiki_pages)
MIN_CHUNK_LENGTH = 60  # Skip very short chunks

# Quick test on one page first
test_page = wiki_pages[0]
print(f"Testing on: {test_page.title}")

content = fetch_page_content(test_page.title)
if content:
    chunks = chunk_article(test_page.title, content)
    chunks = [c for c in chunks if len(c.text) >= MIN_CHUNK_LENGTH]
    print(f"  ‚Üí {len(chunks)} chunks (filtered by min_length={MIN_CHUNK_LENGTH})")
    if chunks:
        print(f"  First chunk preview: {chunks[0].text[:200]}...")
else:
    print(f"  ‚ö†Ô∏è Could not fetch content")

Testing on: Zohran Mamdani
  ‚Üí 20 chunks (filtered by min_length=60)
  First chunk preview: Zohran Kwame Mamdani (born October 18, 1991) is an American politician who is the mayor-elect of New York City. A member of the Democratic Party and the Democratic Socialists of America, he is set to ...


In [7]:
# Process all pages - logic is in process_wikipedia_sample()
# Handles: fetching, chunking, saving with CID provenance, incremental processing

training_chunks, pages_processed = process_wikipedia_sample(
    pages=wiki_pages,
    output_dir=chunks_dir,
    max_pages=MAX_PAGES,
    min_chunk_length=MIN_CHUNK_LENGTH,
)

print(f"\n‚úÖ Processed {pages_processed} pages")
print(f"   Total training chunks: {len(training_chunks)}")
print(f"   Chunks saved to: {chunks_dir}")

Processing pages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 243.86it/s]


‚úÖ Processed 99 pages
   Total training chunks: 3428
   Chunks saved to: /workspaces/wiki3-kg-project/data/training/chunks





## 5. Initialize Extractor with Few-Shot Examples

Create the statement extractor and provide Albert Einstein examples as demonstrations.

In [8]:
# Select best few-shot examples (ones with good variety of statements)
NUM_FEWSHOT = 3

# Sort by statement count to get diverse examples
sorted_fewshot = sorted(fewshot_examples, key=lambda x: len(x.statements), reverse=True)
selected_fewshot = sorted_fewshot[:NUM_FEWSHOT]

print(f"Selected {len(selected_fewshot)} few-shot examples:")
for i, ex in enumerate(selected_fewshot, 1):
    print(f"  {i}. {ex.section_context[:50]}... ({len(ex.statements)} statements)")

Selected 3 few-shot examples:
  1. Albert Einstein > Introduction... (28 statements)
  2. Albert Einstein > Life and career > Personal views... (28 statements)
  3. Albert Einstein > Introduction... (27 statements)


In [9]:
# Create extractor with few-shot demonstrations
extractor = StatementExtractor()

# In DSPy, we can provide demonstrations directly
# The few-shot examples will be used by MIPROv2 for bootstrapping
print("Extractor initialized")
print(f"Few-shot examples available: {len(selected_fewshot)}")

Extractor initialized
Few-shot examples available: 3


## 6. Test Extraction on Training Sample

Run the extractor on a few training chunks to verify it works.

In [10]:
# Test on a training chunk
if training_chunks:
    test_chunk = training_chunks[0]
    
    print(f"Testing on: {test_chunk.section_context}")
    print(f"Text: {test_chunk.text[:300]}...")
    print("\n" + "="*60 + "\n")
    
    result = extractor(
        chunk_text=test_chunk.text,
        section_context=test_chunk.section_context,
    )
    
    print(f"Extracted {len(result.statements)} statements:")
    for i, stmt in enumerate(result.statements[:10], 1):
        print(f"  {i}. {stmt}")
    if len(result.statements) > 10:
        print(f"  ... and {len(result.statements) - 10} more")

Testing on: Zohran Mamdani > Zohran Mamdani
Text: Zohran Kwame Mamdani (born October 18, 1991) is an American politician who is the mayor-elect of New York City. A member of the Democratic Party and the Democratic Socialists of America, he is set to become New York's first Muslim and Asian American mayor. Mamdani has served as a member of the New Y...


Extracted 22 statements:
  1. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) was born on October 18, 1991.
  2. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) was born in [Kampala](/wiki/Kampala), [Uganda](/wiki/Uganda).
  3. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is an American politician.
  4. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is the mayor-elect of [New York City](/wiki/New_York_City).
  5. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is a member of the [Democratic Party](/wiki/Democratic_Party).
  6. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is a member of the [Democratic Socialists

## 7. Create Training Dataset

Convert chunks into DSPy examples. For training, we need to generate initial extractions
that can be scored and optimized.

In [11]:
# Create training examples (without labels - we'll generate and judge them)
# For DSPy optimization, we just need the inputs

random_seed(42)  # For reproducibility

# Convert WikipediaChunk objects to DSPy examples
trainset_chunks = list(training_chunks)  # Make a copy
shuffle(trainset_chunks)

trainset = []
for chunk in trainset_chunks:
    ex = dspy.Example(
        chunk_text=chunk.text,
        section_context=chunk.section_context,
    ).with_inputs('chunk_text', 'section_context')
    trainset.append(ex)

# Split into train/dev
split_idx = int(len(trainset) * 0.8)
devset = trainset[split_idx:]
trainset = trainset[:split_idx]

print(f"Training set: {len(trainset)} examples")
print(f"Dev set: {len(devset)} examples")

# Save datasets with CID provenance for stage1_optimize.ipynb
training_dir = Path("/workspaces/wiki3-kg-project/data/training")

trainset_cid = save_trainset(trainset, training_dir)
devset_cid = save_devset(devset, training_dir)
fewshot_cid = save_fewshot_examples(selected_fewshot, training_dir)

# Save config with CID provenance
config = save_stage1_config(
    output_dir=training_dir,
    model=MODEL,
    api_base=API_BASE,
    temperature=TEMPERATURE,
    num_fewshot=NUM_FEWSHOT,
    train_size=len(trainset),
    dev_size=len(devset),
    pages_processed=pages_processed,
    total_chunks=len(training_chunks),
)

print(f"\n‚úÖ Saved artifacts with provenance:")
print(f"   Config CID: {config['cid']}")
print(f"   Trainset CID: {trainset_cid}")
print(f"   Devset CID: {devset_cid}")
print(f"   Fewshot CID: {fewshot_cid}")

Training set: 2742 examples
Dev set: 686 examples

‚úÖ Saved artifacts with provenance:
   Config CID: bafkreicshdgzdtbuwtk7efcce2e7bcjq2dskfkwrn3xrq4pjkjgzubwhva
   Trainset CID: bafkreihisr4j4qzgb3gbtjoz6d3frwmrcnif2foi26vedicxkkbk3tn5si
   Devset CID: bafkreictctqrjhp2ikuuoinyim46ddus2epcbrxj7brjfruoq4nlp53yhq
   Fewshot CID: bafkreibsv3kwtrlkob2nzudlmhpql2fpdxtv7ezommob7rqhiqji5elh6i


## 8. Initialize Judge with Few-Shot Guidance

The judge scores extraction quality. We use Albert Einstein examples to calibrate.

In [12]:
# Initialize judge
judge = StatementQualityJudge()

# Test judge on a known good example (Albert Einstein few-shot)
if selected_fewshot:
    test_ex = selected_fewshot[0]
    
    evaluation = judge(
        chunk_text=test_ex.chunk_text,
        section_context=test_ex.section_context,
        statements=test_ex.statements,
    )
    
    print("Judge calibration on few-shot example:")
    print(f"  Completeness:      {evaluation.completeness:.2f}")
    print(f"  Atomicity:         {evaluation.atomicity:.2f}")
    print(f"  Accuracy:          {evaluation.accuracy:.2f}")
    print(f"  Link preservation: {evaluation.link_preservation:.2f}")
    print(f"  ---")
    print(f"  Weighted score:    {evaluation.weighted_score:.2f}")

Judge calibration on few-shot example:
  Completeness:      0.95
  Atomicity:         1.00
  Accuracy:          1.00
  Link preservation: 1.00
  ---
  Weighted score:    0.99


## 9. Baseline Evaluation

Evaluate the unoptimized extractor on the dev set.

In [13]:
# Evaluate baseline on dev set (with CID-based cache check)
from ontological_engineer.training import compute_module_cid

EVAL_SIZE = len(devset)

# Create baseline extractor and compute its CID
baseline_extractor = StatementExtractor()
extractor_cid = compute_module_cid(baseline_extractor)
print(f"Extractor CID: {extractor_cid}")

# Check if we already have results for these exact inputs (including extractor definition)
cached_baseline = check_baseline_cache(
    training_dir=training_dir,
    config_cid=config['cid'],
    devset_cid=devset_cid,
    eval_size=EVAL_SIZE,
    extractor_cid=extractor_cid,
)

if cached_baseline:
    # Use cached results - inputs match!
    baseline_score = cached_baseline['score']
    baseline_cid = cached_baseline['cid']
    print(f"‚úÖ Using cached baseline results (inputs unchanged)")
    print(f"   Score: {baseline_score:.2f}")
    print(f"   CID: {baseline_cid}")
else:
    # Compute baseline - inputs changed or no cache
    print(f"Computing baseline evaluation ({EVAL_SIZE} examples)...")
    
    evaluator = dspy.Evaluate(
        devset=devset[:EVAL_SIZE],
        metric=statement_quality_metric,
        num_threads=1,
        display_progress=True,
        provide_traceback=True,  # Show full errors for debugging
    )

    baseline_result = evaluator(baseline_extractor)

    baseline_score = baseline_result.score if hasattr(baseline_result, 'score') else float(baseline_result)
    print(f"\nBaseline quality score: {baseline_score:.2f}")

    # Save baseline results with CID provenance (includes input_cid for cache validation)
    baseline_cid = save_baseline_results(
        output_dir=training_dir,
        score=baseline_score,
        eval_size=EVAL_SIZE,
        config_cid=config['cid'],
        extractor_cid=extractor_cid,
        devset_cid=devset_cid,
    )
    print(f"Saved baseline results CID: {baseline_cid}")

Extractor CID: bafkreifceqfihybmpbxhqo4ox2ubndggakeuu4wbi7lzepi53xypi5wbnm
Computing baseline evaluation (686 examples)...
Average Metric: 35.85 / 38 (94.3%):   6%|‚ñå         | 38/686 [07:29<2:25:40, 13.49s/it]



KeyboardInterrupt: 

## 9b. MLflow Observability (Optional)

MLflow provides tracing, evaluation, and human feedback tools for DSPy pipelines.
When enabled via `mlflow.dspy.autolog()`, **all DSPy operations are automatically traced** -
no separate evaluation loop needed!

### Quick Setup (One-time)

1. **Start the MLflow server** in a terminal:
   ```bash
   mlflow server \
     --backend-store-uri sqlite:///mlflow.sqlite \
     --default-artifact-root ./mlflow-artifacts \
     --host 0.0.0.0 \
     --port 5000
   ```
2. **Open the UI** at http://localhost:5000 (or via VS Code port forwarding)

### What MLflow Provides
- **Tracing**: See every LM call, inputs, outputs, latency (automatic with autolog)
- **Evaluation**: Compare model versions side-by-side
- **Human Feedback**: Add labels/assessments directly in the UI
- **Experiment Tracking**: Track metrics across optimization runs

In [None]:
# =============================================================================
# MLflow Setup for DSPy Optimization Tracking (OPTIONAL)
# =============================================================================
# Reference: https://dspy.ai/tutorials/optimizer_tracking/
#
# When autolog is enabled, ALL DSPy operations (evaluate, compile, etc.)
# are automatically traced - no manual logging needed!
#
# Prerequisites:
#   1. Install: pip install "mlflow>=2.21.1"
#   2. Start server: mlflow server --backend-store-uri sqlite:///mlflow.sqlite --port 5000
#   3. Open UI: http://localhost:5000
# =============================================================================

try:
    import mlflow
    
    MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment("wiki3-kg-stage1-statements")
    
    # Enable autologging - this automatically traces ALL DSPy operations
    # No separate evaluation loop needed!
    mlflow.dspy.autolog(
        log_compiles=True,           # Track optimization process
        log_evals=True,              # Track dspy.Evaluate results  
        log_traces_from_compile=True # Track program traces during optimization
    )
    
    print(f"‚úÖ MLflow autolog enabled - all DSPy operations will be traced")
    print(f"   Tracking URI: {MLFLOW_TRACKING_URI}")
    print(f"   Experiment: wiki3-kg-stage1-statements")
    print(f"\nüìä Open MLflow UI: {MLFLOW_TRACKING_URI}")
    print(f"   ‚Üí Traces tab shows all LM calls from dspy.Evaluate above")
    MLFLOW_ENABLED = True
except Exception as e:
    print(f"‚ö†Ô∏è  MLflow not available: {e}")
    print(f"   (This is optional - baseline evaluation works without it)")
    MLFLOW_ENABLED = False

In [None]:
# With mlflow.dspy.autolog() enabled above, the baseline evaluation in cell 22
# is AUTOMATICALLY traced to MLflow. No separate loop needed!
#
# To view traces:
#   1. Open MLflow UI at http://127.0.0.1:5000
#   2. Click on the experiment "wiki3-kg-stage1-statements"  
#   3. Click "Traces" tab to see all LM calls
#   4. Click individual traces to review inputs/outputs
#   5. Use "Feedback" to add human labels

if MLFLOW_ENABLED:
    print("‚úÖ MLflow autolog is enabled")
    print("   All DSPy operations from this notebook are being traced automatically.")
    print(f"\nüìä View traces at: {MLFLOW_TRACKING_URI}")
    print("   ‚Üí No separate evaluation loop needed!")
else:
    print("‚ÑπÔ∏è  MLflow not running - using dspy.inspect_history() for debugging instead")
    print("\nRecent LM calls:")
    print("=" * 60)
    dspy.inspect_history(n=2)

In [None]:
# Alternative: Use dspy.inspect_history() for quick debugging
# This shows recent LM calls without needing MLflow server

print("Recent LM calls (use MLflow UI for full traces):")
print("=" * 60)
dspy.inspect_history(n=2)

### MLflow Evaluation with Human Feedback

Use MLflow's evaluation API to systematically review predictions and collect human labels.
The MLflow UI provides a proper interface for reviewing and annotating.

In [None]:
# Create evaluation dataset for MLflow
import pandas as pd

eval_data = []
for i, ex in enumerate(devset[:EVAL_SIZE]):
    eval_data.append({
        "index": i,
        "chunk_text": ex.chunk_text,
        "section_context": ex.section_context,
    })

eval_df = pd.DataFrame(eval_data)
print(f"Created evaluation dataset with {len(eval_df)} examples")
eval_df.head()

In [None]:
# ============================================================================
# Per-Statement Classification for ALL Evaluation Examples
# ============================================================================
# Uses StatementClassifier to get GOOD/BAD verdicts per statement

from ontological_engineer import StatementClassifier, StatementClassification

classifier = StatementClassifier()

# Store all results for summary
all_classification_results = []

print(f"Classifying statements for {EVAL_SIZE} chunks...")
print("=" * 70)

for idx, ex in enumerate(devset[:EVAL_SIZE]):
    print(f"\n[{idx+1}/{EVAL_SIZE}] {ex.section_context[:50]}...")
    
    # Extract statements
    pred = baseline_extractor(
        chunk_text=ex.chunk_text,
        section_context=ex.section_context,
    )
    
    # Classify each statement
    result = classifier(
        chunk_text=ex.chunk_text,
        section_context=ex.section_context,
        statements=list(pred.statements),
    )
    
    # Store result
    all_classification_results.append({
        "idx": idx,
        "section": ex.section_context,
        "chunk_text": ex.chunk_text,
        "statements": list(pred.statements),
        "score": result.score,
        "classifications": result.classifications,
        "missing_facts": result.missing_facts,
    })
    
    # Show quick summary
    good = sum(1 for c in result.classifications if c.is_good)
    total = len(result.classifications)
    print(f"   ‚Üí {good}/{total} GOOD ({result.score:.0%})")

# Summary
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
avg_score = sum(r["score"] for r in all_classification_results) / len(all_classification_results)
total_good = sum(sum(1 for c in r["classifications"] if c.is_good) for r in all_classification_results)
total_bad = sum(sum(1 for c in r["classifications"] if not c.is_good) for r in all_classification_results)
print(f"Average score: {avg_score:.1%}")
print(f"Total GOOD: {total_good}, Total BAD: {total_bad}")

In [None]:
# Display detailed results for each chunk
print("DETAILED PER-CHUNK RESULTS")
print("=" * 70)

for r in all_classification_results:
    good = sum(1 for c in r["classifications"] if c.is_good)
    bad = sum(1 for c in r["classifications"] if not c.is_good)
    total = len(r["classifications"])
    
    print(f"\nüìÑ Chunk {r['idx']}: {r['section'][:60]}...")
    print(f"   Score: {r['score']:.0%} ({good}/{total} GOOD)")
    
    # Show BAD statements (these need attention)
    bad_stmts = [c for c in r["classifications"] if not c.is_good]
    if bad_stmts:
        print(f"   ‚ùå BAD statements:")
        for c in bad_stmts:
            print(f"      [{c.index}] {c.statement[:80]}...")
            print(f"          Reason: {c.reason}")
    
    if r["missing_facts"] and r["missing_facts"].lower() != "none":
        print(f"   üìù Missing: {r['missing_facts'][:100]}...")

### Export Annotations from MLflow

After reviewing and labeling in the MLflow UI, export your annotations for judge improvement.

In [None]:
# Load annotations from MLflow (after you've labeled them in the UI)
# MLflow stores feedback as assessments on traces

client = mlflow.MlflowClient()

# Get the latest evaluation run
experiment = client.get_experiment_by_name("wiki3-kg-stage1-statements")
if experiment:
    runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        order_by=["start_time DESC"],
        max_results=1,
    )
    
    if runs:
        latest_run = runs[0]
        print(f"Latest run: {latest_run.info.run_id}")
        print(f"Metrics: {latest_run.data.metrics}")
        
        # Get traces with assessments (human feedback)
        try:
            traces = client.search_traces(
                experiment_ids=[experiment.experiment_id],
                max_results=100,
            )
            print(f"Found {len(traces)} traces")
        except Exception as e:
            print(f"Trace search error: {e}")
else:
    print("No experiment found. Run evaluation first.")

In [None]:
# Use human feedback to improve the judge
# After collecting labels in MLflow, create DSPy training examples

# For now, save the evaluation data for later use
output_dir = Path("/workspaces/wiki3-kg-project/data/training")
output_dir.mkdir(exist_ok=True)

eval_df.to_json(output_dir / "eval_dataset.json", orient="records", indent=2)
print(f"Saved evaluation dataset to {output_dir / 'eval_dataset.json'}")

print("""
üìã Next steps for human feedback:

1. Start MLflow server:
   mlflow server --backend-store-uri sqlite:///mlflow.sqlite --port 5000

2. Open MLflow UI at http://127.0.0.1:5000

3. Navigate to the experiment 'wiki3-kg-stage1-statements'

4. Click on traces to review predictions

5. Use the feedback/assessment features to label quality

6. Export labeled data for judge improvement
""")

## Summary

This notebook:
1. Loaded Albert Einstein as few-shot examples (seed/guidance)
2. Fetched and chunked Wikipedia sample pages for training
3. Created train/dev datasets with CID provenance
4. Established baseline extraction quality
5. Saved all artifacts for optimization

**Artifacts saved** (in `data/training/`):
- `stage1_config.json` - Model config with CID
- `trainset.json` - Training examples with CID
- `devset.json` - Dev examples with CID
- `fewshot_examples.json` - Few-shot examples with CID
- `baseline_results.json` - Baseline score with CID

**Next step**: Run `stage1_optimize.ipynb` for MIPROv2 optimization