In [1]:
# Setup and imports
import sys
sys.path.insert(0, '/workspaces/wiki3-kg-project')

import dspy
import json
import requests
import re
from pathlib import Path
from random import shuffle, seed as random_seed
from tqdm import tqdm

from ontological_engineer import (
    configure_lm,
    StatementExtractor,
    StatementQualityJudge,
)
from ontological_engineer.judges import statement_quality_metric
from ontological_engineer.training.bootstrap import (
    load_chunks_from_notebook,
    load_facts_from_notebook,
    create_training_examples,
)

## 1. Configure Language Model

Connect to LM Studio running Qwen-30B (or your preferred model).

In [2]:
# Configure the LM (defaults to Qwen-30B via LM Studio)
lm = configure_lm(
    model="qwen/qwen3-coder-30b",
    api_base="http://host.docker.internal:1234/v1",
    temperature=0.7,
)

print(f"Configured LM: {lm}")

Configured LM: <dspy.clients.lm.LM object at 0xffffa46693c0>


## 2. Load Few-Shot Examples (Albert Einstein)

Albert Einstein is our gold-standard example. These chunks and their extracted facts
serve as few-shot demonstrations for the extractor and judge.

In [3]:
# Load Albert Einstein data for few-shot examples
fewshot_dir = Path("/workspaces/wiki3-kg-project/data/albert_einstein/20251218_231446")

fewshot_chunks = load_chunks_from_notebook(fewshot_dir / "chunks.ipynb")
fewshot_facts = load_facts_from_notebook(fewshot_dir / "facts.ipynb")

print(f"Loaded {len(fewshot_chunks)} chunks from Albert Einstein")
print(f"Loaded {len(fewshot_facts)} fact sets")

# Create few-shot examples
fewshot_examples = create_training_examples(fewshot_chunks, fewshot_facts)
print(f"Created {len(fewshot_examples)} few-shot examples")

Loaded 63 chunks from Albert Einstein
Loaded 19 fact sets
Created 19 few-shot examples


In [4]:
# Show a few-shot example
if fewshot_examples:
    ex = fewshot_examples[0]
    print("Sample few-shot example:")
    print(f"  Context: {ex.section_context}")
    print(f"  Text: {ex.chunk_text[:200]}...")
    print(f"  Statements: {len(ex.statements)} items")
    for stmt in ex.statements[:3]:
        print(f"    - {stmt}")

Sample few-shot example:
  Context: Albert Einstein > Introduction
  Text: Albert Einstein (14 March 1879 ‚Äì 18 April 1955) was a German-born theoretical physicist best known for developing the theory of relativity. Einstein also made important contributions to quantum theory...
  Statements: 28 items
    - Albert Einstein was a German-born theoretical physicist.
    - Albert Einstein developed the theory of relativity.
    - Albert Einstein made important contributions to quantum theory.


## 3. Load Wikipedia Sample for Training

Load the 100-page Wikipedia sample. We'll fetch article content and chunk it for training.

In [5]:
# Load the Wikipedia sample
sample_file = Path("/workspaces/wiki3-kg-project/data/training/wikipedia_sample.json")

with open(sample_file) as f:
    wiki_sample = json.load(f)

print(f"Loaded {len(wiki_sample['pages'])} Wikipedia pages")
print(f"Sampling method: {wiki_sample['metadata']['sampling_method']}")
print(f"\nFirst 10 pages:")
for p in wiki_sample['pages'][:10]:
    print(f"  - {p['title']} ({p['views']:,} views)")

Loaded 100 Wikipedia pages
Sampling method: power_law

First 10 pages:
  - Zohran Mamdani (9,344,963 views)
  - ChatGPT (3,639,485 views)
  - James A. Garfield (3,524,531 views)
  - 1989 Tiananmen Square protests and massacre (2,867,005 views)
  - 2025 Bihar Legislative Assembly election (2,555,071 views)
  - Mira Nair (2,503,516 views)
  - Dick Cheney (2,186,840 views)
  - 2026 FIFA World Cup (2,155,565 views)
  - 1xBet (1,831,684 views)
  - Survivor Series: WarGames (2025) (1,590,263 views)


## 4. Fetch and Chunk Wikipedia Pages

For each page in our sample, fetch the content and break it into chunks.
We use the same chunking strategy as the original pipeline.

In [6]:
def fetch_wikipedia_content(title):
    """
    Fetch the plain text content of a Wikipedia article.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "prop": "extracts",
        "explaintext": True,
        "format": "json",
    }
    headers = {"User-Agent": "Wiki3.ai OntologicalEngineer/0.1"}
    
    try:
        response = requests.get(url, params=params, headers=headers, timeout=30)
        response.raise_for_status()
        data = response.json()
        
        pages = data.get("query", {}).get("pages", {})
        for page_id, page_data in pages.items():
            if page_id == "-1":
                return None
            return page_data.get("extract", "")
    except Exception as e:
        print(f"  Error fetching {title}: {e}")
        return None
    
    return None


def chunk_article(title, text, max_chunk_size=1500):
    """
    Split article into chunks by section.
    """
    if not text:
        return []
    
    chunks = []
    
    # Split by section headers (== Header ==)
    sections = re.split(r'\n(==+\s+.+?\s+==+)\n', text)
    
    current_section = title
    current_text = ""
    
    for i, part in enumerate(sections):
        # Check if this is a header
        if re.match(r'==+\s+.+?\s+==+', part):
            # Save previous chunk if it has content
            if current_text.strip():
                chunks.append({
                    "text": current_text.strip(),
                    "section_context": f"{title} > {current_section}",
                })
            
            # Update section name
            current_section = part.strip('= \n')
            current_text = ""
        else:
            current_text += part
            
            # If chunk gets too large, split it
            while len(current_text) > max_chunk_size:
                # Find a good break point (paragraph)
                break_point = current_text.rfind('\n\n', 0, max_chunk_size)
                if break_point == -1:
                    break_point = current_text.rfind('. ', 0, max_chunk_size)
                if break_point == -1:
                    break_point = max_chunk_size
                
                chunk_text = current_text[:break_point].strip()
                if chunk_text:
                    chunks.append({
                        "text": chunk_text,
                        "section_context": f"{title} > {current_section}",
                    })
                current_text = current_text[break_point:].strip()
    
    # Don't forget the last chunk
    if current_text.strip():
        chunks.append({
            "text": current_text.strip(),
            "section_context": f"{title} > {current_section}",
        })
    
    return chunks


# Test on one page
test_title = wiki_sample['pages'][0]['title']
test_content = fetch_wikipedia_content(test_title)
if test_content:
    test_chunks = chunk_article(test_title, test_content)
    print(f"Test: '{test_title}' -> {len(test_chunks)} chunks")
    if test_chunks:
        print(f"First chunk: {test_chunks[0]['text'][:200]}...")

Test: 'Zohran Mamdani' -> 20 chunks
First chunk: Zohran Kwame Mamdani (born October 18, 1991) is an American politician who is the mayor-elect of New York City. A member of the Democratic Party and the Democratic Socialists of America, he is set to ...


In [7]:
# Fetch and chunk all pages in sample (limit to first N for faster iteration)
MAX_PAGES = 20  # Increase for full training run
MIN_CHUNK_LENGTH = 100  # Skip very short chunks

training_chunks = []
pages_processed = 0

print(f"Fetching content for {min(MAX_PAGES, len(wiki_sample['pages']))} pages...")

for page in tqdm(wiki_sample['pages'][:MAX_PAGES]):
    content = fetch_wikipedia_content(page['title'])
    if content:
        chunks = chunk_article(page['title'], content)
        # Filter short chunks
        chunks = [c for c in chunks if len(c['text']) >= MIN_CHUNK_LENGTH]
        training_chunks.extend(chunks)
        pages_processed += 1

print(f"\nProcessed {pages_processed} pages")
print(f"Total training chunks: {len(training_chunks)}")

Fetching content for 20 pages...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:09<00:00,  2.19it/s]


Processed 20 pages
Total training chunks: 879





## 5. Initialize Extractor with Few-Shot Examples

Create the statement extractor and provide Albert Einstein examples as demonstrations.

In [8]:
# Select best few-shot examples (ones with good variety of statements)
NUM_FEWSHOT = 3

# Sort by statement count to get diverse examples
sorted_fewshot = sorted(fewshot_examples, key=lambda x: len(x.statements), reverse=True)
selected_fewshot = sorted_fewshot[:NUM_FEWSHOT]

print(f"Selected {len(selected_fewshot)} few-shot examples:")
for i, ex in enumerate(selected_fewshot, 1):
    print(f"  {i}. {ex.section_context[:50]}... ({len(ex.statements)} statements)")

Selected 3 few-shot examples:
  1. Albert Einstein > Introduction... (28 statements)
  2. Albert Einstein > Life and career > Personal views... (28 statements)
  3. Albert Einstein > Introduction... (27 statements)


In [9]:
# Create extractor with few-shot demonstrations
extractor = StatementExtractor()

# In DSPy, we can provide demonstrations directly
# The few-shot examples will be used by MIPROv2 for bootstrapping
print("Extractor initialized")
print(f"Few-shot examples available: {len(selected_fewshot)}")

Extractor initialized
Few-shot examples available: 3


## 6. Test Extraction on Training Sample

Run the extractor on a few training chunks to verify it works.

In [10]:
# Test on a training chunk
if training_chunks:
    test_chunk = training_chunks[0]
    
    print(f"Testing on: {test_chunk['section_context']}")
    print(f"Text: {test_chunk['text'][:300]}...")
    print("\n" + "="*60 + "\n")
    
    result = extractor(
        chunk_text=test_chunk['text'],
        section_context=test_chunk['section_context'],
    )
    
    print(f"Extracted {len(result.statements)} statements:")
    for i, stmt in enumerate(result.statements[:10], 1):
        print(f"  {i}. {stmt}")
    if len(result.statements) > 10:
        print(f"  ... and {len(result.statements) - 10} more")

Testing on: Zohran Mamdani > Zohran Mamdani
Text: Zohran Kwame Mamdani (born October 18, 1991) is an American politician who is the mayor-elect of New York City. A member of the Democratic Party and the Democratic Socialists of America, he is set to become New York's first Muslim and Asian American mayor. Mamdani has served as a member of the New Y...


Extracted 22 statements:
  1. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) was born on October 18, 1991.
  2. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is an American politician.
  3. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is the mayor-elect of [New York City](/wiki/New_York_City).
  4. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is a member of the [Democratic Party](/wiki/Democratic_Party).
  5. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is a member of the [Democratic Socialists of America](/wiki/Democratic_Socialists_of_America).
  6. [Zohran Kwame Mamdani](/wiki/Zohran_Kwame_Mamdani) is set t

## 7. Create Training Dataset

Convert chunks into DSPy examples. For training, we need to generate initial extractions
that can be scored and optimized.

In [11]:
# Create training examples (without labels - we'll generate and judge them)
# For DSPy optimization, we just need the inputs

random_seed(42)  # For reproducibility
shuffle(training_chunks)

# Create DSPy examples from chunks
trainset = []
for chunk in training_chunks:
    ex = dspy.Example(
        chunk_text=chunk['text'],
        section_context=chunk['section_context'],
    ).with_inputs('chunk_text', 'section_context')
    trainset.append(ex)

# Split into train/dev
split_idx = int(len(trainset) * 0.8)
devset = trainset[split_idx:]
trainset = trainset[:split_idx]

print(f"Training set: {len(trainset)} examples")
print(f"Dev set: {len(devset)} examples")

Training set: 703 examples
Dev set: 176 examples


## 8. Initialize Judge with Few-Shot Guidance

The judge scores extraction quality. We use Albert Einstein examples to calibrate.

In [12]:
# Initialize judge
judge = StatementQualityJudge()

# Test judge on a known good example (Albert Einstein few-shot)
if selected_fewshot:
    test_ex = selected_fewshot[0]
    
    evaluation = judge(
        chunk_text=test_ex.chunk_text,
        section_context=test_ex.section_context,
        statements=test_ex.statements,
    )
    
    print("Judge calibration on few-shot example:")
    print(f"  Completeness:      {evaluation.completeness:.2f}")
    print(f"  Atomicity:         {evaluation.atomicity:.2f}")
    print(f"  Accuracy:          {evaluation.accuracy:.2f}")
    print(f"  Link preservation: {evaluation.link_preservation:.2f}")
    print(f"  ---")
    print(f"  Weighted score:    {evaluation.weighted_score:.2f}")

Judge calibration on few-shot example:
  Completeness:      0.95
  Atomicity:         0.85
  Accuracy:          0.95
  Link preservation: 1.00
  ---
  Weighted score:    0.94


## 9. Baseline Evaluation

Evaluate the unoptimized extractor on the dev set.

In [13]:
# Evaluate baseline on dev set
EVAL_SIZE = min(10, len(devset))  # Limit for speed

evaluator = dspy.Evaluate(
    devset=devset[:EVAL_SIZE],
    metric=statement_quality_metric,
    num_threads=1,
    display_progress=True,
)

baseline_extractor = StatementExtractor()
baseline_result = evaluator(baseline_extractor)

baseline_score = baseline_result.score if hasattr(baseline_result, 'score') else float(baseline_result)
print(f"\nBaseline quality score: {baseline_score:.2f}")

Average Metric: 8.72 / 10 (87.2%): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [01:55<00:00, 11.51s/it]

2025/12/20 01:29:59 INFO dspy.evaluate.evaluate: Average Metric: 8.724999999999998 / 10 (87.2%)




Baseline quality score: 87.25


## 9b. Interactive Judgment Review

Review the judge's outputs and provide corrections. Your feedback will be saved
as labeled examples to improve the judge via DSPy's few-shot learning.

In [14]:
# Run detailed evaluation and capture individual results
detailed_results = []

print("Running extraction + judgment on dev set...")
for i, ex in enumerate(tqdm(devset[:EVAL_SIZE])):
    # Run extraction
    pred = baseline_extractor(
        chunk_text=ex.chunk_text,
        section_context=ex.section_context,
    )
    
    # Run judge
    eval_result = judge(
        chunk_text=ex.chunk_text,
        section_context=ex.section_context,
        statements=pred.statements,
    )
    
    detailed_results.append({
        "index": i,
        "section_context": ex.section_context,
        "chunk_text": ex.chunk_text,
        "statements": list(pred.statements),
        "completeness": float(eval_result.completeness),
        "atomicity": float(eval_result.atomicity),
        "accuracy": float(eval_result.accuracy),
        "link_preservation": float(eval_result.link_preservation),
        "weighted_score": float(eval_result.weighted_score),
        "reasoning": eval_result.reasoning,
        # Human annotations (to be filled)
        "human_approved": None,
        "human_scores": None,
        "human_notes": None,
    })

# Sort by score (worst first for review)
detailed_results = sorted(detailed_results, key=lambda x: x["weighted_score"])
print(f"\nReady to review {len(detailed_results)} examples (sorted by score, worst first)")

Running extraction + judgment on dev set...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 44.59it/s]


Ready to review 10 examples (sorted by score, worst first)





In [None]:
# Interactive review widget
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets

# Current review state
review_state = {"current_idx": 0}

def display_example(idx):
    """Display an example for review."""
    r = detailed_results[idx]
    
    print("=" * 80)
    print(f"Example {r['index']} ({idx + 1}/{len(detailed_results)}) | Judge Score: {r['weighted_score']:.1f}")
    print(f"Context: {r['section_context']}")
    print("=" * 80)
    
    print(f"\nüìÑ CHUNK TEXT:\n{'-' * 40}")
    print(r['chunk_text'][:800])
    if len(r['chunk_text']) > 800:
        print(f"... ({len(r['chunk_text']) - 800} more chars)")
    
    print(f"\nüìù EXTRACTED STATEMENTS ({len(r['statements'])}):\n{'-' * 40}")
    for j, stmt in enumerate(r['statements'], 1):
        print(f"  {j}. {stmt}")
    
    print(f"\n‚öñÔ∏è JUDGE SCORES:\n{'-' * 40}")
    print(f"  Completeness:      {r['completeness']:.0f}")
    print(f"  Atomicity:         {r['atomicity']:.0f}")
    print(f"  Accuracy:          {r['accuracy']:.0f}")
    print(f"  Link preservation: {r['link_preservation']:.0f}")
    print(f"  ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
    print(f"  Weighted Score:    {r['weighted_score']:.1f}")
    
    print(f"\nüí≠ JUDGE REASONING:\n{'-' * 40}")
    print(r['reasoning'])
    
    if r['human_approved'] is not None:
        status = "‚úÖ APPROVED" if r['human_approved'] else "‚ùå CORRECTED"
        print(f"\nüè∑Ô∏è Your verdict: {status}")
        if r['human_notes']:
            print(f"   Notes: {r['human_notes']}")

display_example(0)

In [15]:
# Annotation functions - run these cells to provide feedback

def approve(notes=""):
    """Approve the judge's assessment for the current example."""
    idx = review_state["current_idx"]
    detailed_results[idx]["human_approved"] = True
    detailed_results[idx]["human_notes"] = notes
    print(f"‚úÖ Approved example {idx}")
    next_example()

def correct(completeness=None, atomicity=None, accuracy=None, 
            link_preservation=None, notes=""):
    """Override the judge's scores with your own assessment."""
    idx = review_state["current_idx"]
    r = detailed_results[idx]
    
    # Use original scores where not overridden
    r["human_approved"] = False
    r["human_scores"] = {
        "completeness": completeness if completeness is not None else r["completeness"],
        "atomicity": atomicity if atomicity is not None else r["atomicity"],
        "accuracy": accuracy if accuracy is not None else r["accuracy"],
        "link_preservation": link_preservation if link_preservation is not None else r["link_preservation"],
    }
    r["human_notes"] = notes
    
    print(f"‚ùå Corrected example {idx}")
    print(f"   Original: {r['weighted_score']:.0f}")
    hs = r["human_scores"]
    human_weighted = 0.3*hs["completeness"] + 0.25*hs["atomicity"] + 0.3*hs["accuracy"] + 0.15*hs["link_preservation"]
    print(f"   Corrected: {human_weighted:.0f}")
    next_example()

def next_example():
    """Move to the next example."""
    review_state["current_idx"] = min(review_state["current_idx"] + 1, len(detailed_results) - 1)
    clear_output(wait=True)
    display_example(review_state["current_idx"])

def prev_example():
    """Move to the previous example."""
    review_state["current_idx"] = max(review_state["current_idx"] - 1, 0)
    clear_output(wait=True)
    display_example(review_state["current_idx"])

def goto(idx):
    """Jump to a specific example."""
    review_state["current_idx"] = max(0, min(idx, len(detailed_results) - 1))
    clear_output(wait=True)
    display_example(review_state["current_idx"])

def show_progress():
    """Show review progress."""
    approved = sum(1 for r in detailed_results if r["human_approved"] == True)
    corrected = sum(1 for r in detailed_results if r["human_approved"] == False)
    pending = sum(1 for r in detailed_results if r["human_approved"] is None)
    print(f"Progress: ‚úÖ {approved} approved, ‚ùå {corrected} corrected, ‚è≥ {pending} pending")

print("Review functions loaded. Use:")
print("  approve()           - Accept judge's assessment")
print("  approve('notes')    - Accept with notes")
print("  correct(accuracy=95, notes='...')  - Override specific scores")
print("  next_example()      - Skip to next")
print("  prev_example()      - Go back")
print("  goto(5)             - Jump to example 5")
print("  show_progress()     - See review stats")

Review functions loaded. Use:
  approve()           - Accept judge's assessment
  approve('notes')    - Accept with notes
  correct(accuracy=95, notes='...')  - Override specific scores
  next_example()      - Skip to next
  prev_example()      - Go back
  goto(5)             - Jump to example 5
  show_progress()     - See review stats


### Annotate Current Example

Run one of these cells to provide your feedback:

In [None]:
# APPROVE: Judge got it right
approve()

In [None]:
# CORRECT: Override judge's scores (edit values as needed)
correct(
    completeness=90,   # Your score (0-100)
    atomicity=85,      # Your score (0-100)  
    accuracy=95,       # Your score (0-100)
    link_preservation=80,  # Your score (0-100)
    notes="Judge was too harsh on X because Y"
)

In [None]:
# SKIP: Move to next without annotating
next_example()

### Save Annotations as DSPy Training Data

After reviewing, save your corrections as labeled examples for the judge.

In [None]:
# Show review progress
show_progress()

# Save annotations as DSPy Examples for judge improvement
judge_training_examples = []

for r in detailed_results:
    if r["human_approved"] is None:
        continue  # Skip unannotated
    
    # Use human scores if corrected, otherwise use original (approved)
    if r["human_approved"]:
        scores = {
            "completeness": r["completeness"],
            "atomicity": r["atomicity"],
            "accuracy": r["accuracy"],
            "link_preservation": r["link_preservation"],
        }
    else:
        scores = r["human_scores"]
    
    # Create DSPy Example with inputs and outputs
    ex = dspy.Example(
        # Inputs
        chunk_text=r["chunk_text"],
        section_context=r["section_context"],
        statements=r["statements"],
        # Outputs (what judge should produce)
        completeness=scores["completeness"],
        atomicity=scores["atomicity"],
        accuracy=scores["accuracy"],
        link_preservation=scores["link_preservation"],
    ).with_inputs("chunk_text", "section_context", "statements")
    
    judge_training_examples.append(ex)

print(f"\nCreated {len(judge_training_examples)} training examples for judge improvement")

# Save to file
output_dir = Path("/workspaces/wiki3-kg-project/data/training")
output_dir.mkdir(exist_ok=True)

annotations_file = output_dir / "judge_annotations.json"
annotations_data = []
for r in detailed_results:
    if r["human_approved"] is not None:
        annotations_data.append({
            "chunk_text": r["chunk_text"],
            "section_context": r["section_context"],
            "statements": r["statements"],
            "judge_scores": {
                "completeness": r["completeness"],
                "atomicity": r["atomicity"],
                "accuracy": r["accuracy"],
                "link_preservation": r["link_preservation"],
            },
            "judge_reasoning": r["reasoning"],
            "human_approved": r["human_approved"],
            "human_scores": r["human_scores"],
            "human_notes": r["human_notes"],
        })

with open(annotations_file, "w") as f:
    json.dump(annotations_data, f, indent=2)

print(f"Saved annotations to {annotations_file}")

In [None]:
# Use annotations to improve the judge via few-shot learning
# DSPy's LabeledFewShot uses your corrections as demonstrations

from dspy.teleprompt import LabeledFewShot

if judge_training_examples:
    fewshot_optimizer = LabeledFewShot(k=min(3, len(judge_training_examples)))
    
    improved_judge = fewshot_optimizer.compile(
        StatementQualityJudge(),
        trainset=judge_training_examples,
    )
    
    print("‚úÖ Judge improved with your annotations!")
    print(f"   Using {len(judge_training_examples)} labeled examples as few-shot demos")
    
    # Save the improved judge
    try:
        improved_judge.save(output_dir / "improved_judge")
        print(f"   Saved to {output_dir / 'improved_judge'}")
    except:
        pass
else:
    print("No annotations yet. Review some examples first.")

## 10. MIPROv2 Prompt Optimization

Use DSPy's MIPROv2 optimizer to improve the extractor's prompts.
This uses the few-shot examples to bootstrap better demonstrations.

In [None]:
from dspy.teleprompt import MIPROv2

# Configure optimizer
optimizer = MIPROv2(
    metric=statement_quality_metric,
    num_candidates=3,  # Number of prompt candidates to try
    init_temperature=0.7,
)

# Use few-shot examples for bootstrapping demonstrations
# Use training set for optimization
TRAIN_SIZE = min(20, len(trainset))  # Limit for speed

print(f"Optimizing with {TRAIN_SIZE} training examples...")
print(f"Using {len(selected_fewshot)} few-shot demos for bootstrapping...")

optimized_extractor = optimizer.compile(
    StatementExtractor(),
    trainset=trainset[:TRAIN_SIZE],
    num_batches=2,
    max_bootstrapped_demos=NUM_FEWSHOT,
    # Provide few-shot examples as initial demos
    # demos=selected_fewshot,  # Uncomment if supported
)

print("\nOptimization complete!")

In [None]:
# Evaluate optimized extractor
optimized_result = evaluator(optimized_extractor)
optimized_score = optimized_result.score if hasattr(optimized_result, 'score') else float(optimized_result)

print(f"Baseline score:  {baseline_score:.2f}")
print(f"Optimized score: {optimized_score:.2f}")
print(f"Improvement:     {optimized_score - baseline_score:+.2f}")

## 11. Inspect Optimized Prompts

See what prompts MIPROv2 discovered.

In [None]:
# Inspect the optimized module
print("Optimized extractor configuration:")
print("="*60)

# Try to access the optimized signature/demos
if hasattr(optimized_extractor, 'demos'):
    print(f"\nDemonstrations: {len(optimized_extractor.demos)}")
    for i, demo in enumerate(optimized_extractor.demos[:2], 1):
        print(f"  Demo {i}: {demo.section_context[:50]}...")

# Check for any instruction changes
if hasattr(optimized_extractor, 'signature'):
    print(f"\nSignature: {optimized_extractor.signature}")

## 12. Save Results

Save the optimized extractor and training data.

In [None]:
# Save training metadata
output_dir = Path("/workspaces/wiki3-kg-project/data/training")
output_dir.mkdir(exist_ok=True)

# Save training results
results = {
    "baseline_score": baseline_score,
    "optimized_score": optimized_score,
    "train_size": TRAIN_SIZE,
    "eval_size": EVAL_SIZE,
    "num_fewshot": NUM_FEWSHOT,
    "pages_processed": pages_processed,
    "total_chunks": len(training_chunks),
}

with open(output_dir / "stage1_results.json", "w") as f:
    json.dump(results, f, indent=2)

print(f"Saved results to {output_dir / 'stage1_results.json'}")

In [None]:
# Save the optimized extractor state
try:
    optimized_extractor.save(output_dir / "optimized_extractor")
    print(f"Saved optimized extractor to {output_dir / 'optimized_extractor'}")
except Exception as e:
    print(f"Could not save extractor state: {e}")
    # Alternative: save as JSON
    if hasattr(optimized_extractor, 'dump_state'):
        state = optimized_extractor.dump_state()
        with open(output_dir / "optimized_extractor_state.json", "w") as f:
            json.dump(state, f, indent=2)
        print("Saved extractor state as JSON")

In [None]:
# Save few-shot examples for reference
fewshot_data = []
for ex in selected_fewshot:
    fewshot_data.append({
        "chunk_text": ex.chunk_text,
        "section_context": ex.section_context,
        "statements": list(ex.statements),
    })

with open(output_dir / "fewshot_examples.json", "w") as f:
    json.dump(fewshot_data, f, indent=2)

print(f"Saved {len(fewshot_data)} few-shot examples")

## Summary

This notebook:
1. Loaded Albert Einstein as few-shot examples (seed/guidance)
2. Fetched and chunked Wikipedia sample pages for training
3. Established baseline extraction quality
4. Ran MIPROv2 prompt optimization
5. Saved the optimized extractor

Next steps:
- **Stage 2**: Schema matching with optimized statements
- **Stage 3**: RDF generation training
- **Arbor GRPO**: Fine-tune the full pipeline end-to-end