In [None]:
# =============================================================================
# Stage 1: Statement Extraction with DSPy
# =============================================================================
# This notebook orchestrates statement extraction training/evaluation.
# All application logic is in ontological_engineer - this notebook coordinates.
#
# Outputs (with CID provenance):
#   - data/training/chunks/*.ipynb - Chunked Wikipedia pages
#   - data/training/statements.ipynb - Extracted statements
#   - data/training/classifications.ipynb - Per-statement judgments
# =============================================================================

import sys
sys.path.insert(0, '/workspaces/wiki3-kg-project')

import dspy
import json
from pathlib import Path
from random import shuffle, seed as random_seed
from tqdm import tqdm

from ontological_engineer import (
    # LM Configuration
    configure_lm,
    # DSPy Modules
    StatementExtractor,
    StatementQualityJudge,
    StatementClassifier,
    StatementClassification,
    # Data Loading (from provenance-tracked notebooks)
    WikipediaPage,
    WikipediaChunk,
    load_sample_from_notebook,
    load_chunks_from_notebook,
    # Processing (all logic in module!)
    process_wikipedia_sample,
    fetch_page_content,
    chunk_article,
    # Provenance notebook generation
    save_notebook,
    get_processed_chunk_cids,
)
from ontological_engineer.judges import statement_quality_metric
from ontological_engineer.training.bootstrap import (
    load_chunks_from_notebook as load_albert_chunks,
    load_facts_from_notebook,
    create_training_examples,
)

## 1. Configure Language Model

Connect to LM Studio running Qwen-30B (or your preferred model).

In [None]:
# Configure the LM (defaults to Qwen-30B via LM Studio)
lm = configure_lm(
    model="qwen/qwen3-coder-30b",
    api_base="http://host.docker.internal:1234/v1",
    temperature=0.7,
)

print(f"Configured LM: {lm}")

## 2. Load Few-Shot Examples (Albert Einstein)

Albert Einstein is our gold-standard example. These chunks and their extracted facts
serve as few-shot demonstrations for the extractor and judge.

In [None]:
# Load Albert Einstein data for few-shot examples
fewshot_dir = Path("/workspaces/wiki3-kg-project/data/albert_einstein/20251218_231446")

# Use load_albert_chunks (from bootstrap module) - NOT load_chunks_from_notebook
fewshot_chunks = load_albert_chunks(fewshot_dir / "chunks.ipynb")
fewshot_facts = load_facts_from_notebook(fewshot_dir / "facts.ipynb")

print(f"Loaded {len(fewshot_chunks)} chunks from Albert Einstein")
print(f"Loaded {len(fewshot_facts)} fact sets")

# Create few-shot examples
fewshot_examples = create_training_examples(fewshot_chunks, fewshot_facts)
print(f"Created {len(fewshot_examples)} few-shot examples")

In [None]:
# Show a few-shot example
if fewshot_examples:
    ex = fewshot_examples[0]
    print("Sample few-shot example:")
    print(f"  Context: {ex.section_context}")
    print(f"  Text: {ex.chunk_text[:200]}...")
    print(f"  Statements: {len(ex.statements)} items")
    for stmt in ex.statements[:3]:
        print(f"    - {stmt}")

## 3. Load Wikipedia Sample for Training

Load the 100-page Wikipedia sample from the provenance-tracked notebook.
If the notebook doesn't exist, fall back to JSON format.

In [None]:
# Load the Wikipedia sample (prefer provenance-tracked notebook)
sample_notebook = Path("/workspaces/wiki3-kg-project/data/training/wikipedia_sample.ipynb")
sample_json = Path("/workspaces/wiki3-kg-project/data/training/wikipedia_sample.json")

if sample_notebook.exists():
    # Load from provenance-tracked notebook
    wiki_pages = load_sample_from_notebook(sample_notebook)
    print(f"‚úÖ Loaded {len(wiki_pages)} pages from provenance-tracked notebook")
    print(f"   Source: {sample_notebook}")
elif sample_json.exists():
    # Fall back to JSON format
    with open(sample_json) as f:
        wiki_sample = json.load(f)
    wiki_pages = [WikipediaPage(title=p['title'], views=p['views']) for p in wiki_sample['pages']]
    print(f"‚ö†Ô∏è  Loaded {len(wiki_pages)} pages from JSON (no provenance)")
    print(f"   Run sample_wikipedia_pages.ipynb to generate provenance-tracked version")
else:
    raise FileNotFoundError("No Wikipedia sample found. Run sample_wikipedia_pages.ipynb first.")

print(f"\nFirst 10 pages:")
for p in wiki_pages[:10]:
    print(f"  - {p.title} ({p.views:,} views)")

## 4. Fetch and Chunk Wikipedia Pages (with CID Provenance)

Fetch page content and chunk it. Each page's chunks are saved to a 
provenance-tracked notebook with CID signatures.

**Note**: Uses `fetch_page_content` and `chunk_article` from `ontological_engineer` - 
no application logic defined in this notebook!

In [None]:
# Processing parameters
chunks_dir = Path("/workspaces/wiki3-kg-project/data/training/chunks")
MAX_PAGES = len(wiki_pages)
MIN_CHUNK_LENGTH = 60  # Skip very short chunks

# Quick test on one page first
test_page = wiki_pages[0]
print(f"Testing on: {test_page.title}")

content = fetch_page_content(test_page.title)
if content:
    chunks = chunk_article(test_page.title, content)
    chunks = [c for c in chunks if len(c.text) >= MIN_CHUNK_LENGTH]
    print(f"  ‚Üí {len(chunks)} chunks (filtered by min_length={MIN_CHUNK_LENGTH})")
    if chunks:
        print(f"  First chunk preview: {chunks[0].text[:200]}...")
else:
    print(f"  ‚ö†Ô∏è Could not fetch content")

In [None]:
# Process all pages - logic is in process_wikipedia_sample()
# Handles: fetching, chunking, saving with CID provenance, incremental processing

training_chunks, pages_processed = process_wikipedia_sample(
    pages=wiki_pages,
    output_dir=chunks_dir,
    max_pages=MAX_PAGES,
    min_chunk_length=MIN_CHUNK_LENGTH,
)

print(f"\n‚úÖ Processed {pages_processed} pages")
print(f"   Total training chunks: {len(training_chunks)}")
print(f"   Chunks saved to: {chunks_dir}")

## 5. Initialize Extractor with Few-Shot Examples

Create the statement extractor and provide Albert Einstein examples as demonstrations.

In [None]:
# Select best few-shot examples (ones with good variety of statements)
NUM_FEWSHOT = 3

# Sort by statement count to get diverse examples
sorted_fewshot = sorted(fewshot_examples, key=lambda x: len(x.statements), reverse=True)
selected_fewshot = sorted_fewshot[:NUM_FEWSHOT]

print(f"Selected {len(selected_fewshot)} few-shot examples:")
for i, ex in enumerate(selected_fewshot, 1):
    print(f"  {i}. {ex.section_context[:50]}... ({len(ex.statements)} statements)")

In [None]:
# Create extractor with few-shot demonstrations
extractor = StatementExtractor()

# In DSPy, we can provide demonstrations directly
# The few-shot examples will be used by MIPROv2 for bootstrapping
print("Extractor initialized")
print(f"Few-shot examples available: {len(selected_fewshot)}")

## 6. Test Extraction on Training Sample

Run the extractor on a few training chunks to verify it works.

In [None]:
# Test on a training chunk
if training_chunks:
    test_chunk = training_chunks[0]
    
    print(f"Testing on: {test_chunk.section_context}")
    print(f"Text: {test_chunk.text[:300]}...")
    print("\n" + "="*60 + "\n")
    
    result = extractor(
        chunk_text=test_chunk.text,
        section_context=test_chunk.section_context,
    )
    
    print(f"Extracted {len(result.statements)} statements:")
    for i, stmt in enumerate(result.statements[:10], 1):
        print(f"  {i}. {stmt}")
    if len(result.statements) > 10:
        print(f"  ... and {len(result.statements) - 10} more")

## 7. Create Training Dataset

Convert chunks into DSPy examples. For training, we need to generate initial extractions
that can be scored and optimized.

In [None]:
# Create training examples (without labels - we'll generate and judge them)
# For DSPy optimization, we just need the inputs

random_seed(42)  # For reproducibility

# Convert WikipediaChunk objects to DSPy examples
trainset_chunks = list(training_chunks)  # Make a copy
shuffle(trainset_chunks)

trainset = []
for chunk in trainset_chunks:
    ex = dspy.Example(
        chunk_text=chunk.text,
        section_context=chunk.section_context,
    ).with_inputs('chunk_text', 'section_context')
    trainset.append(ex)

# Split into train/dev
split_idx = int(len(trainset) * 0.8)
devset = trainset[split_idx:]
trainset = trainset[:split_idx]

print(f"Training set: {len(trainset)} examples")
print(f"Dev set: {len(devset)} examples")

## 8. Initialize Judge with Few-Shot Guidance

The judge scores extraction quality. We use Albert Einstein examples to calibrate.

In [None]:
# Initialize judge
judge = StatementQualityJudge()

# Test judge on a known good example (Albert Einstein few-shot)
if selected_fewshot:
    test_ex = selected_fewshot[0]
    
    evaluation = judge(
        chunk_text=test_ex.chunk_text,
        section_context=test_ex.section_context,
        statements=test_ex.statements,
    )
    
    print("Judge calibration on few-shot example:")
    print(f"  Completeness:      {evaluation.completeness:.2f}")
    print(f"  Atomicity:         {evaluation.atomicity:.2f}")
    print(f"  Accuracy:          {evaluation.accuracy:.2f}")
    print(f"  Link preservation: {evaluation.link_preservation:.2f}")
    print(f"  ---")
    print(f"  Weighted score:    {evaluation.weighted_score:.2f}")

## 9. Baseline Evaluation

Evaluate the unoptimized extractor on the dev set.

In [None]:
# Evaluate baseline on dev set
EVAL_SIZE = min(10, len(devset))  # Limit for speed

evaluator = dspy.Evaluate(
    devset=devset[:EVAL_SIZE],
    metric=statement_quality_metric,
    num_threads=1,
    display_progress=True,
)

baseline_extractor = StatementExtractor()
baseline_result = evaluator(baseline_extractor)

baseline_score = baseline_result.score if hasattr(baseline_result, 'score') else float(baseline_result)
print(f"\nBaseline quality score: {baseline_score:.2f}")

## 9b. MLflow Observability Setup

MLflow provides tracing, evaluation, and human feedback tools for DSPy pipelines.

### Quick Setup (One-time)

1. **Install MLflow** (already in requirements or run cell below)
2. **Start the MLflow server** in a terminal:
   ```bash
   cd /workspaces/wiki3-kg-project
   mlflow server \
     --backend-store-uri sqlite:///mlflow.sqlite \
     --default-artifact-root ./mlflow-artifacts \
     --host 0.0.0.0 \
     --port 5000
   ```
3. **Open the UI** at http://localhost:5000 (or via VS Code port forwarding)

### What MLflow Provides
- **Tracing**: See every LM call, inputs, outputs, latency
- **Evaluation**: Compare model versions side-by-side
- **Human Feedback**: Add labels/assessments directly in the UI
- **Experiment Tracking**: Track metrics across optimization runs

In [None]:
# =============================================================================
# MLflow Setup
# =============================================================================
# Prerequisites:
#   1. Install: pip install "mlflow>=3.0"
#   2. Start server in terminal (or run: ./scripts/start_mlflow.sh):
#      mlflow server --backend-store-uri sqlite:///mlflow.sqlite \
#                    --default-artifact-root ./mlflow-artifacts \
#                    --host 0.0.0.0 --port 5000
#   3. Open UI: http://localhost:5000
# =============================================================================

import mlflow

# Configure MLflow - use localhost for local server
# For Docker: use host.docker.internal if MLflow runs on host
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"

try:
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment("wiki3-kg-stage1-statements")
    
    # Enable automatic DSPy tracing - captures all LM calls, modules, predictions
    mlflow.dspy.autolog()
    
    print(f"‚úÖ MLflow configured successfully")
    print(f"   Tracking URI: {MLFLOW_TRACKING_URI}")
    print(f"   Experiment: wiki3-kg-stage1-statements")
    print(f"\nüìä Open MLflow UI: {MLFLOW_TRACKING_URI}")
    MLFLOW_ENABLED = True
except Exception as e:
    print(f"‚ö†Ô∏è  MLflow not available: {e}")
    print(f"\nüí° To enable MLflow, start the server:")
    print(f"   mlflow server --backend-store-uri sqlite:///mlflow.sqlite --port 5000")
    MLFLOW_ENABLED = False

In [None]:
# Run evaluation with MLflow tracing (if enabled)
# Each prediction creates a trace viewable in the MLflow UI

if MLFLOW_ENABLED:
    with mlflow.start_run(run_name="baseline_evaluation"):
        # Log parameters for reproducibility
        mlflow.log_param("eval_size", EVAL_SIZE)
        mlflow.log_param("model", "qwen/qwen3-coder-30b")
        mlflow.log_param("num_fewshot", NUM_FEWSHOT)
        
        # Run extractions on dev set - each one is traced
        results = []
        for i, ex in enumerate(tqdm(devset[:EVAL_SIZE], desc="Evaluating")):
            with mlflow.start_span(name=f"example_{i}") as span:
                # Run extraction
                pred = baseline_extractor(
                    chunk_text=ex.chunk_text,
                    section_context=ex.section_context,
                )
                
                # Run judge
                eval_result = judge(
                    chunk_text=ex.chunk_text,
                    section_context=ex.section_context,
                    statements=pred.statements,
                )
                
                # Log to span for MLflow UI review
                span.set_inputs({
                    "chunk_text": ex.chunk_text[:500],
                    "section_context": ex.section_context,
                })
                span.set_outputs({
                    "statements": list(pred.statements),
                    "completeness": float(eval_result.completeness),
                    "atomicity": float(eval_result.atomicity),
                    "accuracy": float(eval_result.accuracy),
                    "link_preservation": float(eval_result.link_preservation),
                    "weighted_score": float(eval_result.weighted_score),
                    "reasoning": eval_result.reasoning,
                })
                
                results.append({
                    "index": i,
                    "score": float(eval_result.weighted_score),
                })
        
        # Log aggregate metrics
        avg_score = sum(r["score"] for r in results) / len(results)
        mlflow.log_metric("avg_quality_score", avg_score)
        
        print(f"\n‚úÖ Evaluation complete!")
        print(f"   Average score: {avg_score:.2f}")
        print(f"   Traces logged: {len(results)}")
        print(f"\nüìä Review in MLflow UI: {MLFLOW_TRACKING_URI}")
        print(f"   ‚Üí Click 'Traces' tab to see all predictions")
        print(f"   ‚Üí Click individual traces to review inputs/outputs")
        print(f"   ‚Üí Use 'Feedback' to add human labels")
else:
    print("‚è≠Ô∏è  Skipping MLflow evaluation (server not running)")
    print("   Run baseline evaluation with dspy.Evaluate instead")

In [None]:
# Alternative: Use dspy.inspect_history() for quick debugging
# This shows recent LM calls without needing MLflow server

print("Recent LM calls (use MLflow UI for full traces):")
print("=" * 60)
dspy.inspect_history(n=2)

### MLflow Evaluation with Human Feedback

Use MLflow's evaluation API to systematically review predictions and collect human labels.
The MLflow UI provides a proper interface for reviewing and annotating.

In [None]:
# Create evaluation dataset for MLflow
import pandas as pd

eval_data = []
for i, ex in enumerate(devset[:EVAL_SIZE]):
    eval_data.append({
        "index": i,
        "chunk_text": ex.chunk_text,
        "section_context": ex.section_context,
    })

eval_df = pd.DataFrame(eval_data)
print(f"Created evaluation dataset with {len(eval_df)} examples")
eval_df.head()

In [None]:
# ============================================================================
# Per-Statement Classification for ALL Evaluation Examples
# ============================================================================
# Uses StatementClassifier to get GOOD/BAD verdicts per statement

from ontological_engineer import StatementClassifier, StatementClassification

classifier = StatementClassifier()

# Store all results for summary
all_classification_results = []

print(f"Classifying statements for {EVAL_SIZE} chunks...")
print("=" * 70)

for idx, ex in enumerate(devset[:EVAL_SIZE]):
    print(f"\n[{idx+1}/{EVAL_SIZE}] {ex.section_context[:50]}...")
    
    # Extract statements
    pred = baseline_extractor(
        chunk_text=ex.chunk_text,
        section_context=ex.section_context,
    )
    
    # Classify each statement
    result = classifier(
        chunk_text=ex.chunk_text,
        section_context=ex.section_context,
        statements=list(pred.statements),
    )
    
    # Store result
    all_classification_results.append({
        "idx": idx,
        "section": ex.section_context,
        "chunk_text": ex.chunk_text,
        "statements": list(pred.statements),
        "score": result.score,
        "classifications": result.classifications,
        "missing_facts": result.missing_facts,
    })
    
    # Show quick summary
    good = sum(1 for c in result.classifications if c.is_good)
    total = len(result.classifications)
    print(f"   ‚Üí {good}/{total} GOOD ({result.score:.0%})")

# Summary
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
avg_score = sum(r["score"] for r in all_classification_results) / len(all_classification_results)
total_good = sum(sum(1 for c in r["classifications"] if c.is_good) for r in all_classification_results)
total_bad = sum(sum(1 for c in r["classifications"] if not c.is_good) for r in all_classification_results)
print(f"Average score: {avg_score:.1%}")
print(f"Total GOOD: {total_good}, Total BAD: {total_bad}")

In [None]:
# Display detailed results for each chunk
print("DETAILED PER-CHUNK RESULTS")
print("=" * 70)

for r in all_classification_results:
    good = sum(1 for c in r["classifications"] if c.is_good)
    bad = sum(1 for c in r["classifications"] if not c.is_good)
    total = len(r["classifications"])
    
    print(f"\nüìÑ Chunk {r['idx']}: {r['section'][:60]}...")
    print(f"   Score: {r['score']:.0%} ({good}/{total} GOOD)")
    
    # Show BAD statements (these need attention)
    bad_stmts = [c for c in r["classifications"] if not c.is_good]
    if bad_stmts:
        print(f"   ‚ùå BAD statements:")
        for c in bad_stmts:
            print(f"      [{c.index}] {c.statement[:80]}...")
            print(f"          Reason: {c.reason}")
    
    if r["missing_facts"] and r["missing_facts"].lower() != "none":
        print(f"   üìù Missing: {r['missing_facts'][:100]}...")

### Export Annotations from MLflow

After reviewing and labeling in the MLflow UI, export your annotations for judge improvement.

In [None]:
# Load annotations from MLflow (after you've labeled them in the UI)
# MLflow stores feedback as assessments on traces

client = mlflow.MlflowClient()

# Get the latest evaluation run
experiment = client.get_experiment_by_name("wiki3-kg-stage1-statements")
if experiment:
    runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        order_by=["start_time DESC"],
        max_results=1,
    )
    
    if runs:
        latest_run = runs[0]
        print(f"Latest run: {latest_run.info.run_id}")
        print(f"Metrics: {latest_run.data.metrics}")
        
        # Get traces with assessments (human feedback)
        try:
            traces = client.search_traces(
                experiment_ids=[experiment.experiment_id],
                max_results=100,
            )
            print(f"Found {len(traces)} traces")
        except Exception as e:
            print(f"Trace search error: {e}")
else:
    print("No experiment found. Run evaluation first.")

In [None]:
# Use human feedback to improve the judge
# After collecting labels in MLflow, create DSPy training examples

# For now, save the evaluation data for later use
output_dir = Path("/workspaces/wiki3-kg-project/data/training")
output_dir.mkdir(exist_ok=True)

eval_df.to_json(output_dir / "eval_dataset.json", orient="records", indent=2)
print(f"Saved evaluation dataset to {output_dir / 'eval_dataset.json'}")

print("""
üìã Next steps for human feedback:

1. Start MLflow server:
   mlflow server --backend-store-uri sqlite:///mlflow.sqlite --port 5000

2. Open MLflow UI at http://127.0.0.1:5000

3. Navigate to the experiment 'wiki3-kg-stage1-statements'

4. Click on traces to review predictions

5. Use the feedback/assessment features to label quality

6. Export labeled data for judge improvement
""")

## 10. MIPROv2 Prompt Optimization

Use DSPy's MIPROv2 optimizer to improve the extractor's prompts.
This uses the few-shot examples to bootstrap better demonstrations.

In [None]:
from dspy.teleprompt import MIPROv2

# Configure optimizer
optimizer = MIPROv2(
    metric=statement_quality_metric,
    num_candidates=3,  # Number of prompt candidates to try
    init_temperature=0.7,
)

# Use few-shot examples for bootstrapping demonstrations
# Use training set for optimization
TRAIN_SIZE = min(20, len(trainset))  # Limit for speed

print(f"Optimizing with {TRAIN_SIZE} training examples...")
print(f"Using {len(selected_fewshot)} few-shot demos for bootstrapping...")

optimized_extractor = optimizer.compile(
    StatementExtractor(),
    trainset=trainset[:TRAIN_SIZE],
    num_batches=2,
    max_bootstrapped_demos=NUM_FEWSHOT,
    # Provide few-shot examples as initial demos
    # demos=selected_fewshot,  # Uncomment if supported
)

print("\nOptimization complete!")

In [None]:
# Evaluate optimized extractor
optimized_result = evaluator(optimized_extractor)
optimized_score = optimized_result.score if hasattr(optimized_result, 'score') else float(optimized_result)

print(f"Baseline score:  {baseline_score:.2f}")
print(f"Optimized score: {optimized_score:.2f}")
print(f"Improvement:     {optimized_score - baseline_score:+.2f}")

## 11. Inspect Optimized Prompts

See what prompts MIPROv2 discovered.

In [None]:
# Inspect the optimized module
print("Optimized extractor configuration:")
print("="*60)

# Try to access the optimized signature/demos
if hasattr(optimized_extractor, 'demos'):
    print(f"\nDemonstrations: {len(optimized_extractor.demos)}")
    for i, demo in enumerate(optimized_extractor.demos[:2], 1):
        print(f"  Demo {i}: {demo.section_context[:50]}...")

# Check for any instruction changes
if hasattr(optimized_extractor, 'signature'):
    print(f"\nSignature: {optimized_extractor.signature}")

## 12. Save Results

Save the optimized extractor and training data.

In [None]:
# Save training metadata
output_dir = Path("/workspaces/wiki3-kg-project/data/training")
output_dir.mkdir(exist_ok=True)

# Save training results
results = {
    "baseline_score": baseline_score,
    "optimized_score": optimized_score,
    "train_size": TRAIN_SIZE,
    "eval_size": EVAL_SIZE,
    "num_fewshot": NUM_FEWSHOT,
    "pages_processed": pages_processed,
    "total_chunks": len(training_chunks),
}

with open(output_dir / "stage1_results.json", "w") as f:
    json.dump(results, f, indent=2)

print(f"Saved results to {output_dir / 'stage1_results.json'}")

In [None]:
# Save the optimized extractor state
try:
    optimized_extractor.save(output_dir / "optimized_extractor")
    print(f"Saved optimized extractor to {output_dir / 'optimized_extractor'}")
except Exception as e:
    print(f"Could not save extractor state: {e}")
    # Alternative: save as JSON
    if hasattr(optimized_extractor, 'dump_state'):
        state = optimized_extractor.dump_state()
        with open(output_dir / "optimized_extractor_state.json", "w") as f:
            json.dump(state, f, indent=2)
        print("Saved extractor state as JSON")

In [None]:
# Save few-shot examples for reference
fewshot_data = []
for ex in selected_fewshot:
    fewshot_data.append({
        "chunk_text": ex.chunk_text,
        "section_context": ex.section_context,
        "statements": list(ex.statements),
    })

with open(output_dir / "fewshot_examples.json", "w") as f:
    json.dump(fewshot_data, f, indent=2)

print(f"Saved {len(fewshot_data)} few-shot examples")

## Summary

This notebook:
1. Loaded Albert Einstein as few-shot examples (seed/guidance)
2. Fetched and chunked Wikipedia sample pages for training
3. Established baseline extraction quality
4. Ran MIPROv2 prompt optimization
5. Saved the optimized extractor

Next steps:
- **Stage 2**: Schema matching with optimized statements
- **Stage 3**: RDF generation training
- **Arbor GRPO**: Fine-tune the full pipeline end-to-end