In [1]:
# Install required packages
# mamba install -c conda-forge langchain langchain-text-splitters langchain-openai langchain-community wikipedia pydantic nbformat nbconvert
%pip install -q langchain langchain-text-splitters langchain-openai langchain-community wikipedia pydantic nbformat

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import re
import json
from datetime import datetime
from dataclasses import dataclass, field, asdict
from typing import Optional
from urllib.parse import quote

import nbformat
from nbformat.v4 import new_notebook, new_markdown_cell, new_code_cell, new_raw_cell
from langchain_community.document_loaders import WikipediaLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

print("Dependencies loaded")

Dependencies loaded


## Configuration

In [3]:
# Pipeline configuration
ARTICLE_TITLE = "Albert Einstein"
OUTPUT_DIR = "data"
CHUNK_SIZE = 2000
CHUNK_OVERLAP = 128

# LLM configuration (shared across stages)
LLM_CONFIG = {
    "provider": "lm_studio",  # or "openai"
    "model": "qwen/qwen3-coder-30b",
    "temperature": 1,
    "base_url": os.environ.get("LM_STUDIO_BASE_URL", "http://host.docker.internal:1234/v1"),
}

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Pipeline configured for: {ARTICLE_TITLE}")

Pipeline configured for: Albert Einstein


## Entity Registry

Manages entity identity across chunks with stable URIs derived from source URL.

In [4]:
@dataclass
class EntityRegistry:
    """Tracks entities with stable IDs derived from source URL."""
    source_url: str
    entities: dict = field(default_factory=dict)  # normalized_key -> entity
    aliases: dict = field(default_factory=dict)   # alias -> canonical_key
    
    def normalize_key(self, label: str) -> str:
        """Create consistent key from entity label."""
        return re.sub(r'[^a-z0-9]+', '_', label.lower().strip()).strip('_')
    
    def generate_uri(self, entity_type: str, label: str) -> str:
        """Generate URI based on source URL with fragment identifier."""
        key = self.normalize_key(label)
        # Use source URL as base, add fragment for entity
        return f"{self.source_url}#{entity_type.lower()}_{key}"
    
    def generate_id(self, entity_type: str, label: str) -> str:
        """Generate local ID for internal reference."""
        key = self.normalize_key(label)
        return f"{entity_type.lower()}_{key}"
    
    def register(self, label: str, entity_type: str, description: str = "",
                 aliases: list = None, source_chunk: int = None) -> str:
        """Register or update an entity, return canonical ID."""
        key = self.normalize_key(label)
        entity_id = self.generate_id(entity_type, label)
        entity_uri = self.generate_uri(entity_type, label)
        
        if key not in self.entities:
            self.entities[key] = {
                "id": entity_id,
                "uri": entity_uri,
                "label": label,
                "type": entity_type,
                "descriptions": [description] if description else [],
                "source_chunks": [source_chunk] if source_chunk is not None else [],
                "aliases": list(aliases or []),
            }
        else:
            existing = self.entities[key]
            if description and description not in existing["descriptions"]:
                existing["descriptions"].append(description)
            if source_chunk is not None and source_chunk not in existing["source_chunks"]:
                existing["source_chunks"].append(source_chunk)
            if aliases:
                existing["aliases"] = list(set(existing["aliases"]) | set(aliases))
        
        # Register aliases
        for alias in (aliases or []):
            self.aliases[self.normalize_key(alias)] = key
        
        return entity_id
    
    def lookup(self, label: str) -> Optional[dict]:
        """Find entity by label or alias."""
        key = self.normalize_key(label)
        canonical_key = self.aliases.get(key, key)
        return self.entities.get(canonical_key)
    
    def to_json(self) -> str:
        """Serialize registry to JSON."""
        return json.dumps({
            "source_url": self.source_url,
            "entities": self.entities,
            "aliases": self.aliases,
        }, indent=2)
    
    @classmethod
    def from_json(cls, json_str: str) -> 'EntityRegistry':
        """Deserialize registry from JSON."""
        data = json.loads(json_str)
        registry = cls(source_url=data["source_url"])
        registry.entities = data["entities"]
        registry.aliases = data["aliases"]
        return registry

print("EntityRegistry class defined")

EntityRegistry class defined


## Section Hierarchy Parser

Extracts Wikipedia section structure for breadcrumb context.

In [5]:
def extract_section_hierarchy(content: str) -> list[dict]:
    """Parse Wikipedia == headers == into hierarchical structure with positions."""
    header_pattern = re.compile(r'^(={2,6})\s*(.+?)\s*\1\s*$', re.MULTILINE)
    
    sections = []
    current_path = []  # Stack of (level, title)
    
    for match in header_pattern.finditer(content):
        level = len(match.group(1))  # Number of '=' signs
        title = match.group(2).strip()
        
        # Pop stack until we're at parent level
        while current_path and current_path[-1][0] >= level:
            current_path.pop()
        
        current_path.append((level, title))
        breadcrumb = " > ".join(t for _, t in current_path)
        
        sections.append({
            "level": level,
            "title": title,
            "breadcrumb": breadcrumb,
            "start_pos": match.start(),
            "end_pos": match.end(),
        })
    
    return sections


def get_section_context(position: int, sections: list[dict], article_title: str) -> dict:
    """Find the section context for a given character position."""
    active_section = {
        "title": "Introduction",
        "breadcrumb": "Introduction",
        "level": 1,
    }
    
    for section in sections:
        if section["start_pos"] <= position:
            active_section = section
        else:
            break
    
    return {
        "section_title": active_section["title"],
        "breadcrumb": f"{article_title} > {active_section['breadcrumb']}",
    }

print("Section hierarchy parser defined")

Section hierarchy parser defined


## Fetch Wikipedia Content

In [6]:
# Fetch Wikipedia article
loader = WikipediaLoader(query=ARTICLE_TITLE, load_max_docs=1, doc_content_chars_max=100000)
docs = loader.load()

if not docs:
    raise ValueError(f"Could not fetch article: {ARTICLE_TITLE}")

raw_content = docs[0].page_content
metadata = docs[0].metadata

# Construct source URL and provenance
source_url = f"https://en.wikipedia.org/wiki/{quote(ARTICLE_TITLE.replace(' ', '_'))}"

provenance = {
    "source_url": source_url,
    "article_title": ARTICLE_TITLE,
    "fetched_at": datetime.now().isoformat(),
    "content_length": len(raw_content),
    # Wikipedia license - standard for all Wikipedia content
    "license": "CC BY-SA 4.0",
    "license_url": "https://creativecommons.org/licenses/by-sa/4.0/",
    "attribution": "Wikipedia contributors",
}

print(f"Fetched: {ARTICLE_TITLE}")
print(f"Source URL: {source_url}")
print(f"Content length: {len(raw_content)} characters")
print(f"License: {provenance['license']}")

Fetched: Albert Einstein
Source URL: https://en.wikipedia.org/wiki/Albert_Einstein
Content length: 87959 characters
License: CC BY-SA 4.0


## Create Contextual Chunks

In [7]:
# Parse section hierarchy
sections = extract_section_hierarchy(raw_content)
print(f"Found {len(sections)} sections")

# Split into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
)
raw_chunks = splitter.split_text(raw_content)
print(f"Split into {len(raw_chunks)} chunks")

# Add context to each chunk
@dataclass
class ContextualChunk:
    content: str
    chunk_index: int
    total_chunks: int
    breadcrumb: str
    section_title: str
    char_start: int
    char_end: int

contextual_chunks = []
current_pos = 0

for i, chunk_text in enumerate(raw_chunks):
    # Find position in original content
    chunk_start = raw_content.find(chunk_text, current_pos)
    if chunk_start == -1:
        chunk_start = current_pos  # Fallback
    chunk_end = chunk_start + len(chunk_text)
    
    # Get section context
    context = get_section_context(chunk_start, sections, ARTICLE_TITLE)
    
    contextual_chunks.append(ContextualChunk(
        content=chunk_text,
        chunk_index=i,
        total_chunks=len(raw_chunks),
        breadcrumb=context["breadcrumb"],
        section_title=context["section_title"],
        char_start=chunk_start,
        char_end=chunk_end,
    ))
    
    current_pos = chunk_start + 1

print(f"\nChunks with context:")
for chunk in contextual_chunks[:3]:
    print(f"  Chunk {chunk.chunk_index + 1}: {chunk.breadcrumb}")

Found 71 sections
Split into 63 chunks

Chunks with context:
  Chunk 1: Albert Einstein > Introduction
  Chunk 2: Albert Einstein > Introduction
  Chunk 3: Albert Einstein > Life and career


## Initialize Entity Registry

In [8]:
# Initialize entity registry with article subject
registry = EntityRegistry(source_url=source_url)

# Pre-seed with the article subject
registry.register(
    label=ARTICLE_TITLE,
    entity_type="Person",  # Adjust based on article type
    description=f"Subject of Wikipedia article: {ARTICLE_TITLE}",
    aliases=[ARTICLE_TITLE.split()[-1]],  # Last name as alias
)

print(f"Entity registry initialized with subject: {ARTICLE_TITLE}")
print(f"Subject URI: {registry.entities[registry.normalize_key(ARTICLE_TITLE)]['uri']}")

Entity registry initialized with subject: Albert Einstein
Subject URI: https://en.wikipedia.org/wiki/Albert_Einstein#person_albert_einstein


## Prompt Templates

These prompts are embedded in the generated notebooks for transparency and adjustability.

In [9]:
FACTS_EXTRACTION_PROMPT = """You are an expert at extracting factual information from text.

Given text from a Wikipedia article, extract simple English statements that capture the consensus factual information. Each statement should:
- Be a single, clear sentence
- Contain one main fact or relationship
- Use the full name of entities on first mention
- Be verifiable from the source text
- Avoid opinions, interpretations, or hedged language

The text comes from: {source_url}
Section context: {breadcrumb}

Known entities (use consistent names):
{known_entities}

---
{chunk_text}
---

Extract factual statements as a bulleted list. Also identify any new entities (people, places, organizations, concepts, events, works) that should be added to the registry.
"""

RDF_GENERATION_PROMPT = """You are an expert at converting factual statements to RDF triples in Turtle format.

Convert the following factual statements to RDF using schema.org vocabulary where possible.

Source: {source_url}
Section: {breadcrumb}

Use these prefixes:
{prefixes}

Entity registry (use these URIs):
{entity_registry}

Guidelines:
- Use schema.org properties (schema:birthDate, schema:birthPlace, schema:worksFor, etc.)
- For relationships not in schema.org, use wiki3: prefix
- Include rdfs:label for entities
- Use xsd datatypes for dates and numbers
- Entity URIs should use the source URL as base with fragment identifiers

---
{facts}
---

Generate Turtle RDF:
"""

RDF_PREFIXES = """@prefix schema: <https://schema.org/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix wiki3: <https://wiki3.ai/vocab/> .
@base <{source_url}> .
"""

print("Prompt templates defined")

Prompt templates defined


## Generate Chunks Notebook

In [10]:
def generate_chunks_notebook(chunks: list, provenance: dict, registry: EntityRegistry, 
                             llm_config: dict, output_path: str):
    """Generate a notebook with chunked source text and context metadata."""
    nb = new_notebook()
    
    # Cell 1: Provenance markdown
    provenance_yaml = f"""# Chunked Text: {provenance['article_title']}

## Provenance

```yaml
source_url: {provenance['source_url']}
article_title: {provenance['article_title']}
fetched_at: {provenance['fetched_at']}
content_length: {provenance['content_length']}
license: {provenance['license']}
license_url: {provenance['license_url']}
attribution: {provenance['attribution']}
chunk_size: {CHUNK_SIZE}
chunk_overlap: {CHUNK_OVERLAP}
total_chunks: {len(chunks)}
generated_by: wiki_to_kg_pipeline.ipynb
generated_at: {datetime.now().isoformat()}
```

## Processing Instructions

Each chunk below contains source text with contextual metadata. The context line (before the separator) provides:
- **Context**: Hierarchical breadcrumb showing article > section path
- **Chunk**: Position in sequence

The text below the `---` separator is the unchanged source content.
"""
    nb.cells.append(new_markdown_cell(provenance_yaml))
    
    # Cell 2: Entity registry (raw cell)
    nb.cells.append(new_raw_cell(registry.to_json()))
    
    # Chunk cells
    for chunk in chunks:
        chunk_cell = f"""**Context:** {chunk.breadcrumb}
**Chunk:** {chunk.chunk_index + 1} of {chunk.total_chunks}

---

{chunk.content}
"""
        nb.cells.append(new_markdown_cell(chunk_cell))
    
    # Write notebook
    with open(output_path, 'w', encoding='utf-8') as f:
        nbformat.write(nb, f)
    
    return output_path

# Generate chunks notebook
article_slug = ARTICLE_TITLE.lower().replace(' ', '_')
chunks_path = os.path.join(OUTPUT_DIR, f"{article_slug}_chunks.ipynb")
generate_chunks_notebook(contextual_chunks, provenance, registry, LLM_CONFIG, chunks_path)
print(f"Generated: {chunks_path}")

Generated: data/albert_einstein_chunks.ipynb


## Generate Facts Notebook (Structure Only)

Creates the facts notebook with placeholders. Actual content is generated in the next step.

In [11]:
def generate_facts_notebook(chunks: list, provenance: dict, registry: EntityRegistry,
                            llm_config: dict, prompt_template: str, output_path: str):
    """Generate a notebook for factual statements - content only, no Python code.
    
    This notebook contains:
    - Provenance header with processing prompt
    - Entity registry (raw cell)
    - One markdown cell per chunk for factual statements (to be filled by LLM)
    """
    nb = new_notebook()
    
    # Cell 1: Provenance and prompt
    provenance_md = f"""# Factual Statements: {provenance['article_title']}

## Provenance

```yaml
source_url: {provenance['source_url']}
article_title: {provenance['article_title']}
license: {provenance['license']}
license_url: {provenance['license_url']}
source_notebook: {article_slug}_chunks.ipynb
generated_by: wiki_to_kg_pipeline.ipynb
generated_at: {datetime.now().isoformat()}
llm_provider: {llm_config['provider']}
llm_model: {llm_config['model']}
llm_temperature: {llm_config['temperature']}
```

## Processing Instructions

This notebook contains simple English factual statements extracted from source text chunks.
Each cell corresponds to one chunk from the source notebook.

An LLM/agent processes this notebook by:
1. Reading each content cell below
2. The context line indicates the source section
3. Statements can be edited, corrected, or expanded by humans

## Prompt Template

The following prompt was used to extract facts from each chunk:

```
{prompt_template}
```
"""
    nb.cells.append(new_markdown_cell(provenance_md))
    
    # Cell 2: Entity registry (raw cell)
    nb.cells.append(new_raw_cell(registry.to_json()))
    
    # Content cells - one per chunk with placeholder for facts
    for chunk in chunks:
        # Markdown cell with context header and placeholder for factual statements
        facts_cell = f"""**Context:** {chunk.breadcrumb}
**Chunk:** {chunk.chunk_index + 1} of {chunk.total_chunks}

---

[Factual statements to be extracted from source chunk]
"""
        nb.cells.append(new_markdown_cell(facts_cell))
    
    # Write notebook
    with open(output_path, 'w', encoding='utf-8') as f:
        nbformat.write(nb, f)
    
    return output_path

# Generate facts notebook
facts_path = os.path.join(OUTPUT_DIR, f"{article_slug}_facts.ipynb")
generate_facts_notebook(contextual_chunks, provenance, registry, LLM_CONFIG, 
                        FACTS_EXTRACTION_PROMPT, facts_path)
print(f"Generated: {facts_path}")

Generated: data/albert_einstein_facts.ipynb


## Generate RDF Notebook (Structure Only)

Creates the RDF notebook with placeholders. Actual content is generated after facts extraction.

In [12]:
def generate_rdf_notebook(chunks: list, provenance: dict, registry: EntityRegistry,
                          llm_config: dict, prompt_template: str, prefixes: str, output_path: str):
    """Generate a notebook for RDF triples - content only, no Python code.
    
    This notebook contains:
    - Provenance header with RDF generation prompt and prefixes
    - Entity registry (raw cell)
    - One raw cell per chunk for Turtle RDF (to be filled by LLM)
    """
    nb = new_notebook()
    
    # Cell 1: Provenance and prompt
    formatted_prefixes = prefixes.format(source_url=provenance['source_url'])
    provenance_md = f"""# RDF Triples: {provenance['article_title']}

## Provenance

```yaml
source_url: {provenance['source_url']}
article_title: {provenance['article_title']}
license: {provenance['license']}
license_url: {provenance['license_url']}
source_notebook: {article_slug}_facts.ipynb
generated_by: wiki_to_kg_pipeline.ipynb
generated_at: {datetime.now().isoformat()}
llm_provider: {llm_config['provider']}
llm_model: {llm_config['model']}
llm_temperature: {llm_config['temperature']}
rdf_format: Turtle
```

## RDF Prefixes

The following prefixes are used throughout this notebook:

```turtle
{formatted_prefixes}
```

## Processing Instructions

This notebook contains RDF triples in Turtle format, one cell per source facts cell.
Each cell corresponds to factual statements from the facts notebook.

An LLM/agent processes this notebook by:
1. Reading the corresponding facts cell
2. Converting statements to RDF using schema.org vocabulary
3. Entity URIs use source URL with fragment identifiers (e.g., `<#person_albert_einstein>`)

Triples can be edited, corrected, or expanded by humans before final export.

## Prompt Template

The following prompt was used to convert facts to RDF:

```
{prompt_template}
```
"""
    nb.cells.append(new_markdown_cell(provenance_md))
    
    # Cell 2: Entity registry (raw cell)
    nb.cells.append(new_raw_cell(registry.to_json()))
    
    # Content cells - one raw cell per chunk for Turtle RDF
    for chunk in chunks:
        # Raw cell with context comment and placeholder for RDF
        rdf_cell = f"""# Chunk {chunk.chunk_index + 1}: {chunk.section_title}
# Context: {chunk.breadcrumb}

# [RDF triples to be generated from corresponding facts cell]
"""
        nb.cells.append(new_raw_cell(rdf_cell))
    
    # Write notebook
    with open(output_path, 'w', encoding='utf-8') as f:
        nbformat.write(nb, f)
    
    return output_path

# Generate RDF notebook
rdf_path = os.path.join(OUTPUT_DIR, f"{article_slug}_rdf.ipynb")
generate_rdf_notebook(contextual_chunks, provenance, registry, LLM_CONFIG,
                      RDF_GENERATION_PROMPT, RDF_PREFIXES, rdf_path)
print(f"Generated: {rdf_path}")

Generated: data/albert_einstein_rdf.ipynb


## Process Chunks → Extract Facts

Run the LLM on each chunk to extract factual statements and update the facts notebook.

In [13]:
import time
import signal
from contextlib import contextmanager

# Timeout handling
CELL_TIMEOUT_SECONDS = 300  # 5 minutes

class TimeoutException(Exception):
    pass

@contextmanager
def timeout_context(seconds):
    """Context manager for timing out long-running operations."""
    def timeout_handler(signum, frame):
        raise TimeoutException(f"Operation timed out after {seconds} seconds")
    
    # Set the signal handler
    old_handler = signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)
        signal.signal(signal.SIGALRM, old_handler)

def is_cell_populated(cell_content: str) -> bool:
    """Check if a cell has real content (not just placeholders)."""
    # Check for placeholder markers
    placeholders = [
        "[Factual statements to be extracted",
        "[Error extracting facts:",
        "[RDF triples to be generated",
        "# [RDF triples to be generated",
    ]
    content_after_separator = cell_content.split("---\n", 1)[-1].strip()
    
    # Empty or only whitespace
    if not content_after_separator:
        return False
    
    # Contains placeholder text
    for placeholder in placeholders:
        if placeholder in content_after_separator:
            return False
    
    return True

# Initialize LLM for processing
llm = ChatOpenAI(
    model=LLM_CONFIG["model"],
    temperature=LLM_CONFIG["temperature"],
    base_url=LLM_CONFIG["base_url"],
    api_key="lm-studio",
)

# Create facts extraction prompt
facts_prompt = ChatPromptTemplate.from_template("""You are an expert at extracting factual information from text.

Given text from a Wikipedia article, extract simple English statements that capture the consensus factual information. Each statement should:
- Be a single, clear sentence
- Contain one main fact or relationship
- Use the full name of entities on first mention
- Be verifiable from the source text
- Avoid opinions, interpretations, or hedged language

Source: {source_url}
Section context: {breadcrumb}

Known entities (use consistent names):
{known_entities}

---
{chunk_text}
---

Extract factual statements as a bulleted list:""")

facts_chain = facts_prompt | llm

def get_known_entities_text(registry: EntityRegistry) -> str:
    """Format known entities for prompt context."""
    lines = []
    for entity in registry.entities.values():
        lines.append(f"- {entity['label']} ({entity['type']})")
    return "\n".join(lines) if lines else "None yet"

# Read chunks notebook
chunks_nb = nbformat.read(chunks_path, as_version=4)

# Extract chunk cells (skip provenance header and entity registry)
chunk_cells = [cell for cell in chunks_nb.cells[2:] if cell.cell_type == 'markdown']

# Read existing facts notebook to check for already-populated cells
facts_nb = nbformat.read(facts_path, as_version=4)

print(f"Processing {len(chunk_cells)} chunks to extract facts...")
print(f"Timeout per cell: {CELL_TIMEOUT_SECONDS} seconds ({CELL_TIMEOUT_SECONDS // 60} minutes)")

# Process each chunk and update notebook incrementally
extracted_facts = []
skipped_count = 0
error_count = 0

for i, cell in enumerate(chunk_cells):
    cell_index = i + 2  # Skip header and registry cells in facts notebook
    
    # Check if this cell is already populated
    if cell_index < len(facts_nb.cells):
        existing_content = facts_nb.cells[cell_index].source
        if is_cell_populated(existing_content):
            print(f"  Chunk {i + 1}/{len(chunk_cells)}: ⊘ Already populated, skipping")
            skipped_count += 1
            # Still add to extracted_facts for later reference
            parts = existing_content.split("---\n", 1)
            context_match = re.search(r'\*\*Context:\*\*\s*(.+)', parts[0]) if len(parts) > 1 else None
            extracted_facts.append({
                "chunk_index": i,
                "breadcrumb": context_match.group(1) if context_match else f"Chunk {i + 1}",
                "facts": parts[1].strip() if len(parts) > 1 else existing_content,
            })
            continue
    
    print(f"  Chunk {i + 1}/{len(chunk_cells)}...", end=" ", flush=True)
    start_time = time.time()
    
    # Parse the chunk cell to extract content after the separator
    cell_content = cell.source
    parts = cell_content.split("---\n", 1)
    if len(parts) > 1:
        chunk_text = parts[1].strip()
        context_match = re.search(r'\*\*Context:\*\*\s*(.+)', parts[0])
        breadcrumb = context_match.group(1) if context_match else f"Chunk {i + 1}"
    else:
        chunk_text = cell_content
        breadcrumb = f"Chunk {i + 1}"
    
    # Call LLM to extract facts with timeout
    try:
        with timeout_context(CELL_TIMEOUT_SECONDS):
            result = facts_chain.invoke({
                "source_url": provenance["source_url"],
                "breadcrumb": breadcrumb,
                "known_entities": get_known_entities_text(registry),
                "chunk_text": chunk_text,
            })
        facts_content = result.content
        elapsed = time.time() - start_time
        print(f"✓ ({len(facts_content)} chars, {elapsed:.1f}s)")
    except TimeoutException as e:
        facts_content = f"[Error: Timeout after {CELL_TIMEOUT_SECONDS}s]"
        print(f"⏱ Timeout after {CELL_TIMEOUT_SECONDS}s")
        error_count += 1
    except Exception as e:
        facts_content = f"[Error extracting facts: {e}]"
        elapsed = time.time() - start_time
        print(f"✗ Error after {elapsed:.1f}s: {e}")
        error_count += 1
    
    extracted_facts.append({
        "chunk_index": i,
        "breadcrumb": breadcrumb,
        "facts": facts_content,
    })
    
    # Update the facts notebook immediately after each cell
    updated_content = f"""**Context:** {breadcrumb}
**Chunk:** {i + 1} of {len(chunk_cells)}

---

{facts_content}
"""
    if cell_index < len(facts_nb.cells):
        facts_nb.cells[cell_index].source = updated_content
    
    # Save notebook after each cell
    with open(facts_path, 'w', encoding='utf-8') as f:
        nbformat.write(facts_nb, f)

print(f"\nFacts extraction complete:")
print(f"  - {len(extracted_facts)} total cells")
print(f"  - {skipped_count} skipped (already populated)")
print(f"  - {error_count} errors/timeouts")

Processing 63 chunks to extract facts...
Timeout per cell: 300 seconds (5 minutes)
  Chunk 1/63... ✓ (2040 chars, 6.6s)
  Chunk 2/63... ✓ (1788 chars, 4.7s)
  Chunk 3/63... ✓ (1203 chars, 3.8s)
  Chunk 4/63... ✓ (1144 chars, 3.5s)
  Chunk 5/63... ✓ (1009 chars, 2.7s)
  Chunk 6/63... ✓ (1053 chars, 3.1s)
  Chunk 7/63... ✓ (1503 chars, 4.3s)
  Chunk 8/63... ✓ (1662 chars, 6.0s)
  Chunk 9/63... ✓ (1901 chars, 6.0s)
  Chunk 10/63... ✓ (1720 chars, 4.6s)
  Chunk 11/63... ✓ (1473 chars, 4.7s)
  Chunk 12/63... ✓ (1543 chars, 4.6s)
  Chunk 13/63... ✓ (1622 chars, 5.2s)
  Chunk 14/63... ✓ (399 chars, 1.2s)
  Chunk 15/63... ✓ (1313 chars, 4.1s)
  Chunk 16/63... ✓ (1233 chars, 3.5s)
  Chunk 17/63... ✓ (1149 chars, 3.3s)
  Chunk 18/63... ✓ (854 chars, 3.0s)
  Chunk 19/63... ✓ (1444 chars, 4.3s)
  Chunk 20/63... ✓ (1122 chars, 3.5s)
  Chunk 21/63... ✓ (1873 chars, 5.3s)
  Chunk 22/63... ✓ (1293 chars, 4.4s)
  Chunk 23/63... ✓ (1193 chars, 3.9s)
  Chunk 24/63... ✓ (1021 chars, 3.5s)
  Chunk 25/63...

In [14]:
# Summary of facts extraction (notebook already updated incrementally)
print(f"Facts notebook: {facts_path}")
print(f"  - Updated incrementally during processing")
print(f"  - Ready for review/editing before RDF generation")

Facts notebook: data/albert_einstein_facts.ipynb
  - Updated incrementally during processing
  - Ready for review/editing before RDF generation


## Process Facts → Generate RDF

Run the LLM on each facts cell to generate RDF triples and update the RDF notebook.

In [15]:
# Format prefixes for RDF generation
formatted_prefixes = RDF_PREFIXES.format(source_url=provenance['source_url'])

# Create RDF generation prompt
rdf_prompt = ChatPromptTemplate.from_template("""You are an expert at converting factual statements to RDF triples in Turtle format.

Convert the following factual statements to RDF using schema.org vocabulary where possible.

Source: {source_url}
Section: {breadcrumb}

Use these prefixes:
{prefixes}

Entity registry (use these URIs for known entities):
{entity_registry}

Guidelines:
- Use schema.org properties (schema:birthDate, schema:birthPlace, schema:worksFor, schema:alumniOf, etc.)
- For relationships not in schema.org, use wiki3: prefix
- Include rdfs:label for new entities
- Use xsd datatypes for dates and numbers
- Entity URIs use the base with fragment identifiers: <#type_name> (e.g., <#person_albert_einstein>)
- Do NOT repeat the @prefix declarations - just output the triples

---
{facts}
---

Generate Turtle RDF triples (without prefix declarations):""")

rdf_chain = rdf_prompt | llm

def get_entity_registry_text(registry: EntityRegistry) -> str:
    """Format entity registry for RDF prompt context."""
    lines = []
    for entity in registry.entities.values():
        lines.append(f"- {entity['label']}: <#{entity['id']}> (type: {entity['type']})")
    return "\n".join(lines) if lines else "None yet"

def is_rdf_cell_populated(cell_content: str) -> bool:
    """Check if an RDF cell has real content (not just placeholders/comments)."""
    lines = cell_content.strip().split('\n')
    # Check if there are any non-comment, non-empty lines
    for line in lines:
        stripped = line.strip()
        if stripped and not stripped.startswith('#'):
            return True
    return False

# Read the updated facts notebook
facts_nb = nbformat.read(facts_path, as_version=4)

# Extract facts cells (skip provenance header and entity registry)
facts_cells = [cell for cell in facts_nb.cells[2:] if cell.cell_type == 'markdown']

# Read existing RDF notebook to check for already-populated cells
rdf_nb = nbformat.read(rdf_path, as_version=4)

print(f"Processing {len(facts_cells)} facts cells to generate RDF...")
print(f"Timeout per cell: {CELL_TIMEOUT_SECONDS} seconds ({CELL_TIMEOUT_SECONDS // 60} minutes)")

# Process each facts cell and update notebook incrementally
generated_rdf = []
skipped_count = 0
error_count = 0

for i, cell in enumerate(facts_cells):
    cell_index = i + 2  # Skip header and registry cells in RDF notebook
    
    # Check if this RDF cell is already populated
    if cell_index < len(rdf_nb.cells):
        existing_content = rdf_nb.cells[cell_index].source
        if is_rdf_cell_populated(existing_content):
            print(f"  Facts cell {i + 1}/{len(facts_cells)}: ⊘ Already populated, skipping")
            skipped_count += 1
            # Extract breadcrumb from comment
            context_match = re.search(r'# Context:\s*(.+)', existing_content)
            generated_rdf.append({
                "chunk_index": i,
                "breadcrumb": context_match.group(1) if context_match else f"Chunk {i + 1}",
                "turtle": existing_content,
            })
            continue
    
    print(f"  Facts cell {i + 1}/{len(facts_cells)}...", end=" ", flush=True)
    start_time = time.time()
    
    # Parse the facts cell to extract content after the separator
    cell_content = cell.source
    parts = cell_content.split("---\n", 1)
    if len(parts) > 1:
        facts_text = parts[1].strip()
        context_match = re.search(r'\*\*Context:\*\*\s*(.+)', parts[0])
        breadcrumb = context_match.group(1) if context_match else f"Chunk {i + 1}"
    else:
        facts_text = cell_content
        breadcrumb = f"Chunk {i + 1}"
    
    # Skip if no real facts content
    if facts_text.startswith("[Factual statements") or facts_text.startswith("[Error"):
        print("⊘ Skipped (no facts to convert)")
        rdf_content = f"# Chunk {i + 1}: No facts to convert\n"
        generated_rdf.append({
            "chunk_index": i,
            "breadcrumb": breadcrumb,
            "turtle": rdf_content,
        })
        # Update RDF notebook with placeholder
        updated_content = f"""# Chunk {i + 1}
# Context: {breadcrumb}

{rdf_content}
"""
        if cell_index < len(rdf_nb.cells):
            rdf_nb.cells[cell_index].source = updated_content
        with open(rdf_path, 'w', encoding='utf-8') as f:
            nbformat.write(rdf_nb, f)
        continue
    
    # Call LLM to generate RDF with timeout
    try:
        with timeout_context(CELL_TIMEOUT_SECONDS):
            result = rdf_chain.invoke({
                "source_url": provenance["source_url"],
                "breadcrumb": breadcrumb,
                "prefixes": formatted_prefixes,
                "entity_registry": get_entity_registry_text(registry),
                "facts": facts_text,
            })
        rdf_content = result.content
        elapsed = time.time() - start_time
        print(f"✓ ({len(rdf_content)} chars, {elapsed:.1f}s)")
    except TimeoutException as e:
        rdf_content = f"# Error: Timeout after {CELL_TIMEOUT_SECONDS}s\n"
        print(f"⏱ Timeout after {CELL_TIMEOUT_SECONDS}s")
        error_count += 1
    except Exception as e:
        rdf_content = f"# Error generating RDF: {e}\n"
        elapsed = time.time() - start_time
        print(f"✗ Error after {elapsed:.1f}s: {e}")
        error_count += 1
    
    generated_rdf.append({
        "chunk_index": i,
        "breadcrumb": breadcrumb,
        "turtle": rdf_content,
    })
    
    # Update the RDF notebook immediately after each cell
    updated_content = f"""# Chunk {i + 1}
# Context: {breadcrumb}

{rdf_content}
"""
    if cell_index < len(rdf_nb.cells):
        rdf_nb.cells[cell_index].source = updated_content
    
    # Save notebook after each cell
    with open(rdf_path, 'w', encoding='utf-8') as f:
        nbformat.write(rdf_nb, f)

print(f"\nRDF generation complete:")
print(f"  - {len(generated_rdf)} total cells")
print(f"  - {skipped_count} skipped (already populated)")
print(f"  - {error_count} errors/timeouts")

Processing 63 facts cells to generate RDF...
Timeout per cell: 300 seconds (5 minutes)
  Facts cell 1/63... ✓ (12346 chars, 48.9s)
  Facts cell 2/63... ✓ (2865 chars, 11.8s)
  Facts cell 3/63... ✓ (2083 chars, 8.8s)
  Facts cell 4/63... ✓ (3801 chars, 16.3s)
  Facts cell 5/63... ✓ (2676 chars, 11.1s)
  Facts cell 6/63... ✓ (2523 chars, 10.2s)
  Facts cell 7/63... ✓ (2796 chars, 12.3s)
  Facts cell 8/63... ✓ (4173 chars, 21.4s)
  Facts cell 9/63... ✓ (3859 chars, 17.0s)
  Facts cell 10/63... ✓ (2386 chars, 11.2s)
  Facts cell 11/63... ✓ (5194 chars, 20.5s)
  Facts cell 12/63... ✓ (4452 chars, 18.8s)
  Facts cell 13/63... ✓ (6327 chars, 25.5s)
  Facts cell 14/63... ✓ (1470 chars, 5.7s)
  Facts cell 15/63... ✓ (2490 chars, 11.2s)
  Facts cell 16/63... ✓ (6967 chars, 30.7s)
  Facts cell 17/63... ✓ (4074 chars, 15.4s)
  Facts cell 18/63... ✓ (3205 chars, 14.8s)
  Facts cell 19/63... ✓ (3035 chars, 12.5s)
  Facts cell 20/63... ✓ (2972 chars, 11.8s)
  Facts cell 21/63... ✓ (3333 chars, 13.8s)

KeyboardInterrupt: 

In [None]:
# Summary of RDF generation (notebook already updated incrementally)
print(f"RDF notebook: {rdf_path}")
print(f"  - Updated incrementally during processing")
print(f"  - Ready for review/editing before final export")

## Export Combined RDF

Combine all RDF cells into a single Turtle file with prefixes.

In [None]:
# Combine all RDF into a single Turtle file
all_triples = []

# Read the updated RDF notebook
rdf_nb = nbformat.read(rdf_path, as_version=4)

# Collect RDF from all raw cells (skip provenance and registry)
for cell in rdf_nb.cells[2:]:
    if cell.cell_type == 'raw':
        # Skip empty or comment-only cells
        content = cell.source.strip()
        lines = [line for line in content.split('\n') if line.strip() and not line.strip().startswith('#')]
        if lines:
            all_triples.append(content)

# Build complete Turtle file
turtle_output = f"""# RDF Knowledge Graph: {provenance['article_title']}
# Source: {provenance['source_url']}
# License: {provenance['license']}
# Generated: {datetime.now().isoformat()}

{formatted_prefixes}

# === Triples ===

"""

turtle_output += "\n\n".join(all_triples)

# Save to file
turtle_path = os.path.join(OUTPUT_DIR, f"{article_slug}.ttl")
with open(turtle_path, 'w', encoding='utf-8') as f:
    f.write(turtle_output)

print(f"Exported RDF to: {turtle_path}")
print(f"  - {len(all_triples)} chunks of triples")
print(f"  - {len(turtle_output)} characters total")

## Save Initial Entity Registry

In [None]:
# Save entity registry to JSON file
registry_path = os.path.join(OUTPUT_DIR, "entity_registry.json")
with open(registry_path, 'w', encoding='utf-8') as f:
    f.write(registry.to_json())

print(f"Saved: {registry_path}")
print(f"\nInitial entities: {len(registry.entities)}")
for key, entity in registry.entities.items():
    print(f"  - {entity['label']} ({entity['type']}): {entity['uri']}")

Saved: data/entity_registry.json

Initial entities: 1
  - Albert Einstein (Person): https://en.wikipedia.org/wiki/Albert_Einstein#person_albert_einstein


## Pipeline Summary

In [None]:
print("="*60)
print("PIPELINE COMPLETE")
print("="*60)
print(f"\nArticle: {ARTICLE_TITLE}")
print(f"Source: {source_url}")
print(f"License: {provenance['license']}")
print(f"\nGenerated artifacts:")
print(f"  1. {chunks_path}")
print(f"     - {len(contextual_chunks)} chunks with breadcrumb context")
print(f"  2. {facts_path}")
print(f"     - {len(extracted_facts)} cells with extracted factual statements")
print(f"  3. {rdf_path}")
print(f"     - {len(generated_rdf)} cells with RDF triples")
print(f"  4. {turtle_path}")
print(f"     - Combined Turtle file for import")
print(f"\nEntity registry: {registry_path}")
print(f"\nThe intermediate notebooks can be reviewed and edited before re-export.")

PIPELINE COMPLETE

Article: Albert Einstein
Source: https://en.wikipedia.org/wiki/Albert_Einstein
License: CC BY-SA 4.0

Generated content notebooks (no Python code):
  1. data/albert_einstein_chunks.ipynb
     - 63 chunks with breadcrumb context
     - Markdown cells with unchanged source text
  2. data/albert_einstein_facts.ipynb
     - Placeholder cells for factual statements
     - Prompt template in header for LLM processing
  3. data/albert_einstein_rdf.ipynb
     - Placeholder cells for Turtle RDF
     - Prefixes and prompt in header

Entity registry: data/entity_registry.json

Workflow:
  1. Review/edit chunks notebook
  2. LLM/agent fills facts notebook from chunks
  3. Human reviews/edits facts
  4. LLM/agent fills RDF notebook from facts
  5. Human reviews/edits RDF
  6. Export final .ttl file
