In [1]:
# Install required packages
%pip install -q langchain langchain-text-splitters langchain-openai langchain-community wikipedia pydantic nbformat

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import sys
from datetime import datetime
from urllib.parse import quote

import nbformat
from langchain_community.document_loaders import WikipediaLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# Add src to path
sys.path.insert(0, os.path.dirname(os.path.abspath('.')))

# Import from our modules
from src.cid import extract_signatures, extract_statement_signatures
from src.entity_registry import EntityRegistry
from src.section_parser import extract_section_hierarchy, get_section_context
from src.prompts import FACTS_EXTRACTION_PROMPT, RDF_STATEMENT_SYSTEM_PROMPT, RDF_STATEMENT_HUMAN_PROMPT, RDF_PREFIXES
from src.notebook_generators import generate_chunks_notebook, generate_facts_notebook, generate_rdf_notebook_header
from src.rdf_tools import create_rdf_tools
from src.utils import log_progress, ContextualChunk, setup_output_directory, create_contextual_chunks
from src.processors import (
    process_facts_extraction, process_rdf_generation, 
    read_chunk_data, read_facts_data
)
from schema_matcher import SchemaMatcher

print("All modules loaded successfully")

All modules loaded successfully


## Configuration

In [3]:
# === PIPELINE CONFIGURATION ===

ARTICLE_TITLE = "Albert Einstein"
OUTPUT_DIR = "data"
CHUNK_SIZE = 2000
CHUNK_OVERLAP = 128

# Continue from a previous run (set to None for fresh run)
CONTINUE_FROM_RUN = "data/albert_einstein/20251219_045359"  # e.g., "data/albert_einstein/20241218_143022"

# LLM configuration
LLM_CONFIG = {
    "provider": "lm_studio",
    "model": "qwen/qwen3-coder-30b",
    "temperature": 1,
    "base_url": os.environ.get("LM_STUDIO_BASE_URL", "http://host.docker.internal:1234/v1"),
}

# Processing limits
CELL_TIMEOUT_SECONDS = 60
MAX_ITERATIONS = 150

# Schema vocabulary cache
VOCAB_CACHE_DIR = "data/vocab_cache"

print(f"Configuration set for: {ARTICLE_TITLE}")

Configuration set for: Albert Einstein


## Setup Output Directory

In [4]:
# Set up output directory (handles CONTINUE_FROM_RUN)
RUN_OUTPUT_DIR, RUN_TIMESTAMP = setup_output_directory(
    OUTPUT_DIR, ARTICLE_TITLE, CONTINUE_FROM_RUN
)

print(f"Output directory: {RUN_OUTPUT_DIR}")
print(f"Run timestamp: {RUN_TIMESTAMP}")

Continuing from previous run: data/albert_einstein/20251219_045359
  Copied 2 files: facts.ipynb, chunks.ipynb
  Existing cells with matching CIDs will be skipped
Output directory: data/albert_einstein/20251219_051452
Run timestamp: 20251219_051452


## Fetch Wikipedia Content

In [6]:
# Fetch Wikipedia article
loader = WikipediaLoader(query=ARTICLE_TITLE, load_max_docs=1, doc_content_chars_max=100000)
docs = loader.load()

if not docs:
    raise ValueError(f"Could not fetch article: {ARTICLE_TITLE}")

raw_content = docs[0].page_content
metadata = docs[0].metadata

# Construct source URL and provenance
source_url = f"https://en.wikipedia.org/wiki/{quote(ARTICLE_TITLE.replace(' ', '_'))}"

provenance = {
    "source_url": source_url,
    "article_title": ARTICLE_TITLE,
    "fetched_at": datetime.now().isoformat(),
    "content_length": len(raw_content),
    "license": "CC BY-SA 4.0",
    "license_url": "https://creativecommons.org/licenses/by-sa/4.0/",
    "attribution": "Wikipedia contributors",
}

print(f"Fetched: {ARTICLE_TITLE}")
print(f"Source URL: {source_url}")
print(f"Content length: {len(raw_content)} characters")

Fetched: Albert Einstein
Source URL: https://en.wikipedia.org/wiki/Albert_Einstein
Content length: 87959 characters


## Create Contextual Chunks

In [7]:
# Parse section hierarchy
sections = extract_section_hierarchy(raw_content)
print(f"Found {len(sections)} sections")

# Split into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
)
raw_chunks = splitter.split_text(raw_content)
print(f"Split into {len(raw_chunks)} chunks")

# Add context to each chunk
contextual_chunks = create_contextual_chunks(
    raw_chunks, raw_content, sections, ARTICLE_TITLE, get_section_context
)

print(f"\nChunks with context:")
for chunk in contextual_chunks[:3]:
    print(f"  Chunk {chunk.chunk_index + 1}: {chunk.breadcrumb}")

Found 71 sections
Split into 63 chunks

Chunks with context:
  Chunk 1: Albert Einstein > Introduction
  Chunk 2: Albert Einstein > Introduction
  Chunk 3: Albert Einstein > Life and career


## Initialize Entity Registry

In [8]:
# Initialize entity registry with article subject
registry = EntityRegistry(source_url=source_url)

registry.register(
    label=ARTICLE_TITLE,
    entity_type="Person",  # Adjust based on article type
    description=f"Subject of Wikipedia article: {ARTICLE_TITLE}",
    aliases=[ARTICLE_TITLE.split()[-1]],
)

print(f"Entity registry initialized with subject: {ARTICLE_TITLE}")
print(f"Subject URI: {registry.entities[registry.normalize_key(ARTICLE_TITLE)]['uri']}")

Entity registry initialized with subject: Albert Einstein
Subject URI: https://en.wikipedia.org/wiki/Albert_Einstein#person_albert_einstein


## Generate Output Notebooks

In [9]:
# Generate chunks notebook
chunks_path = os.path.join(RUN_OUTPUT_DIR, "chunks.ipynb")
generate_chunks_notebook(contextual_chunks, provenance, registry, CHUNK_SIZE, CHUNK_OVERLAP, chunks_path)
print(f"Generated: {chunks_path}")

# Generate facts notebook (only if doesn't exist)
facts_path = os.path.join(RUN_OUTPUT_DIR, "facts.ipynb")
if not os.path.exists(facts_path):
    generate_facts_notebook(contextual_chunks, provenance, registry, LLM_CONFIG, "chunks.ipynb", facts_path)
    print(f"Generated: {facts_path}")
else:
    print(f"Using existing: {facts_path}")

# Generate RDF notebook (only if doesn't exist)
rdf_path = os.path.join(RUN_OUTPUT_DIR, "rdf.ipynb")
if not os.path.exists(rdf_path):
    rdf_nb = generate_rdf_notebook_header(provenance, registry, LLM_CONFIG)
    with open(rdf_path, 'w', encoding='utf-8') as f:
        nbformat.write(rdf_nb, f)
    print(f"Generated: {rdf_path}")
else:
    print(f"Using existing: {rdf_path}")

Generated: data/albert_einstein/20251219_051452/chunks.ipynb
Using existing: data/albert_einstein/20251219_051452/facts.ipynb
Generated: data/albert_einstein/20251219_051452/rdf.ipynb


## Process Chunks → Extract Facts

In [10]:
# Create LLM client for facts extraction
facts_llm = ChatOpenAI(
    model=LLM_CONFIG["model"],
    temperature=LLM_CONFIG["temperature"],
    base_url=LLM_CONFIG["base_url"],
    api_key="lm-studio",
    timeout=CELL_TIMEOUT_SECONDS,
    max_retries=0,
)

# Create facts extraction chain
facts_prompt = ChatPromptTemplate.from_template(FACTS_EXTRACTION_PROMPT)
facts_chain = facts_prompt | facts_llm

# Read chunks and facts notebooks
log_progress("Reading notebooks...")
chunks_nb = nbformat.read(chunks_path, as_version=4)
chunk_data = read_chunk_data(chunks_nb)
log_progress(f"Found {len(chunk_data)} chunks")

facts_nb = nbformat.read(facts_path, as_version=4)
facts_signatures = extract_signatures(facts_nb)
log_progress(f"Found {len(facts_signatures)} existing fact signatures")
log_progress("-" * 50)

# Process facts extraction
processed, skipped, errors = process_facts_extraction(
    chunk_data=chunk_data,
    facts_nb=facts_nb,
    facts_signatures=facts_signatures,
    facts_chain=facts_chain,
    provenance=provenance,
    registry=registry,
    facts_path=facts_path,
    timeout_seconds=CELL_TIMEOUT_SECONDS,
)

log_progress("-" * 50)
log_progress(f"Facts extraction complete:")
log_progress(f"  - {processed} generated")
log_progress(f"  - {skipped} skipped (up-to-date)")
log_progress(f"  - {errors} errors/timeouts")

Reading notebooks...
Found 63 chunks
Found 63 existing fact signatures
--------------------------------------------------
  Chunk 1: ⊘ Up-to-date (CID match), skipping
  Chunk 2: ⊘ Up-to-date (CID match), skipping
  Chunk 3: ⊘ Up-to-date (CID match), skipping
  Chunk 4: ⊘ Up-to-date (CID match), skipping
  Chunk 5: ⊘ Up-to-date (CID match), skipping
  Chunk 6: ⊘ Up-to-date (CID match), skipping
  Chunk 7: ⊘ Up-to-date (CID match), skipping
  Chunk 8: ⊘ Up-to-date (CID match), skipping
  Chunk 9: ⊘ Up-to-date (CID match), skipping
  Chunk 10: ⊘ Up-to-date (CID match), skipping
  Chunk 11: ⊘ Up-to-date (CID match), skipping
  Chunk 12: ⊘ Up-to-date (CID match), skipping
  Chunk 13: ⊘ Up-to-date (CID match), skipping
  Chunk 14: ⊘ Up-to-date (CID match), skipping
  Chunk 15: ⊘ Up-to-date (CID match), skipping
  Chunk 16: ⊘ Up-to-date (CID match), skipping
  Chunk 17: ⊘ Up-to-date (CID match), skipping
  Chunk 18: ⊘ Up-to-date (CID match), skipping
  Chunk 19: ⊘ Up-to-date (CID match), ski

## Process Facts → Generate RDF

In [11]:
# Load schema matcher
schema_matcher = SchemaMatcher.load(VOCAB_CACHE_DIR, embed_base_url=LLM_CONFIG["base_url"])
log_progress(f"Loaded schema matcher with {len(schema_matcher.vocabularies)} vocabularies")

# Create RDF tools bound to schema matcher
rdf_tools, get_triples, reset_triples = create_rdf_tools(schema_matcher)

# Create LLM client for RDF generation
rdf_llm = ChatOpenAI(
    model=LLM_CONFIG["model"],
    temperature=LLM_CONFIG["temperature"],
    base_url=LLM_CONFIG["base_url"],
    api_key="lm-studio",
    timeout=CELL_TIMEOUT_SECONDS,
    max_retries=0,
)
rdf_llm_with_tools = rdf_llm.bind_tools(rdf_tools)

# Create RDF prompt
rdf_prompt = ChatPromptTemplate.from_messages([
    ("system", RDF_STATEMENT_SYSTEM_PROMPT),
    ("human", RDF_STATEMENT_HUMAN_PROMPT),
])

# Read facts notebook
log_progress("Reading facts notebook...")
facts_nb = nbformat.read(facts_path, as_version=4)
facts_data = read_facts_data(facts_nb)
log_progress(f"Found {len(facts_data)} facts cells")

# Read RDF notebook
rdf_nb = nbformat.read(rdf_path, as_version=4)
rdf_signatures = extract_statement_signatures(rdf_nb)
log_progress(f"Found {len(rdf_signatures)} existing RDF signatures")
log_progress(f"HTTP timeout: {CELL_TIMEOUT_SECONDS}s per request")
log_progress(f"Max iterations: {MAX_ITERATIONS} per statement")
log_progress("-" * 50)

# Process RDF generation
processed, skipped, errors, total_triples, iteration_counts = process_rdf_generation(
    facts_data=facts_data,
    rdf_nb=rdf_nb,
    rdf_signatures=rdf_signatures,
    provenance=provenance,
    registry=registry,
    rdf_path=rdf_path,
    rdf_prompt=rdf_prompt,
    rdf_llm_with_tools=rdf_llm_with_tools,
    rdf_tools=rdf_tools,
    get_triples_fn=get_triples,
    reset_triples_fn=reset_triples,
    max_iterations=MAX_ITERATIONS,
    timeout_seconds=CELL_TIMEOUT_SECONDS,
)

log_progress("-" * 50)
log_progress(f"RDF generation complete:")
log_progress(f"  - {processed} statements processed")
log_progress(f"  - {total_triples} total triples emitted")
log_progress(f"  - {skipped} skipped (up-to-date)")
log_progress(f"  - {errors} errors/timeouts")

Loaded schema matcher with 1 vocabularies
Reading facts notebook...
Found 63 facts cells
Found 0 existing RDF signatures
HTTP timeout: 60s per request
Max iterations: 150 per statement
--------------------------------------------------
  Chunk 1: + Processing 28 statements... ✓ 33t/5i 117.7s
  Chunk 2: + Processing 26 statements... ✓ 140t/30i 218.1s
  Chunk 3: + Processing 15 statements... 

KeyboardInterrupt: 

## Export Combined RDF

In [None]:
# Combine all RDF into a single Turtle file
all_triples = []

rdf_nb = nbformat.read(rdf_path, as_version=4)

for cell in rdf_nb.cells[2:]:
    if cell.cell_type == 'raw':
        content = cell.source.strip()
        
        # Skip signature cells
        if content.startswith('{') and '"cid"' in content:
            continue
        
        # Skip empty or error-only cells
        if not content or content.startswith('# Error:'):
            continue
        
        # Skip comment-only cells
        lines = [line for line in content.split('\n') 
                 if line.strip() and not line.strip().startswith('#')]
        if lines:
            all_triples.append(content)

# Build complete Turtle file
turtle_prefixes = RDF_PREFIXES.format(source_url=provenance['source_url'])
turtle_output = f"""# RDF Knowledge Graph: {provenance['article_title']}
# Source: {provenance['source_url']}
# License: {provenance['license']}
# Generated: {datetime.now().isoformat()}

{turtle_prefixes}

# === Triples ===

"""
turtle_output += "\n\n".join(all_triples)

turtle_path = os.path.join(RUN_OUTPUT_DIR, "triples.ttl")
with open(turtle_path, 'w', encoding='utf-8') as f:
    f.write(turtle_output)

print(f"Exported RDF to: {turtle_path}")
print(f"  - {len(all_triples)} statement blocks")
print(f"  - {len(turtle_output)} characters total")

## Save Entity Registry

In [None]:
registry_path = os.path.join(RUN_OUTPUT_DIR, "registry.json")
with open(registry_path, 'w', encoding='utf-8') as f:
    f.write(registry.to_json())

print(f"Saved: {registry_path}")
print(f"\nEntities: {len(registry.entities)}")
for key, entity in registry.entities.items():
    print(f"  - {entity['label']} ({entity['type']})")

## Pipeline Summary

In [None]:
print("=" * 60)
print("PIPELINE COMPLETE")
print("=" * 60)
print(f"\nArticle: {ARTICLE_TITLE}")
print(f"Source: {source_url}")
print(f"License: {provenance['license']}")
print(f"Run timestamp: {RUN_TIMESTAMP}")
print(f"\nOutput directory: {RUN_OUTPUT_DIR}")
print(f"\nGenerated artifacts:")
print(f"  1. chunks.ipynb - {len(contextual_chunks)} chunks with breadcrumb context")
print(f"  2. facts.ipynb - Extracted factual statements")
print(f"  3. rdf.ipynb - RDF triples per statement")
print(f"  4. triples.ttl - Combined Turtle file")
print(f"  5. registry.json - Entity registry")