In [1]:
# Install required packages (run once)
%pip install -q langchain langchain-text-splitters langchain-openai langchain-community wikipedia pydantic

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import WikipediaLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List, Literal
import json
from datetime import datetime
import hashlib

print("Dependencies loaded")

Dependencies loaded


## Wikipedia Loader & Chunking

In [4]:
article_title = "Albert Einstein"
loader = WikipediaLoader(query=article_title, load_max_docs=1, doc_content_chars_max=100000)
docs = loader.load()

raw_content = docs[0].page_content if docs else ""

print(f"Fetched: {article_title}")
print(f"Content length: {len(raw_content)} characters")
print(f"Preview: {raw_content[:300]}...")

Fetched: Albert Einstein
Content length: 87959 characters
Preview: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist best known for developing the theory of relativity. Einstein also made important contributions to quantum theory. His mass–energy equivalence formula E = mc2, which arises from special relativity, has been called...


## Configure LLM for Knowledge Graph Extraction

In [5]:
# Configuration: Choose your LLM provider
USE_LM_STUDIO = True  # Set to False to use OpenAI

if USE_LM_STUDIO:
    llm = ChatOpenAI(
        model="qwen/qwen3-coder-30b",  # LM Studio uses whatever model is loaded
        temperature=0,
        base_url=os.environ.get("LM_STUDIO_BASE_URL", "http://host.docker.internal:1234/v1"),
        api_key="lm-studio",  # LM Studio doesn't require a real key
    )
else:
    llm = ChatOpenAI(
        model="gpt-4o-mini",
        temperature=0,
        api_key=os.environ.get("OPENAI_API_KEY"),
    )

# Define the extraction schema using Pydantic
class Entity(BaseModel):
    id: str = Field(description="Unique entity identifier (e.g., person_1, org_2)")
    label: str = Field(description="Entity name or label")
    type: Literal["Person", "Organization", "Place", "Concept", "Event", "Work"] = Field(description="Entity type")
    description: str = Field(description="Brief entity description from text")

class Relation(BaseModel):
    source_id: str = Field(description="Source entity ID")
    target_id: str = Field(description="Target entity ID")
    relation_type: str = Field(description="Type of relationship (e.g., BORN_IN, WORKED_AT, DISCOVERED)")
    description: str = Field(description="Description of the relationship")

class KnowledgeGraph(BaseModel):
    """Extracted knowledge graph with entities and relations."""
    entities: List[Entity] = Field(description="List of entities extracted from the text")
    relations: List[Relation] = Field(description="List of relationships between entities")

# Use with_structured_output for reliable JSON extraction
structured_llm = llm.with_structured_output(KnowledgeGraph)

print(f"LLM configured: {'LM Studio (local)' if USE_LM_STUDIO else 'OpenAI'}")

LLM configured: LM Studio (local)


## Extract Knowledge Graph from Text

In [7]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=8000,
    chunk_overlap=128,
)

chunks = splitter.split_text(raw_content)
print(f"Split into {len(chunks)} chunks")
print(f"Chunk 1 preview: {chunks[0][:150]}...")

Split into 13 chunks
Chunk 1 preview: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist best known for developing the theory of relativity. Einstein a...


In [8]:
extraction_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are an expert at extracting knowledge graphs from text.
Extract entities and relationships from the provided text.
Assign unique IDs to each entity using the format: type_number (e.g., "person_1", "org_2", "place_3").
Identify relationships between entities that appear in the same context.
Focus on factual relationships like: BORN_IN, WORKED_AT, DISCOVERED, FOUNDED, RECEIVED, MARRIED_TO, DEVELOPED, etc."""),
    ("human", "Extract entities and relationships from this text:\n\n{text}")
])

extraction_chain = extraction_prompt | structured_llm
print("Extraction chain created")

Extraction chain created


In [9]:
all_entities = {}
all_relations = []
process_chunks = 2

for i in range(min(process_chunks, len(chunks))):
    print(f"Processing chunk {i + 1}/{min(process_chunks, len(chunks))}...")
    try:
        result = extraction_chain.invoke({"text": chunks[i]})
        
        # Deduplicate entities by label
        for entity in result.entities:
            key = entity.label.lower()
            if key not in all_entities:
                all_entities[key] = {
                    "id": f"{entity.type.lower()}_{len(all_entities) + 1}",
                    "label": entity.label,
                    "type": entity.type,
                    "description": entity.description,
                }
        
        # Map relations to deduplicated entity IDs
        for relation in result.relations:
            source_entity = next((e for e in result.entities if e.id == relation.source_id), None)
            target_entity = next((e for e in result.entities if e.id == relation.target_id), None)
            
            if source_entity and target_entity:
                source_key = source_entity.label.lower()
                target_key = target_entity.label.lower()
                
                all_relations.append({
                    "source_id": all_entities.get(source_key, {}).get("id", relation.source_id),
                    "target_id": all_entities.get(target_key, {}).get("id", relation.target_id),
                    "relation_type": relation.relation_type,
                    "description": relation.description,
                })
        
        print(f"  Extracted {len(result.entities)} entities, {len(result.relations)} relations")
    except Exception as error:
        print(f"  Error processing chunk {i + 1}: {error}")

print(f"\nTotal unique entities: {len(all_entities)}")
print(f"Total relations: {len(all_relations)}")

Processing chunk 1/2...
  Extracted 53 entities, 60 relations
Processing chunk 2/2...
  Extracted 35 entities, 53 relations

Total unique entities: 79
Total relations: 113


## Inspect Extracted Knowledge Graph

In [10]:
print("\n=== ENTITIES ===")
for entity in all_entities.values():
    print(f"[{entity['id']}] {entity['label']} ({entity['type']})")
    print(f"  → {entity['description']}")


=== ENTITIES ===
[person_1] Albert Einstein (Person)
  → German-born theoretical physicist
[place_2] Ulm (Place)
  → Birthplace of Albert Einstein
[place_3] Kingdom of Württemberg (Place)
  → Location where Albert Einstein was born
[place_4] German Empire (Place)
  → Country where Albert Einstein was born
[person_5] Hermann Einstein (Person)
  → Albert Einstein's father
[person_6] Pauline Koch (Person)
  → Albert Einstein's mother
[place_7] Munich (Place)
  → City where Einstein's family moved
[place_8] Ludwigsvorstadt-Isarvorstadt (Place)
  → Borough in Munich where Einstein's family lived
[organization_9] Elektrotechnische Fabrik J. Einstein & Cie (Organization)
  → Company founded by Hermann and Jakob Einstein
[place_10] Italy (Place)
  → Country where Einstein's family moved
[place_11] Milan (Place)
  → City in Italy where Einstein's family moved
[place_12] Pavia (Place)
  → City in Italy where Einstein's family settled
[place_13] Palazzo Cornazzani (Place)
  → Residence where Ein

In [11]:
print("\n=== RELATIONS ===")
entities_list = list(all_entities.values())
for relation in all_relations:
    source = next((e for e in entities_list if e["id"] == relation["source_id"]), None)
    target = next((e for e in entities_list if e["id"] == relation["target_id"]), None)
    source_label = source["label"] if source else "?"
    target_label = target["label"] if target else "?"
    print(f"{source_label} --[{relation['relation_type']}]--> {target_label}")


=== RELATIONS ===
Albert Einstein --[BORN_IN]--> Ulm
Albert Einstein --[BORN_IN]--> Kingdom of Württemberg
Albert Einstein --[BORN_IN]--> German Empire
Albert Einstein --[PARENT_OF]--> Hermann Einstein
Albert Einstein --[PARENT_OF]--> Pauline Koch
Albert Einstein --[LIVED_IN]--> Munich
Albert Einstein --[LIVED_IN]--> Ludwigsvorstadt-Isarvorstadt
Hermann Einstein --[FOUNDED]--> Elektrotechnische Fabrik J. Einstein & Cie
Albert Einstein --[LIVED_IN]--> Italy
Albert Einstein --[LIVED_IN]--> Milan
Albert Einstein --[LIVED_IN]--> Pavia
Albert Einstein --[LIVED_IN]--> Palazzo Cornazzani
Albert Einstein --[LIVED_IN]--> Munich
Albert Einstein --[ATTENDED]--> Luitpold Gymnasium
Albert Einstein --[ATTENDED]--> St. Peter's Catholic elementary school
Albert Einstein --[LIVED_IN]--> Aarau
Albert Einstein --[ATTENDED]--> Argovian cantonal school
Albert Einstein --[FALL_IN_LOVE_WITH]--> Marie Winteler
Albert Einstein --[FAMILY_RELATIONSHIP]--> Jost Winteler
Albert Einstein --[SIBLING_OF]--> Maja Ein

## Generate Embeddings

In [12]:
def get_mock_embedding(text: str) -> list[float]:
    """Generate deterministic mock embeddings using SHA-256 hash."""
    hash_bytes = hashlib.sha256(text.encode()).digest()
    
    # Expand hash to 384 dimensions (typical embedding size)
    embedding = []
    for i in range(384):
        embedding.append((hash_bytes[i % len(hash_bytes)] - 128) / 128)
    return embedding

entity_embeddings = {}
for entity in all_entities.values():
    entity_embeddings[entity["id"]] = {
        **entity,
        "embedding": get_mock_embedding(entity["label"] + entity["description"]),
    }

print(f"Generated embeddings for {len(entity_embeddings)} entities")

Generated embeddings for 79 entities


## Export to DuckDB-Wasm Format

In [13]:
duckdb_format = {
    "metadata": {
        "source": article_title,
        "extracted_at": datetime.now().isoformat(),
        "entity_count": len(all_entities),
        "relation_count": len(all_relations),
    },
    "entities": list(entity_embeddings.values()),
    "relations": all_relations,
}

print("\n=== EXPORT ===")
print(json.dumps(duckdb_format, indent=2)[:500] + "...")


=== EXPORT ===
{
  "metadata": {
    "source": "Albert Einstein",
    "extracted_at": "2025-12-18T03:56:35.983919",
    "entity_count": 79,
    "relation_count": 113
  },
  "entities": [
    {
      "id": "person_1",
      "label": "Albert Einstein",
      "type": "Person",
      "description": "German-born theoretical physicist",
      "embedding": [
        -0.296875,
        -0.9765625,
        0.8359375,
        -0.2890625,
        -0.515625,
        0.7265625,
        0.9921875,
        0.28125,
        0...


In [None]:
# Optional: Save to file
# with open("knowledge_graph.json", "w") as f:
#     json.dump(duckdb_format, f, indent=2)
# print("Saved to knowledge_graph.json")