# wiki3 Knowledge Graph Extraction Pipeline

Extract structured knowledge from Wikipedia articles using LLMs, generate embeddings, and prepare for browser-based querying with DuckDB-Wasm.

## Setup

In [None]:
import { ChatOpenAI } from "npm:@langchain/openai";
import { WikipediaQueryRun } from "npm:@langchain/community/tools/wikipedia_query_run";
import { RecursiveCharacterTextSplitter } from "npm:@langchain/textsplitters";
import { PromptTemplate } from "npm:@langchain/core/prompts";
import { z } from "npm:zod";
import { StructuredOutputParser } from "npm:@langchain/core/output_parsers/structured";

console.log("Dependencies loaded");

## Wikipedia Loader & Chunking

In [None]:
const wikipediaFetcher = new WikipediaQueryRun();
const articleTitle = "Albert Einstein";
const rawContent = await wikipediaFetcher.call(articleTitle);

console.log(`Fetched: ${articleTitle}`);
console.log(`Content length: ${rawContent.length} characters`);
console.log(`Preview: ${rawContent.substring(0, 300)}...`);

## Configure LLM for Knowledge Graph Extraction

In [None]:
const llm = new ChatOpenAI({
  modelName: "gpt-4-mini",
  temperature: 0,
  apiKey: Deno.env.get("OPENAI_API_KEY"),
});

const extractionSchema = z.object({
  entities: z.array(
    z.object({
      id: z.string().describe("Unique entity identifier"),
      label: z.string().describe("Entity name or label"),
      type: z.enum(["Person", "Organization", "Place", "Concept", "Event", "Work"]),
      description: z.string().describe("Brief entity description from text"),
    })
  ),
  relations: z.array(
    z.object({
      source_id: z.string().describe("Source entity ID"),
      target_id: z.string().describe("Target entity ID"),
      relation_type: z.string().describe("Type of relationship"),
      description: z.string().describe("Description of the relationship"),
    })
  ),
});

const parser = StructuredOutputParser.fromZodSchema(extractionSchema);
const formatInstructions = parser.getFormatInstructions();

console.log("LLM and schema configured");

## Extract Knowledge Graph from Text

In [None]:
const splitter = new RecursiveCharacterTextSplitter({
  chunkSize: 1024,
  chunkOverlap: 128,
});

const chunks = await splitter.splitText(rawContent);
console.log(`Split into ${chunks.length} chunks`);
console.log(`Chunk 1 preview: ${chunks[0].substring(0, 150)}...`);

In [None]:
const extractionPrompt = PromptTemplate.fromTemplate(`
Extract entities and relationships from the following text.
Assign unique IDs to each entity (e.g., "person_1", "org_2").
Infer relationships between entities that appear in the same context.

Text: {text}

{format_instructions}

Provide your response as valid JSON.`);

const extractionChain = extractionPrompt.pipe(llm).pipe(parser);
console.log("Extraction chain created");

In [None]:
const allEntities = new Map();
const allRelations = [];
const processChunks = 2;

for (let i = 0; i < Math.min(processChunks, chunks.length); i++) {
  console.log(`Processing chunk ${i + 1}/${Math.min(processChunks, chunks.length)}...`);
  try {
    const result = await extractionChain.invoke({
      text: chunks[i],
      format_instructions: formatInstructions,
    });

    for (const entity of result.entities) {
      const key = entity.label.toLowerCase();
      if (!allEntities.has(key)) {
        allEntities.set(key, {
          id: `${entity.type.toLowerCase()}_${allEntities.size + 1}`,
          label: entity.label,
          type: entity.type,
          description: entity.description,
        });
      }
    }

    for (const relation of result.relations) {
      const sourceKey = result.entities.find(e => e.id === relation.source_id)?.label.toLowerCase();
      const targetKey = result.entities.find(e => e.id === relation.target_id)?.label.toLowerCase();
      if (sourceKey && targetKey) {
        allRelations.push({
          source_id: allEntities.get(sourceKey)?.id || relation.source_id,
          target_id: allEntities.get(targetKey)?.id || relation.target_id,
          relation_type: relation.relation_type,
          description: relation.description,
        });
      }
    }

    console.log(`  Extracted ${result.entities.length} entities, ${result.relations.length} relations`);
  } catch (error) {
    console.error(`  Error: ${error.message}`);
  }
}

console.log(`Total entities: ${allEntities.size}`);
console.log(`Total relations: ${allRelations.length}`);

## Inspect Extracted Knowledge Graph

In [None]:
console.log("\n=== ENTITIES ===");
for (const [, entity] of allEntities) {
  console.log(`[${entity.id}] ${entity.label} (${entity.type})`);
  console.log(`  → ${entity.description}`);
}

In [None]:
console.log("\n=== RELATIONS ===");
const entitiesArr = Array.from(allEntities.values());
for (const relation of allRelations) {
  const source = entitiesArr.find(e => e.id === relation.source_id);
  const target = entitiesArr.find(e => e.id === relation.target_id);
  console.log(`${source?.label} --[${relation.relation_type}]--> ${target?.label}`);
}

## Generate Embeddings

In [None]:
import { Sha256 } from "https://deno.land/std@0.208.0/crypto/sha256.ts";

function getMockEmbedding(text) {
  const hash = new Sha256().update(text).digest();
  const embedding = [];
  for (let i = 0; i < 384; i++) {
    embedding.push((hash[i % hash.length] - 128) / 128);
  }
  return embedding;
}

const entityEmbeddings = new Map();
for (const [, entity] of allEntities) {
  entityEmbeddings.set(entity.id, {
    ...entity,
    embedding: getMockEmbedding(entity.label + entity.description),
  });
}

console.log(`Generated embeddings for ${entityEmbeddings.size} entities`);

## Export to DuckDB-Wasm Format

In [None]:
const duckdbFormat = {
  metadata: {
    source: articleTitle,
    extracted_at: new Date().toISOString(),
    entity_count: allEntities.size,
    relation_count: allRelations.length,
  },
  entities: Array.from(entityEmbeddings.values()),
  relations: allRelations,
};

console.log("\n=== EXPORT ===");
console.log(JSON.stringify(duckdbFormat, null, 2).substring(0, 500) + "...");