In [8]:
# Automatically reload modules when code changes
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from pgmcp.settings import get_settings

SETTINGS = get_settings()
from rich.table import Table
from rich.console import Console
from rich.theme import Theme


console = Console(highlight=False, force_jupyter=True)

# Server

In [15]:
# Mcp Server Imports 
from pgmcp.server_crawl import mcp as mcp_crawl
from pgmcp.server_kb import mcp as mcp_kb


# Ingest Crawl 22 = openwebui.io
# Ingest Crawl 21 = sqlalchemy docs
from fastmcp import Client

async with Client(mcp_kb) as client:
    response = await client.call_tool("ingest_crawl_job", {"crawl_job_id": 22})
    
    print(response)

CallToolResult(content=[TextContent(type='text', text='{"metadata":{"page":1,"per_page":10,"count":0},"record":{"name":"docs.openwebui.com:  ","library_id":1,"documents":[],"id":5,"created_at":"2025-08-06T20:34:04.361977Z","updated_at":"2025-08-06T20:34:04.361977Z","documents_count":69,"chunks_count":479,"chunks_token_total":101318}}', annotations=None, meta=None)], structured_content={'metadata': {'page': 1, 'per_page': 10, 'count': 0}, 'record': {'name': 'docs.openwebui.com:  ', 'library_id': 1, 'documents': [], 'id': 5, 'created_at': '2025-08-06T20:34:04.361977Z', 'updated_at': '2025-08-06T20:34:04.361977Z', 'documents_count': 69, 'chunks_count': 479, 'chunks_token_total': 101318}}, data={'metadata': {'page': 1, 'per_page': 10, 'count': 0}, 'record': {'name': 'docs.openwebui.com:  ', 'library_id': 1, 'documents': [], 'id': 5, 'created_at': '2025-08-06T20:34:04.361977Z', 'updated_at': '2025-08-06T20:34:04.361977Z', 'documents_count': 69, 'chunks_count': 479, 'chunks_token_total': 101

In [35]:
from sqlalchemy.future import select
from pgmcp.models import Corpus, Document, Chunk 

table = Table(show_header=True, header_style="bold magenta")
table.add_column("ID")
table.add_column("Title")
table.add_column("Content-Type")
table.add_column("Content-Length", justify="right")

            
async with Corpus.async_context() as session:
    # 4 = sqlalchemy docs
    # 5 = openwebui docs
    if CORPUS := await Corpus.find(5): # openwebui docs
        await CORPUS.ensure_loaded("documents")
        DOCUMENTS = list(CORPUS.documents)

        for i, doc in enumerate(DOCUMENTS):
            if i >= 10:
                table.add_row("...", "...", "...", "...")
                break
            table.add_row(
                str(doc.id), 
                doc.title, 
                doc.content_type, 
                str(len(str(doc.content)))
            )
            
    display(table)
    
    DOCUMENT = DOCUMENTS[0]
    DOCUMENT = await DOCUMENT.ensure_loaded("chunks")

In [28]:
# get all chunks from corpus
import tiktoken
encoder = tiktoken.get_encoding("cl100k_base")

async with Chunk.async_context() as session:
    chunks_qb = Chunk.query().joins(Chunk.document, Document.corpus).where(Corpus.id == 4)
    
    async for chunk in chunks_qb.find_each():
        embeddable_content = chunk.to_embeddable_input()
        if len(encoder.encode(embeddable_content)) > 8192:
            print(f"Chunk {chunk.id} is too long to embed: {len(encoder.encode(embeddable_content))} tokens")
            

In [10]:
# await DOCUMENT.update_embeddings()

In [44]:
import json
from rich.table import Table

table = Table(show_header=True, header_style="bold magenta")
table.add_column("Chunk ID", width=22)
# table.add_column("Content")
# table.add_column("Metadata")
table.add_column("chunk_content")
table.add_column("token_count (tiktoken)", justify="right")
table.add_column("token_count (column)", justify="right")



for i, chunk in enumerate(DOCUMENT.chunks):
    if i >= 10:
        break

    token_count_tiktoken = len(encoder.encode(chunk.to_embeddable_input()))
    token_count_column = chunk.token_count

    table.add_row(str(i), chunk.to_embeddable_input(), str(token_count_tiktoken), str(token_count_column))


console.print(table)

In [None]:
from pgmcp.models import Corpus
import asyncio

async def embed_corpus(corpus_id=5):
    corpus = await Corpus.find(corpus_id)
    if corpus:
        print(f"Embedding all chunks for corpus: {corpus.name} (ID: {corpus_id})")
        await corpus.update_embeddings()
        print("Embedding complete.")
    else:
        print(f"Corpus with ID {corpus_id} not found.")

await embed_corpus(5)