# LangChain + pgvector with Supacrawler

- Repo example hub: https://github.com/Supacrawler/supacrawler-py/tree/main/examples
- Enable pgvector first: https://supabase.com/docs/guides/database/extensions/pgvector


In [None]:
# Core LangChain packages
%pip install -U langchain langchain-community langchain-text-splitters  sqlalchemy langchain-postgres "psycopg[binary]" sentence-transformers
# %pip install -U sentence-transformers transformers torch

In [None]:
import os

# Switchable embeddings: HF by default
USE_HF = True
HF_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'

SUPACRAWLER_API_KEY = os.environ.get('SUPACRAWLER_API_KEY', 'YOUR_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YOUR_OPENAI_KEY') # Optional
DATABASE_URL = os.environ.get('DATABASE_URL', 'postgresql://postgres:postgres@127.0.0.1:64322/postgres?sslmode=disable')

In [16]:
from supacrawler import SupacrawlerClient, JobCreateRequest

crawler = SupacrawlerClient(api_key=SUPACRAWLER_API_KEY)

# Start a small crawl (scoped)
job = crawler.create_job(JobCreateRequest(
    url='https://supabase.com/docs/guides/auth',
    type='crawl',
    depth=1,
    link_limit=10,
    render_js=False,
))
status = crawler.wait_for_job(job.job_id)

In [7]:
# Poll until completion
final = crawler.wait_for_job(job.job_id, interval_seconds=3.0, timeout_seconds=60.0)
print(final.status)
if final.status == "completed" and final.data is not None:
    if hasattr(final.data, "crawl_data"):
        print("Pages:", len(final.data.crawl_data))

completed
Pages: 10


In [23]:
import os
from sqlalchemy import create_engine
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_postgres import PGVector
from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain_openai import OpenAIEmbeddings  # optional

# 1) Build LangChain Documents from the crawl result
crawl = getattr(getattr(final, "data", None), "crawl_data", {}) or {}
docs = [
    Document(
        page_content=(page.markdown or ""),
        metadata={"url": url, "title": (getattr(page, "metadata", None) or {}).__dict__.get("title") if hasattr(page, "metadata") else None}
    )
    for url, page in crawl.items()
    if getattr(page, "markdown", None)
]
print(f"Pages with content: {len(docs)}")

# 2) Chunk with overlap (adjust as needed)
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)
print(f"Chunks: {len(chunks)}")

# 3) Embeddings (local HF by default; switch to OpenAI if you prefer)
USE_HF = True
if USE_HF:
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # 384-dim
else:
    from langchain_openai import OpenAIEmbeddings
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small", api_key=os.environ["OPENAI_API_KEY"])  # 1536-dim

Pages with content: 10
Chunks: 75


In [25]:

# 4) Store in pgvector via LangChain PGVector (keeps Postgres/pgvector as storage)
engine = create_engine(DATABASE_URL)
store = PGVector(connection=engine, collection_name="lc_docs", embeddings=embeddings, use_jsonb=True)

store.add_documents(chunks)
print("Added chunks to pgvector")

# 5) Quick test: semantic search
results = store.similarity_search("What are the possible auth methods?", k=3)
for d in results:
    print(d.metadata.get("url"), (d.metadata.get("title") or ""), d.page_content[:200].replace("\n", " "), "...")

Added chunks to pgvector
https://supabase.com/docs/guides/auth Auth | Supabase Docs Auth  # Auth  ## Use Supabase to authenticate and authorize your users.  * * *  Supabase Auth makes it easy to implement authentication and authorization in your app. We provide client SDKs and API en ...
https://supabase.com/docs/guides/auth Auth | Supabase Docs Auth  # Auth  ## Use Supabase to authenticate and authorize your users.  * * *  Supabase Auth makes it easy to implement authentication and authorization in your app. We provide client SDKs and API en ...
https://supabase.com/docs/guides/auth/users Users | Supabase Docs A user can sign in with one of the following methods:  - Password-based method (with email or phone) - Passwordless method (with email or phone) - OAuth - SAML SSO  An identity describes the authentic ...
