# Supabase Vectors with Supacrawler
This notebook scrapes content with `Supacrawler`, embeds it, and stores vectors in Supabase (pgvector).

Ensure pgvector is enabled before running:
https://supabase.com/docs/guides/database/extensions/pgvector

In [10]:
%pip install -qU vecs datasets llama_index html2text
%pip install transformers torch

Note: you may need to restart the kernel to use updated packages.
Collecting transformers
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Collecting torch
  Downloading torch-2.8.0-cp313-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl.metadata (4.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading transformers-4.55.0-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m23.8 MB/s[0m  [33m0:00:00[0meta [36m0:00:01[0m
[?25hDownloading tokenizers-0.21.4-cp39-abi3-macosx_11_0_arm64.whl

In [None]:
# Supabase Vectors with Supacrawler (Vecs)
import os
import vecs  # Supabase Python client for vectors
from supacrawler import SupacrawlerClient, ScrapeParams

# Optional: local embeddings (Hugging Face) or hosted (OpenAI)
USE_HF = True
HF_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'  # 384 dims

SUPACRAWLER_API_KEY = os.environ.get('SUPACRAWLER_API_KEY', 'YOUR_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YOUR_OPENAI_KEY') # Optional
DATABASE_URL = os.environ.get('DATABASE_URL', 'postgresql://postgres:postgres@127.0.0.1:64322/postgres?sslmode=disable')

In [None]:

crawler = SupacrawlerClient(api_key=SUPACRAWLER_API_KEY)
scrape = crawler.scrape(ScrapeParams(url='https://docs.supacrawler.com/api/install', format='markdown'))

In [19]:
# Chunk + embed utilities
if USE_HF:
    from transformers import AutoTokenizer, AutoModel
    import torch
    tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
    model = AutoModel.from_pretrained(HF_MODEL)
    def embed_text(text: str):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        # Mean pooling over token embeddings
        vec = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
        return vec
    def split_text_tokens(text: str, max_tokens: int = 300, overlap: int = 50):
        ids = tokenizer(text, return_tensors=None, add_special_tokens=False)['input_ids']
        chunks = []
        step = max_tokens - overlap
        for i in range(0, len(ids), step):
            window = ids[i:i+max_tokens]
            chunk_txt = tokenizer.decode(window, skip_special_tokens=True)
            chunks.append(chunk_txt)
        return chunks if chunks else [text]
else:
    from openai import OpenAI
    openai_client = OpenAI(api_key=OPENAI_API_KEY)
    def embed_text(text: str):
        resp = openai_client.embeddings.create(model='text-embedding-3-small', input=text)
        return resp.data[0].embedding
    def split_text_chars(text: str, max_chars: int = 1200, overlap_chars: int = 200):
        chunks = []
        step = max_chars - overlap_chars
        for i in range(0, len(text), step):
            chunks.append(text[i:i+max_chars])
        return chunks if chunks else [text]

# Create chunks based on the active embedder
chunks = split_text_tokens(scrape.content) if USE_HF else split_text_chars(scrape.content)
num_chunks = len(chunks)
# Probe dim from first chunk
first_vec = embed_text(chunks[0])
vector_dim = len(first_vec)
print(f"Prepared {num_chunks} chunks (dim={vector_dim})")

Prepared 1 chunks (dim=384)


In [20]:
# Upsert via Vecs (all chunks)
vx = vecs.create_client(DATABASE_URL)
col = vx.get_or_create_collection(name='documents', dimension=vector_dim)
records = []
for idx, chunk in enumerate(chunks):
    vec = first_vec if idx == 0 else embed_text(chunk)
    rec_id = f"{scrape.url}#chunk-{idx}"
    records.append((rec_id, vec, {
        'url': scrape.url,
        'title': getattr(scrape, 'title', None),
        'chunk_index': idx,
        'content': chunk,
    }))
col.upsert(records=records)
print(f"Upserted {len(records)} chunks (dim={vector_dim})")

Upserted 1 chunks (dim=384)


In [44]:
# Test query and log results
from vecs import IndexMeasure
query = "What does supacrawler do?"
qvec = embed_text(query)

# Utility: normalize vecs query results into (id, score, metadata)
def normalize_vecs_result(rec):
    if isinstance(rec, tuple):
        if len(rec) == 4:
            rec_id, score, _vec, metadata = rec
        elif len(rec) == 3:
            rec_id, score, metadata = rec
        else:
            return str(rec), None, {}
        return rec_id, score, (metadata or {})
    if isinstance(rec, dict):
        return rec.get('id'), rec.get('score'), rec.get('metadata', {})
    return str(rec), None, {}

# Ensure the index exists
try:
    col.create_index(measure=IndexMeasure.cosine_distance)
except Exception:
    pass

matches = col.query(data=qvec, limit=1, include_metadata=True)

print(f"Query: {query}")
print(f"Matches: {len(matches)}")
for rec in matches:
    rec_id, score, metadata = normalize_vecs_result(rec)
    title = metadata.get('title') if isinstance(metadata, dict) else None
    chunk_idx = metadata.get('chunk_index') if isinstance(metadata, dict) else None
    snippet = (metadata.get('content') or '')[:240] if isinstance(metadata, dict) else ''
    print(rec_id, (round(score, 4) if isinstance(score, (int, float)) else 'n/a'), title, f"chunk={chunk_idx}")
    if snippet:
        print('  ', snippet.replace('\n', ' ') + ('...' if len(snippet) == 240 else ''))

Query: What does supacrawler do?
Matches: 1
('https://docs.supacrawler.com/api/install#chunk-0', {'url': 'https://docs.supacrawler.com/api/install', 'title': 'Installation - Supacrawler API Reference', 'content': '# installation use the official s ... (27 characters truncated) ... experience with supacrawler. # # [ javascript / typescript ] ( \\ # java - script - type - script ) # # [ python ] ( \\ # python )', 'chunk_index': 0}) n/a None chunk=None


In [35]:
# Crawl + embed docs, then ask a question
from supacrawler import JobCreateRequest

crawler = SupacrawlerClient(api_key=SUPACRAWLER_API_KEY)

# 1) Start a small crawl (scoped)
job = crawler.create_job(JobCreateRequest(
    url='https://docs.supacrawler.com',
    type='crawl',
    depth=1,
    link_limit=10,
    render_js=False,
))
status = crawler.wait_for_job(job.job_id)

In [37]:
# Poll until completion
final = crawler.wait_for_job(job.job_id, interval_seconds=3.0, timeout_seconds=60.0)
print(final.status)
if final.status == "completed" and final.data is not None:
    if hasattr(final.data, "crawl_data"):
        print("Pages:", len(final.data.crawl_data))

completed
Pages: 8


In [38]:
# 2) Upsert each page (HuggingFace by default here)
site_records = []
for page_url, page in (status.data.crawl_data or {}).items():
    content = (page.markdown or '')
    if not content:
        continue
    # Chunk and embed
    page_chunks = split_text_tokens(content) if USE_HF else split_text_chars(content)
    for idx, chunk in enumerate(page_chunks):
        vec = embed_text(chunk)
        site_records.append((f"{page_url}#chunk-{idx}", vec, {
            'url': page_url,
            'title': (page.metadata.title if page.metadata else None),
            'chunk_index': idx,
            'content': chunk,
        }))

if site_records:
    col.upsert(records=site_records)
    print(f"Upserted crawl chunks: {len(site_records)}")
else:
    print('No crawl content found')

Token indices sequence length is longer than the specified maximum sequence length for this model (1378 > 512). Running this sequence through the model will result in indexing errors


Upserted crawl chunks: 19


In [46]:
# 3) Ask: "What are the endpoints?"
from vecs import IndexMeasure
try:
    col.create_index(measure=IndexMeasure.cosine_distance)
except Exception:
    pass
q = "What does the scrape endpoint do?"
qv = embed_text(q)
results = col.query(data=qv, limit=3, include_metadata=True)
print(f"\nQ: {q}\nTop {len(results)} matches:")
for rec in results:
    rec_id, score, metadata = normalize_vecs_result(rec)
    title = metadata.get('title') if isinstance(metadata, dict) else None
    snippet = (metadata.get('content') or '')[:240] if isinstance(metadata, dict) else ''
    print(rec_id, (round(score, 4) if isinstance(score, (int, float)) else 'n/a'), title)
    if snippet:
        print('  ', snippet.replace('\n', ' ') + ('...' if len(snippet) == 240 else ''))



Q: What does the scrape endpoint do?
Top 3 matches:
('https://docs.supacrawler.com/api/scrape#chunk-0', {'url': 'https://docs.supacrawler.com/api/scrape', 'title': 'Scrape - Supacrawler API Reference', 'content': '# scrape # # [ quick example ] ( \\ # qu ... (1112 characters truncated) ... eintegerdescription crawl depth used ( only for links format ). * * * get / v1 / scrape # # [ scrape a webpage ] ( \\ # scrape - a', 'chunk_index': 0}) n/a None
('https://docs.supacrawler.com/api/scrape#chunk-3', {'url': 'https://docs.supacrawler.com/api/scrape', 'title': 'Scrape - Supacrawler API Reference', 'content': '##l ` typestringdescription the url of t ... (1219 characters truncated) ...  - name ` 400 bad request ` description invalid url or missing required parameters. - name ` 401 unauthorized ` descriptioninvalid', 'chunk_index': 3}) n/a None
('https://docs.supacrawler.com/quickstart#chunk-1', {'url': 'https://docs.supacrawler.com/quickstart', 'title': 'Quickstart - Supacrawler API Refer