In [1]:
%pip install -U langchain-postgres psycopg[binary] pypdf sentence-transformers


Collecting langchain-postgres
  Downloading langchain_postgres-0.0.17-py3-none-any.whl.metadata (7.1 kB)
Collecting psycopg[binary]
  Downloading psycopg-3.3.3-py3-none-any.whl.metadata (4.3 kB)
Collecting asyncpg>=0.30.0 (from langchain-postgres)
  Downloading asyncpg-0.31.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting pgvector<0.4,>=0.2.5 (from langchain-postgres)
  Downloading pgvector-0.3.6-py3-none-any.whl.metadata (13 kB)
Collecting psycopg-pool<4,>=3.2.1 (from langchain-postgres)
  Downloading psycopg_pool-3.3.0-py3-none-any.whl.metadata (2.8 kB)
Collecting psycopg-binary==3.3.3 (from psycopg[binary])
  Downloading psycopg_binary-3.3.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)
Downloading langchain_postgres-0.0.17-py3-none-any.whl (48 kB)
Downloading psycopg_binary-3.3.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m18.1

In [3]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from langchain_community.embeddings import SentenceTransformerEmbeddings

# 1. Connection Configuration (Matches your Docker Compose)
CONNECTION_STRING = "postgresql+psycopg://postgres:mysecretpassword@localhost:5432/vectordb"
COLLECTION_NAME = "equity_research_docs"

def run_pgvector_pipeline(pdf_path):
    # 2. Extract & Chunk (Same logic as before)
    loader = PyPDFLoader(pdf_path)
    raw_docs = loader.load()
    
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_documents(raw_docs)

    # 3. Initialize Embedding Model (Local & Fast)
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    # 4. Inject into pgvector
    # This will automatically create the tables if they don't exist
    vector_store = PGVector.from_documents(
        embedding=embeddings,
        documents=chunks,
        collection_name=COLLECTION_NAME,
        connection=CONNECTION_STRING,
        use_jsonb=True, # Optimized for metadata storage
    )
    
    print(f"Successfully injected {len(chunks)} chunks into PostgreSQL.")
    return vector_store

def query_pgvector(query_text):
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    
    # Connect to existing store
    vector_store = PGVector(
        connection=CONNECTION_STRING,
        collection_name=COLLECTION_NAME,
        embeddings=embeddings,
    )
    
    results = vector_store.similarity_search(query_text, k=3)
    return results

if __name__ == "__main__":
    PDF_FILE = "./data/2023-Equity-Derivatives-2023-Latham-Watkins.pdf"
    
    # Step 1: Ingest
    store = run_pgvector_pipeline(PDF_FILE)
    
    # Step 2: Test Query
    query = "What is the outlook for equity derivatives?"
    matches = query_pgvector(query)
    
    for doc in matches:
        print(f"\n[Page {doc.metadata.get('page')}]: {doc.page_content[:200]}...")


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 2883.70it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Successfully injected 432 chunks into PostgreSQL.


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 3103.12it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



[Page 71]: United Kingdom and other instruments the price or value of which depends on or has an effect 
on the price or value of such securities. Accordingly, broadly speaking, equity derivatives are 
within th...

[Page 71]: United Kingdom and other instruments the price or value of which depends on or has an effect 
on the price or value of such securities. Accordingly, broadly speaking, equity derivatives are 
within th...

[Page 71]: United Kingdom and other instruments the price or value of which depends on or has an effect 
on the price or value of such securities. Accordingly, broadly speaking, equity derivatives are 
within th...
