microsoft · chelseacarter29 · Jul 3, 2025 · May 27, 2025 · May 27, 2025 · May 28, 2025
diff --git a/code/.env.template b/code/.env.template
@@ -56,6 +56,10 @@ OPENSEARCH_CREDENTIALS="<TODO>"
 ELASTICSEARCH_URL="http://localhost:9200"
 ELASTICSEARCH_API_KEY="<TODO>"
 
+# If using Postgres connection string
+POSTGRES_CONNECTION_STRING="postgresql://<HOST>:<PORT>/<DATABASE>?user=<USERNAME>&sslmode=require"
+POSTGRES_PASSWORD="<PASSWORD>"
+
 # Local Directory for file writes
 #NLWEB_OUTPUT_DIR=/home/sites/data/nlweb
 

diff --git a/code/config/config_retrieval.yaml b/code/config/config_retrieval.yaml
@@ -106,4 +106,15 @@ endpoints:
     db_type: elasticsearch
     # Vector properties
     vector_type:
-      type: dense_vector
+      type: dense_vector  # PostgreSQL with pgvector extension configuration
+
+  postgres:
+    enabled: true
+    # Database connection details (i.e. "postgresql://<HOST>:<PORT>/<DATABASE>?user=<USERNAME>&sslmode=require")
+    api_endpoint_env: POSTGRES_CONNECTION_STRING
+    # Password for authentication 
+    api_key_env: POSTGRES_PASSWORD
+    # Index name to search in
+    index_name: documents
+    # Specify the database type
+    db_type: postgres
diff --git a/code/requirements.txt b/code/requirements.txt
@@ -63,3 +63,8 @@ seaborn>=0.13.0
 
 # For Elasticsearch:
 # elasticsearch[async]>=8,<9
+
+# For Postgres:
+# psycopg[binary]>=3.1.12  # PostgreSQL adapter (psycopg3)
+# psycopg[pool]>=3.2.0  # Connection pooling for psycopg3
+# pgvector>=0.4.0
diff --git a/code/retrieval/postgres_client.py b/code/retrieval/postgres_client.py
diff --git a/code/retrieval/retriever.py b/code/retrieval/retriever.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT License
 
 """
-Unified vector database interface with support for Azure AI Search, Milvus, and Qdrant.
+Unified vector database interface with support for Azure AI Search, Milvus, Qdrant and Postgres.
 This module provides abstract base classes and concrete implementations for database operations.
 """
 
@@ -60,7 +60,10 @@ def init():
                 elif db_type == "elasticsearch":
                     from retrieval.elasticsearch_client import ElasticsearchClient
                     _preloaded_modules[db_type] = ElasticsearchClient
-
+                elif db_type == "postgres":
+                    from retrieval.postgres_client import PgVectorClient
+                    _preloaded_modules[db_type] = PgVectorClient
+
                 print(f"Successfully preloaded {db_type} client module")
             except Exception as e:
                 print(f"Failed to preload {db_type} client module: {e}")
@@ -75,6 +78,7 @@ def init():
     "qdrant": ["qdrant-client>=1.14.0"],
     "snowflake_cortex_search": ["httpx>=0.28.1"],
     "elasticsearch": ["elasticsearch[async]>=8,<9"],
+    "postgres": ["pgvector>=0.4.0", "psycopg[binary]>=3.1.12", "psycopg[pool]>=3.2.0"]
 }
 
 # Cache for installed packages
@@ -398,7 +402,7 @@ def _has_valid_credentials(self, name: str, config) -> bool:
         """
         db_type = config.db_type
 
-        if db_type in ["azure_ai_search", "snowflake_cortex_search", "opensearch", "milvus", "elasticsearch"]:
+        if db_type in ["azure_ai_search", "snowflake_cortex_search", "opensearch", "milvus", "elasticsearch", "postgres"]:
             # These require API key and endpoint
             return bool(config.api_key and config.api_endpoint)
         elif db_type == "qdrant":
@@ -465,10 +469,14 @@ async def get_client(self, endpoint_name: str) -> VectorDBClientInterface:
                 elif db_type == "elasticsearch":
                     from retrieval.elasticsearch_client import ElasticsearchClient
                     client = ElasticsearchClient(endpoint_name)    
+                elif db_type == "postgres":
+                    from retrieval.postgres_client import PgVectorClient
+                    client = PgVectorClient(self.endpoint_name)
+
                 else:
-                    error_msg = f"Unsupported database type: {db_type}"
-                    logger.error(error_msg)
-                    raise ValueError(error_msg)
+                        error_msg = f"Unsupported database type: {db_type}"
+                        logger.error(error_msg)
+                        raise ValueError(error_msg)
             except ImportError as e:
                 logger.error(f"Failed to import client for {db_type}: {e}")
                 raise ValueError(f"Failed to load client for {db_type}: {e}")

diff --git a/code/tools/postgres_load.py b/code/tools/postgres_load.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""
+Script to verify and create PostgreSQL schema for pgvector if it doesn't exist
+"""
+
+import os
+import sys
+import asyncio
+import argparse
+
+# More robust path handling for imports
+script_path = os.path.abspath(__file__)
+utils_dir = os.path.dirname(script_path)
+code_dir = os.path.dirname(utils_dir)
+
+# Add both directories to path
+sys.path.insert(0, code_dir)  # Add code dir first
+sys.path.insert(1, utils_dir)  # Then utils dir
+
+# Import the required modules from the project
+try:
+    from retrieval.postgres_client import PgVectorClient
+except ImportError as e:
+    print(f"Failed to import required modules: {e}")
+    print(f"Make sure you're running this script from the code directory: {code_dir}")
+    sys.exit(1)
+
+# SQL for creating the table and indexes
+CREATE_TABLE_SQL = """
+-- Create the pgvector extension if it doesn't exist
+CREATE EXTENSION IF NOT EXISTS vector;
+
+-- Create the documents table for vector embeddings
+CREATE TABLE IF NOT EXISTS documents (
+    id TEXT PRIMARY KEY,              -- Document ID (URL or other unique identifier)
+    url TEXT NOT NULL,               -- URL of the document
+    name TEXT NOT NULL,              -- Name of the document (title or similar)
+    schema_json JSONB NOT NULL,      -- JSON schema of the document
+    site TEXT NOT NULL,              -- Site or domain of the document
+    embedding vector(1536) NOT NULL  -- Vector embedding (adjust dimension to match your model)
+);
+
+-- Create a vector index for faster similarity searches
+CREATE INDEX IF NOT EXISTS embedding_cosine_idx 
+ON documents USING hnsw (embedding vector_cosine_ops) 
+WITH (m = 16, ef_construction = 200);
+"""
+
+async def setup_postgres_schema(args):
+    """Set up the PostgreSQL schema for vector search"""
+    print("\n=== PostgreSQL Schema Setup ===\n")
+
+    client = PgVectorClient(args.endpoint)
+
+    # First test the connection
+    print("Testing PostgreSQL connection...")
+    try:
+        connection_info = await client.test_connection()
+
+        if not connection_info.get("success"):
+            print(f"ERROR: Could not connect to PostgreSQL: {connection_info.get('error')}")
+            return False
+    except Exception as e:
+        print(f"ERROR: Could not connect to PostgreSQL: {e}")
+        return False
+
+    print(f"Successfully connected to PostgreSQL {connection_info.get('database_version')}")
+
+    # Check if pgvector is installed
+    if not connection_info.get("pgvector_installed"):
+        print("WARNING: pgvector extension is not installed in the database")
+        print("Please install the pgvector extension before continuing")
+        return False
+
+    # Check if table exists and has correct schema
+    print(f"\nChecking schema for table '{client.table_name}'...")
+    schema_info = await client.check_table_schema()
+
+    if schema_info.get("error"):
+        print(f"ERROR checking table schema: {schema_info.get('error')}")
+        return False
+
+    if not schema_info.get("table_exists"):
+        print(f"Table '{client.table_name}' does not exist. Creating it...")
+
+        # Create the table and indexes
+        async def _create_schema(conn):
+            async with conn.cursor() as cur:
+                await cur.execute(CREATE_TABLE_SQL)
+                await conn.commit()
+                return True
+
+        try:
+            await client._execute_with_retry(_create_schema)
+            print(f"Successfully created table '{client.table_name}' and indexes")
+        except Exception as e:
+            print(f"ERROR creating schema: {e}")
+            await client.close()  # Make sure to close the connection on error
+            return False
+    else:
+        print(f"Table '{client.table_name}' already exists")
+
+        # Check for any schema issues
+        if schema_info.get("needs_corrections"):
+            print("\nThe following schema issues were detected:")
+            for issue in schema_info.get("needs_corrections"):
+                print(f"  - {issue}")
+
+            if args.fix:
+                print("\nAttempting to fix schema issues...")
+                # Implement schema fixes here if --fix is provided
+                print("Schema fixes not implemented yet - please fix manually")
+            else:
+                print("\nRun this script with --fix to attempt to fix these issues")
+
+    # Show schema information
+    print("\nCurrent schema:")
+    print(f"  Table: {client.table_name}")
+    print(f"  Primary key: {schema_info.get('primary_key')}")
+    print(f"  Vector column: {schema_info.get('vector_column', 'None')} {schema_info.get('vector_dimension', '')}")
+    print(f"  Vector indexes: {len(schema_info.get('vector_indexes', []))}")
+
+    # Close the connection pool when done
+    print("\nClosing connection pool...")
+    await client.close()
+
+    print("\nSetup complete!")
+    return True
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="PostgreSQL pgvector Schema Setup")
+    parser.add_argument("--endpoint", default="postgres", help="Name of the PostgreSQL endpoint to use")
+    parser.add_argument("--fix", action="store_true", help="Attempt to fix schema issues if any are found")
+    args = parser.parse_args()
+
+    async def main():
+        try:
+            return await setup_postgres_schema(args)
+        except Exception as e:
+            print(f"Unexpected error: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+
+    asyncio.run(main())
diff --git a/docs/setup-postgres.md b/docs/setup-postgres.md
@@ -0,0 +1,94 @@
+## PostgreSQL with pgvector
+
+NLWeb supports PostgreSQL with the pgvector extension for vector similarity search. This provides a powerful and scalable option for storing and retrieving vector embeddings using standard SQL database technology.
+
+### Setup Requirements
+
+1. PostgreSQL database (version 11 or higher recommended)
+2. pgvector extension installed in the database
+3. A table with the following schema (or compatible):
+
+```sql
+CREATE TABLE documents (
+    id TEXT PRIMARY KEY,              -- Document ID (URL or other unique identifier)
+    url TEXT NOT NULL,               -- URL of the document
+    name TEXT NOT NULL,              -- Name of the document (title or similar)
+    schema_json JSONB NOT NULL,      -- JSON schema of the document
+    site TEXT NOT NULL,              -- Site or domain of the document
+    embedding vector(1536) NOT NULL  -- Vector embedding (adjust dimension to match your model)
+);
+
+-- Create a vector index for faster similarity searches
+CREATE INDEX IF NOT EXISTS embedding_cosine_idx 
+ON documents USING hnsw (embedding vector_cosine_ops) 
+WITH (m = 16, ef_construction = 200);
+```
+
+### Setup Schema
+
+NOTE: If you are using Azure Postgres Flexible server make sure you have `vector` [extension allow-listed](https://learn.microsoft.com/azure/postgresql/flexible-server/how-to-use-pgvector#enable-extension)
+
+To setup you PostgreSQL configuration, you can use the provided setup scripts:
+
+In the `code` directory run
+```bash
+# Setup the Postgres server
+python tools/postgres_load.py
+```
+
+### Dependencies
+
+Make sure you have the required Python packages installed:
+
+```bash
+# Install PostgreSQL client libraries
+pip install psycopg
+pip install psycopg-binary  # PostgreSQL adapter (psycopg3)
+pip install psycopg-pool  # Connection pooling for psycopg3
+pip install pgvector
+```
+
+The following packages are needed:
+- `psycopg` - The PostgreSQL adapter for Python (psycopg3)
+- `psycopg[binary]` - Binary dependencies for psycopg
+- `psycopg[pool]` - Connection pooling support
+- `pgvector` - Support for pgvector operations (vector types and indexing)
+
+### Configuration
+
+Update the `.env` file:
+
+```bash
+# If using Postgres connection string
+POSTGRES_CONNECTION_STRING="postgresql://<HOST>:<PORT>/<DATABASE>?user=<USERNAME>&sslmode=require"
+POSTGRES_PASSWORD="<PASSWORD>"
+```
+
+Configure PostgreSQL in the `config_retrieval.yaml` file:
+
+```yaml
+preferred_endpoint: postgres  # Set this to use PostgreSQL as default
+
+endpoints:
+  postgres:
+    # Database connection details
+    api_endpoint_env: POSTGRES_CONNECTION_STRING # Database connection details (i.e. "postgresql://<HOST>:<PORT>/<DATABASE>?user=<USERNAME>&sslmode=require")
+    # Password for authentication 
+    api_key_env: POSTGRES_PASSWORD
+    index_name: documents
+    # Specify the database type
+    db_type: postgres
+
+```
+
+You can provide credentials directly or via environment variables (recommended for security).
+
+### Usage
+
+The PostgreSQL vector client implements the full `VectorDBClientInterface` and supports all standard operations:
+
+- Vector similarity search
+- Document upload with vector embeddings
+- URL-based document lookup
+- Site-specific filtering
+- Document deletion