diff --git a/.env b/.env index df7acc0..43cf00e 100644 --- a/.env +++ b/.env @@ -20,7 +20,6 @@ EMBEDDING_LENGTH=768 # === Redis === REDIS_URL=redis://localhost:6379 REDIS_INDEX=docs -REDIS_SCHEMA=redis_schema.yaml # === Elasticsearch === ELASTIC_URL=http://localhost:9200 @@ -29,7 +28,7 @@ ELASTIC_USER=elastic ELASTIC_PASSWORD=changeme # === PGVector === -PGVECTOR_URL=postgresql://user:pass@localhost:5432/mydb +PGVECTOR_URL=postgresql+psycopg://user:pass@localhost:5432/mydb PGVECTOR_COLLECTION_NAME=documents # === SQL Server === diff --git a/Containerfile b/Containerfile index 03cd02d..1f30f86 100644 --- a/Containerfile +++ b/Containerfile @@ -18,7 +18,6 @@ COPY vector_db ./vector_db COPY loaders ./loaders COPY embed_documents.py . COPY config.py . -COPY redis_schema.yaml . COPY .env . RUN chown -R 1001:0 . diff --git a/README.md b/README.md index a2171c1..7b6e04d 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,24 @@ It supports Git repositories, web URLs, and file types like Markdown, PDFs, and HTML. Designed for local runs, containers, or OpenShift/Kubernetes jobs. +- [📚 vector-embedder](#-vector-embedder) + - [⚙️ Features](#️-features) + - [🚀 Quick Start](#-quick-start) + - [1. Configuration](#1-configuration) + - [2. Run Locally](#2-run-locally) + - [3. Or Run in a Container](#3-or-run-in-a-container) + - [🧪 Dry Run Mode](#-dry-run-mode) + - [📦 Dependency Management \& Updates](#-dependency-management--updates) + - [🔧 Installing `pip-tools`](#-installing-pip-tools) + - [➕ Adding / Updating a Package](#-adding--updating-a-package) + - [🗂️ Project Layout](#️-project-layout) + - [🧪 Local DB Testing](#-local-db-testing) + - [PGVector (PostgreSQL)](#pgvector-postgresql) + - [Elasticsearch](#elasticsearch) + - [Redis (RediSearch)](#redis-redisearch) + - [Qdrant](#qdrant) + - [🙌 Acknowledgments](#-acknowledgments) + --- ## ⚙️ Features @@ -101,6 +119,43 @@ Run it: --- +## 📦 Dependency Management & Updates + +This project keeps *two* dependency files under version control: + +| File | Purpose | Edited by | +|------|---------|-----------| +| **`requirements.in`** | Short, human-readable list of *top-level* libraries (no pins) | You | +| **`requirements.txt`** | Fully-resolved, **pinned** lock file—including hashes—for exact, reproducible builds | `pip-compile` | + +### 🔧 Installing `pip-tools` + +```bash +python -m pip install --upgrade pip-tools +```` + +### ➕ Adding / Updating a Package + +1. **Edit `requirements.in`** + + ```diff + - sentence-transformers + + sentence-transformers>=4.1 + + llama-index + ``` +2. **Re-lock** the environment + + ```bash + pip-compile --upgrade + ``` +3. **Synchronise** your virtual-env + + ```bash + pip-sync + ``` + +--- + ## 🗂️ Project Layout ``` diff --git a/config.py b/config.py index 0882ff1..91a6691 100644 --- a/config.py +++ b/config.py @@ -114,8 +114,7 @@ def _init_db_provider(db_type: str) -> DBProvider: if db_type == "REDIS": url = get("REDIS_URL") index = os.getenv("REDIS_INDEX", "docs") - schema = os.getenv("REDIS_SCHEMA", "redis_schema.yaml") - return RedisProvider(embedding_model, url, index, schema) + return RedisProvider(embedding_model, url, index) elif db_type == "ELASTIC": url = get("ELASTIC_URL") @@ -127,7 +126,7 @@ def _init_db_provider(db_type: str) -> DBProvider: elif db_type == "PGVECTOR": url = get("PGVECTOR_URL") collection = get("PGVECTOR_COLLECTION_NAME") - return PGVectorProvider(embedding_model, url, collection) + return PGVectorProvider(embedding_model, url, collection, embedding_length) elif db_type == "MSSQL": connection_string = get("MSSQL_CONNECTION_STRING") diff --git a/loaders/git.py b/loaders/git.py index 75c7713..a1bb5ce 100644 --- a/loaders/git.py +++ b/loaders/git.py @@ -81,15 +81,27 @@ def load(self) -> List[Document]: pdf_files = [f for f in matched_files if f.suffix.lower() == ".pdf"] text_files = [f for f in matched_files if f.suffix.lower() != ".pdf"] + docs: List[Document] = [] if pdf_files: logger.info("Loading %d PDF file(s) from %s", len(pdf_files), repo_url) - all_chunks.extend(self.pdf_loader.load(pdf_files)) + docs.extend(self.pdf_loader.load(pdf_files)) if text_files: logger.info( "Loading %d text file(s) from %s", len(text_files), repo_url ) - all_chunks.extend(self.text_loader.load(text_files)) + docs.extend(self.text_loader.load(text_files)) + + for doc in docs: + local_src = Path(doc.metadata.get("source", "")) + try: + rel_path = local_src.relative_to(repo_path) + except ValueError: + rel_path = local_src + + doc.metadata.update({"source": f"{repo_url}@{rel_path.as_posix()}"}) + + all_chunks.extend(docs) return all_chunks diff --git a/redis_schema.yaml b/redis_schema.yaml deleted file mode 100644 index 033683d..0000000 --- a/redis_schema.yaml +++ /dev/null @@ -1,53 +0,0 @@ -numeric: -- name: total_pages - no_index: false - sortable: false -- name: page - no_index: false - sortable: false -- name: page_label - no_index: false - sortable: false -text: -- name: producer - no_index: false - no_stem: false - sortable: false - weight: 1 - withsuffixtrie: false -- name: creator - no_index: false - no_stem: false - sortable: false - weight: 1 - withsuffixtrie: false -- name: creationdate - no_index: false - no_stem: false - sortable: false - weight: 1 - withsuffixtrie: false -- name: title - no_index: false - no_stem: false - sortable: false - weight: 1 - withsuffixtrie: false -- name: source - no_index: false - no_stem: false - sortable: false - weight: 1 - withsuffixtrie: false -- name: content - no_index: false - no_stem: false - sortable: false - weight: 1 - withsuffixtrie: false -vector: -- algorithm: FLAT - datatype: FLOAT32 - dims: 768 - distance_metric: COSINE - name: content_vector diff --git a/requirements.in b/requirements.in new file mode 100644 index 0000000..42a8b13 --- /dev/null +++ b/requirements.in @@ -0,0 +1,17 @@ +beautifulsoup4 +hf_xet +langchain +langchain-community +langchain-elasticsearch +langchain-huggingface +langchain-postgres +langchain-qdrant +langchain-redis +langchain-sqlserver +psycopg-binary +pyodbc +pypdf +python-dotenv +qdrant-client +sentence-transformers +unstructured[md] diff --git a/requirements.txt b/requirements.txt index c1acd1a..220dfb0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,472 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile +# +aiofiles==24.1.0 + # via unstructured-client +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.8 + # via langchain-community +aiosignal==1.3.2 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +anyio==4.9.0 + # via httpx +asyncpg==0.30.0 + # via langchain-postgres +attrs==25.3.0 + # via aiohttp +azure-core==1.34.0 + # via azure-identity +azure-identity==1.23.0 + # via langchain-sqlserver +backoff==2.2.1 + # via unstructured beautifulsoup4==4.13.4 -hf_xet==1.1.2 + # via + # -r requirements.in + # unstructured +certifi==2025.4.26 + # via + # elastic-transport + # httpcore + # httpx + # requests +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.4.2 + # via requests +click==8.2.1 + # via + # nltk + # python-oxmsg +coloredlogs==15.0.1 + # via redisvl +cryptography==45.0.3 + # via + # azure-identity + # msal + # pyjwt + # unstructured-client +dataclasses-json==0.6.7 + # via + # langchain-community + # unstructured +elastic-transport==8.17.1 + # via elasticsearch +elasticsearch[vectorstore-mmr]==8.18.1 + # via langchain-elasticsearch +emoji==2.14.1 + # via unstructured +filelock==3.18.0 + # via + # huggingface-hub + # torch + # transformers +filetype==1.2.0 + # via unstructured +frozenlist==1.6.2 + # via + # aiohttp + # aiosignal +fsspec==2025.5.1 + # via + # huggingface-hub + # torch +greenlet==3.2.2 + # via sqlalchemy +grpcio==1.72.1 + # via qdrant-client +h11==0.16.0 + # via httpcore +h2==4.2.0 + # via httpx +hf-xet==1.1.3 + # via + # -r requirements.in + # huggingface-hub +hpack==4.1.0 + # via h2 +html5lib==1.1 + # via unstructured +httpcore==1.0.9 + # via httpx +httpx[http2]==0.28.1 + # via + # langsmith + # qdrant-client + # unstructured-client +httpx-sse==0.4.0 + # via langchain-community +huggingface-hub==0.32.4 + # via + # langchain-huggingface + # sentence-transformers + # tokenizers + # transformers +humanfriendly==10.0 + # via coloredlogs +hyperframe==6.1.0 + # via h2 +idna==3.10 + # via + # anyio + # httpx + # requests + # yarl +jinja2==3.1.6 + # via + # langchain-redis + # torch +joblib==1.5.1 + # via + # nltk + # scikit-learn +jsonpatch==1.33 + # via langchain-core +jsonpointer==3.0.0 + # via jsonpatch +langchain==0.3.25 + # via + # -r requirements.in + # langchain-community langchain-community==0.3.24 + # via -r requirements.in +langchain-core==0.3.63 + # via + # langchain + # langchain-community + # langchain-elasticsearch + # langchain-huggingface + # langchain-postgres + # langchain-qdrant + # langchain-redis + # langchain-sqlserver + # langchain-text-splitters langchain-elasticsearch==0.3.2 + # via -r requirements.in langchain-huggingface==0.2.0 + # via -r requirements.in langchain-postgres==0.0.14 + # via -r requirements.in langchain-qdrant==0.2.0 + # via -r requirements.in +langchain-redis==0.2.0 + # via -r requirements.in langchain-sqlserver==0.1.2 -langchain==0.3.25 -psycopg[binary]==3.2.9 -psycopg2-binary==2.9.10 + # via -r requirements.in +langchain-text-splitters==0.3.8 + # via langchain +langdetect==1.0.9 + # via unstructured +langsmith==0.3.44 + # via + # langchain + # langchain-community + # langchain-core +lxml==5.4.0 + # via unstructured +markdown==3.8 + # via unstructured +markupsafe==3.0.2 + # via jinja2 +marshmallow==3.26.1 + # via dataclasses-json +ml-dtypes==0.4.1 + # via redisvl +mpmath==1.3.0 + # via sympy +msal==1.32.3 + # via + # azure-identity + # msal-extensions +msal-extensions==1.3.1 + # via azure-identity +multidict==6.4.4 + # via + # aiohttp + # yarl +mypy-extensions==1.1.0 + # via typing-inspect +nest-asyncio==1.6.0 + # via unstructured-client +networkx==3.5 + # via torch +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # elasticsearch + # langchain-community + # langchain-postgres + # langchain-redis + # langchain-sqlserver + # ml-dtypes + # pgvector + # qdrant-client + # redisvl + # scikit-learn + # scipy + # transformers + # unstructured +nvidia-cublas-cu12==12.6.4.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.6.80 + # via torch +nvidia-cuda-nvrtc-cu12==12.6.77 + # via torch +nvidia-cuda-runtime-cu12==12.6.77 + # via torch +nvidia-cudnn-cu12==9.5.1.17 + # via torch +nvidia-cufft-cu12==11.3.0.4 + # via torch +nvidia-cufile-cu12==1.11.1.6 + # via torch +nvidia-curand-cu12==10.3.7.77 + # via torch +nvidia-cusolver-cu12==11.7.1.2 + # via torch +nvidia-cusparse-cu12==12.5.4.2 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-cusparselt-cu12==0.6.3 + # via torch +nvidia-nccl-cu12==2.26.2 + # via torch +nvidia-nvjitlink-cu12==12.6.85 + # via + # nvidia-cufft-cu12 + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.6.77 + # via torch +olefile==0.47 + # via python-oxmsg +orjson==3.10.18 + # via langsmith +packaging==24.2 + # via + # huggingface-hub + # langchain-core + # langsmith + # marshmallow + # transformers +pgvector==0.3.6 + # via langchain-postgres +pillow==11.2.1 + # via sentence-transformers +portalocker==2.10.1 + # via qdrant-client +propcache==0.3.1 + # via + # aiohttp + # yarl +protobuf==6.31.1 + # via qdrant-client +psutil==7.0.0 + # via unstructured +psycopg==3.2.9 + # via langchain-postgres +psycopg-binary==3.2.9 + # via -r requirements.in +psycopg-pool==3.2.6 + # via langchain-postgres +pycparser==2.22 + # via cffi +pydantic==2.11.5 + # via + # langchain + # langchain-core + # langchain-qdrant + # langsmith + # pydantic-settings + # qdrant-client + # redisvl + # unstructured-client +pydantic-core==2.33.2 + # via pydantic +pydantic-settings==2.9.1 + # via langchain-community +pyjwt[crypto]==2.10.1 + # via + # msal + # pyjwt pyodbc==5.2.0 + # via + # -r requirements.in + # langchain-sqlserver pypdf==5.6.0 + # via + # -r requirements.in + # unstructured-client +python-dateutil==2.9.0.post0 + # via elasticsearch python-dotenv==1.1.0 + # via + # -r requirements.in + # pydantic-settings +python-iso639==2025.2.18 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.2 + # via unstructured +python-ulid==3.0.0 + # via + # langchain-redis + # redisvl +pyyaml==6.0.2 + # via + # huggingface-hub + # langchain + # langchain-community + # langchain-core + # redisvl + # transformers qdrant-client==1.14.2 + # via + # -r requirements.in + # langchain-qdrant +rapidfuzz==3.13.0 + # via unstructured redis==5.2.1 + # via redisvl +redisvl==0.4.1 + # via langchain-redis +regex==2024.11.6 + # via + # nltk + # transformers +requests==2.32.3 + # via + # azure-core + # huggingface-hub + # langchain + # langchain-community + # langsmith + # msal + # requests-toolbelt + # transformers + # unstructured +requests-toolbelt==1.0.0 + # via + # langsmith + # unstructured-client +safetensors==0.5.3 + # via transformers +scikit-learn==1.6.1 + # via sentence-transformers +scipy==1.15.3 + # via + # scikit-learn + # sentence-transformers sentence-transformers==4.1.0 + # via + # -r requirements.in + # langchain-huggingface +simsimd==6.4.7 + # via elasticsearch +six==1.17.0 + # via + # azure-core + # html5lib + # langdetect + # python-dateutil +sniffio==1.3.1 + # via anyio +soupsieve==2.7 + # via beautifulsoup4 +sqlalchemy==2.0.41 + # via + # langchain + # langchain-community + # langchain-postgres + # langchain-sqlserver +sympy==1.14.0 + # via torch +tabulate==0.9.0 + # via redisvl +tenacity==9.1.2 + # via + # langchain-community + # langchain-core + # redisvl +threadpoolctl==3.6.0 + # via scikit-learn +tokenizers==0.21.1 + # via + # langchain-huggingface + # transformers +torch==2.7.0 + # via sentence-transformers +tqdm==4.67.1 + # via + # huggingface-hub + # nltk + # sentence-transformers + # transformers + # unstructured +transformers==4.52.4 + # via + # langchain-huggingface + # sentence-transformers +triton==3.3.0 + # via torch +typing-extensions==4.14.0 + # via + # anyio + # azure-core + # azure-identity + # beautifulsoup4 + # elasticsearch + # huggingface-hub + # langchain-core + # psycopg + # psycopg-pool + # pydantic + # pydantic-core + # python-oxmsg + # sentence-transformers + # sqlalchemy + # torch + # typing-inspect + # typing-inspection + # unstructured +typing-inspect==0.9.0 + # via dataclasses-json +typing-inspection==0.4.1 + # via + # pydantic + # pydantic-settings unstructured[md]==0.17.2 + # via -r requirements.in +unstructured-client==0.36.0 + # via unstructured +urllib3==2.4.0 + # via + # elastic-transport + # qdrant-client + # requests +webencodings==0.5.1 + # via html5lib +wrapt==1.17.2 + # via unstructured +yarl==1.20.0 + # via aiohttp +zstandard==0.23.0 + # via langsmith + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/vector_db/pgvector_provider.py b/vector_db/pgvector_provider.py index cc3bbe3..3a026f2 100644 --- a/vector_db/pgvector_provider.py +++ b/vector_db/pgvector_provider.py @@ -3,7 +3,7 @@ from urllib.parse import urlparse from langchain_core.documents import Document -from langchain_postgres import PGVector +from langchain_postgres import PGEngine, PGVectorStore from vector_db.db_provider import DBProvider @@ -26,18 +26,26 @@ class PGVectorProvider(DBProvider): embedding_model (str): The model name to use for computing embeddings. url (str): PostgreSQL connection string (e.g. "postgresql://user:pass@host:5432/db"). collection_name (str): Name of the table/collection used for storing vectors. + embedding_length (int): Dimensionality of the embeddings (e.g., 768 for all-mpnet-base-v2). Example: >>> from vector_db.pgvector_provider import PGVectorProvider >>> provider = PGVectorProvider( ... embedding_model="BAAI/bge-base-en-v1.5", ... url="postgresql://user:pass@localhost:5432/vector_db", - ... collection_name="rag_chunks" + ... collection_name="rag_chunks", + ... embedding_length=768 ... ) >>> provider.add_documents(docs) """ - def __init__(self, embedding_model: str, url: str, collection_name: str): + def __init__( + self, + embedding_model: str, + url: str, + collection_name: str, + embedding_length: int, + ): """ Initialize a PGVectorProvider for use with PostgreSQL. @@ -48,11 +56,10 @@ def __init__(self, embedding_model: str, url: str, collection_name: str): """ super().__init__(embedding_model) - self.db = PGVector( - connection=url, - collection_name=collection_name, - embeddings=self.embeddings, - ) + engine = PGEngine.from_connection_string(url) + engine.init_vectorstore_table(collection_name, embedding_length) + + self.db = PGVectorStore.create_sync(engine, self.embeddings, collection_name) parsed = urlparse(url) postgres_location = ( diff --git a/vector_db/redis_provider.py b/vector_db/redis_provider.py index 6849ad9..c164137 100644 --- a/vector_db/redis_provider.py +++ b/vector_db/redis_provider.py @@ -1,9 +1,8 @@ import logging -from typing import List, Optional +from typing import List -import redis -from langchain_community.vectorstores.redis import Redis as RedisVectorStore from langchain_core.documents import Document +from langchain_redis import RedisVectorStore from vector_db.db_provider import DBProvider @@ -14,32 +13,25 @@ class RedisProvider(DBProvider): """ Redis-backed vector DB provider using RediSearch and LangChain's Redis integration. - This provider connects to a Redis instance, checks if the specified index exists, - and either loads from it or creates a new index on first insert. Vectors are stored - using the RediSearch module with configurable schema. - Attributes: - redis_client (redis.Redis): Raw Redis client for low-level access. - db (Optional[RedisVectorStore]): LangChain vector store, lazily created on first add. + db (RedisVectorStore): LangChain vector store Args: embedding_model (str): Name of the embedding model to use for text chunks. url (str): Redis connection string (e.g., "redis://localhost:6379"). index (str): RediSearch index name to use for vector storage. - schema (str): Path to schema file where the RediSearch index definition is written. Example: >>> from vector_db.redis_provider import RedisProvider >>> provider = RedisProvider( ... embedding_model="BAAI/bge-large-en-v1.5", ... url="redis://localhost:6379", - ... index="validated_docs", - ... schema="redis_schema.yaml" + ... index="validated_docs" ... ) >>> provider.add_documents(docs) """ - def __init__(self, embedding_model: str, url: str, index: str, schema: str): + def __init__(self, embedding_model: str, url: str, index: str): """ Initialize a Redis-backed vector store provider. @@ -47,48 +39,18 @@ def __init__(self, embedding_model: str, url: str, index: str, schema: str): embedding_model (str): HuggingFace model for embeddings. url (str): Redis connection string. index (str): Name of the RediSearch index to use. - schema (str): Path to write RediSearch schema YAML (used on creation). """ super().__init__(embedding_model) - self.url = url - self.index = index - self.schema = schema - self.db: Optional[RedisVectorStore] = None - - try: - self.redis_client = redis.from_url(self.url) - self.redis_client.ping() - logger.info("Connected to Redis instance at %s", self.url) - except Exception: - logger.exception("Failed to connect to Redis at %s", self.url) - raise - if self._index_exists(): - logger.info("Using existing Redis index: %s", self.index) - self.db = RedisVectorStore.from_existing_index( - embedding=self.embeddings, - redis_url=self.url, - index_name=self.index, - schema=self.schema, - ) - else: - logger.info( - "Redis index %s does not exist. Will create on first add_documents call.", - self.index, - ) + self.db = RedisVectorStore( + index_name=index, embeddings=self.embeddings, redis_url=url + ) - def _index_exists(self) -> bool: - """ - Check whether the Redis index already exists. - - Returns: - bool: True if the index exists, False otherwise. - """ - try: - self.redis_client.ft(self.index).info() - return True - except Exception: - return False + logger.info( + "Connected to Redis at %s (index: %s)", + url, + index, + ) def add_documents(self, docs: List[Document]) -> None: """ @@ -97,15 +59,4 @@ def add_documents(self, docs: List[Document]) -> None: Args: docs (List[Document]): LangChain document chunks to embed and store. """ - if self.db is None: - logger.info("Creating new Redis index: %s", self.index) - self.db = RedisVectorStore.from_documents( - documents=docs, - embedding=self.embeddings, - redis_url=self.url, - index_name=self.index, - ) - logger.info("Writing Redis schema to file: %s", self.schema) - self.db.write_schema(self.schema) - else: - self.db.add_documents(docs) + self.db.add_documents(docs)