From d3c58b5e2dc938517d404c5c9d6c2653fdee667c Mon Sep 17 00:00:00 2001 From: Drew Minnear Date: Mon, 22 Sep 2025 16:05:12 -0400 Subject: [PATCH] update ci to use super-linter and push to both quay-repos --- .env | 16 +-- .github/linters/.hadolint.yaml | 9 ++ .github/linters/.isort.cfg | 2 + .github/linters/.jscpd.json | 3 + .github/linters/.python-lint | 5 + .github/workflows/ci-pipeline.yaml | 188 +++++++++++++++++----------- .github/workflows/super-linter.yaml | 16 +++ Containerfile | 18 +-- Makefile | 32 +++++ README.md | 23 ++-- config.py | 74 +++++------ embed_documents.py | 152 +++++++++++----------- loaders/__init__.py | 1 + loaders/git.py | 2 + loaders/pdf.py | 2 + loaders/text.py | 96 ++++++++------ loaders/web.py | 2 + requirements.txt | 10 +- vector_db/__init__.py | 1 + vector_db/db_provider.py | 3 +- vector_db/dryrun_provider.py | 15 +-- vector_db/elastic_provider.py | 4 +- vector_db/mssql_provider.py | 8 +- vector_db/pgvector_provider.py | 2 + vector_db/qdrant_provider.py | 2 + vector_db/redis_provider.py | 2 + 26 files changed, 418 insertions(+), 270 deletions(-) create mode 100644 .github/linters/.hadolint.yaml create mode 100644 .github/linters/.isort.cfg create mode 100644 .github/linters/.jscpd.json create mode 100644 .github/linters/.python-lint create mode 100644 .github/workflows/super-linter.yaml create mode 100644 Makefile create mode 100644 loaders/__init__.py create mode 100644 vector_db/__init__.py diff --git a/.env b/.env index 34c49fe..e547580 100644 --- a/.env +++ b/.env @@ -5,35 +5,35 @@ TEMP_DIR=/tmp LOG_LEVEL=info # === Git Repo Document Sources === -REPO_SOURCES=[{"repo": "https://github.com/RHEcosystemAppEng/llm-on-openshift.git", "globs": ["examples/notebooks/langchain/rhods-doc/*.pdf"]}] +REPO_SOURCES='[{"repo": "https://github.com/RHEcosystemAppEng/llm-on-openshift.git", "globs": ["examples/notebooks/langchain/rhods-doc/*.pdf"]}]' # === Web Document Sources === -WEB_SOURCES=["https://ai-on-openshift.io/getting-started/openshift/", "https://ai-on-openshift.io/getting-started/opendatahub/", "https://ai-on-openshift.io/getting-started/openshift-ai/", "https://ai-on-openshift.io/odh-rhoai/configuration/", "https://ai-on-openshift.io/odh-rhoai/custom-notebooks/", "https://ai-on-openshift.io/odh-rhoai/nvidia-gpus/", "https://ai-on-openshift.io/odh-rhoai/custom-runtime-triton/", "https://ai-on-openshift.io/odh-rhoai/openshift-group-management/", "https://ai-on-openshift.io/tools-and-applications/minio/minio/"] +WEB_SOURCES='["https://ai-on-openshift.io/getting-started/openshift/", "https://ai-on-openshift.io/getting-started/opendatahub/", "https://ai-on-openshift.io/getting-started/openshift-ai/", "https://ai-on-openshift.io/odh-rhoai/configuration/", "https://ai-on-openshift.io/odh-rhoai/custom-notebooks/", "https://ai-on-openshift.io/odh-rhoai/nvidia-gpus/", "https://ai-on-openshift.io/odh-rhoai/custom-runtime-triton/", "https://ai-on-openshift.io/odh-rhoai/openshift-group-management/", "https://ai-on-openshift.io/tools-and-applications/minio/minio/"]' # === General Embedding Config === -CHUNK_SIZE=1024 CHUNK_OVERLAP=40 +CHUNK_SIZE=1024 DB_TYPE=DRYRUN EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2 # === Redis === -REDIS_URL=redis://localhost:6379 REDIS_INDEX=docs +REDIS_URL=redis://localhost:6379 # === Elasticsearch === -ELASTIC_URL=http://localhost:9200 ELASTIC_INDEX=docs -ELASTIC_USER=elastic ELASTIC_PASSWORD=changeme +ELASTIC_URL=http://localhost:9200 +ELASTIC_USER=elastic # === PGVector === -PGVECTOR_URL=postgresql+psycopg://user:pass@localhost:5432/mydb PGVECTOR_COLLECTION_NAME=documents +PGVECTOR_URL=postgresql+psycopg://user:pass@localhost:5432/mydb # === SQL Server === MSSQL_CONNECTION_STRING="Driver={ODBC Driver 18 for SQL Server}; Server=localhost,1433; Database=embeddings; UID=sa; PWD=StrongPassword!; TrustServerCertificate=yes; Encrypt=no;" MSSQL_TABLE=docs # === Qdrant === -QDRANT_URL=http://localhost:6333 QDRANT_COLLECTION=embedded_docs +QDRANT_URL=http://localhost:6333 diff --git a/.github/linters/.hadolint.yaml b/.github/linters/.hadolint.yaml new file mode 100644 index 0000000..e5d771d --- /dev/null +++ b/.github/linters/.hadolint.yaml @@ -0,0 +1,9 @@ +ignored: + # Always tag the version of an image explicitly + - DL3006 + # Using latest is prone to errors if the image will ever update. Pin the version explicitly to a release tag + - DL3007 + # Specify version with `dnf install -y -`. + - DL3041 + # Pin versions in pip. Instead of `pip install ` use `pip install ==` or `pip install --requirement ` + - DL3013 diff --git a/.github/linters/.isort.cfg b/.github/linters/.isort.cfg new file mode 100644 index 0000000..57fae97 --- /dev/null +++ b/.github/linters/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +profile = "black" diff --git a/.github/linters/.jscpd.json b/.github/linters/.jscpd.json new file mode 100644 index 0000000..27a038a --- /dev/null +++ b/.github/linters/.jscpd.json @@ -0,0 +1,3 @@ +{ + "ignore": ["**/.github/**"] +} diff --git a/.github/linters/.python-lint b/.github/linters/.python-lint new file mode 100644 index 0000000..090a4fa --- /dev/null +++ b/.github/linters/.python-lint @@ -0,0 +1,5 @@ +[MESSAGES CONTROL] +disable= + too-few-public-methods, + broad-exception-caught, + import-error diff --git a/.github/workflows/ci-pipeline.yaml b/.github/workflows/ci-pipeline.yaml index 0d193bb..bf8c66e 100644 --- a/.github/workflows/ci-pipeline.yaml +++ b/.github/workflows/ci-pipeline.yaml @@ -1,78 +1,81 @@ -name: CI Pipeline +name: Build and push to quay on: pull_request: + branches: [main] push: branches: [main] tags: - - "v*" + - "v*.*.*" -jobs: - lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - run: pip install black isort ruff - - run: black --check . - - run: isort --check-only . - - run: ruff check . +permissions: + contents: read + +env: + REGISTRY: localhost + NAME: vector-embedder + TAG: ${{ github.event_name == 'pull_request' && format('pr-{0}', github.event.pull_request.number) || (github.ref_name == 'main' && 'latest' || github.ref_name) }} - build: +jobs: + build-container: runs-on: ubuntu-latest - needs: lint - outputs: - image_tag: ${{ steps.meta.outputs.sha_tag }} - steps: - - uses: actions/checkout@v4 - - name: Generate tag - id: meta - run: echo "sha_tag=sha-${GITHUB_SHA::7}" >> $GITHUB_OUTPUT + permissions: + contents: read - - name: Build Docker image - uses: docker/build-push-action@v5 + steps: + - name: Checkout code + uses: actions/checkout@v5 with: - context: . - file: ./Containerfile - load: true - tags: test-image:${{ steps.meta.outputs.sha_tag }} + fetch-depth: 0 + persist-credentials: false - - name: Save image as artifact - run: docker save test-image:${{ steps.meta.outputs.sha_tag }} -o image.tar + - name: Build container and push to local registry + env: + CONTAINER: ${{ env.NAME }}:${{ env.TAG }} + run: | + make build + podman push "${CONTAINER}" "docker-archive:/tmp/image.tar:${CONTAINER}" - name: Upload image artifact uses: actions/upload-artifact@v4 with: - name: test-image - path: image.tar + name: image-${{ github.run_id }} + path: /tmp/image.tar + retention-days: 1 test: - needs: [lint, build] - runs-on: ubuntu-latest + needs: [build-container] + if: github.event_name == 'pull_request' strategy: fail-fast: false matrix: db: [pgvector, redis, elastic, qdrant, mssql] + runs-on: ubuntu-latest + permissions: + contents: read + steps: - - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v5 + with: + fetch-depth: 0 + persist-credentials: false - - name: Download image artifact - uses: actions/download-artifact@v4 + - name: Download image + uses: actions/download-artifact@v5 with: - name: test-image - path: . + name: image-${{ github.run_id }} + path: /tmp - - name: Load Docker image - run: docker load -i image.tar + - name: Load image into local containers-storage + run: podman pull docker-archive:/tmp/image.tar - name: Start MSSQL if: matrix.db == 'mssql' run: | - docker run -d --name mssql-vector-test \ + podman run -d --name mssql-vector-test \ -e "ACCEPT_EULA=Y" \ -e "SA_PASSWORD=StrongPassword!" \ -p 1433:1433 \ @@ -81,7 +84,7 @@ jobs: - name: Start PGVector if: matrix.db == 'pgvector' run: | - docker run -d --name pgvector-test \ + podman run -d --name pgvector-test \ -e POSTGRES_USER=user \ -e POSTGRES_PASSWORD=pass \ -e POSTGRES_DB=mydb \ @@ -91,14 +94,14 @@ jobs: - name: Start Redis if: matrix.db == 'redis' run: | - docker run -d --name redis-test \ + podman run -d --name redis-test \ -p 6379:6379 \ redis/redis-stack-server:6.2.6-v19 - name: Start Elasticsearch if: matrix.db == 'elastic' run: | - docker run -d --name es-test \ + podman run -d --name es-test \ -e "discovery.type=single-node" \ -e "xpack.security.enabled=true" \ -e "ELASTIC_PASSWORD=changeme" \ @@ -109,7 +112,7 @@ jobs: - name: Start Qdrant if: matrix.db == 'qdrant' run: | - docker run -d --name qdrant-test \ + podman run -d --name qdrant-test \ -p 6333:6333 \ qdrant/qdrant @@ -117,45 +120,78 @@ jobs: run: sleep 30 - name: Run embed job + env: + CONTAINER: ${{ env.NAME }}:${{ env.TAG }} + DB_TYPE: ${{ matrix.db }} run: | - docker run --rm --network host \ + podman run --rm --network host \ -e LOG_LEVEL=debug \ - -e DB_TYPE=${{ matrix.db }} \ - test-image:${{ needs.build.outputs.image_tag }} + -e DB_TYPE="${DB_TYPE}" \ + "${REGISTRY}/${CONTAINER}" + + push-container: + needs: [build-container] + if: github.event_name != 'pull_request' + strategy: + matrix: + include: + - upload_registry: quay.io/validatedpatterns + legacy: false + - upload_registry: quay.io/hybridcloudpatterns + legacy: true - release: - if: (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) && github.event.repository.fork == false runs-on: ubuntu-latest - needs: [lint, build, test] - steps: - - uses: actions/checkout@v4 + permissions: + contents: read + # This is used to complete the identity challenge + # with sigstore/fulcio when running outside of PRs. + id-token: write - - name: Log in to Quay.io - uses: docker/login-action@v3 + steps: + - name: Checkout code + uses: actions/checkout@v5 with: - registry: quay.io - username: ${{ secrets.QUAY_USERNAME }} - password: ${{ secrets.QUAY_PASSWORD }} + fetch-depth: 0 + persist-credentials: false - - name: Download image artifact - uses: actions/download-artifact@v4 + - name: Download image + uses: actions/download-artifact@v5 with: - name: test-image - path: . + name: image-${{ github.run_id }} + path: /tmp - - name: Load Docker image - run: docker load -i image.tar + - name: Load image into local containers-storage + run: podman pull docker-archive:/tmp/image.tar - - name: Tag and push image + - name: Log into Quay + env: + USERNAME: ${{ matrix.legacy && secrets.LEGACY_QUAY_USERNAME || secrets.QUAY_USERNAME }} + PASSWORD: ${{ matrix.legacy && secrets.LEGACY_QUAY_PASSWORD || secrets.QUAY_PASSWORD }} run: | - docker tag test-image:${{ needs.build.outputs.image_tag }} quay.io/hybridcloudpatterns/vector-embedder:${{ needs.build.outputs.image_tag }} + podman login -u "${USERNAME}" -p "${PASSWORD}" quay.io - if [[ $GITHUB_REF == refs/tags/* ]]; then - docker tag test-image:${{ needs.build.outputs.image_tag }} quay.io/hybridcloudpatterns/vector-embedder:${GITHUB_REF#refs/tags/} - docker push quay.io/hybridcloudpatterns/vector-embedder:${GITHUB_REF#refs/tags/} - elif [[ $GITHUB_REF == refs/heads/main ]]; then - docker tag test-image:${{ needs.build.outputs.image_tag }} quay.io/hybridcloudpatterns/vector-embedder:latest - docker push quay.io/hybridcloudpatterns/vector-embedder:latest - fi + - name: Push image to Quay + id: image-push + env: + UPLOADREGISTRY: ${{ matrix.upload_registry }} + CONTAINER: ${{ env.NAME }}:${{ env.TAG }} + run: | + make upload + DIGEST=$(skopeo inspect --format "{{.Digest}}" "docker://${UPLOADREGISTRY}/${CONTAINER}") + echo "digest=$DIGEST" >> "$GITHUB_OUTPUT" - docker push quay.io/hybridcloudpatterns/vector-embedder:${{ needs.build.outputs.image_tag }} + - name: Install cosign + uses: sigstore/cosign-installer@d58896d6a1865668819e1d91763c7751a165e159 # v3.9.2 + with: + cosign-release: "v2.2.4" + + # Cosign expects the docker config.json for registry authentication so we must + # copy it from buildah + - name: Sign the published Docker image + env: + CONTAINER: ${{ env.NAME }}:${{ env.TAG }} + DIGEST: ${{ steps.image-push.outputs.digest }} + UPLOADREGISTRY: ${{ matrix.upload_registry }} + run: | + cat "${XDG_RUNTIME_DIR}/containers/auth.json" > ~/.docker/config.json + cosign sign --yes "${UPLOADREGISTRY}/${CONTAINER}@${DIGEST}" diff --git a/.github/workflows/super-linter.yaml b/.github/workflows/super-linter.yaml new file mode 100644 index 0000000..79b7ff9 --- /dev/null +++ b/.github/workflows/super-linter.yaml @@ -0,0 +1,16 @@ +name: Super linter + +on: + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + lint: + uses: validatedpatterns/github-actions-library/.github/workflows/superlinter.yml@v1 + with: + sl_env: | + VALIDATE_TRIVY=false + VALIDATE_PYTHON_ISORT=false diff --git a/Containerfile b/Containerfile index 1f30f86..7c9fc88 100644 --- a/Containerfile +++ b/Containerfile @@ -1,18 +1,22 @@ -FROM registry.access.redhat.com/ubi9/python-312:9.5 +FROM registry.access.redhat.com/ubi10/python-312-minimal:10.0 USER root + WORKDIR /app -RUN dnf install -y \ +RUN microdnf install -y git \ unixODBC \ unixODBC-devel && \ curl -sSL https://packages.microsoft.com/config/rhel/9/prod.repo -o /etc/yum.repos.d/mssql-release.repo && \ - ACCEPT_EULA=Y dnf install -y msodbcsql18 && \ - dnf clean all + ACCEPT_EULA=Y microdnf install -y msodbcsql18 && \ + microdnf clean all COPY requirements.txt . -RUN pip install --upgrade pip && \ - pip install -r requirements.txt +RUN pip install --no-cache-dir --upgrade pip && \ + pip install \ + --no-cache-dir \ + --compile \ + -r requirements.txt COPY vector_db ./vector_db COPY loaders ./loaders @@ -24,4 +28,4 @@ RUN chown -R 1001:0 . USER 1001 -CMD ./embed_documents.py +CMD ["python", "./embed_documents.py"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3a0508a --- /dev/null +++ b/Makefile @@ -0,0 +1,32 @@ +NAME ?= vector-embedder +TAG ?= latest +CONTAINER ?= $(NAME):$(TAG) +REGISTRY ?= localhost +UPLOADREGISTRY ?= quay.io/validatedpatterns + +##@ Pattern Must Gather Tasks + +.PHONY: help +help: ## This help message + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^(\s|[a-zA-Z_0-9-])+:.*?##/ { printf " \033[36m%-35s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +.PHONY: build +build: podman-build + +.PHONY: podman-build +podman-build: ## build container + podman build -t ${REGISTRY}/${CONTAINER} . + +.PHONY: upload +upload: ## push container + podman tag ${REGISTRY}/${CONTAINER} ${UPLOADREGISTRY}/${CONTAINER} + podman push ${UPLOADREGISTRY}/${CONTAINER} + +.PHONY: super-linter +super-linter: ## Runs super linter locally + rm -rf .mypy_cache + podman run -e RUN_LOCAL=true -e USE_FIND_ALGORITHM=true \ + $(DISABLE_LINTERS) \ + -v $(PWD):/tmp/lint:rw,z \ + -w /tmp/lint \ + ghcr.io/super-linter/super-linter:slim-v8 diff --git a/README.md b/README.md index 6c44dba..c0966c0 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # πŸ“š vector-embedder -[![Quay Repository](https://img.shields.io/badge/Quay.io-vector--embedder-blue?logo=quay)](https://quay.io/repository/hybridcloudpatterns/vector-embedder) -[![CI](https://github.com/validatedpatterns-sandbox/vector-embedder/actions/workflows/ci-pipeline.yaml/badge.svg?branch=main)](https://github.com/validatedpatterns-sandbox/vector-embedder/actions/workflows/ci-pipeline.yaml) - +![Version: 1.0.0](https://img.shields.io/badge/Version-1.0.0-informational?style=flat-square) +[![Quay Repository](https://img.shields.io/badge/Quay.io-vector--embedder-blue?logo=quay)](https://quay.io/repository/validatedpatterns/vector-embedder) +[![CI Pipeline](https://github.com/validatedpatterns-sandbox/vector-embedder/actions/workflows/ci-pipeline.yaml/badge.svg?branch=main)](https://github.com/validatedpatterns-sandbox/vector-embedder/actions/workflows/ci-pipeline.yaml) **vector-embedder** is a flexible, language-agnostic document ingestion and embedding pipeline. It transforms structured and unstructured content from multiple sources into vector embeddings and stores them in your vector database of choice. @@ -122,18 +122,18 @@ Run it: ## πŸ“¦ Dependency Management & Updates -This project keeps *two* dependency files under version control: +This project keeps _two_ dependency files under version control: -| File | Purpose | Edited by | -|------|---------|-----------| -| **`requirements.in`** | Short, human-readable list of *top-level* libraries (no pins) | You | +| File | Purpose | Edited by | +| ---------------------- | ------------------------------------------------------------------------------------ | ------------- | +| **`requirements.in`** | Short, human-readable list of _top-level_ libraries (no pins) | You | | **`requirements.txt`** | Fully-resolved, **pinned** lock fileβ€”including hashesβ€”for exact, reproducible builds | `pip-compile` | ### πŸ”§ Installing `pip-tools` ```bash python -m pip install --upgrade pip-tools -```` +``` ### βž• Adding / Updating a Package @@ -144,11 +144,13 @@ python -m pip install --upgrade pip-tools + sentence-transformers>=4.1 + llama-index ``` + 2. **Re-lock** the environment ```bash pip-compile --upgrade ``` + 3. **Synchronise** your virtual-env ```bash @@ -159,7 +161,7 @@ python -m pip install --upgrade pip-tools ## πŸ—‚οΈ Project Layout -``` +```text . β”œβ”€β”€ embed_documents.py # Main entrypoint script β”œβ”€β”€ config.py # Config loader from env @@ -245,7 +247,6 @@ DB_TYPE=QDRANT ./embed_documents.py ### SQL Server (MSSQL) - ```bash podman run --rm -d \ --name mssql \ @@ -253,7 +254,7 @@ podman run --rm -d \ -e SA_PASSWORD=StrongPassword! \ -p 1433:1433 \ mcr.microsoft.com/mssql/rhel/server:2025-latest -```` +``` ```bash DB_TYPE=MSSQL ./embed_documents.py diff --git a/config.py b/config.py index 4ccf99f..e226ecf 100644 --- a/config.py +++ b/config.py @@ -1,3 +1,5 @@ +"""Configuration management for vector database embedder application.""" + import json import logging import os @@ -31,7 +33,6 @@ class Config: web_sources (List[str]): List of web URLs to scrape and embed. repo_sources (List[Dict]): Repositories and glob patterns for file discovery. temp_dir (str): Path to a temporary working directory. - log_level (int): Log verbosity level. Example: >>> config = Config.load() @@ -45,7 +46,6 @@ class Config: web_sources: List[str] repo_sources: List[Dict] temp_dir: str - log_level: int @staticmethod def _get_required_env_var(key: str) -> str: @@ -89,7 +89,8 @@ def _parse_log_level(log_level_name: str) -> int: } if log_level_name not in log_levels: raise ValueError( - f"Invalid LOG_LEVEL: '{log_level_name}'. Must be one of: {', '.join(log_levels.keys())}" + f"Invalid LOG_LEVEL: '{log_level_name}'. " + f"Must be one of: {', '.join(log_levels.keys())}" ) return log_levels[log_level_name] @@ -111,37 +112,33 @@ def _init_db_provider(db_type: str) -> DBProvider: db_type = db_type.upper() embeddings = HuggingFaceEmbeddings(model_name=get("EMBEDDING_MODEL")) - if db_type == "REDIS": - url = get("REDIS_URL") - index = os.getenv("REDIS_INDEX", "docs") - return RedisProvider(embeddings, url, index) - - elif db_type == "ELASTIC": - url = get("ELASTIC_URL") - password = get("ELASTIC_PASSWORD") - index = os.getenv("ELASTIC_INDEX", "docs") - user = os.getenv("ELASTIC_USER", "elastic") - return ElasticProvider(embeddings, url, password, index, user) - - elif db_type == "PGVECTOR": - url = get("PGVECTOR_URL") - collection = get("PGVECTOR_COLLECTION_NAME") - return PGVectorProvider(embeddings, url, collection) - - elif db_type == "MSSQL": - connection_string = get("MSSQL_CONNECTION_STRING") - table = get("MSSQL_TABLE") - return MSSQLProvider(embeddings, connection_string, table) - - elif db_type == "QDRANT": - url = get("QDRANT_URL") - collection = get("QDRANT_COLLECTION") - return QdrantProvider(embeddings, url, collection) - - elif db_type == "DRYRUN": - return DryRunProvider(embeddings) - - raise ValueError(f"Unsupported DB_TYPE '{db_type}'") + match db_type: + case "REDIS": + url = get("REDIS_URL") + index = os.getenv("REDIS_INDEX", "docs") + return RedisProvider(embeddings, url, index) + case "ELASTIC": + url = get("ELASTIC_URL") + password = get("ELASTIC_PASSWORD") + index = os.getenv("ELASTIC_INDEX", "docs") + user = os.getenv("ELASTIC_USER", "elastic") + return ElasticProvider(embeddings, url, password, index, user) + case "PGVECTOR": + url = get("PGVECTOR_URL") + collection = get("PGVECTOR_COLLECTION_NAME") + return PGVectorProvider(embeddings, url, collection) + case "MSSQL": + connection_string = get("MSSQL_CONNECTION_STRING") + table = get("MSSQL_TABLE") + return MSSQLProvider(embeddings, connection_string, table) + case "QDRANT": + url = get("QDRANT_URL") + collection = get("QDRANT_COLLECTION") + return QdrantProvider(embeddings, url, collection) + case "DRYRUN": + return DryRunProvider(embeddings) + case _: + raise ValueError(f"Unsupported DB_TYPE '{db_type}'") @staticmethod def load() -> "Config": @@ -162,7 +159,11 @@ def load() -> "Config": # Logging setup log_level = get("LOG_LEVEL").upper() - logging.basicConfig(level=Config._parse_log_level(log_level)) + logging.basicConfig( + level=Config._parse_log_level(log_level), + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) logger = logging.getLogger(__name__) logger.debug("Logging initialized at level: %s", log_level) @@ -174,7 +175,7 @@ def load() -> "Config": try: web_sources = json.loads(get("WEB_SOURCES")) except json.JSONDecodeError as e: - raise ValueError(f"WEB_SOURCES must be a valid JSON list: {e}") + raise ValueError(f"WEB_SOURCES must be a valid JSON list: {e}") from e # Git repositories and file matchers try: @@ -196,5 +197,4 @@ def load() -> "Config": web_sources=web_sources, repo_sources=repo_sources, temp_dir=temp_dir, - log_level=log_level, ) diff --git a/embed_documents.py b/embed_documents.py index b365d73..0700cc6 100755 --- a/embed_documents.py +++ b/embed_documents.py @@ -60,6 +60,83 @@ def _fail_and_exit(message: str, exc: Exception) -> None: raise exc +def _process_git_documents() -> None: + """Process Git-based document sources and add to vector DB.""" + if not config.repo_sources: + return + + logger.info("Starting Git-based document embedding...") + try: + git_loader = GitLoader(config) + git_chunks = git_loader.load() + + if git_chunks: + logger.info("Adding %d Git document chunks to vector DB", len(git_chunks)) + config.db_provider.add_documents(git_chunks) + else: + logger.info("No documents found in Git sources.") + except Exception as e: + _fail_and_exit("Failed during Git document processing", e) + + +def _process_html_documents(html_urls: list) -> None: + """Process HTML web documents and add to vector DB.""" + if not html_urls: + return + + logger.info("Starting HTML-based web document embedding...") + try: + web_loader = WebLoader(config) + web_chunks = web_loader.load(html_urls) + + if web_chunks: + logger.info("Adding %d HTML web chunks to vector DB", len(web_chunks)) + config.db_provider.add_documents(web_chunks) + else: + logger.info("No chunks produced from HTML URLs.") + except Exception as e: + _fail_and_exit("Failed during HTML web document processing", e) + + +def _process_pdf_documents(pdf_urls: list) -> None: + """Download and process PDF documents from web URLs and add to vector DB.""" + if not pdf_urls: + return + + logger.info("Downloading PDF documents from web URLs...") + pdf_dir = Path(config.temp_dir) / "web_pdfs" + pdf_dir.mkdir(parents=True, exist_ok=True) + + downloaded_files = [] + for url in pdf_urls: + try: + response = requests.get(url) + response.raise_for_status() + + filename = Path(url.split("/")[-1]) + file_path = pdf_dir / filename + with open(file_path, "wb") as f: + f.write(response.content) + + logger.info("Downloaded: %s", file_path) + downloaded_files.append(file_path) + except Exception as e: + _fail_and_exit(f"Failed to download {url}", e) + + if downloaded_files: + try: + pdf_loader = PDFLoader(config) + pdf_chunks = pdf_loader.load(downloaded_files) + + if pdf_chunks: + logger.info("Adding %d PDF web chunks to vector DB", len(pdf_chunks)) + config.db_provider.add_documents(pdf_chunks) + else: + logger.info("No chunks produced from downloaded PDFs.") + except Exception as e: + _fail_and_exit("Failed during PDF web document processing", e) + + def main() -> None: """ Main embedding workflow for Git, HTML, and PDF sources. @@ -72,82 +149,13 @@ def main() -> None: All errors are logged with traceback and will stop execution via `_fail_and_exit`. """ - # ─────────────────────────────────────────────────────────────── - # Git-based document ingestion - # ─────────────────────────────────────────────────────────────── - if config.repo_sources: - logger.info("Starting Git-based document embedding...") - try: - git_loader = GitLoader(config) - git_chunks = git_loader.load() - - if git_chunks: - logger.info( - "Adding %d Git document chunks to vector DB", len(git_chunks) - ) - config.db_provider.add_documents(git_chunks) - else: - logger.info("No documents found in Git sources.") - except Exception as e: - _fail_and_exit("Failed during Git document processing", e) + _process_git_documents() - # ─────────────────────────────────────────────────────────────── - # Web-based document ingestion - # ─────────────────────────────────────────────────────────────── pdf_urls = [url for url in config.web_sources if url.lower().endswith(".pdf")] html_urls = [url for url in config.web_sources if not url.lower().endswith(".pdf")] - # HTML documents - if html_urls: - logger.info("Starting HTML-based web document embedding...") - try: - web_loader = WebLoader(config) - web_chunks = web_loader.load(html_urls) - - if web_chunks: - logger.info("Adding %d HTML web chunks to vector DB", len(web_chunks)) - config.db_provider.add_documents(web_chunks) - else: - logger.info("No chunks produced from HTML URLs.") - except Exception as e: - _fail_and_exit("Failed during HTML web document processing", e) - - # PDF documents - if pdf_urls: - logger.info("Downloading PDF documents from web URLs...") - pdf_dir = Path(config.temp_dir) / "web_pdfs" - pdf_dir.mkdir(parents=True, exist_ok=True) - - downloaded_files = [] - for url in pdf_urls: - try: - response = requests.get(url) - response.raise_for_status() - - filename = Path(url.split("/")[-1]) - file_path = pdf_dir / filename - with open(file_path, "wb") as f: - f.write(response.content) - - logger.info("Downloaded: %s", file_path) - downloaded_files.append(file_path) - except Exception as e: - _fail_and_exit(f"Failed to download {url}", e) - - if downloaded_files: - try: - pdf_loader = PDFLoader(config) - pdf_chunks = pdf_loader.load(downloaded_files) - - if pdf_chunks: - logger.info( - "Adding %d PDF web chunks to vector DB", len(pdf_chunks) - ) - config.db_provider.add_documents(pdf_chunks) - else: - logger.info("No chunks produced from downloaded PDFs.") - except Exception as e: - _fail_and_exit("Failed during PDF web document processing", e) + _process_html_documents(html_urls) + _process_pdf_documents(pdf_urls) logger.info("Embedding job complete.") diff --git a/loaders/__init__.py b/loaders/__init__.py new file mode 100644 index 0000000..e17dead --- /dev/null +++ b/loaders/__init__.py @@ -0,0 +1 @@ +"""Document loaders package.""" diff --git a/loaders/git.py b/loaders/git.py index a1bb5ce..733228d 100644 --- a/loaders/git.py +++ b/loaders/git.py @@ -1,3 +1,5 @@ +"""Git repository document loader for cloning and processing repository contents.""" + import logging import shutil import subprocess diff --git a/loaders/pdf.py b/loaders/pdf.py index cf96424..a9c78ba 100644 --- a/loaders/pdf.py +++ b/loaders/pdf.py @@ -1,3 +1,5 @@ +"""PDF document loader for extracting and chunking text from PDF files.""" + import logging from pathlib import Path from typing import List diff --git a/loaders/text.py b/loaders/text.py index 86c8f2a..2a3958c 100644 --- a/loaders/text.py +++ b/loaders/text.py @@ -1,3 +1,5 @@ +"""Text document loader for processing various text-based file formats.""" + import logging from pathlib import Path from typing import List @@ -50,6 +52,56 @@ def __init__(self, config: Config): chunk_overlap=config.chunk_overlap, ) + def _process_single_file(self, path: Path) -> List[Document]: + """Process a single file and return its document chunks.""" + logger.info("Partitioning %s", path) + elements = partition(filename=str(path), strategy="fast") + + buf: List[str] = [] + buf_len, chunk_idx = 0, 0 + fname = path.name + source_str = str(path) + chunks = [] + + def _flush(): + nonlocal buf, buf_len, chunk_idx + if not buf_len: + return + chunks.append( + Document( + page_content="\n".join(buf).strip(), + metadata={ + "source": source_str, + "chunk_id": chunk_idx, + }, + ) + ) + buf, buf_len = [], 0 + chunk_idx += 1 + + for el in elements: + txt = getattr(el, "text", "").strip() + if not txt: + continue + if buf_len == 0: + buf.append(f"## {fname}\n") # inject heading + if buf_len + len(txt) > self.config.chunk_size: + _flush() + buf.append(txt) + buf_len += len(txt) + _flush() + + return chunks + + def _add_chunk_totals(self, docs: List[Document]) -> None: + """Add chunk_total metadata to all documents.""" + counts: dict[str, int] = {} + for doc in docs: + source = doc.metadata["source"] + counts[source] = counts.get(source, 0) + 1 + for doc in docs: + doc.metadata["chunk_total"] = counts[doc.metadata["source"]] + def load(self, paths: List[Path]) -> List[Document]: """ Loads and splits a list of text files into semantic chunks. @@ -76,44 +128,12 @@ def load(self, paths: List[Path]) -> List[Document]: - Each chunk begins with a lightweight heading that includes the filename to help orient the LLM when formatting prompts. """ - grouped: list[Document] = [] + grouped = [] for path in paths: try: - logger.info("Partitioning %s", path) - elements = partition(filename=str(path), strategy="fast") - - buf, buf_len, chunk_idx = [], 0, 0 - fname = Path(path).name - - def _flush(): - nonlocal buf, buf_len, chunk_idx - if not buf_len: - return - grouped.append( - Document( - page_content="\n".join(buf).strip(), - metadata={ - "source": str(path), - "chunk_id": chunk_idx, - }, - ) - ) - buf, buf_len = [], 0 - chunk_idx += 1 - - for el in elements: - txt = getattr(el, "text", "").strip() - if not txt: - continue - if buf_len == 0: - buf.append(f"## {fname}\n") # inject heading - if buf_len + len(txt) > self.config.chunk_size: - _flush() - buf.append(txt) - buf_len += len(txt) - _flush() - + chunks = self._process_single_file(path) + grouped.extend(chunks) except Exception as e: logger.warning("Failed to load %s: %s", path, e) @@ -126,11 +146,7 @@ def _flush(): final_docs.append(doc) # Add chunk_total metadata for all docs - counts: dict[str, int] = {} - for d in final_docs: - counts[d.metadata["source"]] = counts.get(d.metadata["source"], 0) + 1 - for d in final_docs: - d.metadata["chunk_total"] = counts[d.metadata["source"]] + self._add_chunk_totals(final_docs) logger.info( "Produced %d chunks (avg %.0f chars)", diff --git a/loaders/web.py b/loaders/web.py index f9d21fb..050ce21 100644 --- a/loaders/web.py +++ b/loaders/web.py @@ -1,3 +1,5 @@ +"""Web document loader for fetching and processing HTML content from URLs.""" + import logging from typing import Dict, List diff --git a/requirements.txt b/requirements.txt index 220dfb0..a6ecb1f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.12 # by the following command: # -# pip-compile +# pip-compile --strip-extras # aiofiles==24.1.0 # via unstructured-client @@ -60,7 +60,7 @@ dataclasses-json==0.6.7 # unstructured elastic-transport==8.17.1 # via elasticsearch -elasticsearch[vectorstore-mmr]==8.18.1 +elasticsearch==8.18.1 # via langchain-elasticsearch emoji==2.14.1 # via unstructured @@ -97,7 +97,7 @@ html5lib==1.1 # via unstructured httpcore==1.0.9 # via httpx -httpx[http2]==0.28.1 +httpx==0.28.1 # via # langsmith # qdrant-client @@ -299,7 +299,7 @@ pydantic-core==2.33.2 # via pydantic pydantic-settings==2.9.1 # via langchain-community -pyjwt[crypto]==2.10.1 +pyjwt==2.10.1 # via # msal # pyjwt @@ -450,7 +450,7 @@ typing-inspection==0.4.1 # via # pydantic # pydantic-settings -unstructured[md]==0.17.2 +unstructured==0.17.2 # via -r requirements.in unstructured-client==0.36.0 # via unstructured diff --git a/vector_db/__init__.py b/vector_db/__init__.py new file mode 100644 index 0000000..0ec320c --- /dev/null +++ b/vector_db/__init__.py @@ -0,0 +1 @@ +"""Vector database providers package.""" diff --git a/vector_db/db_provider.py b/vector_db/db_provider.py index 968bd5c..e5f9f69 100644 --- a/vector_db/db_provider.py +++ b/vector_db/db_provider.py @@ -1,3 +1,5 @@ +"""Abstract base class for vector database providers.""" + from abc import ABC, abstractmethod from typing import List @@ -51,4 +53,3 @@ def add_documents(self, docs: List[Document]) -> None: Args: docs (List[Document]): A list of LangChain `Document` objects to be embedded and added. """ - pass diff --git a/vector_db/dryrun_provider.py b/vector_db/dryrun_provider.py index fb52346..2aaf4e0 100644 --- a/vector_db/dryrun_provider.py +++ b/vector_db/dryrun_provider.py @@ -1,7 +1,8 @@ +"""Dry run vector database provider for testing and debugging.""" + from typing import List from langchain_core.documents import Document -from langchain_huggingface import HuggingFaceEmbeddings from vector_db.db_provider import DBProvider @@ -15,7 +16,8 @@ class DryRunProvider(DBProvider): to validate chunking, structure, and metadata before pushing to a production vector store. Attributes: - embeddings (HuggingFaceEmbeddings): HuggingFace embedding instance, used for interface consistency. + embeddings (HuggingFaceEmbeddings): HuggingFace embedding instance, used for interface + consistency. embedding_length (int): Dimensionality of embeddings (computed for validation, not used). Args: @@ -31,15 +33,6 @@ class DryRunProvider(DBProvider): >>> provider.add_documents(docs) """ - def __init__(self, embeddings: HuggingFaceEmbeddings): - """ - Initialize the dry run provider with a placeholder embedding model. - - Args: - embeddings (HuggingFaceEmbeddings): A HuggingFace embedding model (used for compatibility). - """ - super().__init__(embeddings) - def add_documents(self, docs: List[Document]) -> None: """ Print chunked documents and metadata to stdout for inspection. diff --git a/vector_db/elastic_provider.py b/vector_db/elastic_provider.py index 6be9af3..adf27e3 100644 --- a/vector_db/elastic_provider.py +++ b/vector_db/elastic_provider.py @@ -1,3 +1,5 @@ +"""Elasticsearch vector database provider implementation.""" + import logging from typing import List @@ -43,7 +45,7 @@ class ElasticProvider(DBProvider): >>> provider.add_documents(docs) """ - def __init__( + def __init__( # pylint: disable=too-many-arguments,too-many-positional-arguments self, embeddings: HuggingFaceEmbeddings, url: str, diff --git a/vector_db/mssql_provider.py b/vector_db/mssql_provider.py index 5fc0c1b..0576507 100644 --- a/vector_db/mssql_provider.py +++ b/vector_db/mssql_provider.py @@ -1,3 +1,5 @@ +"""Microsoft SQL Server vector database provider implementation.""" + import logging import re from typing import List, Optional @@ -35,7 +37,11 @@ class MSSQLProvider(DBProvider): >>> embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5") >>> provider = MSSQLProvider( ... embeddings=embeddings, - ... connection_string="Driver={ODBC Driver 18 for SQL Server};Server=localhost,1433;Database=docs;UID=sa;PWD=StrongPassword!;TrustServerCertificate=yes;Encrypt=no;", + ... connection_string=( + ... "Driver={ODBC Driver 18 for SQL Server};" + ... "Server=localhost,1433;Database=docs;UID=sa;" + ... "PWD=StrongPassword!;TrustServerCertificate=yes;Encrypt=no;" + ... ), ... table="embedded_docs", ... ) >>> provider.add_documents(docs) diff --git a/vector_db/pgvector_provider.py b/vector_db/pgvector_provider.py index 14b7073..5a2486a 100644 --- a/vector_db/pgvector_provider.py +++ b/vector_db/pgvector_provider.py @@ -1,3 +1,5 @@ +"""PostgreSQL with pgvector extension vector database provider implementation.""" + import logging from typing import List from urllib.parse import urlparse diff --git a/vector_db/qdrant_provider.py b/vector_db/qdrant_provider.py index d05ff7a..4cca990 100644 --- a/vector_db/qdrant_provider.py +++ b/vector_db/qdrant_provider.py @@ -1,3 +1,5 @@ +"""Qdrant vector database provider implementation.""" + import logging from typing import List, Optional diff --git a/vector_db/redis_provider.py b/vector_db/redis_provider.py index b8e0fee..80489ff 100644 --- a/vector_db/redis_provider.py +++ b/vector_db/redis_provider.py @@ -1,3 +1,5 @@ +"""Redis vector database provider implementation.""" + import logging from typing import List