From d3c58b5e2dc938517d404c5c9d6c2653fdee667c Mon Sep 17 00:00:00 2001
From: Drew Minnear <dminnear@redhat.com>
Date: Mon, 22 Sep 2025 16:05:12 -0400
Subject: [PATCH] update ci to use super-linter and push to both quay-repos

---
 .env                                |  16 +--
 .github/linters/.hadolint.yaml      |   9 ++
 .github/linters/.isort.cfg          |   2 +
 .github/linters/.jscpd.json         |   3 +
 .github/linters/.python-lint        |   5 +
 .github/workflows/ci-pipeline.yaml  | 188 +++++++++++++++++-----------
 .github/workflows/super-linter.yaml |  16 +++
 Containerfile                       |  18 +--
 Makefile                            |  32 +++++
 README.md                           |  23 ++--
 config.py                           |  74 +++++------
 embed_documents.py                  | 152 +++++++++++-----------
 loaders/__init__.py                 |   1 +
 loaders/git.py                      |   2 +
 loaders/pdf.py                      |   2 +
 loaders/text.py                     |  96 ++++++++------
 loaders/web.py                      |   2 +
 requirements.txt                    |  10 +-
 vector_db/__init__.py               |   1 +
 vector_db/db_provider.py            |   3 +-
 vector_db/dryrun_provider.py        |  15 +--
 vector_db/elastic_provider.py       |   4 +-
 vector_db/mssql_provider.py         |   8 +-
 vector_db/pgvector_provider.py      |   2 +
 vector_db/qdrant_provider.py        |   2 +
 vector_db/redis_provider.py         |   2 +
 26 files changed, 418 insertions(+), 270 deletions(-)
 create mode 100644 .github/linters/.hadolint.yaml
 create mode 100644 .github/linters/.isort.cfg
 create mode 100644 .github/linters/.jscpd.json
 create mode 100644 .github/linters/.python-lint
 create mode 100644 .github/workflows/super-linter.yaml
 create mode 100644 Makefile
 create mode 100644 loaders/__init__.py
 create mode 100644 vector_db/__init__.py

diff --git a/.env b/.env
index 34c49fe..e547580 100644
--- a/.env
+++ b/.env
@@ -5,35 +5,35 @@ TEMP_DIR=/tmp
 LOG_LEVEL=info
 
 # === Git Repo Document Sources ===
-REPO_SOURCES=[{"repo": "https://github.com/RHEcosystemAppEng/llm-on-openshift.git", "globs": ["examples/notebooks/langchain/rhods-doc/*.pdf"]}]
+REPO_SOURCES='[{"repo": "https://github.com/RHEcosystemAppEng/llm-on-openshift.git", "globs": ["examples/notebooks/langchain/rhods-doc/*.pdf"]}]'
 
 # === Web Document Sources ===
-WEB_SOURCES=["https://ai-on-openshift.io/getting-started/openshift/", "https://ai-on-openshift.io/getting-started/opendatahub/", "https://ai-on-openshift.io/getting-started/openshift-ai/", "https://ai-on-openshift.io/odh-rhoai/configuration/", "https://ai-on-openshift.io/odh-rhoai/custom-notebooks/", "https://ai-on-openshift.io/odh-rhoai/nvidia-gpus/", "https://ai-on-openshift.io/odh-rhoai/custom-runtime-triton/", "https://ai-on-openshift.io/odh-rhoai/openshift-group-management/", "https://ai-on-openshift.io/tools-and-applications/minio/minio/"]
+WEB_SOURCES='["https://ai-on-openshift.io/getting-started/openshift/", "https://ai-on-openshift.io/getting-started/opendatahub/", "https://ai-on-openshift.io/getting-started/openshift-ai/", "https://ai-on-openshift.io/odh-rhoai/configuration/", "https://ai-on-openshift.io/odh-rhoai/custom-notebooks/", "https://ai-on-openshift.io/odh-rhoai/nvidia-gpus/", "https://ai-on-openshift.io/odh-rhoai/custom-runtime-triton/", "https://ai-on-openshift.io/odh-rhoai/openshift-group-management/", "https://ai-on-openshift.io/tools-and-applications/minio/minio/"]'
 
 # === General Embedding Config ===
-CHUNK_SIZE=1024
 CHUNK_OVERLAP=40
+CHUNK_SIZE=1024
 DB_TYPE=DRYRUN
 EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2
 
 # === Redis ===
-REDIS_URL=redis://localhost:6379
 REDIS_INDEX=docs
+REDIS_URL=redis://localhost:6379
 
 # === Elasticsearch ===
-ELASTIC_URL=http://localhost:9200
 ELASTIC_INDEX=docs
-ELASTIC_USER=elastic
 ELASTIC_PASSWORD=changeme
+ELASTIC_URL=http://localhost:9200
+ELASTIC_USER=elastic
 
 # === PGVector ===
-PGVECTOR_URL=postgresql+psycopg://user:pass@localhost:5432/mydb
 PGVECTOR_COLLECTION_NAME=documents
+PGVECTOR_URL=postgresql+psycopg://user:pass@localhost:5432/mydb
 
 # === SQL Server ===
 MSSQL_CONNECTION_STRING="Driver={ODBC Driver 18 for SQL Server}; Server=localhost,1433; Database=embeddings; UID=sa; PWD=StrongPassword!; TrustServerCertificate=yes; Encrypt=no;"
 MSSQL_TABLE=docs
 
 # === Qdrant ===
-QDRANT_URL=http://localhost:6333
 QDRANT_COLLECTION=embedded_docs
+QDRANT_URL=http://localhost:6333
diff --git a/.github/linters/.hadolint.yaml b/.github/linters/.hadolint.yaml
new file mode 100644
index 0000000..e5d771d
--- /dev/null
+++ b/.github/linters/.hadolint.yaml
@@ -0,0 +1,9 @@
+ignored:
+  # Always tag the version of an image explicitly
+  - DL3006
+  # Using latest is prone to errors if the image will ever update. Pin the version explicitly to a release tag
+  - DL3007
+  # Specify version with `dnf install -y <package>-<version>`.
+  - DL3041
+  # Pin versions in pip. Instead of `pip install <package>` use `pip install <package>==<version>` or `pip install --requirement <requirements file>`
+  - DL3013
diff --git a/.github/linters/.isort.cfg b/.github/linters/.isort.cfg
new file mode 100644
index 0000000..57fae97
--- /dev/null
+++ b/.github/linters/.isort.cfg
@@ -0,0 +1,2 @@
+[settings]
+profile = "black"
diff --git a/.github/linters/.jscpd.json b/.github/linters/.jscpd.json
new file mode 100644
index 0000000..27a038a
--- /dev/null
+++ b/.github/linters/.jscpd.json
@@ -0,0 +1,3 @@
+{
+  "ignore": ["**/.github/**"]
+}
diff --git a/.github/linters/.python-lint b/.github/linters/.python-lint
new file mode 100644
index 0000000..090a4fa
--- /dev/null
+++ b/.github/linters/.python-lint
@@ -0,0 +1,5 @@
+[MESSAGES CONTROL]
+disable=
+    too-few-public-methods,
+    broad-exception-caught,
+    import-error
diff --git a/.github/workflows/ci-pipeline.yaml b/.github/workflows/ci-pipeline.yaml
index 0d193bb..bf8c66e 100644
--- a/.github/workflows/ci-pipeline.yaml
+++ b/.github/workflows/ci-pipeline.yaml
@@ -1,78 +1,81 @@
-name: CI Pipeline
+name: Build and push to quay
 
 on:
   pull_request:
+    branches: [main]
   push:
     branches: [main]
     tags:
-      - "v*"
+      - "v*.*.*"
 
-jobs:
-  lint:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
-      - run: pip install black isort ruff
-      - run: black --check .
-      - run: isort --check-only .
-      - run: ruff check .
+permissions:
+  contents: read
+
+env:
+  REGISTRY: localhost
+  NAME: vector-embedder
+  TAG: ${{ github.event_name == 'pull_request' && format('pr-{0}', github.event.pull_request.number) || (github.ref_name == 'main' && 'latest' || github.ref_name) }}
 
-  build:
+jobs:
+  build-container:
     runs-on: ubuntu-latest
-    needs: lint
-    outputs:
-      image_tag: ${{ steps.meta.outputs.sha_tag }}
-    steps:
-      - uses: actions/checkout@v4
 
-      - name: Generate tag
-        id: meta
-        run: echo "sha_tag=sha-${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
+    permissions:
+      contents: read
 
-      - name: Build Docker image
-        uses: docker/build-push-action@v5
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
         with:
-          context: .
-          file: ./Containerfile
-          load: true
-          tags: test-image:${{ steps.meta.outputs.sha_tag }}
+          fetch-depth: 0
+          persist-credentials: false
 
-      - name: Save image as artifact
-        run: docker save test-image:${{ steps.meta.outputs.sha_tag }} -o image.tar
+      - name: Build container and push to local registry
+        env:
+          CONTAINER: ${{ env.NAME }}:${{ env.TAG }}
+        run: |
+          make build
+          podman push "${CONTAINER}" "docker-archive:/tmp/image.tar:${CONTAINER}"
 
       - name: Upload image artifact
         uses: actions/upload-artifact@v4
         with:
-          name: test-image
-          path: image.tar
+          name: image-${{ github.run_id }}
+          path: /tmp/image.tar
+          retention-days: 1
 
   test:
-    needs: [lint, build]
-    runs-on: ubuntu-latest
+    needs: [build-container]
+    if: github.event_name == 'pull_request'
     strategy:
       fail-fast: false
       matrix:
         db: [pgvector, redis, elastic, qdrant, mssql]
 
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+
     steps:
-      - uses: actions/checkout@v4
+      - name: Checkout code
+        uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+          persist-credentials: false
 
-      - name: Download image artifact
-        uses: actions/download-artifact@v4
+      - name: Download image
+        uses: actions/download-artifact@v5
         with:
-          name: test-image
-          path: .
+          name: image-${{ github.run_id }}
+          path: /tmp
 
-      - name: Load Docker image
-        run: docker load -i image.tar
+      - name: Load image into local containers-storage
+        run: podman pull docker-archive:/tmp/image.tar
 
       - name: Start MSSQL
         if: matrix.db == 'mssql'
         run: |
-          docker run -d --name mssql-vector-test \
+          podman run -d --name mssql-vector-test \
             -e "ACCEPT_EULA=Y" \
             -e "SA_PASSWORD=StrongPassword!" \
             -p 1433:1433 \
@@ -81,7 +84,7 @@ jobs:
       - name: Start PGVector
         if: matrix.db == 'pgvector'
         run: |
-          docker run -d --name pgvector-test \
+          podman run -d --name pgvector-test \
             -e POSTGRES_USER=user \
             -e POSTGRES_PASSWORD=pass \
             -e POSTGRES_DB=mydb \
@@ -91,14 +94,14 @@ jobs:
       - name: Start Redis
         if: matrix.db == 'redis'
         run: |
-          docker run -d --name redis-test \
+          podman run -d --name redis-test \
             -p 6379:6379 \
             redis/redis-stack-server:6.2.6-v19
 
       - name: Start Elasticsearch
         if: matrix.db == 'elastic'
         run: |
-          docker run -d --name es-test \
+          podman run -d --name es-test \
             -e "discovery.type=single-node" \
             -e "xpack.security.enabled=true" \
             -e "ELASTIC_PASSWORD=changeme" \
@@ -109,7 +112,7 @@ jobs:
       - name: Start Qdrant
         if: matrix.db == 'qdrant'
         run: |
-          docker run -d --name qdrant-test \
+          podman run -d --name qdrant-test \
             -p 6333:6333 \
             qdrant/qdrant
 
@@ -117,45 +120,78 @@ jobs:
         run: sleep 30
 
       - name: Run embed job
+        env:
+          CONTAINER: ${{ env.NAME }}:${{ env.TAG }}
+          DB_TYPE: ${{ matrix.db }}
         run: |
-          docker run --rm --network host \
+          podman run --rm --network host \
             -e LOG_LEVEL=debug \
-            -e DB_TYPE=${{ matrix.db }} \
-            test-image:${{ needs.build.outputs.image_tag }}
+            -e DB_TYPE="${DB_TYPE}" \
+            "${REGISTRY}/${CONTAINER}"
+
+  push-container:
+    needs: [build-container]
+    if: github.event_name != 'pull_request'
+    strategy:
+      matrix:
+        include:
+          - upload_registry: quay.io/validatedpatterns
+            legacy: false
+          - upload_registry: quay.io/hybridcloudpatterns
+            legacy: true
 
-  release:
-    if: (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) && github.event.repository.fork == false
     runs-on: ubuntu-latest
-    needs: [lint, build, test]
-    steps:
-      - uses: actions/checkout@v4
+    permissions:
+      contents: read
+      # This is used to complete the identity challenge
+      # with sigstore/fulcio when running outside of PRs.
+      id-token: write
 
-      - name: Log in to Quay.io
-        uses: docker/login-action@v3
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
         with:
-          registry: quay.io
-          username: ${{ secrets.QUAY_USERNAME }}
-          password: ${{ secrets.QUAY_PASSWORD }}
+          fetch-depth: 0
+          persist-credentials: false
 
-      - name: Download image artifact
-        uses: actions/download-artifact@v4
+      - name: Download image
+        uses: actions/download-artifact@v5
         with:
-          name: test-image
-          path: .
+          name: image-${{ github.run_id }}
+          path: /tmp
 
-      - name: Load Docker image
-        run: docker load -i image.tar
+      - name: Load image into local containers-storage
+        run: podman pull docker-archive:/tmp/image.tar
 
-      - name: Tag and push image
+      - name: Log into Quay
+        env:
+          USERNAME: ${{ matrix.legacy && secrets.LEGACY_QUAY_USERNAME || secrets.QUAY_USERNAME }}
+          PASSWORD: ${{ matrix.legacy && secrets.LEGACY_QUAY_PASSWORD || secrets.QUAY_PASSWORD }}
         run: |
-          docker tag test-image:${{ needs.build.outputs.image_tag }} quay.io/hybridcloudpatterns/vector-embedder:${{ needs.build.outputs.image_tag }}
+          podman login -u "${USERNAME}" -p "${PASSWORD}" quay.io
 
-          if [[ $GITHUB_REF == refs/tags/* ]]; then
-            docker tag test-image:${{ needs.build.outputs.image_tag }} quay.io/hybridcloudpatterns/vector-embedder:${GITHUB_REF#refs/tags/}
-            docker push quay.io/hybridcloudpatterns/vector-embedder:${GITHUB_REF#refs/tags/}
-          elif [[ $GITHUB_REF == refs/heads/main ]]; then
-            docker tag test-image:${{ needs.build.outputs.image_tag }} quay.io/hybridcloudpatterns/vector-embedder:latest
-            docker push quay.io/hybridcloudpatterns/vector-embedder:latest
-          fi
+      - name: Push image to Quay
+        id: image-push
+        env:
+          UPLOADREGISTRY: ${{ matrix.upload_registry }}
+          CONTAINER: ${{ env.NAME }}:${{ env.TAG }}
+        run: |
+          make upload
+          DIGEST=$(skopeo inspect --format "{{.Digest}}" "docker://${UPLOADREGISTRY}/${CONTAINER}")
+          echo "digest=$DIGEST" >> "$GITHUB_OUTPUT"
 
-          docker push quay.io/hybridcloudpatterns/vector-embedder:${{ needs.build.outputs.image_tag }}
+      - name: Install cosign
+        uses: sigstore/cosign-installer@d58896d6a1865668819e1d91763c7751a165e159 # v3.9.2
+        with:
+          cosign-release: "v2.2.4"
+
+      # Cosign expects the docker config.json for registry authentication so we must
+      # copy it from buildah
+      - name: Sign the published Docker image
+        env:
+          CONTAINER: ${{ env.NAME }}:${{ env.TAG }}
+          DIGEST: ${{ steps.image-push.outputs.digest }}
+          UPLOADREGISTRY: ${{ matrix.upload_registry }}
+        run: |
+          cat "${XDG_RUNTIME_DIR}/containers/auth.json" > ~/.docker/config.json
+          cosign sign --yes "${UPLOADREGISTRY}/${CONTAINER}@${DIGEST}"
diff --git a/.github/workflows/super-linter.yaml b/.github/workflows/super-linter.yaml
new file mode 100644
index 0000000..79b7ff9
--- /dev/null
+++ b/.github/workflows/super-linter.yaml
@@ -0,0 +1,16 @@
+name: Super linter
+
+on:
+  pull_request:
+    branches: [main]
+
+permissions:
+  contents: read
+
+jobs:
+  lint:
+    uses: validatedpatterns/github-actions-library/.github/workflows/superlinter.yml@v1
+    with:
+      sl_env: |
+        VALIDATE_TRIVY=false
+        VALIDATE_PYTHON_ISORT=false
diff --git a/Containerfile b/Containerfile
index 1f30f86..7c9fc88 100644
--- a/Containerfile
+++ b/Containerfile
@@ -1,18 +1,22 @@
-FROM registry.access.redhat.com/ubi9/python-312:9.5
+FROM registry.access.redhat.com/ubi10/python-312-minimal:10.0
 
 USER root
+
 WORKDIR /app
 
-RUN dnf install -y \
+RUN microdnf install -y git \
     unixODBC \
     unixODBC-devel && \
     curl -sSL https://packages.microsoft.com/config/rhel/9/prod.repo -o /etc/yum.repos.d/mssql-release.repo && \
-    ACCEPT_EULA=Y dnf install -y msodbcsql18 && \
-    dnf clean all
+    ACCEPT_EULA=Y microdnf install -y msodbcsql18 && \
+    microdnf clean all
 
 COPY requirements.txt .
-RUN pip install --upgrade pip && \
-    pip install -r requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install \
+      --no-cache-dir \
+      --compile \
+      -r requirements.txt
 
 COPY vector_db ./vector_db
 COPY loaders ./loaders
@@ -24,4 +28,4 @@ RUN chown -R 1001:0 .
 
 USER 1001
 
-CMD ./embed_documents.py
+CMD ["python", "./embed_documents.py"]
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3a0508a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,32 @@
+NAME ?= vector-embedder
+TAG ?= latest
+CONTAINER ?= $(NAME):$(TAG)
+REGISTRY ?= localhost
+UPLOADREGISTRY ?= quay.io/validatedpatterns
+
+##@ Pattern Must Gather Tasks
+
+.PHONY: help
+help: ## This help message
+	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^(\s|[a-zA-Z_0-9-])+:.*?##/ { printf "  \033[36m%-35s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
+
+.PHONY: build
+build: podman-build
+
+.PHONY: podman-build
+podman-build: ## build container
+	podman build -t ${REGISTRY}/${CONTAINER} .
+
+.PHONY: upload
+upload: ## push container
+	podman tag ${REGISTRY}/${CONTAINER} ${UPLOADREGISTRY}/${CONTAINER}
+	podman push ${UPLOADREGISTRY}/${CONTAINER}
+
+.PHONY: super-linter
+super-linter: ## Runs super linter locally
+	rm -rf .mypy_cache
+	podman run -e RUN_LOCAL=true -e USE_FIND_ALGORITHM=true	\
+					$(DISABLE_LINTERS) \
+					-v $(PWD):/tmp/lint:rw,z \
+					-w /tmp/lint \
+					ghcr.io/super-linter/super-linter:slim-v8
diff --git a/README.md b/README.md
index 6c44dba..c0966c0 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # 📚 vector-embedder
 
-[![Quay Repository](https://img.shields.io/badge/Quay.io-vector--embedder-blue?logo=quay)](https://quay.io/repository/hybridcloudpatterns/vector-embedder)
-[![CI](https://github.com/validatedpatterns-sandbox/vector-embedder/actions/workflows/ci-pipeline.yaml/badge.svg?branch=main)](https://github.com/validatedpatterns-sandbox/vector-embedder/actions/workflows/ci-pipeline.yaml)
-
+![Version: 1.0.0](https://img.shields.io/badge/Version-1.0.0-informational?style=flat-square)
+[![Quay Repository](https://img.shields.io/badge/Quay.io-vector--embedder-blue?logo=quay)](https://quay.io/repository/validatedpatterns/vector-embedder)
+[![CI Pipeline](https://github.com/validatedpatterns-sandbox/vector-embedder/actions/workflows/ci-pipeline.yaml/badge.svg?branch=main)](https://github.com/validatedpatterns-sandbox/vector-embedder/actions/workflows/ci-pipeline.yaml)
 
 **vector-embedder** is a flexible, language-agnostic document ingestion and embedding pipeline. It transforms structured and unstructured content from multiple sources into vector embeddings and stores them in your vector database of choice.
 
@@ -122,18 +122,18 @@ Run it:
 
 ## 📦 Dependency Management & Updates
 
-This project keeps *two* dependency files under version control:
+This project keeps _two_ dependency files under version control:
 
-| File | Purpose | Edited by |
-|------|---------|-----------|
-| **`requirements.in`** | Short, human-readable list of *top-level* libraries (no pins) | You |
+| File                   | Purpose                                                                              | Edited by     |
+| ---------------------- | ------------------------------------------------------------------------------------ | ------------- |
+| **`requirements.in`**  | Short, human-readable list of _top-level_ libraries (no pins)                        | You           |
 | **`requirements.txt`** | Fully-resolved, **pinned** lock file—including hashes—for exact, reproducible builds | `pip-compile` |
 
 ### 🔧 Installing `pip-tools`
 
 ```bash
 python -m pip install --upgrade pip-tools
-````
+```
 
 ### ➕ Adding / Updating a Package
 
@@ -144,11 +144,13 @@ python -m pip install --upgrade pip-tools
    + sentence-transformers>=4.1
    + llama-index
    ```
+
 2. **Re-lock** the environment
 
    ```bash
    pip-compile --upgrade
    ```
+
 3. **Synchronise** your virtual-env
 
    ```bash
@@ -159,7 +161,7 @@ python -m pip install --upgrade pip-tools
 
 ## 🗂️ Project Layout
 
-```
+```text
 .
 ├── embed_documents.py      # Main entrypoint script
 ├── config.py               # Config loader from env
@@ -245,7 +247,6 @@ DB_TYPE=QDRANT ./embed_documents.py
 
 ### SQL Server (MSSQL)
 
-
 ```bash
 podman run --rm -d \
   --name mssql \
@@ -253,7 +254,7 @@ podman run --rm -d \
   -e SA_PASSWORD=StrongPassword! \
   -p 1433:1433 \
   mcr.microsoft.com/mssql/rhel/server:2025-latest
-````
+```
 
 ```bash
 DB_TYPE=MSSQL ./embed_documents.py
diff --git a/config.py b/config.py
index 4ccf99f..e226ecf 100644
--- a/config.py
+++ b/config.py
@@ -1,3 +1,5 @@
+"""Configuration management for vector database embedder application."""
+
 import json
 import logging
 import os
@@ -31,7 +33,6 @@ class Config:
         web_sources (List[str]): List of web URLs to scrape and embed.
         repo_sources (List[Dict]): Repositories and glob patterns for file discovery.
         temp_dir (str): Path to a temporary working directory.
-        log_level (int): Log verbosity level.
 
     Example:
         >>> config = Config.load()
@@ -45,7 +46,6 @@ class Config:
     web_sources: List[str]
     repo_sources: List[Dict]
     temp_dir: str
-    log_level: int
 
     @staticmethod
     def _get_required_env_var(key: str) -> str:
@@ -89,7 +89,8 @@ def _parse_log_level(log_level_name: str) -> int:
         }
         if log_level_name not in log_levels:
             raise ValueError(
-                f"Invalid LOG_LEVEL: '{log_level_name}'. Must be one of: {', '.join(log_levels.keys())}"
+                f"Invalid LOG_LEVEL: '{log_level_name}'. "
+                f"Must be one of: {', '.join(log_levels.keys())}"
             )
         return log_levels[log_level_name]
 
@@ -111,37 +112,33 @@ def _init_db_provider(db_type: str) -> DBProvider:
         db_type = db_type.upper()
         embeddings = HuggingFaceEmbeddings(model_name=get("EMBEDDING_MODEL"))
 
-        if db_type == "REDIS":
-            url = get("REDIS_URL")
-            index = os.getenv("REDIS_INDEX", "docs")
-            return RedisProvider(embeddings, url, index)
-
-        elif db_type == "ELASTIC":
-            url = get("ELASTIC_URL")
-            password = get("ELASTIC_PASSWORD")
-            index = os.getenv("ELASTIC_INDEX", "docs")
-            user = os.getenv("ELASTIC_USER", "elastic")
-            return ElasticProvider(embeddings, url, password, index, user)
-
-        elif db_type == "PGVECTOR":
-            url = get("PGVECTOR_URL")
-            collection = get("PGVECTOR_COLLECTION_NAME")
-            return PGVectorProvider(embeddings, url, collection)
-
-        elif db_type == "MSSQL":
-            connection_string = get("MSSQL_CONNECTION_STRING")
-            table = get("MSSQL_TABLE")
-            return MSSQLProvider(embeddings, connection_string, table)
-
-        elif db_type == "QDRANT":
-            url = get("QDRANT_URL")
-            collection = get("QDRANT_COLLECTION")
-            return QdrantProvider(embeddings, url, collection)
-
-        elif db_type == "DRYRUN":
-            return DryRunProvider(embeddings)
-
-        raise ValueError(f"Unsupported DB_TYPE '{db_type}'")
+        match db_type:
+            case "REDIS":
+                url = get("REDIS_URL")
+                index = os.getenv("REDIS_INDEX", "docs")
+                return RedisProvider(embeddings, url, index)
+            case "ELASTIC":
+                url = get("ELASTIC_URL")
+                password = get("ELASTIC_PASSWORD")
+                index = os.getenv("ELASTIC_INDEX", "docs")
+                user = os.getenv("ELASTIC_USER", "elastic")
+                return ElasticProvider(embeddings, url, password, index, user)
+            case "PGVECTOR":
+                url = get("PGVECTOR_URL")
+                collection = get("PGVECTOR_COLLECTION_NAME")
+                return PGVectorProvider(embeddings, url, collection)
+            case "MSSQL":
+                connection_string = get("MSSQL_CONNECTION_STRING")
+                table = get("MSSQL_TABLE")
+                return MSSQLProvider(embeddings, connection_string, table)
+            case "QDRANT":
+                url = get("QDRANT_URL")
+                collection = get("QDRANT_COLLECTION")
+                return QdrantProvider(embeddings, url, collection)
+            case "DRYRUN":
+                return DryRunProvider(embeddings)
+            case _:
+                raise ValueError(f"Unsupported DB_TYPE '{db_type}'")
 
     @staticmethod
     def load() -> "Config":
@@ -162,7 +159,11 @@ def load() -> "Config":
 
         # Logging setup
         log_level = get("LOG_LEVEL").upper()
-        logging.basicConfig(level=Config._parse_log_level(log_level))
+        logging.basicConfig(
+            level=Config._parse_log_level(log_level),
+            format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
         logger = logging.getLogger(__name__)
         logger.debug("Logging initialized at level: %s", log_level)
 
@@ -174,7 +175,7 @@ def load() -> "Config":
         try:
             web_sources = json.loads(get("WEB_SOURCES"))
         except json.JSONDecodeError as e:
-            raise ValueError(f"WEB_SOURCES must be a valid JSON list: {e}")
+            raise ValueError(f"WEB_SOURCES must be a valid JSON list: {e}") from e
 
         # Git repositories and file matchers
         try:
@@ -196,5 +197,4 @@ def load() -> "Config":
             web_sources=web_sources,
             repo_sources=repo_sources,
             temp_dir=temp_dir,
-            log_level=log_level,
         )
diff --git a/embed_documents.py b/embed_documents.py
index b365d73..0700cc6 100755
--- a/embed_documents.py
+++ b/embed_documents.py
@@ -60,6 +60,83 @@ def _fail_and_exit(message: str, exc: Exception) -> None:
     raise exc
 
 
+def _process_git_documents() -> None:
+    """Process Git-based document sources and add to vector DB."""
+    if not config.repo_sources:
+        return
+
+    logger.info("Starting Git-based document embedding...")
+    try:
+        git_loader = GitLoader(config)
+        git_chunks = git_loader.load()
+
+        if git_chunks:
+            logger.info("Adding %d Git document chunks to vector DB", len(git_chunks))
+            config.db_provider.add_documents(git_chunks)
+        else:
+            logger.info("No documents found in Git sources.")
+    except Exception as e:
+        _fail_and_exit("Failed during Git document processing", e)
+
+
+def _process_html_documents(html_urls: list) -> None:
+    """Process HTML web documents and add to vector DB."""
+    if not html_urls:
+        return
+
+    logger.info("Starting HTML-based web document embedding...")
+    try:
+        web_loader = WebLoader(config)
+        web_chunks = web_loader.load(html_urls)
+
+        if web_chunks:
+            logger.info("Adding %d HTML web chunks to vector DB", len(web_chunks))
+            config.db_provider.add_documents(web_chunks)
+        else:
+            logger.info("No chunks produced from HTML URLs.")
+    except Exception as e:
+        _fail_and_exit("Failed during HTML web document processing", e)
+
+
+def _process_pdf_documents(pdf_urls: list) -> None:
+    """Download and process PDF documents from web URLs and add to vector DB."""
+    if not pdf_urls:
+        return
+
+    logger.info("Downloading PDF documents from web URLs...")
+    pdf_dir = Path(config.temp_dir) / "web_pdfs"
+    pdf_dir.mkdir(parents=True, exist_ok=True)
+
+    downloaded_files = []
+    for url in pdf_urls:
+        try:
+            response = requests.get(url)
+            response.raise_for_status()
+
+            filename = Path(url.split("/")[-1])
+            file_path = pdf_dir / filename
+            with open(file_path, "wb") as f:
+                f.write(response.content)
+
+            logger.info("Downloaded: %s", file_path)
+            downloaded_files.append(file_path)
+        except Exception as e:
+            _fail_and_exit(f"Failed to download {url}", e)
+
+    if downloaded_files:
+        try:
+            pdf_loader = PDFLoader(config)
+            pdf_chunks = pdf_loader.load(downloaded_files)
+
+            if pdf_chunks:
+                logger.info("Adding %d PDF web chunks to vector DB", len(pdf_chunks))
+                config.db_provider.add_documents(pdf_chunks)
+            else:
+                logger.info("No chunks produced from downloaded PDFs.")
+        except Exception as e:
+            _fail_and_exit("Failed during PDF web document processing", e)
+
+
 def main() -> None:
     """
     Main embedding workflow for Git, HTML, and PDF sources.
@@ -72,82 +149,13 @@ def main() -> None:
 
     All errors are logged with traceback and will stop execution via `_fail_and_exit`.
     """
-    # ───────────────────────────────────────────────────────────────
-    # Git-based document ingestion
-    # ───────────────────────────────────────────────────────────────
-    if config.repo_sources:
-        logger.info("Starting Git-based document embedding...")
-        try:
-            git_loader = GitLoader(config)
-            git_chunks = git_loader.load()
-
-            if git_chunks:
-                logger.info(
-                    "Adding %d Git document chunks to vector DB", len(git_chunks)
-                )
-                config.db_provider.add_documents(git_chunks)
-            else:
-                logger.info("No documents found in Git sources.")
-        except Exception as e:
-            _fail_and_exit("Failed during Git document processing", e)
+    _process_git_documents()
 
-    # ───────────────────────────────────────────────────────────────
-    # Web-based document ingestion
-    # ───────────────────────────────────────────────────────────────
     pdf_urls = [url for url in config.web_sources if url.lower().endswith(".pdf")]
     html_urls = [url for url in config.web_sources if not url.lower().endswith(".pdf")]
 
-    # HTML documents
-    if html_urls:
-        logger.info("Starting HTML-based web document embedding...")
-        try:
-            web_loader = WebLoader(config)
-            web_chunks = web_loader.load(html_urls)
-
-            if web_chunks:
-                logger.info("Adding %d HTML web chunks to vector DB", len(web_chunks))
-                config.db_provider.add_documents(web_chunks)
-            else:
-                logger.info("No chunks produced from HTML URLs.")
-        except Exception as e:
-            _fail_and_exit("Failed during HTML web document processing", e)
-
-    # PDF documents
-    if pdf_urls:
-        logger.info("Downloading PDF documents from web URLs...")
-        pdf_dir = Path(config.temp_dir) / "web_pdfs"
-        pdf_dir.mkdir(parents=True, exist_ok=True)
-
-        downloaded_files = []
-        for url in pdf_urls:
-            try:
-                response = requests.get(url)
-                response.raise_for_status()
-
-                filename = Path(url.split("/")[-1])
-                file_path = pdf_dir / filename
-                with open(file_path, "wb") as f:
-                    f.write(response.content)
-
-                logger.info("Downloaded: %s", file_path)
-                downloaded_files.append(file_path)
-            except Exception as e:
-                _fail_and_exit(f"Failed to download {url}", e)
-
-        if downloaded_files:
-            try:
-                pdf_loader = PDFLoader(config)
-                pdf_chunks = pdf_loader.load(downloaded_files)
-
-                if pdf_chunks:
-                    logger.info(
-                        "Adding %d PDF web chunks to vector DB", len(pdf_chunks)
-                    )
-                    config.db_provider.add_documents(pdf_chunks)
-                else:
-                    logger.info("No chunks produced from downloaded PDFs.")
-            except Exception as e:
-                _fail_and_exit("Failed during PDF web document processing", e)
+    _process_html_documents(html_urls)
+    _process_pdf_documents(pdf_urls)
 
     logger.info("Embedding job complete.")
 
diff --git a/loaders/__init__.py b/loaders/__init__.py
new file mode 100644
index 0000000..e17dead
--- /dev/null
+++ b/loaders/__init__.py
@@ -0,0 +1 @@
+"""Document loaders package."""
diff --git a/loaders/git.py b/loaders/git.py
index a1bb5ce..733228d 100644
--- a/loaders/git.py
+++ b/loaders/git.py
@@ -1,3 +1,5 @@
+"""Git repository document loader for cloning and processing repository contents."""
+
 import logging
 import shutil
 import subprocess
diff --git a/loaders/pdf.py b/loaders/pdf.py
index cf96424..a9c78ba 100644
--- a/loaders/pdf.py
+++ b/loaders/pdf.py
@@ -1,3 +1,5 @@
+"""PDF document loader for extracting and chunking text from PDF files."""
+
 import logging
 from pathlib import Path
 from typing import List
diff --git a/loaders/text.py b/loaders/text.py
index 86c8f2a..2a3958c 100644
--- a/loaders/text.py
+++ b/loaders/text.py
@@ -1,3 +1,5 @@
+"""Text document loader for processing various text-based file formats."""
+
 import logging
 from pathlib import Path
 from typing import List
@@ -50,6 +52,56 @@ def __init__(self, config: Config):
             chunk_overlap=config.chunk_overlap,
         )
 
+    def _process_single_file(self, path: Path) -> List[Document]:
+        """Process a single file and return its document chunks."""
+        logger.info("Partitioning %s", path)
+        elements = partition(filename=str(path), strategy="fast")
+
+        buf: List[str] = []
+        buf_len, chunk_idx = 0, 0
+        fname = path.name
+        source_str = str(path)
+        chunks = []
+
+        def _flush():
+            nonlocal buf, buf_len, chunk_idx
+            if not buf_len:
+                return
+            chunks.append(
+                Document(
+                    page_content="\n".join(buf).strip(),
+                    metadata={
+                        "source": source_str,
+                        "chunk_id": chunk_idx,
+                    },
+                )
+            )
+            buf, buf_len = [], 0
+            chunk_idx += 1
+
+        for el in elements:
+            txt = getattr(el, "text", "").strip()
+            if not txt:
+                continue
+            if buf_len == 0:
+                buf.append(f"## {fname}\n")  # inject heading
+            if buf_len + len(txt) > self.config.chunk_size:
+                _flush()
+            buf.append(txt)
+            buf_len += len(txt)
+        _flush()
+
+        return chunks
+
+    def _add_chunk_totals(self, docs: List[Document]) -> None:
+        """Add chunk_total metadata to all documents."""
+        counts: dict[str, int] = {}
+        for doc in docs:
+            source = doc.metadata["source"]
+            counts[source] = counts.get(source, 0) + 1
+        for doc in docs:
+            doc.metadata["chunk_total"] = counts[doc.metadata["source"]]
+
     def load(self, paths: List[Path]) -> List[Document]:
         """
         Loads and splits a list of text files into semantic chunks.
@@ -76,44 +128,12 @@ def load(self, paths: List[Path]) -> List[Document]:
             - Each chunk begins with a lightweight heading that includes the
               filename to help orient the LLM when formatting prompts.
         """
-        grouped: list[Document] = []
+        grouped = []
 
         for path in paths:
             try:
-                logger.info("Partitioning %s", path)
-                elements = partition(filename=str(path), strategy="fast")
-
-                buf, buf_len, chunk_idx = [], 0, 0
-                fname = Path(path).name
-
-                def _flush():
-                    nonlocal buf, buf_len, chunk_idx
-                    if not buf_len:
-                        return
-                    grouped.append(
-                        Document(
-                            page_content="\n".join(buf).strip(),
-                            metadata={
-                                "source": str(path),
-                                "chunk_id": chunk_idx,
-                            },
-                        )
-                    )
-                    buf, buf_len = [], 0
-                    chunk_idx += 1
-
-                for el in elements:
-                    txt = getattr(el, "text", "").strip()
-                    if not txt:
-                        continue
-                    if buf_len == 0:
-                        buf.append(f"## {fname}\n")  # inject heading
-                    if buf_len + len(txt) > self.config.chunk_size:
-                        _flush()
-                    buf.append(txt)
-                    buf_len += len(txt)
-                _flush()
-
+                chunks = self._process_single_file(path)
+                grouped.extend(chunks)
             except Exception as e:
                 logger.warning("Failed to load %s: %s", path, e)
 
@@ -126,11 +146,7 @@ def _flush():
                 final_docs.append(doc)
 
         # Add chunk_total metadata for all docs
-        counts: dict[str, int] = {}
-        for d in final_docs:
-            counts[d.metadata["source"]] = counts.get(d.metadata["source"], 0) + 1
-        for d in final_docs:
-            d.metadata["chunk_total"] = counts[d.metadata["source"]]
+        self._add_chunk_totals(final_docs)
 
         logger.info(
             "Produced %d chunks (avg %.0f chars)",
diff --git a/loaders/web.py b/loaders/web.py
index f9d21fb..050ce21 100644
--- a/loaders/web.py
+++ b/loaders/web.py
@@ -1,3 +1,5 @@
+"""Web document loader for fetching and processing HTML content from URLs."""
+
 import logging
 from typing import Dict, List
 
diff --git a/requirements.txt b/requirements.txt
index 220dfb0..a6ecb1f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
-#    pip-compile
+#    pip-compile --strip-extras
 #
 aiofiles==24.1.0
     # via unstructured-client
@@ -60,7 +60,7 @@ dataclasses-json==0.6.7
     #   unstructured
 elastic-transport==8.17.1
     # via elasticsearch
-elasticsearch[vectorstore-mmr]==8.18.1
+elasticsearch==8.18.1
     # via langchain-elasticsearch
 emoji==2.14.1
     # via unstructured
@@ -97,7 +97,7 @@ html5lib==1.1
     # via unstructured
 httpcore==1.0.9
     # via httpx
-httpx[http2]==0.28.1
+httpx==0.28.1
     # via
     #   langsmith
     #   qdrant-client
@@ -299,7 +299,7 @@ pydantic-core==2.33.2
     # via pydantic
 pydantic-settings==2.9.1
     # via langchain-community
-pyjwt[crypto]==2.10.1
+pyjwt==2.10.1
     # via
     #   msal
     #   pyjwt
@@ -450,7 +450,7 @@ typing-inspection==0.4.1
     # via
     #   pydantic
     #   pydantic-settings
-unstructured[md]==0.17.2
+unstructured==0.17.2
     # via -r requirements.in
 unstructured-client==0.36.0
     # via unstructured
diff --git a/vector_db/__init__.py b/vector_db/__init__.py
new file mode 100644
index 0000000..0ec320c
--- /dev/null
+++ b/vector_db/__init__.py
@@ -0,0 +1 @@
+"""Vector database providers package."""
diff --git a/vector_db/db_provider.py b/vector_db/db_provider.py
index 968bd5c..e5f9f69 100644
--- a/vector_db/db_provider.py
+++ b/vector_db/db_provider.py
@@ -1,3 +1,5 @@
+"""Abstract base class for vector database providers."""
+
 from abc import ABC, abstractmethod
 from typing import List
 
@@ -51,4 +53,3 @@ def add_documents(self, docs: List[Document]) -> None:
         Args:
             docs (List[Document]): A list of LangChain `Document` objects to be embedded and added.
         """
-        pass
diff --git a/vector_db/dryrun_provider.py b/vector_db/dryrun_provider.py
index fb52346..2aaf4e0 100644
--- a/vector_db/dryrun_provider.py
+++ b/vector_db/dryrun_provider.py
@@ -1,7 +1,8 @@
+"""Dry run vector database provider for testing and debugging."""
+
 from typing import List
 
 from langchain_core.documents import Document
-from langchain_huggingface import HuggingFaceEmbeddings
 
 from vector_db.db_provider import DBProvider
 
@@ -15,7 +16,8 @@ class DryRunProvider(DBProvider):
     to validate chunking, structure, and metadata before pushing to a production vector store.
 
     Attributes:
-        embeddings (HuggingFaceEmbeddings): HuggingFace embedding instance, used for interface consistency.
+        embeddings (HuggingFaceEmbeddings): HuggingFace embedding instance, used for interface
+            consistency.
         embedding_length (int): Dimensionality of embeddings (computed for validation, not used).
 
     Args:
@@ -31,15 +33,6 @@ class DryRunProvider(DBProvider):
         >>> provider.add_documents(docs)
     """
 
-    def __init__(self, embeddings: HuggingFaceEmbeddings):
-        """
-        Initialize the dry run provider with a placeholder embedding model.
-
-        Args:
-            embeddings (HuggingFaceEmbeddings): A HuggingFace embedding model (used for compatibility).
-        """
-        super().__init__(embeddings)
-
     def add_documents(self, docs: List[Document]) -> None:
         """
         Print chunked documents and metadata to stdout for inspection.
diff --git a/vector_db/elastic_provider.py b/vector_db/elastic_provider.py
index 6be9af3..adf27e3 100644
--- a/vector_db/elastic_provider.py
+++ b/vector_db/elastic_provider.py
@@ -1,3 +1,5 @@
+"""Elasticsearch vector database provider implementation."""
+
 import logging
 from typing import List
 
@@ -43,7 +45,7 @@ class ElasticProvider(DBProvider):
         >>> provider.add_documents(docs)
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-arguments,too-many-positional-arguments
         self,
         embeddings: HuggingFaceEmbeddings,
         url: str,
diff --git a/vector_db/mssql_provider.py b/vector_db/mssql_provider.py
index 5fc0c1b..0576507 100644
--- a/vector_db/mssql_provider.py
+++ b/vector_db/mssql_provider.py
@@ -1,3 +1,5 @@
+"""Microsoft SQL Server vector database provider implementation."""
+
 import logging
 import re
 from typing import List, Optional
@@ -35,7 +37,11 @@ class MSSQLProvider(DBProvider):
         >>> embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
         >>> provider = MSSQLProvider(
         ...     embeddings=embeddings,
-        ...     connection_string="Driver={ODBC Driver 18 for SQL Server};Server=localhost,1433;Database=docs;UID=sa;PWD=StrongPassword!;TrustServerCertificate=yes;Encrypt=no;",
+        ...     connection_string=(
+        ...         "Driver={ODBC Driver 18 for SQL Server};"
+        ...         "Server=localhost,1433;Database=docs;UID=sa;"
+        ...         "PWD=StrongPassword!;TrustServerCertificate=yes;Encrypt=no;"
+        ...     ),
         ...     table="embedded_docs",
         ... )
         >>> provider.add_documents(docs)
diff --git a/vector_db/pgvector_provider.py b/vector_db/pgvector_provider.py
index 14b7073..5a2486a 100644
--- a/vector_db/pgvector_provider.py
+++ b/vector_db/pgvector_provider.py
@@ -1,3 +1,5 @@
+"""PostgreSQL with pgvector extension vector database provider implementation."""
+
 import logging
 from typing import List
 from urllib.parse import urlparse
diff --git a/vector_db/qdrant_provider.py b/vector_db/qdrant_provider.py
index d05ff7a..4cca990 100644
--- a/vector_db/qdrant_provider.py
+++ b/vector_db/qdrant_provider.py
@@ -1,3 +1,5 @@
+"""Qdrant vector database provider implementation."""
+
 import logging
 from typing import List, Optional
 
diff --git a/vector_db/redis_provider.py b/vector_db/redis_provider.py
index b8e0fee..80489ff 100644
--- a/vector_db/redis_provider.py
+++ b/vector_db/redis_provider.py
@@ -1,3 +1,5 @@
+"""Redis vector database provider implementation."""
+
 import logging
 from typing import List