## Test ingestion of embeddings to QDrant

In [1]:
import json
from pathlib import Path
from typing import Any, Dict, List, Optional
import sys
import requests
import uuid

from sec_embedder import build_text_docs
from sec_embedder import embed_docs
from sec_embedder import build_table_and_row_docs

from qdrant_client import models


In [5]:
# Connect to Qdrant
from qdrant_client import QdrantClient, models
client = QdrantClient(host="localhost", port=6333)

In [3]:
import sys, os
print("Jupyter sys.executable:", sys.executable)
print("Jupyter sys.version:", sys.version)
print("Jupyter CONDA_DEFAULT_ENV:", os.environ.get("CONDA_DEFAULT_ENV"))


Jupyter sys.executable: /Users/shicheny/miniforge3/envs/finsearch-arm/bin/python
Jupyter sys.version: 3.11.14 | packaged by conda-forge | (main, Oct 22 2025, 22:56:31) [Clang 19.1.7 ]
Jupyter CONDA_DEFAULT_ENV: finsearch-arm


In [6]:
api_url = "http://localhost:11434/api/embed"
model  = "qwen3-embedding:8b"
text_path = Path("../data/chunked/AAPL_10-K_2024.text.split.jsonl")

common_meta = {
    "prefix": "AAPL_10-K_2024",
    "ticker": "AAPL",
    "company_name": "Apple Inc.",
    "form_type": "10-K",
    "fiscal_year": 2024,
    # add "cik", "filing_date", etc. if you have them in your JSONL
}
text_docs = build_text_docs(text_path=text_path, common_meta=common_meta)
text_doc = text_docs[0:2]
embedded_text = embed_docs(
    text_doc, api_url=api_url, model=model
)


[EMBED] Embedded docs 0–1 with model=qwen3-embedding:8b


In [7]:
table_summaries_path = Path("../data/chunked/table_summaries/AAPL_10-K_2024.tables.summaries.jsonl")
table_row_docs_dict = build_table_and_row_docs(
    table_summaries_path=table_summaries_path,
    common_meta=common_meta,
)
table_docs = table_row_docs_dict["tables"]
row_docs = table_row_docs_dict["rows"]

embedded_tables = embed_docs(
    table_docs[:5], api_url=api_url, model=model,
    batch_size=1
)

[EMBED] Embedded docs 0–0 with model=qwen3-embedding:8b
[EMBED] Embedded docs 1–1 with model=qwen3-embedding:8b
[EMBED] Embedded docs 2–2 with model=qwen3-embedding:8b
[EMBED] Embedded docs 3–3 with model=qwen3-embedding:8b
[EMBED] Embedded docs 4–4 with model=qwen3-embedding:8b


In [8]:
len(embedded_tables[0]["embedding"])

4096

### Create collections (matching your embedding dimension)

In [15]:
# get dimension
dim = len(embedded_text[0]["embedding"])
print("Embedding dimension:", dim)

# helper function to create collection
def ensure_collection(name: str, vector_size: int) -> None:
    """
    Create the collection if it doesn't exist.
    If it already exists, leave it as-is.
    """
    collections = client.get_collections().collections
    if any(c.name == name for c in collections):
        print(f"[QDRANT] Collection '{name}' already exists; leaving as-is.")
        return

    client.create_collection(
        collection_name=name,
        vectors_config=models.VectorParams(
            size=vector_size,
            distance=models.Distance.COSINE,
        ),
    )
    print(f"[QDRANT] Created collection '{name}' (dim={vector_size}).")

Embedding dimension: 4096


In [16]:
ensure_collection("sec_10k_text_chunks", dim)
# ensure_collection("sec_10k_tables", dim)
# ensure_collection("sec_10k_table_rows", dim)

[QDRANT] Collection 'sec_10k_text_chunks' already exists; leaving as-is.


### Ingest Embedding 

In [2]:
import sys
sys.path.append("../") 
from src.ingestion.qdrant_ingester import docs_to_points
from qdrant_client import models

In [3]:
  docs = [
      {
          "id": "TEST::0",
          "content": "Apple reported net sales growth driven by iPhone and Services.",
          "metadata": {"ticker": "AAPL", "doc_type": "text_chunk"},
          "embedding": [0.1, 0.2, 0.3],                    # dense
          "bge_m3_dense": [0.01] * 1024,                   # bge-m3 dense
          "bge_m3_sparse": {"indices": [1, 10], "values": [0.2, 0.8]},  # bge-m3 sparse
      }
  ]

  pts = docs_to_points(
      docs,
      include_bm25=True,
      bge_m3_dense_key="bge_m3_dense",
      bge_m3_sparse_key="bge_m3_sparse",
  )

  p = pts[0]
  print("point id:", p.id)
  print("vector keys:", list(p.vector.keys()))
  print("dense dim:", len(p.vector["dense"]))
  print("bge_m3_dense dim:", len(p.vector["bge_m3_dense"]))
  print("bge_m3_sparse:", type(p.vector["bge_m3_sparse"]), p.vector["bge_m3_sparse"].indices, p.vector["bge_m3_sparse"].values)
  print("bm25:", type(p.vector["bm25"]), "model=", p.vector["bm25"].model)

point id: 0f54ac29-1288-5813-9f6b-e42431bbca65
vector keys: ['dense', 'bm25', 'bge_m3_dense', 'bge_m3_sparse']
dense dim: 3
bge_m3_dense dim: 1024
bge_m3_sparse: <class 'qdrant_client.http.models.models.SparseVector'> [1, 10] [0.2, 0.8]
bm25: <class 'qdrant_client.http.models.models.Document'> model= Qdrant/bm25


In [4]:
# upsert_docs(client, "sec_10k_text_chunks", embedded_text)
# upsert_docs(client, "sec_10k_tables", embedded_tables)
# upsert_docs(client, "sec_10k_table_rows", embedded_rows)

### Check contents in container

In [12]:
from pathlib import Path
import json

def count_jsonl(path: Path) -> int:
    n = 0
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                n += 1
    return n

base = Path("../data/embedding/AAPL_10-K_2024")  # adjust as needed

expected_text   = count_jsonl(base / "AAPL_10-K_2024.text.embedded.jsonl")
expected_tables = count_jsonl(base / "AAPL_10-K_2024.tables.embedded.jsonl")
expected_rows   = count_jsonl(base / "AAPL_10-K_2024.tables.rows.embedded.jsonl")

print("Expected points:")
print("  sec_10k_text_chunks   =", expected_text)
print("  sec_10k_tables        =", expected_tables)
print("  sec_10k_table_rows    =", expected_rows)

Expected points:
  sec_10k_text_chunks   = 110
  sec_10k_tables        = 48
  sec_10k_table_rows    = 450


In [6]:
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)

def qdrant_count(name: str) -> int:
    return client.count(collection_name=name, exact=True).count

print("Qdrant counts:")
# qdrant_count("sec_docs")
qdrant_count("sec_docs_hybrid")
# print("  sec_10k_text_chunks   =", qdrant_count("sec_10k_text_chunks"))
# print("  sec_10k_tables        =", qdrant_count("sec_10k_tables"))
# print("  sec_10k_table_rows    =", qdrant_count("sec_10k_table_rows"))

Qdrant counts:


608

In [13]:
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
