# Test Retrieval Pipeline

In [1]:
import requests
from __future__ import annotations
from typing import Any, Dict, List, Optional
from collections import defaultdict
import requests
# import qdrant_client
from qdrant_client import QdrantClient, models

In [2]:
import importlib.metadata as im

print(im.version("qdrant-client"))

1.16.1


In [3]:
import qdrant_client
import inspect

print("qdrant_client module:", qdrant_client)
print("module file:", getattr(qdrant_client, "__file__", "NO __file__"))

qdrant_client module: <module 'qdrant_client' from '/Users/shicheny/miniforge3/envs/finsearch-arm/lib/python3.11/site-packages/qdrant_client/__init__.py'>
module file: /Users/shicheny/miniforge3/envs/finsearch-arm/lib/python3.11/site-packages/qdrant_client/__init__.py


In [4]:
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)

print([m for m in dir(client) if "search" in m.lower()])

['search_matrix_offsets', 'search_matrix_pairs']


In [5]:
import qdrant_client
from qdrant_client import QdrantClient

print("qdrant_client module:", qdrant_client)
print("module file:", getattr(qdrant_client, "__file__", "NO __file__"))

client = QdrantClient(host="localhost", port=6333)

print("client type:", type(client))
print("search-like attributes on client:",
      [m for m in dir(client) if "search" in m.lower()])


qdrant_client module: <module 'qdrant_client' from '/Users/shicheny/miniforge3/envs/finsearch-arm/lib/python3.11/site-packages/qdrant_client/__init__.py'>
module file: /Users/shicheny/miniforge3/envs/finsearch-arm/lib/python3.11/site-packages/qdrant_client/__init__.py
client type: <class 'qdrant_client.qdrant_client.QdrantClient'>
search-like attributes on client: ['search_matrix_offsets', 'search_matrix_pairs']


In [6]:
from dataclasses import dataclass
from typing import Any, Dict, Optional

@dataclass
class SecDocHit:
    score: float
    doc_id: str
    content: str
    doc_type: str
    metadata: Dict[str, Any]

    # Convenience pointers
    ticker: Optional[str] = None
    fiscal_year: Optional[int] = None
    form_type: Optional[str] = None
    table_index: Optional[int] = None
    row_index: Optional[int] = None
    row_label: Optional[str] = None
    section_path: Optional[str] = None

def to_sec_doc_hit(scored_point) -> SecDocHit:
    payload = scored_point.payload or {}
    doc_id = payload.get("doc_id") or payload.get("id") or ""
    content = payload.get("content") or ""
    doc_type = payload.get("doc_type") or payload.get("metadata", {}).get("doc_type") or "unknown"

    # Your ingestion puts metadata at top-level payload, so just reuse payload
    md = payload

    return SecDocHit(
        score=float(scored_point.score),
        doc_id=doc_id,
        content=content,
        doc_type=doc_type,
        metadata=md,
        ticker=md.get("ticker"),
        fiscal_year=md.get("fiscal_year"),
        form_type=md.get("form_type"),
        table_index=md.get("table_index"),
        row_index=md.get("row_index"),
        row_label=md.get("row_label"),
        section_path=md.get("section_path") if isinstance(md.get("section_path"), str) else None,
    )

def group_by_doc_type(hits: List[SecDocHit]) -> Dict[str, List[SecDocHit]]:
    grouped = defaultdict(list)
    for h in hits:
        grouped[h.doc_type].append(h)
    # Keep each group sorted by score
    return {k: sorted(v, key=lambda x: x.score, reverse=True) for k, v in grouped.items()}

def dedupe_hits(hits: List[SecDocHit]) -> List[SecDocHit]:
    seen = set()
    out = []
    for h in hits:
        if h.doc_id and h.doc_id not in seen:
            seen.add(h.doc_id)
            out.append(h)
    return out

In [7]:
# --- 1) Query embedding via qwen3-embedding:8b (Ollama) --------------------

def embed_query_qwen3(
    query: str,
    api_url: str = "http://localhost:11434/api/embed",
    model: str = "qwen3-embedding:8b",
    timeout: int = 60,
) -> List[float]:
    """
    Embed a query string using qwen3-embedding:8b via Ollama.

    Returns a single embedding vector (list[float]).
    """
    payload = {
        "model": model,
        "input": [query],
    }
    resp = requests.post(api_url, json=payload, timeout=timeout)
    resp.raise_for_status()
    data = resp.json()

    # Ollama's /api/embed returns {"embeddings": [[...]]}
    return data["embeddings"][0]


# --- 2) Filter builder -----------------------------------------------------

def build_sec_filter(
    doc_types: Optional[List[str]] = None,  # e.g. ["text_chunk"], ["table"], ["table_row"]
    ticker: Optional[str] = None,          # e.g. "AAPL"
    fiscal_year: Optional[int] = None,     # e.g. 2024
    form_type: Optional[str] = None,       # e.g. "10-K"
) -> Optional[models.Filter]:
    """
    Build a Qdrant filter for the sec_docs collection.
    Returns None if no filter is needed.
    """
    must: List[models.Condition] = []

    if doc_types:
        # MatchAny lets us say "doc_type in [..]"
        must.append(
            models.FieldCondition(
                key="doc_type",
                match=models.MatchAny(any=doc_types),
            )
        )

    if ticker is not None:
        must.append(
            models.FieldCondition(
                key="ticker",
                match=models.MatchValue(value=ticker),
            )
        )

    if fiscal_year is not None:
        must.append(
            models.FieldCondition(
                key="fiscal_year",
                match=models.MatchValue(value=fiscal_year),
            )
        )

    if form_type is not None:
        must.append(
            models.FieldCondition(
                key="form_type",
                match=models.MatchValue(value=form_type),
            )
        )

    if not must:
        return None

    return models.Filter(must=must)


# --- 3) Core dense search over sec_docs ------------------------------------

def dense_search_sec_docs(
    query: str,
    *,
    client: QdrantClient,
    embed_fn,                          # function: str -> List[float]
    collection_name: str = "sec_docs",
    top_k: int = 10,
    doc_types: Optional[List[str]] = None,
    ticker: Optional[str] = None,
    fiscal_year: Optional[int] = None,
    form_type: Optional[str] = None,
) -> List[models.ScoredPoint]:
    """
    Dense search over the sec_docs collection.

    Parameters
    ----------
    query : str
        User query.
    client : QdrantClient
        Active Qdrant client.
    embed_fn : callable
        Function that embeds the query into a vector.
    collection_name : str
        Name of the Qdrant collection ("sec_docs").
    top_k : int
        Number of results to return.
    doc_types : list[str] or None
        Filter by type, e.g. ["text_chunk"], ["table"], ["table_row"], or None for all.
    ticker, fiscal_year, form_type : optional
        Filters for specific filing universe.

    Returns
    -------
    List[ScoredPoint]
    """
    query_vec = embed_fn(query)
    qfilter = build_sec_filter(
        doc_types=doc_types,
        ticker=ticker,
        fiscal_year=fiscal_year,
        form_type=form_type,
    )

    hits = client.query_points(
        collection_name=collection_name,
        query=query_vec,
        query_filter=qfilter,
        limit=top_k,
        with_payload=True,
        with_vectors=False,
    )

    hits_cleaned = list(map(lambda x:to_sec_doc_hit(x), hits.points))
    hits_cleaned = dedupe_hits(hits_cleaned)
    return hits.points


### All docs

In [9]:
query = "Where does Apple discuss share repurchases in the 2024 10-K?"

hits = dense_search_sec_docs(
    query,
    client=client,
    embed_fn=embed_query_qwen3,
    ticker="AAPL",
    fiscal_year=2024,
    form_type="10-K",
    top_k=10,
    doc_types=None,  # search across text + tables + rows
)

In [10]:
for h in hits:
    p = h.payload or {}
    print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
    print("section:", p.get("section_title"))
    print("content:", (p.get("content") or "")[:250], "...\n")

score=0.6833  doc_type=table  doc_id=AAPL_10-K_2024::table::0
section: Purchases of Equity Securities by the Issuer and Affiliated Purchasers
content: Table summary: The table details Apple Inc.'s purchases of its common equity during specific periods in fiscal year 2024, including the number of shares purchased, average price paid per share, and total purchases under publicly announced plans or pr ...

score=0.6540  doc_type=table_row  doc_id=AAPL_10-K_2024::table::13::row::10
section: CONSOLIDATED STATEMENTS OF SHAREHOLDERS’ EQUITY
content: Common stock repurchased: The total amount spent on repurchasing common stock during the fiscal year ending September 28, 2024. ...

score=0.6451  doc_type=text_chunk  doc_id=AAPL_10-K_2024::text::37
section: Purchases of Equity Securities by the Issuer and Affiliated Purchasers
content: Item 5. Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities > Purchases of Equity Securities by the Issue

### Text Only

In [12]:
query2 = "What risks does Apple highlight about the App Store?"
q2_hits = dense_search_sec_docs(
    query2,
    client=client,
    embed_fn=embed_query_qwen3,
    ticker="AAPL",
    fiscal_year=2024,
    form_type="10-K",
    top_k=5,
    doc_types=None,  # search across text + tables + rows
)
for h in q2_hits:
    p = h.payload or {}
    print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
    print("section:", p.get("section_title"))
    print("content:", (p.get("content") or ""), "...\n")

score=0.6248  doc_type=text_chunk  doc_id=AAPL_10-K_2024::text::22
section: Legal and Regulatory Compliance Risks
content: Item 1. Business > Legal and Regulatory Compliance Risks

Expectations relating to environmental, social and governance considerations and related reporting obligations expose the Company to potential liabilities, increased costs, reputational harm, and other adverse effects on the Company’s business.

Many governments, regulators, investors, employees, customers and other stakeholders are increasingly focused on environmental, social and governance considerations relating to businesses, including climate change and greenhouse gas emissions, human and civil rights, and diversity, equity and inclusion. In addition, the Company makes statements about its goals and initiatives through its various non-financial reports, information provided on its website, press statements and other communications. Responding to these environmental, social and governance considerations

### Tables Only

In [13]:
query3 = "what are the segment net sales by region in 2024?"
q3_hits = dense_search_sec_docs(
    query3,
    client=client,
    embed_fn=embed_query_qwen3,
    ticker="AAPL",
    fiscal_year=2024,
    form_type="10-K",
    top_k=5,
    doc_types=None,  # search across text + tables + rows
)
for h in q3_hits:
    p = h.payload or {}
    print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
    print("section:", p.get("section_title"))
    print("content:", (p.get("content") or "")[:250], "...\n")

score=0.7103  doc_type=table_row  doc_id=AAPL_10-K_2024::table::2::row::5
section: Segment Operating Performance
content: Total net sales – Net sales$: Total net sales across all segments in fiscal year 2024 compared to 2023 with a change percentage. ...

score=0.7015  doc_type=table_row  doc_id=AAPL_10-K_2024::table::2::row::1
section: Segment Operating Performance
content: Europe – Net sales$: Net sales for the Europe segment in fiscal year 2024 compared to 2023 with a change percentage. ...

score=0.6848  doc_type=table_row  doc_id=AAPL_10-K_2024::table::38::row::2
section: Note 13 – Segment Information and Geographic Data
content: Europe – Net sales$: Net sales for the Europe segment across fiscal years 2024, 2023, and 2022. ...

score=0.6845  doc_type=table_row  doc_id=AAPL_10-K_2024::table::2::row::0
section: Segment Operating Performance
content: Americas – Net sales$: Net sales for the Americas segment in fiscal year 2024 compared to 2023 with a change percentage. ...

score=0.

### Table Rows Only

In [121]:
query4 = "operating income in Europe 2024"
q4_hits = dense_search_sec_docs(
    query4,
    client=client,
    embed_fn=embed_query_qwen3,
    ticker="AAPL",
    fiscal_year=2024,
    form_type="10-K",
    top_k=5,
    doc_types=None,  # search across text + tables + rows
)
for h in q4_hits:
    p = h.payload or {}
    print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
    print("section:", p.get("section_title"))
    print("content:", (p.get("content") or "")[:250], "...\n")

score=0.8101  doc_type=table_row  doc_id=AAPL_10-K_2024::table::38::row::3
section: Note 13 – Segment Information and Geographic Data
content: Europe – Operating income$: Operating income for the Europe segment across fiscal years 2024, 2023, and 2022. ...

score=0.6796  doc_type=table_row  doc_id=AAPL_10-K_2024::table::2::row::1
section: Segment Operating Performance
content: Europe – Net sales$: Net sales for the Europe segment in fiscal year 2024 compared to 2023 with a change percentage. ...

score=0.6558  doc_type=table_row  doc_id=AAPL_10-K_2024::table::38::row::2
section: Note 13 – Segment Information and Geographic Data
content: Europe – Net sales$: Net sales for the Europe segment across fiscal years 2024, 2023, and 2022. ...

score=0.6477  doc_type=table_row  doc_id=AAPL_10-K_2024::table::39::row::0
section: Note 13 – Segment Information and Geographic Data
content: Segment operating income: Operating income for the segment across fiscal years 2024, 2023, and 2022. ...

score

In [18]:
q = "What were iPhone net sales in 2024 and 2023?"
# "What were Apple’s total net sales in fiscal 2024?"
q4_hits = dense_search_sec_docs(
    q,
    client=client,
    embed_fn=embed_query_qwen3,
    ticker="AAPL",
    fiscal_year=2024,
    form_type="10-K",
    top_k=5,
    doc_types=None,  # search across text + tables + rows
)
for h in q4_hits:
    p = h.payload or {}
    print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
    print("section:", p.get("section_title"))
    print("content:", (p.get("content") or "")[:250], "...\n")

score=0.8437  doc_type=table_row  doc_id=AAPL_10-K_2024::table::15::row::0
section: Note 2 – Revenue
content: iPhone – Net sales$: Net sales for the iPhone product line across fiscal years 2024, 2023, and 2022. ...

score=0.8420  doc_type=table_row  doc_id=AAPL_10-K_2024::table::3::row::0
section: Products and Services Performance
content: iPhone – Net sales$: Net sales for the iPhone product category across fiscal years 2024, 2023, and 2022. ...

score=0.7683  doc_type=text_chunk  doc_id=AAPL_10-K_2024::text::44
section: Products and Services Performance
content: Item 6. [Reserved] > Products and Services Performance

The following table shows net sales by category for 2024, 2023 and 2022 (dollars in millions):

(1)Services net sales include amortization of the deferred value of services bundled in the sales  ...

score=0.7364  doc_type=table  doc_id=AAPL_10-K_2024::table::3
section: Products and Services Performance
content: Table summary: The table provides a breakdown of Apple Inc.

In [39]:
h.payload

{'doc_id': 'AAPL_10-K_2024::table::15::row::2',
 'content': 'iPad – Net sales$: Net sales for the iPad product line across fiscal years 2024, 2023, and 2022.',
 'prefix': 'AAPL_10-K_2024',
 'ticker': 'AAPL',
 'form_type': '10-K',
 'fiscal_year': 2024,
 'doc_type': 'table_row',
 'table_index': 15,
 'row_index': 2,
 'row_label': 'iPad – Net sales$',
 'section_title': 'Note 2 – Revenue',
 'section_path': 'PART II > Item 6. [Reserved] > Note 2 – Revenue',
 'item_id': '6',
 'item_title': 'Item 6. [Reserved]',
 'source_html': 'data/html_filings/AAPL/10-K/10-K_2024.html',
 'source': 'table_row'}

# Run Test Queries

In [8]:
client = QdrantClient(host="localhost", port=6333)

In [9]:
%reload_ext autoreload
%autoreload 2

In [10]:
import sys
sys.path.append("../") 
from src.retrieval_evaluator import (
      embed_query_qwen3,
      dense_search_sec_docs,
      load_table_eval,
      evaluate_table_queries,
      pretty_print_table_example,
      run_table_eval_and_print,
        print_query_retrieval_details,
        hybrid_search_sec_docs_bge_m3,
        hybrid_search_sec_docs_rrf
  )

In [11]:
summary, results = run_table_eval_and_print(
      client=client,
      eval_path="table_eval.jsonl",
      embed_fn=embed_query_qwen3,
      ticker="AAPL",
      fiscal_year=2024,
      form_type="10-K",
      top_k=50,
      max_examples_to_show=10,
  )

=== Summary metrics ===
num_queries: 24
hit_rate@50: 1.0000
precision@50: 0.1417
recall@50: 6.1667
mrr@50: 0.9444

=== Per-query metrics ===
 qid   Hit      P      R    MRR   #rel@k  #rel_total  query
 101  1.00   0.10   5.00   1.00        5           1  How many shares did Apple repurchase in total during the June 30, 2024 to Sep...
 102  1.00   0.10   5.00   1.00        5           1  What was the approximate dollar value of shares that may yet be purchased und...
 103  1.00   0.08   4.00   1.00        4           1  In Apple’s stock performance table (September 2019 = 100), what was Apple Inc...
 104  1.00   0.12   6.00   1.00        6           1  What was total property, plant and equipment, net in 2024 (in millions) in Ap...
 105  1.00   0.44  11.00   1.00       22           2  What were total lease liabilities in 2024 (operating + finance, in millions) ...
 106  1.00   0.06   3.00   1.00        3           1  What was total other current liabilities in 2024 (in millions) in Appl

In [12]:
 print_query_retrieval_details(
      query_id=6,
      eval_path="table_eval.jsonl",
      results=results,
      max_hits=50,
  )

[WARN] No example / result found for query_id=6


# Hybrid Search

### BGE_M3

In [17]:
# query = "What was Apple’s total debt (short-term plus long-term) at year-end 2024?"
# raw_hits = hybrid_search_sec_docs_bge_m3(
#     query,
#     client=client,
#     collection_name="sec_docs_hybrid",
#     top_k=10,
#     ticker='AAPL',
#     fiscal_year=2024,
#     form_type="10-K",
#     # doc_types=["table", "table_row"],
# )

In [18]:
# for h in raw_hits:
#     p = h.payload or {}
#     print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
#     print("section:", p.get("section_title"))
#     print("section:", p.get("table_index"))
#     print("content:", (p.get("content") or "")[:250], "...\n")

### Dense + BM25 with RRF

In [16]:
# raw_hits = hybrid_search_sec_docs_rrf(
#     query,
#     client=client,
#     collection_name="sec_docs_hybrid",
#     embed_fn=embed_query_qwen3,          # same dense embed fn you used before
#     ticker='AAPL',
#     fiscal_year=2024,
#     form_type='10-K',
#     top_k=10,
#     doc_types=["table", "table_row"],
#     using_dense="dense",
#     using_bm25="bm25",
#     rrf_k=60,
#     w_dense=10,
#     w_bm25=1,
#     dense_limit=20,
#     bm25_limit=10,
#     # avg_len=avg_len,  # if you compute it
# )


In [15]:
# for h in raw_hits:
#     p = h.payload or {}
#     print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
#     print("section:", p.get("section_title"))
#     print("section:", p.get("table_index"))
#     print("content:", (p.get("content") or "")[:250], "...\n")

# With Reranker

#### Steps:
- Use BGE-M3 hybrid retrieval to get top k = 50 results
- Dedupe by doc_id
- Return final top K = 20

In [19]:
def dedupe_scored_points(
    points: List[models.ScoredPoint],
    key_fn: Callable[[models.ScoredPoint], str],
) -> List[models.ScoredPoint]:
    best: Dict[str, models.ScoredPoint] = {}
    for p in points:
        k = key_fn(p)
        if k not in best or float(p.score) > float(best[k].score):
            best[k] = p
    # keep stable order by score desc
    return sorted(best.values(), key=lambda x: float(x.score), reverse=True)

def normalize_doc_id_to_table(doc_id: str) -> str:
    """
    Collapse table rows to their parent table id.
    Leaves table-level and text ids as-is.
    """
    if not doc_id:
        return ""
    # If it's a table row, strip the row suffix
    if "::table::" in doc_id and "::row::" in doc_id:
        return doc_id.split("::row::", 1)[0]
    return doc_id

def table_group_key(p: models.ScoredPoint) -> str:
    md = p.payload or {}
    doc_id = md.get("doc_id") or md.get("id") or ""
    return normalize_doc_id_to_table(doc_id)

def cap_per_group(
    points: List[models.ScoredPoint],
    key_fn: Callable[[models.ScoredPoint], str],
    *,
    cap: int = 2,
    max_total: int = 120,
) -> List[models.ScoredPoint]:
    buckets = defaultdict(list)
    for p in sorted(points, key=lambda x: float(x.score), reverse=True):
        k = key_fn(p)
        if len(buckets[k]) < cap:
            buckets[k].append(p)
    # flatten in score order and apply max_total
    out = []
    for p in sorted([p for lst in buckets.values() for p in lst], key=lambda x: float(x.score), reverse=True):
        out.append(p)
        if len(out) >= max_total:
            break
    return out
    
def doc_id_table_key(p: models.ScoredPoint) -> str:
    md = p.payload or {}
    doc_id = md.get("doc_id") or md.get("id") or ""
    return normalize_doc_id_to_table(doc_id)

In [13]:
# query = "What was Apple’s total debt (short-term plus long-term) at year-end 2024?"
# raw_hits = hybrid_search_sec_docs_bge_m3(
#     query,
#     client=client,
#     collection_name="sec_docs_hybrid",
#     top_k=50,
#     ticker='AAPL',
#     fiscal_year=2024,
#     form_type="10-K",
#     # doc_types=["table", "table_row"],
# )
# capped_hits = cap_per_group(raw_hits, key_fn=table_group_key)
# # by_table = dedupe_scored_points(raw_hits, key_fn=doc_id_table_key)[:20]

In [14]:
# print(len(capped_hits), "results.\n")
# for h in capped_hits:
#     p = h.payload or {}
#     print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
#     print("section:", p.get("section_title"))
#     print("section:", p.get("table_index"))
#     print("content:", (p.get("content") or "")[:250], "...\n")

### Reranker Model

In [20]:
%reload_ext autoreload
%autoreload 2
import time
import sys
sys.path.append("../src") 

from retrieval_evaluator import (
      run_table_eval_and_print,
      hybrid_search_sec_docs_bge_m3,
      cap_per_group,
      table_group_key,
      rerank_with_minilm_l6,
      dedupe_scored_points,
      dense_search_points,
      dense_search_sec_docs,
      embed_query_qwen3,
      format_passage_for_rerank,
      rerank_with_bge_reranker_large,
      get_bge_reranker_large_model,
      get_gte_multilingual_reranker_base,
      rerank_with_gte_multilingual_reranker_base,
      get_granite_reranker_english_r2_model,
      rerank_with_granite_english_r2,
      get_qwen3_reranker_model,
      rerank_with_qwen3_reranker,
  )

In [21]:
client = QdrantClient(host="localhost", port=6333)
# client = QdrantClient(host="localhost", port=6333)
query = "What was Apple’s total debt (short-term plus long-term) at year-end 2024?"
raw = dense_search_sec_docs(
  query,
  client=client,
  embed_fn=embed_query_qwen3,
  collection_name="sec_docs_hybrid",
  using_dense="dense",
  top_k=20,
  ticker="AAPL",
  fiscal_year=2024,
  form_type="10-K",
  doc_types=["table", "table_row"],
)

# pairs = [
#         [query, format_passage_for_rerank(p, max_chars=2000)]
#         for p in raw
#     ]

i = 1
for h in raw[:10]:
    print("rank: ", i)
    p = h.payload or {}
    print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
    print("section:", p.get("section_title"))
    print("section:", p.get("table_index"))
    print("content:", (p.get("content") or "")[:250], "...\n")
    i += 1

rank:  1
score=0.6737  doc_type=table  doc_id=AAPL_10-K_2024::table::32
section: Term Debt
section: 32
content: Table summary: The table details term debt maturities and related financial information for Apple Inc., covering the years 2023 and 2024, including fixed-rate notes and their effective interest rates.
Rows: 2013 – 2023 debt issuances: – Fixed-rate 0. ...

rank:  2
score=0.6484  doc_type=table_row  doc_id=AAPL_10-K_2024::table::32::row::6
section: Term Debt
section: 32
content: Total term debt principal: Total principal amount of term debt for the fiscal years 2024 and 2023. ...

rank:  3
score=0.6349  doc_type=table  doc_id=AAPL_10-K_2024::table::23
section: Other Current Liabilities
section: 23
content: Table summary: The table provides details on other current liabilities for Apple Inc., specifically focusing on income taxes payable and total other current liabilities for the fiscal years 2024 and 2023.
Rows: Income taxes payable: Amount of income  ...

rank:  4
score=0.633

In [22]:
model = get_bge_reranker_large_model(model_name="BAAI/bge-reranker-large")
# get_qwen3_reranker_model(model_name="Qwen/Qwen3-Reranker-8B")
# get_granite_reranker_english_r2_model(model_name="ibm-granite/granite-embedding-reranker-english-r2") 
# model = get_gte_multilingual_reranker_base()
# scores = model.compute_score(pairs)

# jina = get_jina_reranker_v3_model()
reranked = rerank_with_bge_reranker_large(query, raw, top_k=10, model=model, max_passage_chars=4000)
# rerank_with_qwen3_reranker(query, raw, top_k=20, model_name="Qwen/Qwen3-Reranker-4B")
# rerank_with_granite_english_r2(query, raw, top_k=20, max_passage_chars=4000, model=model) 
# rerank_with_gte_multilingual_reranker_base(query, raw, top_k=10, 
#                                                     model = model, max_passage_chars=5000)
# rerank_with_bge_reranker_large(query, raw, top_k=10, model=model)
# reranked = dedupe_scored_points(reranked, key_fn=table_group_key)
i = 1
for h in reranked:
    print("rank: ", i)
    p = h.payload or {}
    print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
    print("section:", p.get("section_title"))
    print("section:", p.get("table_index"))
    print("content:", (p.get("content") or "")[:250], "...\n")
    i += 1
# [(float(p.score), (p.payload or {}).get("doc_id"), (p.payload or {}).get("doc_type")) for p in reranked]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


rank:  1
score=-0.3342  doc_type=table  doc_id=AAPL_10-K_2024::table::32
section: Term Debt
section: 32
content: Table summary: The table details term debt maturities and related financial information for Apple Inc., covering the years 2023 and 2024, including fixed-rate notes and their effective interest rates.
Rows: 2013 – 2023 debt issuances: – Fixed-rate 0. ...

rank:  2
score=-1.0771  doc_type=table  doc_id=AAPL_10-K_2024::table::23
section: Other Current Liabilities
section: 23
content: Table summary: The table provides details on other current liabilities for Apple Inc., specifically focusing on income taxes payable and total other current liabilities for the fiscal years 2024 and 2023.
Rows: Income taxes payable: Amount of income  ...

rank:  3
score=-1.8857  doc_type=table  doc_id=AAPL_10-K_2024::table::24
section: Other Non-Current Liabilities
section: 24
content: Table summary: The table provides details on other non-current liabilities for Apple Inc., including income taxes

In [None]:
# query = "What was Apple’s total debt (short-term plus long-term) at year-end 2024?"
# raw = hybrid_search_sec_docs_bge_m3(query, client=client, top_k=100)
# cands = cap_per_group(raw, key_fn=table_group_key, cap=2, max_total=80)
# reranked = rerank_with_minilm_l6(query, cands, top_k=50)
# final = dedupe_scored_points(reranked, key_fn=table_group_key)[:15]

In [None]:
# i = 1
# for h in final:
#     print("rank: ", i)
#     p = h.payload or {}
#     print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
#     print("section:", p.get("section_title"))
#     print("section:", p.get("table_index"))
#     print("content:", (p.get("content") or "")[:250], "...\n")
#     i += 1

In [23]:
def search_fn(q: str):
    raw = dense_search_sec_docs(
                          q,
                          client=client,
                          embed_fn=embed_query_qwen3,
                          collection_name="sec_docs_hybrid",
                          using_dense="dense",
                          top_k=20,
                          ticker="AAPL",
                          fiscal_year=2024,
                          form_type="10-K",
                          doc_types=["table", "table_row"],
                        )
    model = get_bge_reranker_large_model() 
    reranked = rerank_with_bge_reranker_large(query, raw, top_k=10, model=model)
    final = dedupe_scored_points(reranked, key_fn=table_group_key)[:10]
    return final

t0 = time.perf_counter()
summary, results = run_table_eval_and_print(
      client=client,
      eval_path="table_eval_v1.jsonl",
      ticker="AAPL",
      fiscal_year=2024,
      form_type="10-K",
      top_k=15,
      search_fn=search_fn,
  )
dt = time.perf_counter() - t0
print(f"Elapsed: {dt:.3f}s")

=== Summary metrics ===
num_queries: 12
hit_rate@15: 1.0000
precision@15: 0.3653
recall@15: 0.9028
mrr@15: 0.6716

=== Per-query metrics ===
 qid   Hit      P      R    MRR   #rel@k  #rel_total  query
   1  1.00   0.62   1.00   1.00        5           5  What were Apple’s total net sales in fiscal 2024?
   2  1.00   0.33   1.00   1.00        3           3  What were Apple’s net sales from Services in 2024?
   3  1.00   0.20   0.67   0.33        2           3  What was Apple’s research and development expense in 2024?
   4  1.00   0.67   1.00   1.00        2           2  What was diluted earnings per share in 2024?
   5  1.00   0.12   0.50   0.25        1           2  How much cash and cash equivalents did Apple report at September 28, 2024?
   6  1.00   0.10   1.00   0.14        1           1  What was Apple’s total debt (short-term plus long-term) at year-end 2024?
   7  1.00   0.33   1.00   0.50        2           2  What were iPhone net sales in 2024 and 2023?
   8  1.00   0.50   1.

In [49]:
def search_fn_dense(q: str):
    raw = dense_search_sec_docs(
                          q,
                          client=client,
                          embed_fn=embed_query_qwen3,
                          collection_name="sec_docs_hybrid",
                          using_dense="dense",
                          top_k=20,
                          ticker="AAPL",
                          fiscal_year=2024,
                          form_type="10-K",
                          doc_types=["table", "table_row"],
                        )
    final = dedupe_scored_points(raw, key_fn=table_group_key)[:10]
    return final
dense_summary, dense_results = run_table_eval_and_print(
      client=client,
      eval_path="table_eval_v1.jsonl",
      ticker="AAPL",
      fiscal_year=2024,
      form_type="10-K",
      top_k=10,
      embed_fn=embed_query_qwen3,
      search_fn=search_fn_dense,
      # IMPORTANT if you’re using the hybrid collection:
      # collection_name="sec_docs_hybrid",  # if your wrapper exposes it; otherwise use search_fn
  )

=== Summary metrics ===
num_queries: 12
hit_rate@10: 1.0000
precision@10: 0.3231
recall@10: 0.9583
mrr@10: 0.9306

=== Per-query metrics ===
 qid   Hit      P      R    MRR   #rel@k  #rel_total  query
   1  1.00   0.62   1.00   1.00        5           5  What were Apple’s total net sales in fiscal 2024?
   2  1.00   0.33   1.00   1.00        3           3  What were Apple’s net sales from Services in 2024?
   3  1.00   0.30   1.00   1.00        3           3  What was Apple’s research and development expense in 2024?
   4  1.00   0.40   1.00   1.00        2           2  What was diluted earnings per share in 2024?
   5  1.00   0.11   0.50   1.00        1           2  How much cash and cash equivalents did Apple report at September 28, 2024?
   6  1.00   0.10   1.00   0.17        1           1  What was Apple’s total debt (short-term plus long-term) at year-end 2024?
   7  1.00   0.33   1.00   1.00        2           2  What were iPhone net sales in 2024 and 2023?
   8  1.00   0.40   1.

In [24]:
def search_fn(q: str):
    raw = dense_search_sec_docs(
                          q,
                          client=client,
                          embed_fn=embed_query_qwen3,
                          collection_name="sec_docs_hybrid",
                          using_dense="dense",
                          top_k=20,
                          ticker="AAPL",
                          fiscal_year=2024,
                          form_type="10-K",
                          doc_types=["table", "table_row"],
                        )
    model = get_bge_reranker_large_model() 
    reranked = rerank_with_bge_reranker_large(query, raw, top_k=10, model=model)
    final = dedupe_scored_points(reranked, key_fn=table_group_key)[:10]
    return final

t0 = time.perf_counter()
summary, results = run_table_eval_and_print(
      client=client,
      eval_path="table_eval_v1.jsonl",
      ticker="AAPL",
      fiscal_year=2024,
      form_type="10-K",
      top_k=15,
      search_fn=search_fn,
  )
dt = time.perf_counter() - t0
print(f"Elapsed: {dt:.3f}s")

=== Summary metrics ===
num_queries: 12
hit_rate@15: 1.0000
precision@15: 0.3653
recall@15: 0.9028
mrr@15: 0.6716

=== Per-query metrics ===
 qid   Hit      P      R    MRR   #rel@k  #rel_total  query
   1  1.00   0.62   1.00   1.00        5           5  What were Apple’s total net sales in fiscal 2024?
   2  1.00   0.33   1.00   1.00        3           3  What were Apple’s net sales from Services in 2024?
   3  1.00   0.20   0.67   0.33        2           3  What was Apple’s research and development expense in 2024?
   4  1.00   0.67   1.00   1.00        2           2  What was diluted earnings per share in 2024?
   5  1.00   0.12   0.50   0.25        1           2  How much cash and cash equivalents did Apple report at September 28, 2024?
   6  1.00   0.10   1.00   0.14        1           1  What was Apple’s total debt (short-term plus long-term) at year-end 2024?
   7  1.00   0.33   1.00   0.50        2           2  What were iPhone net sales in 2024 and 2023?
   8  1.00   0.50   1.

In [51]:
q = "How many shares did Apple repurchase in total during the June 30, 2024 to September 28, 2024 periods shown in the share repurchase table?"
r = search_fn(q)
i = 1
for h in r:
    print("rank: ", i)
    p = h.payload or {}
    print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
    print("section:", p.get("section_title"))
    print("section:", p.get("table_index"))
    print("content:", (p.get("content") or "")[:250], "...\n")
    i += 1

rank:  1
score=-2.2266  doc_type=table  doc_id=AAPL_10-K_2024::table::12
section: CONSOLIDATED BALANCE SHEETS
section: 12
content: Table summary: The table presents consolidated balance sheet information for Apple Inc. as of September 28, 2024 and September 30, 2023, including assets, liabilities, and shareholders’ equity.
Rows: September 28, 2024: Date for the balance sheet as  ...

rank:  2
score=-3.7578  doc_type=table  doc_id=AAPL_10-K_2024::table::10
section: CONSOLIDATED STATEMENTS OF OPERATIONS
section: 10
content: Table summary: The table presents consolidated statements of operations for Apple Inc., showing various financial metrics such as net sales, cost of sales, gross margin, operating expenses, and net income for the fiscal years ending September 28, 202 ...

rank:  3
score=-4.3203  doc_type=table  doc_id=AAPL_10-K_2024::table::13
section: CONSOLIDATED STATEMENTS OF SHAREHOLDERS’ EQUITY
section: 13
content: Table summary: The table presents the consolidated statements of 