## Lexical Selector

In [1]:
import requests
from __future__ import annotations
from typing import Any, Dict, List, Optional, Iterable
from pathlib import Path
from collections import defaultdict
import requests
import uuid


# import qdrant_client
from qdrant_client import QdrantClient, models
import importlib.metadata as im
import qdrant_client
import inspect
import time
import sys
import json

from concurrent.futures import ThreadPoolExecutor, as_completed
import os
from huggingface_hub import snapshot_download
from FlagEmbedding import BGEM3FlagModel

%reload_ext autoreload
%autoreload 2

In [13]:
sys.path.append("../src") 

from rag10kq.retrieval_evaluator import (
      run_table_eval_and_print,
      hybrid_search_sec_docs_bge_m3,
      cap_per_group,
      table_group_key,
      rerank_with_minilm_l6,
      dedupe_scored_points,
      dense_search_points,
      dense_search_sec_docs,
      embed_query_qwen3,
      format_passage_for_rerank,
      rerank_with_bge_reranker_large,
      get_bge_reranker_large_model,
      get_gte_multilingual_reranker_base,
      rerank_with_gte_multilingual_reranker_base,
      get_granite_reranker_english_r2_model,
      rerank_with_granite_english_r2,
      get_qwen3_reranker_model,
      rerank_with_qwen3_reranker,
    build_sec_filter,
      embed_query_bge_m3,
      rrf_fuse,
      _point_key,
      _scored_point_with_score,
      dedupe_scored_points,
      doc_id_table_key,
      table_group_key,
      multi_query_hybrid_search_bge_m3, 
      get_bge_m3_model,
      score_and_select_tables,
      tables_to_llm_texts
  )

from rag10kq.utils import accounting_terms_file_to_llm_digest
from rag10kq.utils import chat_with_ollama
from rag10kq.query_expansion_helper import expand_query_with_ollama
from rag10kq.rerank_context_enricher import (
    fetch_table_summaries_for_candidates, 
    enrich_point_for_rerank, 
    enrich_candidates_with_table_summaries
    )

In [3]:
def print_retrieved_results(results: list):
    i = 1
    for h in results:
        print("rank: ", i)
        p = h.payload or {}
        print(f"score={h.score:.4f}  doc_type={p.get('doc_type')}  doc_id={p.get('doc_id')}")
        print("section:", p.get("section_title"))
        print("section:", p.get("table_index"))
        print("content:", (p.get("content") or ""), "...\n")
        i += 1
    return

In [4]:
user_query = "What was Apple’s total debt (short-term plus long-term) at year-end 2024?"
line_items = accounting_terms_file_to_llm_digest("../data/config/SEC_accounting_terms.json")
# response = chat_with_ollama(build_query_expansion_prompt(user_query, line_items), 
#                             model="qwen3:4b-instruct")
expanded_queries = expand_query_with_ollama(
                        user_query, 
                         allowed_line_items=line_items,
                         model = "qwen3:4b-instruct",
                         include_original=True,
                         dedupe=True,
                        )
print(expanded_queries)

['What was Apple’s total debt (short-term plus long-term) at year-end 2024?', 'Total liabilities', 'Total term debt', 'Commercial paper', 'Less: Current portion of term debt', 'Total non-current liabilities']


In [5]:
client = QdrantClient(host="localhost", port=6333)
queries = expanded_queries

### Hybrid Retrieval and Reranking

In [6]:
bge = get_bge_m3_model(model_name="BAAI/bge-m3", use_fp16=False)  # or True on CUDA
fused, hits_by_query = multi_query_hybrid_search_bge_m3(
      queries,
      client=client,
      collection_name="sec_docs_hybrid",
      top_k=20,
      ticker="AAPL",
      fiscal_year=2024,
      form_type="10-K",
      doc_types=["table", "table_row"],
      bge_model=bge,
      fuse=True,
      rrf_k=60,
  )
deduped = dedupe_scored_points(fused, key_fn=doc_id_table_key)
enriched_cands = enrich_candidates_with_table_summaries(
    deduped,
    client=client,
    collection_name="sec_docs_hybrid",        
)
query_for_reranking = queries[0] + " (" + ", ".join(queries[1:]) + ")"
print(query_for_reranking)
model = get_bge_reranker_large_model(model_name="BAAI/bge-reranker-v2-m3")
reranked = rerank_with_bge_reranker_large(
  query_for_reranking,
  enriched_cands,
  top_k=5,
  model=model,
  max_passage_chars=4000,
)
print_retrieved_results(reranked)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


What was Apple’s total debt (short-term plus long-term) at year-end 2024? (Total liabilities, Total term debt, Commercial paper, Less: Current portion of term debt, Total non-current liabilities)


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


rank:  1
score=2.6562  doc_type=table_row  doc_id=AAPL_10-K_2024::table::32::row::10
section: Term Debt
section: 32
content: Table summary: The table details term debt maturities and related financial information for Apple Inc., covering the years 2023 and 2024, including fixed-rate notes and their effective interest rates.
Rows: 2013 – 2023 debt issuances: – Fixed-rate 0.000% – 4.850% notes: Details of fixed-rate term debt issued between 2013 and 2023, including the maturity period from 2024 to 2062. 2013 – 2023 debt issuances: – Fixed-rate 0.000% – 4.850% notes: Amount of fixed-rate term debt issued between 2013 and 2023, with maturities from 2024 to 2062. 2013 – 2023 debt issuances: – Fixed-rate 0.000% – 4.850% notes: Effective interest rates for the fixed-rate term debt issued between 2013 and 2023, with maturities from 2024 to 2062. 2013 – 2023 debt issuances: – Fixed-rate 0.000% – 4.850% notes: Details of fixed-rate term debt issued between 2013 and 2023, including the maturity p

### Applying Lexical Rules

In [7]:
scored_tables = score_and_select_tables(
        reranked,
        queries,
        '2024'
)
print(scored_tables)

[{'table': ScoredPoint(id='6acb1a3b-f486-54e4-b05e-3e141c0597b4', version=1, score=1.5908203125, payload={'doc_id': 'AAPL_10-K_2024::table::12::row::24', 'content': 'Table summary: The table presents consolidated balance sheet information for Apple Inc. as of September 28, 2024 and September 30, 2023, including assets, liabilities, and shareholders’ equity.\nRows: September 28, 2024: Date for the balance sheet as of September 28, 2024. September 30, 2023: Date for the balance sheet as of September 30, 2023. ASSETS: – Current assets: Section header indicating current assets for both fiscal years. Cash and cash equivalents – Current assets: Current assets consisting of cash and cash equivalents as of September 28, 2024 and September 30, 2023.\n\nTerm debt – Non-current liabilities: Non-current liabilities representing term debt as of September 28, 2024 and September 30, 2023.', 'prefix': 'AAPL_10-K_2024', 'ticker': 'AAPL', 'form_type': '10-K', 'fiscal_year': 2024, 'doc_type': 'table_row'

## Context Selector Agent

The Problem: The correct table is at Rank #2. Feeding all Top 5 tables into the generation prompt creates "Context Pollution," where the model might hallucinate by grabbing a number from a similar but irrelevant table (e.g., Rank #1 might be 2023 data). '
The Solution:
- Step: Insert a lightweight "Judge" or "Selector" step before the final answer generation.
  
- Prompt: Send the User Query + Top 5 Raw Tables (or detailed headers) to a fast LLM. Ask it: "Identify the single table that contains the detailed balance sheet or debt breakdown for the fiscal year ended 2024. Return only the Table ID."

Outcome: This isolates the Rank #2 table as the sole context for the reasoning engine, eliminating noise.

### Why Agent?
1. 
An "Agent" (Active):

- Goal: Decision Making & Routing.

- Logic: "Here are 5 potential sources. Evaluate them against the user's constraints. Choose the best one. Discard the rest. If none are good, Stop or Retry."

- Flow: The pipeline is dynamic. The agent controls what data (if any) enters the reasoning engine.
2. 
If you use a LLM Layer: You might concatenate the Top 5 tables and feed them to the final generator. The model sees "Short-term debt" in Table 1 (2023) and Table 2 (2024). LLMs often suffer from __Primacy Bias__ (favoring Rank #1) or simply get confused by seeing the same line item twice, leading to "merged" hallucinated numbers.

If you use an Agent: The agent performs __Listwise Reasoning__. It looks at the candidates simultaneously and reasons: "Table 1 is 2023 data. Table 2 is 2024 data. Table 3 is a 'Fair Value' estimate (irrelevant). Therefore, I will isolate Table 2 and block the others."

This creates a "Clean Room" for the next step. Your Python calculator (Step 3) receives only the correct data, eliminating the possibility of it accidentally grabbing a number from the wrong year.

3. The "Auditor" Role (Finance Specifics)

In finance, semantic relevance (Vector Search) 
!= logical relevance.

Vector Search sees "Debt" and "Table" and thinks "Fair Value of Debt Instruments" is a great match.

The Agent acts as an Auditor. You prompt it with specific constraints: "Reject 'Fair Value' tables. Only accept 'Carrying Value' or 'Consolidated Balance Sheet' tables." It actively judges the validity of the data, not just the similarity.

# Context Generation

In [45]:
from rag10kq.context_generator import load_table_jsonl, build_generator_prompt
tables_dir = "../data/chunked"
# doc_prefix = "AAPL_10-K_2024"
# path = Path(tables_dir) / f"{doc_prefix}.tables.jsonl"
# tables = []
# for idx, line in enumerate(path.open()):
#             tables.append(json.loads(line))
# bs_table = tables[12]    
USER_QUERY = user_query
tables = tables_to_llm_texts(scored_tables, tables_dir=tables_dir)
RETRIEVED_TABLE = tables
# print(tables)

In [46]:
print(tables)

['| 1                                                                                                                                                                    | 2                                     | 3                                     |\n|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------|:--------------------------------------|\n|                                                                                                                                                                      | September 28, 2024                    | September 30, 2023                    |\n| ASSETS:                                                                                                                                                              | ASSETS:                               | ASSETS:                               |

In [35]:
# def build_generator_prompt(
#     USER_QUERY: str,
#     RETRIEVED_TABLE: str
# ):
#     return f'''
#         You are a senior financial analyst. Your task is to answer the user's question using ONLY the provided financial table context.
        
#         **Instructions:**
#         1. Identify the specific column in the table that matches the fiscal year/date requested. (Be careful not to mix data from the prior year's column).
#         2. Identify the relevant rows or line items from the table. 
#         3. If the answer is directly available from the table, no calculation is needed. Otherwise, perform calculations as needed. 
#         3. **Output Format:**
#            - **Part 1 [Scratchpad]:** List the extracted values and show your math: "Value A + Value B = Sum".
#            - **Part 2 [Final Answer]:** Answer the user query with your calculation and your analysis in English. 
        
#         **Context:**
#         User Query: "{USER_QUERY}"
#         Retrieved Table:
#         """
#         {RETRIEVED_TABLE}
#         """

#         **Required Output Format:**
#         [Scratchpad]
#         (Show extraction and math)

#         [Final Answer]
#         '''

In [39]:
Generator_Prompt = build_generator_prompt(USER_QUERY, RETRIEVED_TABLE)
print(Generator_Prompt)


        You are a senior financial analyst. Your task is to answer the user's question using ONLY the provided financial table context.
        
        **Instructions:**
        1. Identify the specific column in the table that matches the fiscal year/date requested. (Be careful not to mix data from the prior year's column).
        2. Identify the relevant rows or line items from the table. 
        3. If the answer is directly available from the table, no calculation is needed. Otherwise, perform calculations as needed. 
        4. **Output Format:**
           - **Part 1 [Scratchpad]:** List the extracted values and show your math: "Value A + Value B = Sum".
           - **Part 2 [Final Answer]:** Answer the user query with your calculation and your analysis in English. 
        
        **Context:**
        User Query: "What was Apple’s total debt (short-term plus long-term) at year-end 2024?"
        Retrieved Table:
        """
        ['| 1                                     

In [44]:
response = chat_with_ollama(Generator_Prompt,
                            model="deepseek-r1:14b",
                           as_list=False,
                           options={'num_predct':1024, 'temperature':0.1})
print(response)

**Scratchpad:**

- **Extracted Values:**
  - Short-term (current) term debt: $10,912 million
  - Long-term (non-current) term debt: $85,750 million

- **Calculation:**
  - Total Debt = $10,912 + $85,750 = $96,662 million

**Final Answer:**

Apple's total debt at year-end 2024 was calculated by summing the short-term and long-term term debts. The short-term term debt was $10,912 million, and the long-term term debt was $85,750 million. Adding these together gives a total debt of $96,662 million. This comprehensive figure represents Apple's combined obligations across both time frames.
