## Test Embedding

In [31]:
import json
from pathlib import Path
from typing import Any, Dict, List, Optional
import sys
import requests

from sec_embedder import build_text_docs
from sec_embedder import embed_docs
from sec_embedder import build_table_and_row_docs


In [2]:
%load_ext autoreload
%autoreload 2

In [5]:
root = Path.cwd().parent  # if you're inside notebooks/
sys.path.append(str(root / "src"))

In [6]:
def build_all_aapl_10k_embeddings(
    text_path: Path,
    table_summaries_path: Path,
    api_url: str = "http://localhost:11434/api/embed",
    model: str = "qwen3-embedding:8b",
) -> Dict[str, List[Dict[str, Any]]]:
    """
    High-level driver that:
      - builds text_chunk docs
      - builds table + table_row docs
      - embeds all of them with qwen3-embedding:8b

    Returns a dict:
      {
        "text_chunks": [...],
        "tables": [...],
        "table_rows": [...]
      }
    Each element is a doc with {id, content, metadata, embedding}.
    """

    # Fill in with actual metadata you know for this filing
    common_meta = {
        "prefix": "AAPL_10-K_2024",
        "ticker": "AAPL",
        "company_name": "Apple Inc.",
        "form_type": "10-K",
        "fiscal_year": 2024,
        # add "cik", "filing_date", etc. if you have them in your JSONL
    }

    # Build docs (no embeddings yet)
    text_docs = build_text_docs(text_path=text_path, common_meta=common_meta)
    table_row_docs_dict = build_table_and_row_docs(
        table_summaries_path=table_summaries_path,
        common_meta=common_meta,
    )
    table_docs = table_row_docs_dict["tables"]
    row_docs = table_row_docs_dict["rows"]

    print(
        f"[INFO] Prepared {len(text_docs)} text docs, "
        f"{len(table_docs)} table docs, {len(row_docs)} row docs."
    )

    # Embed each group separately (so you can ingest into separate collections if desired)
    embedded_text = embed_docs_with_qwen3(
        text_docs, api_url=api_url, model=model
    )
    embedded_tables = embed_docs_with_qwen3(
        table_docs, api_url=api_url, model=model
    )
    embedded_rows = embed_docs_with_qwen3(
        row_docs, api_url=api_url, model=model
    )

    print(
        f"[DONE] Embedded text={len(embedded_text)}, "
        f"tables={len(embedded_tables)}, rows={len(embedded_rows)}."
    )

    return {
        "text_chunks": embedded_text,
        "tables": embedded_tables,
        "table_rows": embedded_rows,
    }

In [27]:
api_url = "http://192.168.1.237:11434/api/embed"
# "http://localhost:11434/api/embed"
model  = "qwen3-embedding:8b"
text_path = Path("../data/chunked/AAPL_10-K_2024.text.split.jsonl")

In [None]:
text_docs

In [22]:
common_meta = {
    "prefix": "AAPL_10-K_2024",
    "ticker": "AAPL",
    "company_name": "Apple Inc.",
    "form_type": "10-K",
    "fiscal_year": 2024,
    # add "cik", "filing_date", etc. if you have them in your JSONL
}

# Build docs (no embeddings yet)
text_docs = build_text_docs(text_path=text_path, common_meta=common_meta)
text_doc = text_docs[0:2]
print(text_doc)

[{'id': 'AAPL_10-K_2024::text::0', 'content': 'The Company designs, manufactures and markets smartphones, personal computers, tablets, wearables and accessories, and sells a variety of related services. The Company’s fiscal year is the 52- or 53-week period that ends on the last Saturday of September.', 'metadata': {'prefix': 'AAPL_10-K_2024', 'ticker': 'AAPL', 'company_name': 'Apple Inc.', 'form_type': '10-K', 'fiscal_year': 2024, 'doc_type': 'text_chunk', 'chunk_index': 0, 'section_title': None, 'section_path': None, 'source': 'text'}}, {'id': 'AAPL_10-K_2024::text::1', 'content': 'iPhone® is the Company’s line of smartphones based on its iOS operating system. The iPhone line includes iPhone 16 Pro, iPhone 16, iPhone 15, iPhone 14 and iPhone SE®.\n\nMac® is the Company’s line of personal computers based on its macOS® operating system. The Mac line includes laptops MacBook Air® and MacBook Pro®, as well as desktops iMac®, Mac mini®, Mac Studio® and Mac Pro®.\n\niPad® is the Company’s 

In [28]:
embedded_text = embed_docs(
    text_doc, api_url=api_url, model=model
)

[EMBED] Embedded docs 0–1 with model=qwen3-embedding:8b


### Tables

In [32]:
table_summaries_path = Path("../data/chunked/table_summaries/AAPL_10-K_2024.tables.summaries.jsonl")

table_row_docs_dict = build_table_and_row_docs(
    table_summaries_path=table_summaries_path,
    common_meta=common_meta,
)
table_docs = table_row_docs_dict["tables"]
row_docs = table_row_docs_dict["rows"]

In [42]:
api_url = "http://localhost:11434/api/embed"
# Embed each group separately (so you can ingest into separate collections if desired)
embedded_tables = embed_docs(
    table_docs[:5], api_url=api_url, model=model,
    batch_size=1
)

[EMBED] Embedded docs 0–0 with model=qwen3-embedding:8b
[EMBED] Embedded docs 1–1 with model=qwen3-embedding:8b
[EMBED] Embedded docs 2–2 with model=qwen3-embedding:8b
[EMBED] Embedded docs 3–3 with model=qwen3-embedding:8b
[EMBED] Embedded docs 4–4 with model=qwen3-embedding:8b


In [49]:
embedded_tables[0]

{'id': 'AAPL_10-K_2024::table::0',
 'content': "Table summary: The table details Apple Inc.'s purchases of its common equity during specific periods in fiscal year 2024, including the number of shares purchased, average price paid per share, and total purchases under publicly announced plans or programs.\nRows: June 30, 2024 to August 3, 2024 – Open market and privately negotiated purchases: Purchases of shares in the open market and through private negotiations during the period from June 30, 2024, to August 3, 2024. August 4, 2024 to August 31, 2024 – Open market and privately negotiated purchases: Purchases of shares in the open market and through private negotiations during the period from August 4, 2024, to August 31, 2024. September 1, 2024 to September 28, 2024 – Open market and privately negotiated purchases: Purchases of shares in the open market and through private negotiations during the period from September 1, 2024, to September 28, 2024. Total – Open market and privately 

In [44]:
embedded_rows = embed_docs(
    row_docs[:15], api_url=api_url, model=model
)

[EMBED] Embedded docs 0–14 with model=qwen3-embedding:8b


In [48]:
embedded_rows[0]

{'id': 'AAPL_10-K_2024::table::0::row::0',
 'content': 'June 30, 2024 to August 3, 2024 – Open market and privately negotiated purchases: Purchases of shares in the open market and through private negotiations during the period from June 30, 2024, to August 3, 2024.',
 'metadata': {'prefix': 'AAPL_10-K_2024',
  'ticker': 'AAPL',
  'company_name': 'Apple Inc.',
  'form_type': '10-K',
  'fiscal_year': 2024,
  'doc_type': 'table_row',
  'table_index': 0,
  'row_index': 0,
  'row_label': 'June 30, 2024 to August 3, 2024 – Open market and privately negotiated purchases',
  'section_title': 'Purchases of Equity Securities by the Issuer and Affiliated Purchasers',
  'section_path': None,
  'source': 'table_row'},
 'embedding': [-0.0015352812,
  0.0029663113,
  -0.014464433,
  -0.01767238,
  -0.0067325816,
  0.0468936,
  -0.01067392,
  0.024938513,
  -0.0148923835,
  -0.012626129,
  -0.0013699192,
  -0.0144192865,
  0.04556597,
  -0.031305756,
  0.013099084,
  -0.01696861,
  -0.032487012,
  -0