# Compare Two Models' Row Summaries Side by Side

In [10]:
from table_summarization_eval import annotate_table_with_ollama
from table_summarization_eval import evaluate_models_on_tables
from chunks_loader import load_filing_chunks
import pandas as pd
from IPython.display import HTML

import json
from typing import List, Dict, Any
import numpy as np
import pandas as pd
import ollama  # pip install ollama

API_URL = "http://localhost:11434/api/generate" 

pd.set_option("display.max_colwidth", None)     # show full column text
pd.set_option("display.max_rows", None)         # optional
pd.set_option("display.width", 2000)            # prevent wrapping


text_chunks, table_chunks = load_filing_chunks("AAPL_10-K_2025", out_dir="./chunks")

len(text_chunks), len(table_chunks)

(110, 47)

In [8]:
from typing import List, Dict, Any

def get_test_tables(table_chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Return three representative tables for LLM summarization tests:
      0: Consolidated Statements of Operations
      1: Note 2 – Revenue
      2: Note 13 – Segment Information and Geographic Data (geography table)
    """

    ops_tbl = None
    note2_tbl = None
    note13_seg_tbl = None

    for ch in table_chunks:
        section_title = ch.get("section_title")

        # 1) Consolidated Statements of Operations
        if section_title == "CONSOLIDATED STATEMENTS OF OPERATIONS":
            ops_tbl = ops_tbl or ch  # take the first match

        # 2) Note 2 – Revenue
        elif section_title == "Note 2 – Revenue":
            note2_tbl = note2_tbl or ch

        # 3) Note 13 – Segment Information and Geographic Data (geo table)
        elif section_title == "Note 13 – Segment Information and Geographic Data":
            table = ch.get("table_dict") or {}
            data = table.get("data") or []
            # Heuristic: second row (index 1) contains region headers, including "Americas"
            if len(data) >= 2 and any(cell == "Americas" for cell in data[1]):
                note13_seg_tbl = note13_seg_tbl or ch

    missing = []
    if ops_tbl is None:
        missing.append("CONSOLIDATED STATEMENTS OF OPERATIONS")
    if note2_tbl is None:
        missing.append("Note 2 – Revenue")
    if note13_seg_tbl is None:
        missing.append("Note 13 – Segment Information and Geographic Data (geo)")

    if missing:
        raise ValueError(f"Could not find tables: {', '.join(missing)}")

    return [ops_tbl, note2_tbl, note13_seg_tbl]


# Example usage:
test_tables = get_test_tables(table_chunks)

In [1]:
import json
import pandas as pd
from IPython.display import display, HTML

def load_jsonl(filepath: str) -> list[dict]:
    with open(filepath) as f:
        return [json.loads(line) for line in f]

def get_row_summaries(record: dict) -> dict:
    """Extract row summaries as {row_index: {label, description}}."""
    annotation = record.get('annotation')
    if not annotation or not annotation.get('row_summaries'):
        return {}
    return {r['row_index']: {'label': r['row_label'], 'desc': r['description']} 
            for r in annotation['row_summaries']}

def compare_two_models(rec_a: dict, rec_b: dict) -> pd.DataFrame:
    """Create side-by-side comparison DataFrame."""
    sum_a = get_row_summaries(rec_a)
    sum_b = get_row_summaries(rec_b)
    
    all_indices = sorted(set(sum_a.keys()) | set(sum_b.keys()))
    
    rows = []
    for idx in all_indices:
        a = sum_a.get(idx, {})
        b = sum_b.get(idx, {})
        rows.append({
            'idx': idx,
            'row_label': a.get('label') or b.get('label', ''),
            rec_a['model']: a.get('desc', '—'),
            rec_b['model']: b.get('desc', '—')
        })
    
    return pd.DataFrame(rows)

In [2]:
# Load data
filepath = "cloud_eval_log.jsonl"
records = load_jsonl(filepath)

# Show available models
for i, rec in enumerate(records):
    status = "✓" if rec.get('json_valid') else "✗"
    print(f"{i}: {rec['model']} [{status}]")

0: glm-4.6:cloud [✓]
1: kimi-k2-thinking:cloud [✗]
2: kimi-k2:1t-cloud [✓]
3: minimax-m2:cloud [✓]
4: deepseek-v3.1:671b-cloud [✓]
5: gpt-oss:120b-cloud [✓]
6: gpt-oss:20b-cloud [✓]


In [9]:
# Pick two models to compare
model_a_index = 2  # glm-4.6:cloud
model_b_index = 3  # gpt-oss:120b-cloud

df = compare_two_models(records[model_a_index], records[model_b_index])

# Style for better readability
styled = df.style.set_properties(**{
    'text-align': 'left',
    'white-space': 'pre-wrap',
    'vertical-align': 'top'
}).set_table_styles([
    {'selector': 'th', 'props': [('text-align', 'left')]}
])

display(styled)

Unnamed: 0,idx,row_label,kimi-k2:1t-cloud,minimax-m2:cloud
0,0,Net sales: Products,Revenue from product sales for each fiscal year.,Products net sales for each of the years ended.
1,1,Net sales: Services,Revenue from services for each fiscal year.,Services net sales for each of the years ended.
2,2,Total net sales,Combined product and services revenue for each fiscal year.,Total net sales for each of the years ended.
3,3,Cost of sales: Products,Direct costs attributable to product sales for each fiscal year.,Products cost of sales for each of the years ended.
4,4,Cost of sales: Services,Direct costs attributable to services for each fiscal year.,Services cost of sales for each of the years ended.
5,5,Total cost of sales,Combined cost of products and services for each fiscal year.,Total cost of sales for each of the years ended.
6,6,Gross margin,Total net sales less total cost of sales for each fiscal year.,Gross margin for each of the years ended.
7,7,Operating expenses: Research and development,R&D expenses for each fiscal year.,Research and development operating expenses for each of the years ended.
8,8,"Operating expenses: Selling, general and administrative",SG&A expenses for each fiscal year.,"Selling, general and administrative operating expenses for each of the years ended."
9,9,Total operating expenses,Combined R&D and SG&A expenses for each fiscal year.,Total operating expenses for each of the years ended.


In [7]:
test_tables[0]["table_df"]

Unnamed: 0,1,2,3,4
0,,Years ended,Years ended,Years ended
1,,"September 27, 2025","September 28, 2024","September 30, 2023"
2,Net sales:,,,
3,Products,$307003,$294866,$298085
4,Services,109158,96169,85200
5,Total net sales,416161,391035,383285
6,Cost of sales:,,,
7,Products,194116,185233,189282
8,Services,26844,25119,24855
9,Total cost of sales,220960,210352,214137


In [11]:

def _cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    """Cosine similarity between two embedding vectors."""
    denom = (np.linalg.norm(a) * np.linalg.norm(b)) + 1e-8
    return float(np.dot(a, b) / denom)


def _embed(text: str, model: str) -> np.ndarray:
    """Get embedding from Ollama embedding model."""
    resp = ollama.embeddings(model=model, prompt=text)
    return np.array(resp["embedding"], dtype="float32")


def evaluate_table_summaries_with_embeddings(
    jsonl_path: str,
    query: str,
    embed_model: str = "nomic-embed-text:latest",
    summary_field: str = "table_summary",
) -> pd.DataFrame:
    """
    Load LLM outputs from a JSONL file, embed each summary + the query,
    and compute similarity scores.

    Returns a DataFrame with columns:
        model, summary_text, similarity
    sorted by similarity (desc).
    """
    records: List[Dict[str, Any]] = []

    # 1) Load JSONL and collect summaries
    with open(jsonl_path, "r") as f:
        for line_no, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue

            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                print(f"[WARN] bad JSON at line {line_no}, skipping")
                continue

            model_name = obj.get("model")
            ann = obj.get("annotation") or {}
            summary_text = ann.get(summary_field)

            if not model_name or not summary_text:
                # skip entries with missing model or summary
                continue

            records.append(
                {
                    "model": model_name,
                    "summary_text": summary_text,
                }
            )

    if not records:
        raise ValueError("No valid summaries found in JSONL file.")

    df = pd.DataFrame(records)

    # 2) Embed query once
    query_emb = _embed(query, embed_model)

    # 3) Embed each summary and compute similarity
    sims: List[float] = []
    for text in df["summary_text"]:
        emb = _embed(text, embed_model)
        sims.append(_cosine_sim(query_emb, emb))

    df["similarity"] = sims

    # 4) Sort by similarity (higher = better match)
    df = df.sort_values("similarity", ascending=False).reset_index(drop=True)

    return df


In [13]:
jsonl_path = "cloud_eval_log.jsonl"   # or gemma_eval_log.jsonl, etc.
query = "How does gross margin change over the last three years?"
embed_model = "qwen3-embedding:8b"

# results = evaluate_table_summaries_with_embeddings(
#     jsonl_path=jsonl_path,
#     query=query,
#     embed_model="qwen3-embedding:8b",
#     summary_field="table_summary",  # or "row_summaries_text" if you pre-join rows
# )

# results.head()


In [14]:
query_embedding = _embed(query, embed_model)

In [15]:
kimi_embedding    = _embed("Total net sales less total cost of sales for each fiscal year.", embed_model)
minimax_embedding = _embed("Gross margin for each of the years ended.", embed_model)

In [16]:
_cosine_sim(query_embedding, kimi_embedding), _cosine_sim(query_embedding, minimax_embedding)

(0.48777180422338673, 0.7291222737833012)

In [43]:
# import voyageai

# vo = voyageai.Client(api_key="pa-4ePpo-c2jIcU-yX_PxJNWd7WvjrZwMdODZdogE7r4De")
# voyage_model = "voyage-3-large"
# # This will automatically use the environment variable VOYAGE_API_KEY.
# # Alternatively, you can use vo = voyageai.Client(api_key="<your secret key>")

# query_voyage_embedding = np.array(vo.embed(query, model= voyage_model).embeddings[0])

In [44]:
# kimi_voyage_embedding = np.array(vo.embed("Total net sales less total cost of sales for each fiscal year.", model = voyage_model).embeddings[0])
# minimax_voyage_embedding = np.array(vo.embed("Gross margin for each of the years ended.", model = voyage_model).embeddings[0])
# _cosine_sim(query_voyage_embedding, kimi_voyage_embedding), _cosine_sim(query_voyage_embedding, minimax_voyage_embedding)

(0.38230271810214955, 0.342454762358308)