In [5]:
%load_ext autoreload
%autoreload 2

In [None]:
from table_summarization_eval import annotate_table_with_ollama
from table_summarization_eval import evaluate_models_on_tables
from chunks_loader import load_filing_chunks
import pandas as pd
from IPython.display import HTML

API_URL = "http://localhost:11434/api/generate" 

pd.set_option("display.max_colwidth", None)     # show full column text
pd.set_option("display.max_rows", None)         # optional
pd.set_option("display.width", 2000)            # prevent wrapping


text_chunks, table_chunks = load_filing_chunks("AAPL_10-K_2025", out_dir="./chunks")

len(text_chunks), len(table_chunks)

In [12]:
table_chunks[5].keys()

dict_keys(['chunk_type', 'item_id', 'item_title', 'section_title', 'heading_path', 'text', 'table_html', 'table_dict', 'meta', 'table_df'])

In [None]:
from typing import List, Dict, Any

def get_test_tables(table_chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Return three representative tables for LLM summarization tests:
      0: Consolidated Statements of Operations
      1: Note 2 – Revenue
      2: Note 13 – Segment Information and Geographic Data (geography table)
    """

    ops_tbl = None
    note2_tbl = None
    note13_seg_tbl = None

    for ch in table_chunks:
        section_title = ch.get("section_title")

        # 1) Consolidated Statements of Operations
        if section_title == "CONSOLIDATED STATEMENTS OF OPERATIONS":
            ops_tbl = ops_tbl or ch  # take the first match

        # 2) Note 2 – Revenue
        elif section_title == "Note 2 – Revenue":
            note2_tbl = note2_tbl or ch

        # 3) Note 13 – Segment Information and Geographic Data (geo table)
        elif section_title == "Note 13 – Segment Information and Geographic Data":
            table = ch.get("table_dict") or {}
            data = table.get("data") or []
            # Heuristic: second row (index 1) contains region headers, including "Americas"
            if len(data) >= 2 and any(cell == "Americas" for cell in data[1]):
                note13_seg_tbl = note13_seg_tbl or ch

    missing = []
    if ops_tbl is None:
        missing.append("CONSOLIDATED STATEMENTS OF OPERATIONS")
    if note2_tbl is None:
        missing.append("Note 2 – Revenue")
    if note13_seg_tbl is None:
        missing.append("Note 13 – Segment Information and Geographic Data (geo)")

    if missing:
        raise ValueError(f"Could not find tables: {', '.join(missing)}")

    return [ops_tbl, note2_tbl, note13_seg_tbl]


# Example usage:
test_tables = get_test_tables(table_chunks)
# for i, t in enumerate(test_tables):
#     print(i, t["section_title"], t["table_df"].shape)


test_tables[0]["table_df"]

In [26]:
candidate_models = [
    # "deepseek-r1:1.5b",
    # "qwen3:latest",
    "gemma3:1b",
    "gemma3:4b",
    "gemma3:12b",
    "gemma3:27b",
    # "mistral:latest"
]

API_URL = "http://localhost:11434/api/generate"   # or your ngrok endpoint

results_df = evaluate_models_on_tables(
    models=candidate_models,
    table_chunks=test_tables[:1],          # your 3 selected tables
    api_url=API_URL,
    temperature=0.0,
    save_jsonl_path="eval_log.jsonl"   # optional
)

results_df


gemma3:1b CONSOLIDATED STATEMENTS OF OPERATIONS
gemma3:4b CONSOLIDATED STATEMENTS OF OPERATIONS
gemma3:12b CONSOLIDATED STATEMENTS OF OPERATIONS
gemma3:27b CONSOLIDATED STATEMENTS OF OPERATIONS


Unnamed: 0,model,table_index,table_title,elapsed_sec,error,json_valid,n_table_rows,n_row_summaries,row_index_valid_fraction,row_index_coverage_fraction,rows_with_non_year_number_fraction,has_any_non_year_numbers,table_summary_has_non_year_numbers
0,gemma3:1b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,6.123678,,True,26,3,1.0,0.115385,1.0,True,True
1,gemma3:4b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,23.690001,,True,26,11,1.0,0.423077,0.0,False,True
2,gemma3:12b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,126.29367,,True,26,25,1.0,0.961538,0.04,True,True
3,gemma3:27b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,326.009366,,True,26,25,1.0,0.961538,0.0,False,True


In [30]:
import json

with open("eval_log.jsonl", "r") as f:
    data = [json.loads(line) for line in f]



[{'model': 'gemma3:1b', 'table_index': 0, 'table_title': 'CONSOLIDATED STATEMENTS OF OPERATIONS', 'elapsed_sec': 6.123677968978882, 'error': None, 'json_valid': True, 'n_table_rows': 26, 'n_row_summaries': 3, 'row_index_valid_fraction': 1.0, 'row_index_coverage_fraction': 0.11538461538461539, 'rows_with_non_year_number_fraction': 1.0, 'has_any_non_year_numbers': True, 'table_summary_has_non_year_numbers': True, 'annotation': {'table_summary': 'This table presents the financial performance of Apple Inc. for the fiscal year 2025, focusing on key financial metrics including net sales, cost of sales, gross margin, operating expenses, and net income. It details the company’s financial results across the years ended September 27, 2025, and the years ended September 28, 2024, and September 30, 2023.', 'row_summaries': [{'row_index': 0, 'row_label': 'Years ended September 27, 2025', 'description': 'This row provides the financial results for the period ending September 27, 2025.'}, {'row_index

In [40]:
df = pd.DataFrame([
    {
        "model": item["model"],
        "table_summary": item["annotation"]["table_summary"]
    }
    for item in data
])

In [45]:
df

Unnamed: 0,model,table_summary
0,gemma3:1b,"This table presents the financial performance of Apple Inc. for the fiscal year 2025, focusing on key financial metrics including net sales, cost of sales, gross margin, operating expenses, and net income. It details the company’s financial results across the years ended September 27, 2025, and the years ended September 28, 2024, and September 30, 2023."
1,gemma3:4b,"This table presents the consolidated statements of operations for Apple Inc. for the years ended September 27, 2025, September 28, 2024, and September 30, 2023. It details net sales, cost of sales, gross margin, operating expenses, and net income."
2,gemma3:12b,"This table presents a consolidated statement of operations for Apple Inc. for the years ended September 27, 2025, September 28, 2024, and September 30, 2023."
3,gemma3:27b,"This table presents the consolidated statements of operations for Apple Inc. for the fiscal years ending September 27, 2025, September 28, 2024, and September 30, 2023."


In [37]:
data[0]['model']

'gemma3:1b'

In [48]:
data[1]['annotation']['row_summaries']

[{'row_index': 0,
  'row_label': 'Net sales:',
  'description': 'This row details the breakdown of net sales by product and service categories for the specified years.'},
 {'row_index': 1,
  'row_label': 'Cost of sales:',
  'description': 'This row presents the cost of sales, categorized by product and service, for the specified years.'},
 {'row_index': 2,
  'row_label': 'Gross margin:',
  'description': 'This row indicates the gross margin for the specified years.'},
 {'row_index': 3,
  'row_label': 'Operating expenses:',
  'description': 'This row outlines the total operating expenses, broken down into research and development and selling, general and administrative expenses, for the specified years.'},
 {'row_index': 4,
  'row_label': 'Operating income:',
  'description': 'This row shows the operating income for the specified years.'},
 {'row_index': 5,
  'row_label': 'Other income/(expense), net:',
  'description': 'This row indicates other income or expense, net, for the specified

In [52]:
import pandas as pd

CANON_MODEL = "gemma3:27b"   # or gemma3:12b, whichever you trust more

# model -> list of row_summaries
rows_by_model = {
    item["model"]: item["annotation"]["row_summaries"]
    for item in data
}

canonical_rows = {
    row["row_index"]: row
    for row in rows_by_model[CANON_MODEL]
}


In [55]:
rows = []

for row_idx, crow in canonical_rows.items():
    record = {
        "row_index": row_idx,
        # "canonical_label": crow["row_label"],
        # "canonical_desc": crow["description"],
    }

    for model, mrows in rows_by_model.items():
        # find that row_index in this model’s output (if it exists)
        match = next((r for r in mrows if r["row_index"] == row_idx), None)

        # record[f"{model}_label"] = match["row_label"] if match else None
        record[f"{model}_desc"]  = match["description"] if match else None

    rows.append(record)

comparison_df = pd.DataFrame(rows).sort_values("row_index")

pd.set_option("display.max_colwidth", None)
comparison_df

Unnamed: 0,row_index,gemma3:1b_desc,gemma3:4b_desc,gemma3:12b_desc,gemma3:27b_desc
0,0,"This row provides the financial results for the period ending September 27, 2025.",This row details the breakdown of net sales by product and service categories for the specified years.,,This row indicates the periods covered by the financial data.
1,1,"This row presents the financial results for the period ending September 28, 2024.","This row presents the cost of sales, categorized by product and service, for the specified years.","This row labels the columns as years ended September 27, 2025, September 28, 2024, and September 30, 2023.",This row represents net sales data.
2,2,"This row presents the financial results for the period ending September 30, 2023.",This row indicates the gross margin for the specified years.,This row represents the line item 'Net sales'.,"This row shows net sales from products for the years 2025, 2024, and 2023."
3,3,,"This row outlines the total operating expenses, broken down into research and development and selling, general and administrative expenses, for the specified years.",This row represents net sales from Products.,"This row shows net sales from services for the years 2025, 2024, and 2023."
4,4,,This row shows the operating income for the specified years.,This row represents net sales from Services.,"This row shows the total net sales for the years 2025, 2024, and 2023."
5,5,,"This row indicates other income or expense, net, for the specified years.",This row represents the total net sales.,This row represents cost of sales data.
6,6,,This row shows the income before provision for income taxes for the specified years.,This row represents the line item 'Cost of sales'.,"This row shows the cost of sales for products for the years 2025, 2024, and 2023."
7,7,,This row indicates the provision for income taxes for the specified years.,This row represents cost of sales from Products.,"This row shows the cost of sales for services for the years 2025, 2024, and 2023."
8,8,,This row shows the net income for the specified years.,This row represents cost of sales from Services.,"This row shows the total cost of sales for the years 2025, 2024, and 2023."
9,9,,This row details the earnings per share for both basic and diluted shares for the specified years.,This row represents the total cost of sales.,"This row shows the gross margin for the years 2025, 2024, and 2023."


In [50]:
candidate_models = [
    "qwen3:0.6b",
    "qwen3:1.7b",
    "qwen3:4b",
    "qwen3:latest",
    "qwen3:14b",
    "qwen3:30b",
    "qwen3:32b",
]

API_URL = "http://localhost:11434/api/generate"   # or your ngrok endpoint

results_df = evaluate_models_on_tables(
    models=candidate_models,
    table_chunks=test_tables[:1],          # your 3 selected tables
    api_url=API_URL,
    temperature=0.0,
    save_jsonl_path="qwen_eval_log.jsonl"   # optional
)

qwen3:0.6b CONSOLIDATED STATEMENTS OF OPERATIONS
qwen3:1.7b CONSOLIDATED STATEMENTS OF OPERATIONS
qwen3:4b CONSOLIDATED STATEMENTS OF OPERATIONS
qwen3:latest CONSOLIDATED STATEMENTS OF OPERATIONS
qwen3:14b CONSOLIDATED STATEMENTS OF OPERATIONS
qwen3:30b CONSOLIDATED STATEMENTS OF OPERATIONS
qwen3:32b CONSOLIDATED STATEMENTS OF OPERATIONS


In [56]:
candidate_models = [
    "deepseek-r1:1.5b",
    "deepseek-r1:7b",
    "deepseek-r1:8b",
    "deepseek-r1:14b",
    "deepseek-r1:32b",
]

API_URL = "http://localhost:11434/api/generate"   # or your ngrok endpoint

r1_df = evaluate_models_on_tables(
    models=candidate_models,
    table_chunks=test_tables[:1],          # your 3 selected tables
    api_url=API_URL,
    temperature=0.0,
    save_jsonl_path="r1_eval_log.jsonl"   # optional
)

deepseek-r1:1.5b CONSOLIDATED STATEMENTS OF OPERATIONS
deepseek-r1:7b CONSOLIDATED STATEMENTS OF OPERATIONS
deepseek-r1:8b CONSOLIDATED STATEMENTS OF OPERATIONS
deepseek-r1:14b CONSOLIDATED STATEMENTS OF OPERATIONS
deepseek-r1:32b CONSOLIDATED STATEMENTS OF OPERATIONS


In [67]:
!ollama list

NAME                        ID              SIZE      MODIFIED          
kimi-k2-thinking:cloud      9752ffb77f53    -         40 minutes ago       
kimi-k2:1t-cloud            20dc43ca06d7    -         40 minutes ago       
minimax-m2:cloud            698ab6d56142    -         41 minutes ago       
deepseek-v3.1:671b-cloud    d3749919e45f    -         41 minutes ago       
gpt-oss:120b-cloud          569662207105    -         42 minutes ago       
gpt-oss:20b-cloud           875e8e3a629a    -         42 minutes ago       
glm-4.6:cloud               05277b76269f    -         54 minutes ago       
deepseek-r1:32b             edba8017331d    19 GB     About an hour ago    
deepseek-r1:14b             c333b7232bdb    9.0 GB    About an hour ago    
deepseek-r1:8b              6995872bfe4c    5.2 GB    About an hour ago    
deepseek-r1:7b              755ced02ce7b    4.7 GB    About an hour ago    
qwen3:4b                    359d7dd4bcda    2.5 GB    2 hours ago          
qwen3:1.7b     

In [68]:
candidate_models = [
    "glm-4.6:cloud",
    "kimi-k2-thinking:cloud",
    "kimi-k2:1t-cloud",
    "minimax-m2:cloud",
    "deepseek-v3.1:671b-cloud",
    "gpt-oss:120b-cloud",
    "gpt-oss:20b-cloud"
]

API_URL = "http://localhost:11434/api/generate"   # or your ngrok endpoint

cloud_df = evaluate_models_on_tables(
    models=candidate_models,
    table_chunks=test_tables[:1],          # your 3 selected tables
    api_url=API_URL,
    temperature=0.0,
    save_jsonl_path="cloud_eval_log.jsonl"   # optional
)

glm-4.6:cloud CONSOLIDATED STATEMENTS OF OPERATIONS
kimi-k2-thinking:cloud CONSOLIDATED STATEMENTS OF OPERATIONS
kimi-k2:1t-cloud CONSOLIDATED STATEMENTS OF OPERATIONS
minimax-m2:cloud CONSOLIDATED STATEMENTS OF OPERATIONS
deepseek-v3.1:671b-cloud CONSOLIDATED STATEMENTS OF OPERATIONS
gpt-oss:120b-cloud CONSOLIDATED STATEMENTS OF OPERATIONS
gpt-oss:20b-cloud CONSOLIDATED STATEMENTS OF OPERATIONS


In [69]:
cloud_df

Unnamed: 0,model,table_index,table_title,elapsed_sec,error,json_valid,n_table_rows,n_row_summaries,row_index_valid_fraction,row_index_coverage_fraction,rows_with_non_year_number_fraction,has_any_non_year_numbers,table_summary_has_non_year_numbers
0,glm-4.6:cloud,0,CONSOLIDATED STATEMENTS OF OPERATIONS,14.956853,,True,26,24,1.0,0.923077,0.0,False,True
1,kimi-k2-thinking:cloud,0,CONSOLIDATED STATEMENTS OF OPERATIONS,64.231987,Expecting value: line 1 column 1 (char 0),False,26,0,0.0,0.0,0.0,False,False
2,kimi-k2:1t-cloud,0,CONSOLIDATED STATEMENTS OF OPERATIONS,19.207029,,True,26,19,1.0,0.730769,0.0,False,True
3,minimax-m2:cloud,0,CONSOLIDATED STATEMENTS OF OPERATIONS,15.350703,,True,26,19,1.0,0.730769,0.0,False,True
4,deepseek-v3.1:671b-cloud,0,CONSOLIDATED STATEMENTS OF OPERATIONS,14.425715,,True,26,19,1.0,0.730769,0.0,False,True
5,gpt-oss:120b-cloud,0,CONSOLIDATED STATEMENTS OF OPERATIONS,14.734773,,True,26,24,1.0,0.923077,0.0,False,True
6,gpt-oss:20b-cloud,0,CONSOLIDATED STATEMENTS OF OPERATIONS,33.386565,,True,26,24,1.0,0.923077,1.0,True,True


In [65]:
glm_df

Unnamed: 0,model,table_index,table_title,elapsed_sec,error,json_valid,n_table_rows,n_row_summaries,row_index_valid_fraction,row_index_coverage_fraction,rows_with_non_year_number_fraction,has_any_non_year_numbers,table_summary_has_non_year_numbers
0,glm-4.6:cloud,0,CONSOLIDATED STATEMENTS OF OPERATIONS,12.654039,,True,26,19,1.0,0.730769,0.0,False,True


In [57]:
r1_df

Unnamed: 0,model,table_index,table_title,elapsed_sec,error,json_valid,n_table_rows,n_row_summaries,row_index_valid_fraction,row_index_coverage_fraction,rows_with_non_year_number_fraction,has_any_non_year_numbers,table_summary_has_non_year_numbers
0,deepseek-r1:1.5b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,5.341539,'list' object has no attribute 'get',False,26,0,0.0,0.0,0.0,False,False
1,deepseek-r1:7b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,50.800211,,True,26,20,1.0,0.423077,0.0,False,True
2,deepseek-r1:8b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,84.336688,,True,26,26,1.0,1.0,0.0,False,True
3,deepseek-r1:14b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,147.141991,,True,26,23,1.0,0.884615,0.043478,True,False
4,deepseek-r1:32b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,224.823438,,True,26,24,1.0,0.923077,0.0,False,True


In [51]:
results_df

Unnamed: 0,model,table_index,table_title,elapsed_sec,error,json_valid,n_table_rows,n_row_summaries,row_index_valid_fraction,row_index_coverage_fraction,rows_with_non_year_number_fraction,has_any_non_year_numbers,table_summary_has_non_year_numbers
0,qwen3:0.6b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,11.2413,,True,26,20,1.0,0.769231,0.0,False,True
1,qwen3:1.7b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,23.907061,,True,26,12,1.0,0.461538,1.0,True,True
2,qwen3:4b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,69.855961,Expecting value: line 1 column 1 (char 0),False,26,0,0.0,0.0,0.0,False,False
3,qwen3:latest,0,CONSOLIDATED STATEMENTS OF OPERATIONS,76.829833,,True,26,26,1.0,1.0,0.923077,True,True
4,qwen3:14b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,19.322945,,False,26,0,0.0,0.0,0.0,False,False
5,qwen3:30b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,54.006747,Expecting value: line 1 column 1 (char 0),False,26,0,0.0,0.0,0.0,False,False
6,qwen3:32b,0,CONSOLIDATED STATEMENTS OF OPERATIONS,268.948101,,True,26,25,1.0,0.961538,0.0,False,False
