# Explore Chunks and Tables

This notebook allows you to explore the chunks and tables extracted from 10-K/10-Q filings.

In [5]:
import json
import pandas as pd
from pathlib import Path

# Load chunks
def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

chunks = load_jsonl('chunks/AAPL_10-K_2025.text.jsonl')
tables = load_jsonl('chunks/AAPL_10-K_2025.text.jsonl')

print(f"Loaded {len(chunks)} chunks and {len(tables)} tables")

Loaded 110 chunks and 110 tables


## Basic Statistics

In [None]:
chunk

In [6]:
# Convert chunks to DataFrame for easier analysis
chunks_df = pd.DataFrame([
    {
        'chunk_id': c['chunk_id'],
        'text': c['text'],
        'text_length': len(c['text']),
        'word_count': len(c['text'].split()),
        'form_type': c['filing']['form_type'],
        'ticker': c['filing']['ticker'],
        'fiscal_year': c['filing']['fiscal_year'],
        'section_depth': len(c['section_path']),
        'has_section_path': len(c['section_path']) > 0,
        'source_path': c['filing']['source_path']
    }
    for c in chunks
])

print("\nChunks by Form Type:")
print(chunks_df['form_type'].value_counts())

print("\nWord Count Statistics:")
print(chunks_df['word_count'].describe())

print("\nChunks with Section Paths:")
print(chunks_df['has_section_path'].value_counts())

KeyError: 'chunk_id'

## Filter Chunks by Criteria

In [2]:
# Filter by form type
form_type = '10-K'  # Change to '10-Q' for quarterly reports
filtered = chunks_df[chunks_df['form_type'] == form_type]

print(f"\nFiltered to {len(filtered)} chunks from {form_type}")
filtered.head()

NameError: name 'chunks_df' is not defined

In [None]:
# Filter by word count (e.g., longer chunks with substantial content)
min_words = 100
substantial_chunks = chunks_df[chunks_df['word_count'] >= min_words]

print(f"\nChunks with at least {min_words} words: {len(substantial_chunks)}")
substantial_chunks[['chunk_id', 'word_count', 'form_type']].head(10)

## Search Chunks by Keyword

In [None]:
# Search for chunks containing specific keywords
keyword = 'revenue'  # Change this to search for different terms

matches = chunks_df[chunks_df['text'].str.contains(keyword, case=False, na=False)]
print(f"\nFound {len(matches)} chunks containing '{keyword}'")

# Display first few matches
for idx, row in matches.head(5).iterrows():
    print(f"\n{'='*80}")
    print(f"Chunk ID: {row['chunk_id']}")
    print(f"Form: {row['form_type']} | Words: {row['word_count']}")
    print(f"Text:\n{row['text'][:300]}...")  # Show first 300 chars

## View Full Chunk Content

In [None]:
# View a specific chunk by index or ID
chunk_idx = 100  # Change this to view different chunks

chunk = chunks[chunk_idx]
print(f"Chunk ID: {chunk['chunk_id']}")
print(f"Form Type: {chunk['filing']['form_type']}")
print(f"Source: {chunk['filing']['source_path']}")
print(f"Section Path: {chunk['section_path']}")
print(f"\nText:")
print(chunk['text'])

## Explore Tables

In [None]:
# Table statistics
tables_df = pd.DataFrame([
    {
        'table_id': t['table_id'],
        'title': t['title'],
        'statement_type': t['statement_type'],
        'form_type': t['filing']['form_type'],
        'num_rows': len(t['rows']),
        'num_cols': len(t['columns']),
        'currency': t['currency'],
        'scale': t['scale']
    }
    for t in tables
])

print("\nTables by Statement Type:")
print(tables_df['statement_type'].value_counts())

print("\nTables by Form Type:")
print(tables_df['form_type'].value_counts())

tables_df.head(10)

In [None]:
# Filter tables by statement type
stmt_type = 'income_statement'  # Try: 'balance_sheet', 'cash_flow', or None

filtered_tables = tables_df[tables_df['statement_type'] == stmt_type]
print(f"\nFound {len(filtered_tables)} {stmt_type} tables")
filtered_tables

In [None]:
# View a specific table
table_idx = 0  # Change this to view different tables

table = tables[table_idx]
print(f"Table ID: {table['table_id']}")
print(f"Title: {table['title']}")
print(f"Statement Type: {table['statement_type']}")
print(f"Form: {table['filing']['form_type']}")
print(f"Currency: {table['currency']} (Scale: {table['scale']})")
print(f"\nColumns: {[col['label'] for col in table['columns']]}")
print(f"\nFirst 5 rows:")
for i, row in enumerate(table['rows'][:5]):
    print(f"  {row['label']}: {row['values']}")

## Custom Filters

In [None]:
# Example: Find all chunks from 10-K with more than 200 words
custom_filter = chunks_df[
    (chunks_df['form_type'] == '10-K') & 
    (chunks_df['word_count'] > 200)
]

print(f"Found {len(custom_filter)} chunks matching criteria")

# Show a sample
if len(custom_filter) > 0:
    sample_idx = custom_filter.index[0]
    sample = chunks[sample_idx]
    print(f"\nSample chunk (ID: {sample['chunk_id']}):")
    print(sample['text'])

In [None]:
# Advanced: Search for specific metrics in tables
metric_search = 'net sales'  # Change to search for different metrics

matching_tables = []
for table in tables:
    for row in table['rows']:
        if metric_search.lower() in row['label'].lower():
            matching_tables.append({
                'table_id': table['table_id'],
                'title': table['title'],
                'form_type': table['filing']['form_type'],
                'row_label': row['label'],
                'values': row['values']
            })
            break

print(f"\nFound {len(matching_tables)} tables with '{metric_search}'")
pd.DataFrame(matching_tables)