# Download raw data

In [None]:
%load_ext autoreload
%autoreload 2

# Import necessary modules
import sys
from pathlib import Path

# Add the project root to Python path so we can import our modules
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from knowledge_base.src.ingestion.sec_downloader import SECDownloader

# Create our downloader instance
downloader = SECDownloader()

# Test the connection by getting basic company info
# company_info = downloader.get_company_info('MSTR')
# print("Company Info for MSTR:")
# print(company_info)

In [None]:
# # Import required modules
# import json
# from pathlib import Path

# # Download and process filings
# filings = downloader.download_company_filings(
#     ticker='RDDT',
#     filing_types=['10-K'],  # Just annual reports
#     num_filings=2  # Get the most recent filings
# )

# print("\nDownloaded Filing Metadata:")
# for filing in filings:
#     print(f"\nFiling Type: {filing.get('type')}")
#     print(f"Filing Date: {filing.get('period_of_report', 'N/A')}")
#     print(f"Accession Number: {filing.get('accession_number', 'N/A')}")
#     print(f"File Path: {filing.get('file_path')}")
    
#     # Get metadata file path
#     doc_dir = Path(filing['file_path']).parent
#     metadata_path = doc_dir / "metadata.json"
    
#     # Read and display the saved metadata
#     if metadata_path.exists():
#         with open(metadata_path, 'r') as f:
#             metadata = json.load(f)
#             print("\nStored Metadata:")
#             print(json.dumps(metadata, indent=2))


## Download SEC data in bulk

In [None]:
%load_ext autoreload
%autoreload 2

# Import necessary modules
import sys
from pathlib import Path

# Add the project root to Python path so we can import our modules
project_root = Path.cwd().parent
sys.path.append(str(project_root))


from knowledge_base.src.ingestion.sec_downloader import SECDownloader

downloader = SECDownloader()
tickers = ["NVDA", "META", "AMZN", "APPl", "MSFT", "ORCL", "GOOG", "PLTR"]
# tickers = ["NVDA", "META", "AMZN"]
filing_types = ["8-K", "10-K", "10-Q"]
num_filings = 12

# Option 1: Bulk download
downloader.bulk_download_companies(tickers, filing_types, num_filings)

# Option 2: Loop with granular control
# for ticker in tickers:
#     for filing_type in filing_types:
#         downloader.download_company_filings(ticker, [filing_type], num_filings=3)

# SQL Database

## Processing for SQL Database data

In [None]:

TEST_TICKER = 'RDDT'

%load_ext autoreload
%autoreload 2

from datetime import datetime
from pathlib import Path
import sys

# Add the project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from knowledge_base.src.ingestion.sec_downloader import SECDownloader
from knowledge_base.src.ingestion.sec_sql_extractor import SECDataExtractor
from knowledge_base.src.storage.sql_manager import FinancialMetricsManager


# Force reload the updated module
import importlib
import knowledge_base.src.ingestion.sql_extractor
importlib.reload(knowledge_base.src.ingestion.sec_downloader)
importlib.reload(knowledge_base.src.ingestion.sql_extractor)
importlib.reload(knowledge_base.src.storage.sql_manager)

import os
print(f"Current working directory: {os.getcwd()}")
print(f"Project root: {Path.cwd().parent}")

# Then import the extractor
from knowledge_base.src.ingestion.sql_extractor import SECDataExtractor

# Initialize components
downloader = SECDownloader()
extractor = SECDataExtractor()
sql_manager = FinancialMetricsManager()

# Step 1: Download SEC filing
filings = downloader.download_company_filings(
    ticker=TEST_TICKER,
    filing_types=['10-K'],
    num_filings=5
)

print("len(filings)", len(filings))

# Step 2: Process the filing
if filings:

    print("if filings called")
    
    filing = filings[0]
    print("filing", filing)
    file_dir = Path(filing['file_path']).parent
    
    # Look for XBRL or HTML version
    xbrl_file = next(file_dir.glob("*.xml"), None)
    html_file = next(file_dir.glob("*.htm*"), None)


    # Add client to the database before adding the document
    client_data = {
        "id": filing['ticker'],
        "company_name": filing.get('company_name', ''),
        "cik": filing.get('cik', ''),
        "industry": "",  # Fill if available
        "sector": "",    # Fill if available
        "market_cap": None  # Fill if available
    }
    sql_manager.sql_store.add_client(client_data)
    
    # Create document in SQL store first
    doc_data = {
        "document_id": filing['accession_number'],
        "client_id": filing['ticker'],
        "filing_type": filing['type'],
        "filing_date": filing['period_of_report'],
        "file_path": filing['file_path'],
        "file_size": filing['file_size'],
        "download_date": datetime.fromisoformat(filing['downloaded_at']),
        "has_revenue_data": filing['has_revenue_data'],
        "has_profit_data": filing['has_profit_data'],
        "has_balance_sheet": filing['has_balance_sheet'],
        "has_cash_flow": filing['has_cash_flow']
    }
    
    # Add document to get SQL document_id
    document_id = sql_manager.sql_store.add_document(doc_data)
    
    if document_id:

        print("IF document_id CALLED")

        # Try XBRL first, then HTML, then full submission
        if xbrl_file:
            print(f"Processing XBRL file: {xbrl_file}")
            metrics = extractor.process_document(str(xbrl_file), TEST_TICKER)
        elif html_file:
            print(f"Processing HTML file: {html_file}")
            metrics = extractor.process_document(str(html_file), TEST_TICKER)
        else:
            print(f"Processing full submission: {filing['file_path']}")
            metrics = extractor.process_document(filing['file_path'], TEST_TICKER)
            
        print(f"Extracted {len(metrics)} metrics")
        if metrics:
            sql_manager.save_extracted_metrics(metrics, document_id)
            print("Metrics saved to database")
            print("metrics", metrics)

        # Validate the extractions
        validation = sql_manager.validate_client_metrics(TEST_TICKER, 2024)
        print("\nValidation results:", validation)

        # Get comparative metrics
        comparative = sql_manager.get_comparative_metrics(
            ["AAPL", "MSFT", "GOOGL"],
            ["revenue", "net_income"],
            2024
        )
        print("\nComparative metrics:", comparative)


## Quick test for SQL DB

In [None]:
%load_ext autoreload
%autoreload 2

# Simple SQL Database Test
import sqlite3
import pandas as pd
from pathlib import Path

# Import necessary modules
import sys
from pathlib import Path

# # Add the project root to Python path so we can import our modules
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Force reload the updated module
import importlib
import knowledge_base.src.ingestion.sql_extractor
importlib.reload(knowledge_base.src.ingestion.sec_downloader)
importlib.reload(knowledge_base.src.ingestion.sql_extractor)
importlib.reload(knowledge_base.src.storage.sql_manager)

# # Database path
db_path = Path("../knowledge_base/data/financial_kb.db")

print("\n" + "="*60)
print("DETAILED DATABASE ANALYSIS")
print("="*60)

# Connect to database for detailed analysis
conn = sqlite3.connect(db_path)

try:
    # Get all tables
    tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table'", conn)
    print(f"Tables found: {tables['name'].tolist()}")
    
    # Analyze each table
    for table_name in tables['name']:
        print(f"\n--- {table_name.upper()} TABLE ---")
        
        # Get table schema
        schema = pd.read_sql_query(f"PRAGMA table_info({table_name})", conn)
        print(f"Columns: {schema['name'].tolist()}")
        
        # Get record count
        count = pd.read_sql_query(f"SELECT COUNT(*) as count FROM {table_name}", conn)
        record_count = count['count'].iloc[0]
        print(f"Records: {record_count}")
        
        # Show sample data if records exist
        if record_count > 0:
            sample = pd.read_sql_query(f"SELECT * FROM {table_name} LIMIT 200", conn)
            print("Sample data:")
            print(sample.to_string(index=False))
        else:
            print("(No data)")
    
    # Additional analysis
    print(f"\n=== ADDITIONAL ANALYSIS ===")
    
    # Check for any financial metrics
    metrics_count = pd.read_sql_query("SELECT COUNT(*) as count FROM financial_metrics", conn)['count'].iloc[0]
    print(f"Financial metrics extracted: {metrics_count}")
    
    # Check for any clients
    clients_count = pd.read_sql_query("SELECT COUNT(*) as count FROM clients", conn)['count'].iloc[0]
    print(f"Clients registered: {clients_count}")
    
    # Check document processing status
    docs = pd.read_sql_query("""
        SELECT 
            client_id,
            filing_type,
            filing_date,
            has_revenue_data,
            has_profit_data,
            has_balance_sheet,
            has_cash_flow,
            financial_density
        FROM documents
    """, conn)
    
    if not docs.empty:
        print(f"\nDocument processing status:")
        print(docs.to_string(index=False))
    
    # Check for any chunks
    chunks_count = pd.read_sql_query("SELECT COUNT(*) as count FROM document_chunks", conn)['count'].iloc[0]
    print(f"Document chunks created: {chunks_count}")
    
    print(f"\n✅ Database analysis complete!")
    
except Exception as e:
    print(f"❌ Error analyzing database: {e}")

finally:
    conn.close()

# Vector Database

## Create chunks for embeddings

In [None]:
# import sec_parser
# print(dir(sec_parser))
# import sec_parser.filing
# print(dir(sec_parser.filing))
# import sec_parser
# print(sec_parser.__version__)


In [1]:
%load_ext autoreload
%autoreload 2

# Import necessary modules
import sys
from pathlib import Path

# Add the project root to Python path so we can import our modules
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from knowledge_base.src.ingestion.vector_db_chunk_raw_docs import DocumentProcessor

processor = DocumentProcessor()

# Process all docs and save chunks per document
chunks = processor.process_all()


2025-08-10 15:11:34,832 - knowledge_base.src.ingestion.vector_db_chunk_raw_docs - INFO - Loaded 0 previously processed documents
2025-08-10 15:11:34,876 - knowledge_base.src.ingestion.vector_db_chunk_raw_docs - INFO - Found 49 document folders
2025-08-10 15:11:34,877 - knowledge_base.src.ingestion.vector_db_chunk_raw_docs - INFO - Processing SEC document (HTML): /Users/daylight/Desktop/Financial Insight AI/knowledge_base/data/raw/AMZN/10-Q/000101872424000083/000101872424000083.html
2025-08-10 15:11:34,877 - knowledge_base.src.ingestion.vector_db_chunk_raw_docs - INFO - Processing SEC document (HTML): /Users/daylight/Desktop/Financial Insight AI/knowledge_base/data/raw/AMZN/10-Q/000101872425000036/000101872425000036.html
2025-08-10 15:11:34,877 - knowledge_base.src.ingestion.vector_db_chunk_raw_docs - INFO - Processing SEC document (HTML): /Users/daylight/Desktop/Financial Insight AI/knowledge_base/data/raw/AMZN/10-Q/000101872423000008/000101872423000008.html
2025-08-10 15:11:34,878 - k

In [2]:
print(type(chunks))
print(len(chunks))
print(chunks)

<class 'list'>
4002
[{'text': 'Financial Instruments | 3 Months Ended\nMar. 31, 2023\nInvestments, Debt and Equity Securities [Abstract] | \nFinancial Instruments | FINANCIAL INSTRUMENTSCash, Cash Equivalents, Restricted Cash, and Marketable SecuritiesAs of December\xa031, 2022 and March\xa031, 2023, our cash, cash equivalents, restricted cash, and marketable securities primarily consisted of cash, AAA-rated money market funds, U.S. and foreign government and agency securities, other investment grade securities, and marketable equity securities. Cash equivalents and marketable securities are recorded at fair value. Fair value is defined as the price that would be received to sell an asset or paid to transfer a liability in an orderly transaction between market participants at the measurement date. To increase the comparability of fair value measures, the following hierarchy prioritizes the inputs to valuation methodologies used to measure fair value:Level\xa01—Valuations based on quote

Name: sec-parser
Version: 0.58.1
Summary: Parse SEC EDGAR HTML documents into a tree of elements that correspond to the visual structure of the document.
Home-page: https://github.com/alphanome-ai/sec-parser
Author: Alphanome.AI
Author-email: info@alphanome.ai
License: MIT
Location: /Users/daylight/Desktop/Financial Insight AI/venv/lib/python3.9/site-packages
Requires: beautifulsoup4, cssutils, frozendict, loguru, lxml, pandas, sec-downloader, tabulate, xxhash
Required-by: 


In [None]:
import sec_parser as sp
from sec_downloader import Downloader

# Initialize the downloader with your company name and email
dl = Downloader("MyCompanyName", "email@example.com")

html = dl.get_filing_html(ticker="AAPL", form="10-Q")

# html = 'knowledge_base/data/raw/AMZN/10-Q/000101872422000019/000101872422000019.html'
elements: list = sp.Edgar10QParser().parse(html)


# Utility function to make the example code a bit more compact
def print_first_n_lines(text: str, *, n: int):
    print("\n".join(text.split("\n")[:n]), "...", sep="\n")

demo_output: str = sp.render(elements)
print(elements)
print(demo_output)
# print_first_n_lines(demo_output, n=7)
# clean_text = extract_clean_text("knowledge_base/data/raw/AMZN/10-K/000101872423000004/000101872423000004.html")

In [None]:
import warnings
from sec_downloader import Downloader
import sec_parser as sp
from typing import Union

class SECFilingsParser:
    def __init__(self):
        self.downloader = Downloader("YourCompany", "your@email.com")
        
    def parse_filing(self, ticker: str, form_type: str) -> Union[sp.SemanticTree, str]:
        """Universal parser for all SEC form types"""
        html = self.downloader.get_filing_html(ticker=ticker, form=form_type)
        
        if form_type == "10-Q":
            # Use optimized 10-Q parser
            parser = sp.Edgar10QParser()
            elements = parser.parse(html)
        else:
            # Universal parsing for other forms
            parser = sp.Edgar10QParser()
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", message="Invalid section type for")
                elements = parser.parse(html)
        
        return sp.TreeBuilder().build(elements)

    def get_structured_text(self, tree: sp.SemanticTree) -> list[dict]:
        """Convert semantic tree to clean structured data"""
        results = []
        
        for node in tree:
            if not node.text.strip():
                continue
                
            results.append({
                "text": node.text,
                "type": node.__class__.__name__,
                "section": self._get_parent_section(node),
                "form_type": tree.metadata.get("form_type", "unknown")
            })
        
        return results
    
    def _get_parent_section(self, node):
        """Helper to find parent section title"""
        while node.parent:
            if isinstance(node, sp.TitleElement):
                return node.text
            node = node.parent
        return "Document"

In [3]:
%load_ext autoreload
%autoreload 2

from knowledge_base.config.settings import DataSettings


data_settings = DataSettings()
chunk_path = data_settings.processed_text_chunk_path

# Save chunks using the correct path from settings
# This will create one JSON file per document in knowledge_base/data/processed/vector_chunks/
processor.save_chunks(chunks, f"{chunk_path}")  # Will use settings.data.processed_text_chunk_path by default

2025-08-10 15:12:58,027 - knowledge_base.src.ingestion.vector_db_chunk_raw_docs - INFO - Saved 4002 chunks across 49 documents to /Users/daylight/Desktop/Financial Insight AI/knowledge_base/data/processed/vector_chunks


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## New method for chunking

In [10]:
import re
import json
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass
import html
from pathlib import Path

@dataclass
class Chunk:
    """Represents a text chunk with metadata"""
    content: str
    chunk_id: str
    section_type: str
    section_title: str
    chunk_index: int
    token_count: int
    contains_table: bool
    filing_type: str
    company_name: str
    filing_date: str
    cik: str
    metadata: Dict[str, Any]

class SECDocumentProcessor:
    """Process SEC full-submission.txt files into structured chunks"""
    
    def __init__(self, max_chunk_tokens: int = 600, overlap_tokens: int = 50):
        self.max_chunk_tokens = max_chunk_tokens
        self.overlap_tokens = overlap_tokens
        
        # Common SEC section patterns
        self.section_patterns = {
            'business': r'(?:item\s+1\.?\s*business|part\s+i.*?item\s+1)',
            'risk_factors': r'(?:item\s+1a\.?\s*risk\s+factors|risk\s+factors)',
            'legal_proceedings': r'(?:item\s+3\.?\s*legal\s+proceedings)',
            'financial_statements': r'(?:item\s+8\.?\s*financial\s+statements|consolidated\s+(?:balance\s+sheets?|statements?))',
            'md_a': r'(?:item\s+7\.?\s*management.{0,50}discussion|md&a)',
            'controls': r'(?:item\s+9a\.?\s*controls\s+and\s+procedures)',
            'exhibits': r'(?:item\s+15\.?\s*exhibits|exhibit\s+index)',
            'signatures': r'signatures',
            'xbrl': r'(?:<xbrl|xbrl\s+instance\s+document)'
        }
        
        # Table detection patterns
        self.table_patterns = [
            r'<table[^>]*>.*?</table>',
            r'\n\s*(?:[|\+\-\s]+\n){2,}',  # ASCII tables
            r'(?:\$\s*\([^)]+\)|\$\s*[\d,]+).*(?:\$\s*\([^)]+\)|\$\s*[\d,]+)',  # Financial data rows
        ]
    
    def extract_document_metadata(self, text: str) -> Dict[str, str]:
        """Extract basic document metadata from SEC filing"""
        metadata = {}
        
        # Extract filing type (10-K, 10-Q, etc.)
        filing_match = re.search(r'<TYPE>([^<\n]+)', text, re.IGNORECASE)
        metadata['filing_type'] = filing_match.group(1).strip() if filing_match else 'Unknown'
        
        # Extract company name
        company_match = re.search(r'COMPANY\s+CONFORMED\s+NAME:\s*([^\n]+)', text, re.IGNORECASE)
        if not company_match:
            company_match = re.search(r'<COMPANY-NAME>([^<\n]+)', text, re.IGNORECASE)
        metadata['company_name'] = company_match.group(1).strip() if company_match else 'Unknown'
        
        # Extract CIK
        cik_match = re.search(r'CENTRAL\s+INDEX\s+KEY:\s*([^\n]+)', text, re.IGNORECASE)
        if not cik_match:
            cik_match = re.search(r'<CIK>([^<\n]+)', text, re.IGNORECASE)
        metadata['cik'] = cik_match.group(1).strip() if cik_match else 'Unknown'
        
        # Extract filing date
        date_match = re.search(r'FILED\s+AS\s+OF\s+DATE:\s*([^\n]+)', text, re.IGNORECASE)
        if not date_match:
            date_match = re.search(r'<FILING-DATE>([^<\n]+)', text, re.IGNORECASE)
        metadata['filing_date'] = date_match.group(1).strip() if date_match else 'Unknown'
        
        return metadata
    
    def clean_xbrl_tags(self, text: str) -> str:
        """Convert XBRL tags to readable labels"""
        # Common XBRL namespace mappings
        xbrl_mappings = {
            'us-gaap:Revenues': 'Revenue',
            'us-gaap:CostOfRevenue': 'Cost of Revenue',
            'us-gaap:GrossProfit': 'Gross Profit',
            'us-gaap:ResearchAndDevelopmentExpense': 'R&D Expense',
            'us-gaap:SellingGeneralAndAdministrativeExpense': 'SG&A Expense',
            'us-gaap:OperatingIncomeLoss': 'Operating Income',
            'us-gaap:NetIncomeLoss': 'Net Income',
            'us-gaap:Assets': 'Total Assets',
            'us-gaap:Liabilities': 'Total Liabilities',
            'us-gaap:StockholdersEquity': 'Stockholders Equity',
            'us-gaap:Cash': 'Cash and Cash Equivalents',
        }
        
        # Replace specific mappings
        for xbrl_tag, readable_label in xbrl_mappings.items():
            text = re.sub(f'<{xbrl_tag}[^>]*>', f'{readable_label}: ', text, flags=re.IGNORECASE)
            text = re.sub(f'</{xbrl_tag.split(":")[1]}>', '', text, flags=re.IGNORECASE)
        
        # Clean remaining XBRL tags
        text = re.sub(r'</?(?:xbrl:|us-gaap:|dei:)[^>]*>', '', text, flags=re.IGNORECASE)
        text = re.sub(r'</?(?:ix:|xsi:)[^>]*>', '', text, flags=re.IGNORECASE)
        
        return text
    
    def clean_html_content(self, text: str) -> str:
        """Clean HTML tags and decode entities with better table handling"""
        # Better table cleaning - convert to readable format
        text = re.sub(r'<table[^>]*>.*?</table>', self._format_table_content, text, flags=re.IGNORECASE | re.DOTALL)
        
        # Remove other HTML tags but preserve some structure
        text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
        text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
        text = re.sub(r'</p>', '\n', text, flags=re.IGNORECASE)
        text = re.sub(r'<[^>]+>', ' ', text)
        
        # Decode HTML entities
        text = html.unescape(text)
        
        # Clean up excessive whitespace and malformed table remnants
        text = re.sub(r'\|\s*\|\s*\|', '|', text)  # Remove empty table cells
        text = re.sub(r'\n\s*\|\s*\n', '\n', text)  # Remove standalone pipes
        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Normalize line breaks
        text = re.sub(r' +', ' ', text)  # Normalize spaces
        
        return text.strip()
    
    def _format_table_content(self, match) -> str:
        """Convert HTML table to readable text format"""
        table_html = match.group(0)
        
        # Extract text content from table cells
        rows = re.findall(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
        formatted_rows = []
        
        for row in rows:
            cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL | re.IGNORECASE)
            if cells:
                # Clean cell content and join
                clean_cells = []
                for cell in cells:
                    clean_cell = re.sub(r'<[^>]+>', ' ', cell).strip()
                    clean_cell = html.unescape(clean_cell)
                    if clean_cell:  # Only include non-empty cells
                        clean_cells.append(clean_cell)
                
                if clean_cells:
                    formatted_rows.append(' | '.join(clean_cells))
        
        if formatted_rows:
            return '\n[TABLE]\n' + '\n'.join(formatted_rows) + '\n[/TABLE]\n'
        else:
            return '\n'
    
    def identify_section_type(self, text: str) -> Tuple[str, str]:
        """Identify the section type and extract title"""
        text_lower = text.lower()
        
        for section_type, pattern in self.section_patterns.items():
            if re.search(pattern, text_lower, re.IGNORECASE):
                # Extract more specific title
                title_match = re.search(r'item\s+\d+[a-z]?\.?\s*([^\n]{1,100})', text, re.IGNORECASE)
                if title_match:
                    return section_type, title_match.group(1).strip()
                else:
                    return section_type, section_type.replace('_', ' ').title()
        
        # Check for part/section headers
        part_match = re.search(r'(part\s+[ivx]+[^a-z\n]{0,50})', text, re.IGNORECASE)
        if part_match:
            return 'part', part_match.group(1).strip()
        
        return 'other', 'Document Section'
    
    def contains_table(self, text: str) -> bool:
        """Check if text contains tabular data"""
        # Check for our cleaned table markers
        if '[TABLE]' in text and '[/TABLE]' in text:
            return True
            
        # Check for financial data patterns
        dollar_lines = re.findall(r'.*\$.*\$.*', text)
        if len(dollar_lines) >= 2:  # Reduced threshold
            return True
        
        # Check for multiple pipe-separated values (table remnants)
        pipe_lines = re.findall(r'.*\|.*\|.*', text)
        if len(pipe_lines) >= 2:
            return True
            
        return False
    
    def estimate_tokens(self, text: str) -> int:
        """More accurate token estimation (3.3 characters per token average)"""
        return max(1, int(len(text) / 3.3))
    
    def split_into_chunks(self, text: str, section_type: str, section_title: str) -> List[str]:
        """Split text into appropriate chunks based on content type"""
        
        # If it's a table-heavy section, try to keep tables intact
        if self.contains_table(text):
            chunks = self._split_preserving_tables(text)
        else:
            chunks = self._split_by_paragraphs(text)
        
        return chunks
    
    def _split_preserving_tables(self, text: str) -> List[str]:
        """Split text while trying to preserve table integrity"""
        chunks = []
        
        # Split by table boundaries
        table_sections = re.split(r'(\[TABLE\].*?\[/TABLE\])', text, flags=re.DOTALL)
        
        current_chunk = ""
        
        for section in table_sections:
            if '[TABLE]' in section:
                # This is a table - try to keep it as one chunk if possible
                table_tokens = self.estimate_tokens(section)
                
                if table_tokens <= self.max_chunk_tokens:
                    # Table fits - add to current chunk or start new one
                    if current_chunk and self.estimate_tokens(current_chunk + section) > self.max_chunk_tokens:
                        chunks.append(current_chunk.strip())
                        current_chunk = section
                    else:
                        current_chunk += section
                else:
                    # Table is too large - save current chunk and split table
                    if current_chunk.strip():
                        chunks.append(current_chunk.strip())
                        current_chunk = ""
                    
                    # Split large table
                    table_chunks = self._split_large_table(section)
                    if table_chunks:
                        chunks.extend(table_chunks[:-1])
                        current_chunk = table_chunks[-1] if table_chunks else ""
            else:
                # Regular text - use paragraph splitting
                if section.strip():
                    section_chunks = self._split_by_paragraphs(section)
                    
                    for chunk in section_chunks:
                        if self.estimate_tokens(current_chunk + chunk) <= self.max_chunk_tokens:
                            current_chunk += chunk + '\n\n'
                        else:
                            if current_chunk.strip():
                                chunks.append(current_chunk.strip())
                            current_chunk = chunk + '\n\n'
        
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        
        return chunks
    
    def _split_large_table(self, table_text: str) -> List[str]:
        """Split a large table into smaller chunks"""
        chunks = []
        lines = table_text.split('\n')
        
        current_chunk = ""
        header_lines = []
        
        # Try to identify header lines (usually the first few lines)
        for i, line in enumerate(lines[:5]):
            if '|' in line or any(keyword in line.lower() for keyword in ['year', 'period', 'months', 'ended']):
                header_lines.append(line)
        
        for line in lines:
            if self.estimate_tokens(current_chunk + line) <= self.max_chunk_tokens:
                current_chunk += line + '\n'
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                
                # Start new chunk with headers if available
                current_chunk = '\n'.join(header_lines) + '\n' + line + '\n' if header_lines else line + '\n'
        
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        
        return chunks
    
    def _split_by_paragraphs(self, text: str) -> List[str]:
        """Split text by paragraphs with sentence-level fallback"""
        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
        chunks = []
        current_chunk = ""
        
        for para in paragraphs:
            # If paragraph alone exceeds token limit, split it by sentences
            if self.estimate_tokens(para) > self.max_chunk_tokens:
                # Finish current chunk if it has content
                if current_chunk.strip():
                    chunks.append(current_chunk.strip())
                    current_chunk = ""
                
                # Split oversized paragraph by sentences
                sentence_chunks = self._split_by_sentences(para)
                chunks.extend(sentence_chunks[:-1])  # Add all but last
                current_chunk = sentence_chunks[-1] + '\n\n' if sentence_chunks else ""
                
            elif self.estimate_tokens(current_chunk + para) <= self.max_chunk_tokens:
                current_chunk += para + '\n\n'
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = para + '\n\n'
        
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        
        return chunks
    
    def _split_by_sentences(self, text: str) -> List[str]:
        """Split text by sentences as fallback for oversized paragraphs"""
        # Simple sentence splitting (could be improved with NLTK/spaCy)
        sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            if self.estimate_tokens(current_chunk + sentence) <= self.max_chunk_tokens:
                current_chunk += sentence + ' '
            else:
                if current_chunk.strip():
                    chunks.append(current_chunk.strip())
                
                # If single sentence is still too long, split by character limit
                if self.estimate_tokens(sentence) > self.max_chunk_tokens:
                    char_chunks = self._split_by_characters(sentence)
                    chunks.extend(char_chunks[:-1])
                    current_chunk = char_chunks[-1] + ' ' if char_chunks else ""
                else:
                    current_chunk = sentence + ' '
        
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        
        return chunks
    
    def _split_by_characters(self, text: str) -> List[str]:
        """Last resort: split by character count"""
        target_chars = int(self.max_chunk_tokens * 3.3)  # Convert tokens to chars
        chunks = []
        
        while len(text) > target_chars:
            # Find nearest word boundary
            split_point = text.rfind(' ', 0, target_chars)
            if split_point == -1:  # No word boundary found
                split_point = target_chars
            
            chunks.append(text[:split_point].strip())
            text = text[split_point:].strip()
        
        if text:
            chunks.append(text)
        
        return chunks
    
    def process_document(self, file_path: str) -> List[Dict[str, Any]]:
        """Main method to process an SEC document into chunks"""
        
        # Read the document
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            raw_text = f.read()
        
        # Extract document metadata
        doc_metadata = self.extract_document_metadata(raw_text)
        
        # Clean XBRL tags and HTML
        cleaned_text = self.clean_xbrl_tags(raw_text)
        cleaned_text = self.clean_html_content(cleaned_text)
        
        # Split document into major sections
        sections = self._split_into_sections(cleaned_text)
        
        chunks_data = []
        chunk_id = 0
        
        for section_text in sections:
            if len(section_text.strip()) < 50:  # Skip very short sections
                continue
            
            # Identify section type
            section_type, section_title = self.identify_section_type(section_text)
            
            # Split section into chunks
            section_chunks = self.split_into_chunks(section_text, section_type, section_title)
            
            for chunk_index, chunk_text in enumerate(section_chunks):
                chunk_obj = {
                    'content': chunk_text,
                    'chunk_id': f"{doc_metadata.get('cik', 'unknown')}_{doc_metadata.get('filing_type', 'unknown')}_{chunk_id:04d}",
                    'section_type': section_type,
                    'section_title': section_title,
                    'chunk_index': chunk_index,
                    'token_count': self.estimate_tokens(chunk_text),
                    'contains_table': self.contains_table(chunk_text),
                    'filing_type': doc_metadata.get('filing_type', 'Unknown'),
                    'company_name': doc_metadata.get('company_name', 'Unknown'),
                    'filing_date': doc_metadata.get('filing_date', 'Unknown'),
                    'cik': doc_metadata.get('cik', 'Unknown'),
                    'metadata': {
                        'source_file': Path(file_path).name,
                        'section_order': len(chunks_data),
                        'has_financial_data': bool(re.search(r'\$[\d,]+|\(\$[\d,]+\)', chunk_text)),
                        'word_count': len(chunk_text.split())
                    }
                }
                
                chunks_data.append(chunk_obj)
                chunk_id += 1
        
        return chunks_data
    
    def _split_into_sections(self, text: str) -> List[str]:
        """Split document into major sections"""
        # Split by common SEC section markers
        section_markers = [
            r'\n\s*PART\s+[IVX]+',
            r'\n\s*Item\s+\d+[A-Za-z]?\.',
            r'\n\s*ITEM\s+\d+[A-Za-z]?\.',
            r'\n\s*(?:TABLE\s+OF\s+CONTENTS|SIGNATURES)',
        ]
        
        # Combine all patterns
        combined_pattern = '|'.join(f'({pattern})' for pattern in section_markers)
        
        # Split text
        sections = re.split(combined_pattern, text, flags=re.MULTILINE | re.IGNORECASE)
        
        # Recombine sections with their headers
        result_sections = []
        current_section = ""
        
        for part in sections:
            if part and re.match(combined_pattern, part, re.IGNORECASE | re.MULTILINE):
                if current_section.strip():
                    result_sections.append(current_section)
                current_section = part
            else:
                current_section += part or ""
        
        if current_section.strip():
            result_sections.append(current_section)
        
        return [s for s in result_sections if len(s.strip()) > 100]  # Filter very short sections

# Usage example
def process_sec_file(input_file_path: str, output_file_path: str = None):
    """Process a single SEC file and save chunks as JSON"""
    
    processor = SECDocumentProcessor(max_chunk_tokens=600, overlap_tokens=50)
    chunks = processor.process_document(input_file_path)
    
    # Generate output filename if not provided
    if output_file_path is None:
        input_path = Path(input_file_path)
        output_file_path = input_path.parent / f"{input_path.stem}_chunks.json"
    
    # Save to JSON
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(chunks, f, indent=2, ensure_ascii=False)
    
    print(f"Processed {len(chunks)} chunks from {input_file_path}")
    print(f"Output saved to {output_file_path}")
    
    # Print summary statistics
    total_tokens = sum(chunk['token_count'] for chunk in chunks)
    table_chunks = sum(1 for chunk in chunks if chunk['contains_table'])
    sections = set(chunk['section_type'] for chunk in chunks)
    
    print(f"\nSummary:")
    print(f"- Total chunks: {len(chunks)}")
    print(f"- Total tokens: {total_tokens:,}")
    print(f"- Chunks with tables: {table_chunks}")
    print(f"- Unique sections: {len(sections)}")
    print(f"- Sections found: {', '.join(sections)}")
    
    return chunks

# Example usage:

full_submission_file_path = '../knowledge_base/data/raw/AMZN/10-K/000101872422000005/full-submission.txt'
output_path = '../knowledge_base/data/processed/test.json'

chunks = process_sec_file(full_submission_file_path, output_path)

Processed 2549 chunks from ../knowledge_base/data/raw/AMZN/10-K/000101872422000005/full-submission.txt
Output saved to ../knowledge_base/data/processed/test.json

Summary:
- Total chunks: 2549
- Total tokens: 1,359,363
- Chunks with tables: 753
- Unique sections: 6
- Sections found: other, business, financial_statements, risk_factors, signatures, part


## Agentic parsing

### Clean Text

In [15]:
# Clean Document

import re

# sec_filing_cleaner_notebook.py
#
# Description:
# This script is adapted for use in a Jupyter Notebook or similar environment.
# It takes a raw SEC filing HTML file, cleans it by removing unwanted tags 
# (like XBRL, style, script), converts tables to Markdown format, normalizes
# whitespace, and prepares the content for sectioning and chunking.
#
# Dependencies:
# You need to install the following libraries in your notebook environment:
# !pip install beautifulsoup4 lxml markdownify
#
# Usage:
# 1. Place this code in a notebook cell.
# 2. Change the `input_path` and `output_path` variables to match your file locations.
# 3. Run the cell.

import re
from bs4 import BeautifulSoup
from markdownify import markdownify as md

def clean_and_sectionize_filing(html_content: str) -> str:
    """
    Cleans the raw HTML of an SEC filing and prepares it for sectioning.

    Args:
        html_content: A string containing the raw HTML of the filing.

    Returns:
        A string containing the cleaned and sectionized text.
    """
    print("--- Starting HTML Cleaning ---")

    # 1. Parse the HTML with BeautifulSoup
    # We use the 'lxml' parser for its speed and robustness.
    soup = BeautifulSoup(html_content, 'lxml')

    # 2. Remove all script and style elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()
        
    # 3. Remove all XBRL tags (which often start with `ix:`)
    # These tags are for machine-to-machine data exchange and add noise.
    # We find all tags and check if they have a colon in their name.
    for tag in soup.find_all(True):
        if ':' in tag.name:
            # The .unwrap() method removes the tag but keeps its content.
            # This is crucial for keeping the text within XBRL tags.
            tag.unwrap()

    # 4. Convert tables to Markdown format
    # This preserves the tabular structure in a readable text format.
    for table in soup.find_all('table'):
        # markdownify is a great library for converting HTML to Markdown.
        # We replace the table tag with a text representation of the Markdown.
        table_md = md(str(table), heading_style="ATX")
        table.replace_with(table_md)

    # 5. Get the text from the soup, normalizing whitespace
    # .get_text() extracts all the text content.
    # The ' ' separator joins text from different elements with a space.
    # .strip() removes leading/trailing whitespace.
    text = soup.get_text(' ', strip=True)

    # 6. Normalize whitespace and newlines
    # Replace multiple spaces with a single space.
    text = re.sub(r'\s+', ' ', text)
    # Add newlines before "Item" sections for clear separation.
    # This regex looks for "Item" followed by a space and a number/letter,
    # ensuring it's a section header.
    text = re.sub(r'(Item\s(\d{1,2}|[A-Z]{1,2})\.)', r'\n\n\1', text)
    # Replace multiple newlines with just two, creating clean paragraphs.
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    print("--- Cleaning Complete ---")
    
    # The final text is now clean and broadly sectionized.
    return text.strip()

# --- Notebook Usage Example ---
# Define the file paths directly in your notebook.
# Make sure the input file exists at this path.
input_path = "../knowledge_base/data/raw/AMZN/10-K/000101872422000005/full-submission.txt"  # <-- CHANGE THIS to your input file
output_path = '../knowledge_base/data/processed/cleaned_text.txt' # <-- CHANGE THIS to your desired output file

try:
    # Read the raw HTML file
    print(f"Reading HTML file from: {input_path}")
    with open(input_path, 'r', encoding='utf-8') as f:
        raw_html = f.read()

    # Process the HTML using the function defined above
    cleaned_text = clean_and_sectionize_filing(raw_html)

    # Save the cleaned text to the output file
    print(f"Saving cleaned text to: {output_path}")
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(cleaned_text)

    print("\nProcessing finished successfully!")
    # You can now read the 'cleaned_text' variable or the output file
    # in subsequent notebook cells.

except FileNotFoundError:
    print(f"Error: The file '{input_path}' was not found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")



Reading HTML file from: ../knowledge_base/data/raw/AMZN/10-K/000101872422000005/full-submission.txt
--- Starting HTML Cleaning ---
--- Cleaning Complete ---
Saving cleaned text to: ../knowledge_base/data/processed/cleaned_text.txt

Processing finished successfully!


In [7]:
# Post processeing to remove CSS and binary garbage from text
import re

def postprocess_cleaned_text(text: str) -> str:
    lines = text.splitlines()
    cleaned_lines = []
    for line in lines:
        # Remove lines that look like CSS
        if '{' in line and '}' in line:
            continue
        # Remove lines that are mostly non-alphanumeric and very long (likely encoded garbage)
        if len(line) > 120 and sum(c.isalnum() for c in line) / len(line) < 0.3:
            continue
        # Optionally, stop at the first occurrence of a known metadata marker
        if line.strip().startswith("XML") or "FilingSummary.xml" in line:
            break
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines).strip()



def is_mostly_metadata(line):
    # Heuristic: skip lines with very few alphabetic characters or mostly codes/urls
    if len(line.strip()) == 0:
        return True
    # Skip lines with mostly numbers, symbols, or URLs
    if re.match(r'^[A-Z0-9\-\:\#\[\]\.\/ ]+$', line) and len(re.findall(r'[A-Za-z]', line)) < 10:
        return True
    # Skip lines with lots of us-gaap/xbrl tags
    if line.count('us-gaap:') > 2 or line.count('xbrl') > 1:
        return True
    return False

def filter_metadata_lines(text):
    return "\n".join([line for line in text.splitlines() if not is_mostly_metadata(line)])


# Read the cleaned text from file
with open('../knowledge_base/data/processed/cleaned_text.txt', 'r', encoding='utf-8') as f:
    cleaned_text = f.read()

# Post-process the cleaned text
post_processed_text = postprocess_cleaned_text(cleaned_text)
post_processed_text = filter_metadata_lines(post_processed_text)

# Save the post-processed text to a new file
post_processed_text_path = '../knowledge_base/data/processed/post_processed_text.txt'

with open(post_processed_text_path, 'w', encoding='utf-8') as f:
    f.write(post_processed_text)

### Download Model

In [None]:
# from huggingface_hub import hf_hub_download
# import os, shutil


# local_path = hf_hub_download(
#     repo_id="TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF",
#     filename="mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf",
#     token=os.environ["HF_TOKEN"]
# )

# # Store in your project under models/llama
# model_dir = os.path.join(os.getcwd(), "models", "llama")
# os.makedirs(model_dir, exist_ok=True)

# model_dir = './models/'

# # Gated repo — requires HF login & access approval
# repo_id = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
# filename = "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"  # good balance of speed & quality

# local_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=True)
# model_path = os.path.join(model_dir, filename)
# shutil.copy2(local_path, model_path)

# print("Model stored at:", model_path)


mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf:   3%|2         | 765M/26.4G [00:00<?, ?B/s]

Model stored at: ./models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf


### Chunk cleaned text with LLM

In [6]:
import json, re, hashlib, uuid
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple

# ------------------------
# Config (tuned for sane, small chunks)
# ------------------------
DEFAULT_TEXT_TARGET_TOKENS = 600      # ~400 words target
DEFAULT_TEXT_HARD_TOKENS   = 900      # ~600 words hard cap
DEFAULT_ROW_GROUP_SIZE     = 2        # 1–2 rows per table chunk
DEFAULT_TABLE_TOKEN_BUDGET = 250      # tight budget for row-group payload
EMBED_TABLE_OVERVIEW       = False    # display only; embed summary if used
EMBED_ROW_GROUPS           = True

# crude token estimate: ~4 chars/token

def est_tokens(s: str) -> int:
    return max(1, int(len(s) / 4))

PIPE_TABLE_HEADER_SEP = re.compile(r'^\s*\|?(?:\s*:?-{3,}:?\s*\|)+\s*$', re.MULTILINE)
TOC_PAT = re.compile(r'\b(table of contents|item\s+\d+\.?\s|exhibit[s]?:?)\b', re.I)


def _looks_like_toc(block: str) -> bool:
    head = block[:600].lower()
    return bool(TOC_PAT.search(head))


def _numeric_density(lines):
    nums = sum(len(re.findall(r'[\d\$\(\),.%]', ln)) for ln in lines)
    chars = sum(len(ln) for ln in lines) + 1
    return nums / chars if chars else 0.0


def _hash(text: str) -> str:
    return hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]


def _clean_num(s: str) -> Optional[float]:
    s = s.strip()
    if s in {"—", "–", "-", "N/A", "NA", ""}:
        return None
    neg = s.startswith('(') and s.endswith(')')
    s = s.strip('()').replace(',', '').replace('$', '')
    try:
        return -float(s) if neg else float(s)
    except ValueError:
        if s.endswith('%'):
            try:
                v = float(s[:-1])
                return -v if neg else v
            except ValueError:
                return None
    return None


def _is_probably_table(block: str) -> bool:
    if _looks_like_toc(block):
        return False
    pipe_count = block.count('|')
    lines = block.strip().splitlines()
    wide_lines = sum(1 for ln in lines if ln.count('|') >= 3)
    has_header_sep = bool(PIPE_TABLE_HEADER_SEP.search(block))
    if not has_header_sep:
        if len(lines) < 3 or wide_lines < 3:
            return False
        if _numeric_density(lines) < 0.06:
            return False
    return pipe_count >= 10 and (has_header_sep or wide_lines >= 3)


def _extract_candidate_tables(text: str) -> List[Dict[str, Any]]:
    spans = []
    for m in re.finditer(r'((?:[^\n]*\|[^\n]*\n){2,})', text):
        block = m.group(0)
        if _is_probably_table(block):
            spans.append({"content": block, "start": m.start(), "end": m.end()})
    spans.sort(key=lambda x: x["start"])
    merged = []
    for s in spans:
        if not merged or s["start"] > merged[-1]["end"]:
            merged.append(s)
        else:
            merged[-1]["end"] = max(merged[-1]["end"], s["end"])
            merged[-1]["content"] = text[merged[-1]["start"]:merged[-1]["end"]]
    return merged


def _parse_pipe_table(md: str) -> Dict[str, Any]:
    lines = [ln.rstrip() for ln in md.strip().splitlines() if ln.strip()]
    header_idx = None; sep_idx = None
    for i, ln in enumerate(lines[:5]):
        if ln.count('|') >= 2 and i+1 < len(lines) and PIPE_TABLE_HEADER_SEP.match(lines[i+1]):
            header_idx = i; sep_idx = i+1; break
    if header_idx is None:
        hdr = [c.strip() for c in lines[0].strip('|').split('|')]
        data_lines = lines[1:]
    else:
        hdr = [c.strip() for c in lines[header_idx].strip('|').split('|')]
        data_lines = lines[sep_idx+1:]
    headers = [h if h else f"col_{j}" for j, h in enumerate(hdr)]
    rows = []
    for ln in data_lines:
        if '|' in ln:
            cells = [c.strip() for c in ln.strip('|').split('|')]
            if len(cells) < len(headers):
                cells += [''] * (len(headers)-len(cells))
            elif len(cells) > len(headers):
                cells = cells[:len(headers)]
            rows.append(cells)
    return {"headers": headers, "rows": rows}


def _infer_units_from_headers(headers: List[str]) -> Dict[str, str]:
    units = {}
    for h in headers:
        hl = h.lower()
        if 'percent' in hl or 'interest rate' in hl or hl.endswith('%'):
            units[h] = '%'
        elif any(tok in hl for tok in ['$', 'usd', 'in millions', 'in billions']):
            units[h] = 'USD'
        else:
            units[h] = ''
    return units


def _normalize_row(headers: List[str], row: List[str], units_map: Dict[str, str]) -> Dict[str, Any]:
    out = {}
    for h, v in zip(headers, row):
        val = _clean_num(v)
        out[h] = val if val is not None else (v.strip() if v is not None else None)
    return out


def _readable_row(headers: List[str], row: List[str], units_map: Dict[str, str]) -> str:
    pairs = []
    for h, v in zip(headers, row):
        v = (v or '').strip()
        if v == '':
            continue
        u = units_map.get(h, '')
        pairs.append(f"{h}: {v}{(' ' + u) if u and not v.endswith(u) else ''}".strip())
    return "; ".join(pairs)


def _mask_spans(text: str, spans: List[Dict[str, Any]]) -> str:
    masked = list(text)
    for span in spans:
        for k in range(span["start"], span["end"]):
            if masked[k] != '\n':
                masked[k] = ' '
    return ''.join(masked)

SENT_SPLIT = re.compile(r'(?<=[\.!?])\s+(?=[A-Z0-9])')


def _force_cap(content: str, hard_tokens: int) -> List[Dict[str, str]]:
    max_chars = hard_tokens * 4
    if len(content) <= max_chars:
        return [{"content": content}]
    out: List[Dict[str, str]] = []
    cur: List[str] = []
    cur_len = 0
    for sent in SENT_SPLIT.split(content):
        if cur and (cur_len + len(sent)) > max_chars:
            out.append({"content": " ".join(cur)})
            cur, cur_len = [], 0
        cur.append(sent)
        cur_len += len(sent) + 1
    if cur:
        out.append({"content": " ".join(cur)})
    return out


def _pack_text_blocks(text: str, target_tokens=DEFAULT_TEXT_TARGET_TOKENS, hard_tokens=DEFAULT_TEXT_HARD_TOKENS):
    paras = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
    chunks = []
    buf: List[str] = []
    start_offset = 0
    cursor = 0

    def flush():
        nonlocal buf, start_offset
        if not buf:
            return
        joined = "\n\n".join(buf)
        pieces = _force_cap(joined, hard_tokens)
        local_start = start_offset
        for piece in pieces:
            c = piece["content"]
            chunks.append({"content": c, "start": local_start, "end": local_start + len(c)})
            local_start += len(c)
        start_offset = local_start
        buf.clear()

    for p in paras:
        if buf and (est_tokens("\n\n".join(buf)) + est_tokens(p)) > target_tokens:
            flush()
        buf.append(p)
        cursor += len(p) + 2
    flush()
    return chunks


def _group_rows_with_budget(headers: List[str], rows: List[List[str]], units_map: Dict[str, str],
                            row_group_size=DEFAULT_ROW_GROUP_SIZE, token_budget=DEFAULT_TABLE_TOKEN_BUDGET
                           ) -> List[Tuple[int, int, List[Dict[str, Any]], List[str]]]:
    groups = []
    i = 0
    while i < len(rows):
        group = []
        j = i
        tokens = 0
        while j < len(rows) and len(group) < row_group_size:
            norm = _normalize_row(headers, rows[j], units_map)
            readable = _readable_row(headers, rows[j], units_map)
            row_json = json.dumps(norm, ensure_ascii=False)
            row_tokens = est_tokens(readable) + est_tokens(row_json)
            if group and (tokens + row_tokens) > token_budget:
                break
            group.append({"norm": norm, "readable": readable})
            tokens += row_tokens
            j += 1
        groups.append((i, j, group, [r["readable"] for r in group]))
        i = j if j > i else i + 1
    return groups


def _guess_section_hint(text: str) -> str:
    t = text.lower()
    if "management’s discussion and analysis" in t or "management's discussion and analysis" in t or "md&a" in t:
        return "MD&A"
    if "quantitative and qualitative disclosures about market risk" in t:
        return "Market Risk"
    if "liquidity and capital resources" in t:
        return "Liquidity"
    return ""

# -------- Optional LLM hooks (wire to your local Mixtral) --------

def summarize_table_with_llm(markdown_table: str, model_path: Optional[str]) -> str:
    """Return a 1–3 sentence summary; implement via llama.cpp/ollama. Placeholder here."""
    return ""


def alias_headers_with_llm(headers: List[str], model_path: Optional[str]) -> Dict[str, List[str]]:
    """Return synonyms/aliases per header to boost recall. Placeholder here."""
    return {}

# ------------------------
# Main API
# ------------------------

def chunk_document(
    txt_path: str,
    meta_path: str,
    *,
    doc_id: Optional[str] = None,
    model_path: Optional[str] = None,
    embed_table_overview: bool = EMBED_TABLE_OVERVIEW,
    embed_row_groups: bool = EMBED_ROW_GROUPS,
    text_target_tokens: int = DEFAULT_TEXT_TARGET_TOKENS,
    text_hard_tokens: int = DEFAULT_TEXT_HARD_TOKENS,
    row_group_size: int = DEFAULT_ROW_GROUP_SIZE,
    table_token_budget: int = DEFAULT_TABLE_TOKEN_BUDGET
) -> List[Dict[str, Any]]:
    text = Path(txt_path).read_text(encoding='utf-8', errors='ignore')
    metadata = json.loads(Path(meta_path).read_text(encoding='utf-8', errors='ignore'))
    doc_id = doc_id or metadata.get("doc_id") or _hash(text)

    table_spans = _extract_candidate_tables(text)
    masked_text = _mask_spans(text, table_spans)

    # 1) TEXT CHUNKS
    text_chunks = _pack_text_blocks(masked_text, target_tokens=text_target_tokens, hard_tokens=text_hard_tokens)

    out: List[Dict[str, Any]] = []
    chunk_index = 0
    for tc in text_chunks:
        content = tc["content"].strip()
        if not content:
            continue
        out.append({
            "id": str(uuid.uuid4()),
            "doc_id": doc_id,
            "chunk_index": chunk_index,
            "chunk_type": "text_block",
            "content": content,
            "content_format": "text",
            "section_hint": _guess_section_hint(content),
            "source_offsets": {"start": tc["start"], "end": tc["end"]},
            "metadata": metadata,
            "original_hash": _hash(content),
            "should_embed": True
        })
        chunk_index += 1

    # 2) TABLE CHUNKS (overview + row groups)
    for t_idx, span in enumerate(table_spans):
        raw_md = span["content"].strip()
        parsed = _parse_pipe_table(raw_md)
        headers = parsed["headers"]
        units_map = _infer_units_from_headers(headers)
        header_aliases = alias_headers_with_llm(headers, model_path) if model_path else {}
        table_id = f"{doc_id}_table_{t_idx}"
        llm_summary = summarize_table_with_llm(raw_md, model_path) if model_path else ""

        overview = {
            "id": str(uuid.uuid4()),
            "doc_id": doc_id,
            "chunk_index": chunk_index,
            "chunk_type": "table_overview",
            "table_id": table_id,
            "content": raw_md,
            "content_format": "markdown",
            "col_names": headers,
            "units": units_map,
            "header_aliases": header_aliases,
            "n_rows": len(parsed["rows"]),
            "llm_summary": llm_summary,
            "source_offsets": {"start": span["start"], "end": span["end"]},
            "metadata": metadata,
            "original_hash": _hash(raw_md),
            "should_embed": bool(embed_table_overview and llm_summary)
        }
        out.append(overview)
        chunk_index += 1

        groups = _group_rows_with_budget(headers, parsed["rows"], units_map,
                                         row_group_size=row_group_size,
                                         token_budget=table_token_budget)

        KEEP_FIELDS = {
            "Security","Line Item","Category","Instrument",
            "Total","Estimated Fair Value","Weighted average interest rate",
            "2021","2022","2023","2024","2025","2026","Thereafter"
        }
        def _prune_row(headers_local, norm_row, max_fields=6):
            label_key = next((h for h in headers_local if h.lower() in (
                "security","line item","category","instrument","name")), headers_local[0])
            pruned = {}
            if label_key in norm_row:
                pruned[label_key] = norm_row[label_key]
            for h in headers_local:
                if h in KEEP_FIELDS and h in norm_row and len(pruned) < max_fields:
                    pruned[h] = norm_row[h]
            return pruned

        for start_row, end_row, group, _ in groups:
            norm_rows_full = [g["norm"] for g in group]
            norm_rows = [_prune_row(headers, r, max_fields=6) for r in norm_rows_full]
            readable_rows = [", ".join(f"{k}: {v}" for k, v in row.items() if v not in (None, "")) for row in norm_rows]
            payload_json = json.dumps({
                "headers": list(norm_rows[0].keys()) if norm_rows else headers,
                "rows": norm_rows
            }, ensure_ascii=False)
            payload_nl = "\n".join(readable_rows)

            out.append({
                "id": str(uuid.uuid4()),
                "doc_id": doc_id,
                "chunk_index": chunk_index,
                "chunk_type": "table_row_group",
                "table_id": table_id,
                "row_range": [start_row, end_row],
                "content": payload_json,
                "content_readable": payload_nl,
                "content_format": "json+text",
                "col_names": headers,
                "units": units_map,
                "header_aliases": header_aliases,
                "n_rows_in_group": len(group),
                "source_offsets": {"start": span["start"], "end": span["end"]},
                "metadata": metadata,
                "original_hash": _hash(payload_json + '|' + payload_nl),
                "should_embed": bool(embed_row_groups)
            })
            chunk_index += 1

    return out


# Convenience writer + summary

def write_chunks_jsonl(chunks: List[Dict[str, Any]], out_path: str) -> Dict[str, int]:
    with open(out_path, "w", encoding="utf-8") as f:
        for c in chunks:
            f.write(json.dumps(c, ensure_ascii=False) + "\n")
    from collections import Counter
    c = Counter(x["chunk_type"] for x in chunks)
    sizes = sorted(len((x.get("content_readable") or x.get("content") or "")) for x in chunks)
    summary = dict(c)
    summary["min_chars"] = sizes[0] if sizes else 0
    summary["median_chars"] = sizes[len(sizes)//2] if sizes else 0
    summary["max_chars"] = sizes[-1] if sizes else 0
    return summary



text_path = '../knowledge_base/data/processed/post_processed_text.txt'
meta_path = '../knowledge_base/data/raw/AMZN/10-K/000101872422000005/metadata.json'
model_path = '../models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf'
doc_id = 1657890

chunks = chunk_document(
    text_path,
    meta_path,
    model_path=model_path,          # enables LLM hooks (optional)
    embed_table_overview=False,     # don’t embed whole tables
    embed_row_groups=True,          # embed row groups
    text_target_tokens=750,
    text_hard_tokens=1000,
    row_group_size=2,
    table_token_budget=250
)

print(len(chunks))
summary = write_chunks_jsonl(chunks, "chunks.jsonl")
print("Chunk summary:", summary)

# Peek at just the row groups:
import json, itertools
with open("chunks.jsonl", encoding="utf-8") as f:
    row_groups = (json.loads(l) for l in f)
    row_groups = [c for c in row_groups if c["chunk_type"]=="table_row_group"][:3]
for c in row_groups:
    print(c["table_id"], c["row_range"], "\n", c["content_readable"], "\n---")


33
Chunk summary: {'text_block': 3, 'table_overview': 1, 'table_row_group': 29, 'min_chars': 16, 'median_chars': 16, 'max_chars': 280664}
7c46f3bd6847b288_table_0 [0, 1] 
 Item 1.: Item 2. 
---
7c46f3bd6847b288_table_0 [1, 2] 
 Item 1.: Item 3. 
---
7c46f3bd6847b288_table_0 [2, 3] 
 Item 1.: Item 4. 
---


In [5]:
print(len(chunks))
summary = write_chunks_jsonl(chunks, "chunks.jsonl")
print("Chunk summary:", summary)

# Peek at just the row groups:
import json, itertools
with open("chunks.jsonl", encoding="utf-8") as f:
    row_groups = (json.loads(l) for l in f)
    row_groups = [c for c in row_groups if c["chunk_type"]=="table_row_group"][:3]
for c in row_groups:
    print(c["table_id"], c["row_range"], "\n", c["content_readable"], "\n---")


31
Chunk summary: {'text_block': 1, 'table_overview': 1, 'table_row_group': 29}
7c46f3bd6847b288_table_0 [0, 1] 
 Item 1.: Item 2.; [Business](#i10ffcc0db5d74ac5a2de7ca2ad731f50_13): [Properties](#i10ffcc0db5d74ac5a2de7ca2ad731f50_22); [3](#i10ffcc0db5d74ac5a2de7ca2ad731f50_13): [16](#i10ffcc0db5d74ac5a2de7ca2ad731f50_22) 
---
7c46f3bd6847b288_table_0 [1, 2] 
 Item 1.: Item 3.; [Business](#i10ffcc0db5d74ac5a2de7ca2ad731f50_13): [Legal Proceedings](#i10ffcc0db5d74ac5a2de7ca2ad731f50_25); [3](#i10ffcc0db5d74ac5a2de7ca2ad731f50_13): [16](#i10ffcc0db5d74ac5a2de7ca2ad731f50_25) 
---
7c46f3bd6847b288_table_0 [2, 3] 
 Item 1.: Item 4.; [Business](#i10ffcc0db5d74ac5a2de7ca2ad731f50_13): [Mine Safety Disclosures](#i10ffcc0db5d74ac5a2de7ca2ad731f50_28); [3](#i10ffcc0db5d74ac5a2de7ca2ad731f50_13): [16](#i10ffcc0db5d74ac5a2de7ca2ad731f50_28); Item 1B.: PART II 
---


In [7]:
print(len(chunks))

# import json
# print(json.dumps(chunks[:3], indent=2, ensure_ascii=False))
print(chunks)


31
[{'id': '4d6a303e-6344-4558-86bd-de935d8ffe90', 'doc_id': '7c46f3bd6847b288', 'chunk_index': 0, 'chunk_type': 'text_block', 'content': 'Item 15. | | | Exhibits, Financial Statement Schedules | | | (a) List of Documents Filed as a Part of This Report: (1) Index to Consolidated Financial Statements: Report of Ernst & Young LLP, Independent Registered Public Accounting Firm Consolidated Statements of Cash Flows for each of the three years ended December 31, 2021 Consolidated Statements of Operations for each of the three years ended December 31, 2021 Consolidated Statements of Comprehensive Income for each of the three years ended December 31, 2021 Consolidated Balance Sheets as of December 31, 2020 and 2021 Consolidated Statements of Stockholders’ Equity for each of the three years ended December 31, 2021 Notes to Consolidated Financial Statements Report of Ernst & Young LLP, Independent Registered Public Accounting Firm (2) Index to Financial Statement Schedules: All schedules have b

## Sec parser

In [2]:
import sec_parser as sp
from sec_downloader import Downloader

# Initialize the downloader with your company name and email
dl = Downloader("MyCompanyName", "email@example.com")

html = dl.get_filing_html(ticker="AAPL", form="10-Q")

# html = 'knowledge_base/data/raw/AMZN/10-Q/000101872422000019/000101872422000019.html'
elements: list = sp.Edgar10QParser().parse(html)


# Utility function to make the example code a bit more compact
def print_first_n_lines(text: str, *, n: int):
    print("\n".join(text.split("\n")[:n]), "...", sep="\n")

demo_output: str = sp.render(elements)
print(elements)
print(demo_output)
# print_first_n_lines(demo_output, n=7)
# clean_text = extract_clean_text("knowledge_base/data/raw/AMZN/10-K/000101872423000004/000101872423000004.html")

[IntroductorySectionElement<table>, TopSectionTitle[L0]<span>, TopSectionTitle[L1]<span>, TitleElement[L0]<div>, SupplementaryText<div>, TableElement<div>, SupplementaryText<div>, TitleElement[L0]<div>, SupplementaryText<div>, TableElement<div>, SupplementaryText<div>, TitleElement[L0]<div>, SupplementaryText<div>, TableElement<div>, SupplementaryText<div>, TitleElement[L0]<div>, SupplementaryText<div>, TableElement<div>, SupplementaryText<div>, TitleElement[L0]<div>, SupplementaryText<div>, TableElement<div>, SupplementaryText<div>, TitleElement[L1]<div>, TitleElement[L2]<div>, TitleElement[L2]<div>, TextElement<div>, TextElement<div>, TitleElement[L2]<div>, TextElement<div>, TableElement<div>, TextElement<sec-parser-merged-text>, TitleElement[L2]<div>, TextElement<div>, TableElement<div>, TitleElement[L2]<div>, TitleElement[L2]<div>, TextElement<div>, TableElement<div>, TableElement<div>, TextElement<sec-parser-merged-text>, TextElement<div>, TitleElement[L2]<div>, TextElement<div>, 

In [None]:
import sec_parser as sp
from sec_downloader import Downloader

# Initialize the downloader with your company name and email
dl = Downloader("MyCompanyName", "email@example.com")

html = dl.get_filing_html(ticker="AAPL", form="10-Q")

# html = 'knowledge_base/data/raw/AMZN/10-Q/000101872422000019/000101872422000019.html'
elements: list = sp.Edgar10QParser().parse(html)


# Utility function to make the example code a bit more compact
def print_first_n_lines(text: str, *, n: int):
    print("\n".join(text.split("\n")[:n]), "...", sep="\n")

demo_output: str = sp.render(elements)
print(elements)
print(demo_output)
# print_first_n_lines(demo_output, n=7)
# clean_text = extract_clean_text("knowledge_base/data/raw/AMZN/10-K/000101872423000004/000101872423000004.html")

## Vector DB: Get Embeddings

In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

from datetime import datetime
from pathlib import Path
import sys

# Add the project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from knowledge_base.src.utils.embeddings import EmbeddingClient

embedding_client = EmbeddingClient()


test_text = "Hello World!"
embedding = embedding_client.get_embedding(test_text)
print(embedding)


test_text_list = ["Hello World!", "Why hello!"]
embeddings_list = embedding_client.get_embeddings_batch(test_text_list)
print(embeddings_list)

## Test to calculate distances of similar and different embeddings

In [None]:
import numpy as np
from knowledge_base.src.utils.embeddings import EmbeddingClient

client = EmbeddingClient()
texts = [
    "The stock market reached a record high.",  # Similar to next line
    "Penguins habitate Antarctica.",     # Similar
    "Penguins live in Antarctica.",             # Different
    "Penguins live in Antarctica!"           
]
embeddings = [client.get_embedding(text) for text in texts]


# Cosine similarity (using dot product since OpenAI embeddings are normalized)
def cosine_similarity(a, b):
    return np.dot(a, b)  # Equivalent to cosine if vectors are normalized

# Euclidean distance
def euclidean_distance(a, b):
    return np.linalg.norm(np.array(a) - np.array(b))

# Compare all pairs
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        sim = cosine_similarity(embeddings[i], embeddings[j])
        dist = euclidean_distance(embeddings[i], embeddings[j])
        print(f"Text {i+1} & {j+1}: Cosine={sim:.3f}, Euclidean={dist:.3f}")
        print(f"  Text {i+1}: {texts[i]}")
        print(f"  Text {j+1}: {texts[j]}\n")

## Qdrant DB

In [None]:
!docker run -p 6333:6333 -p 6334:6334 \
    -v $(pwd)/qdrant_storage:/qdrant/storage \
    qdrant/qdrant

In [None]:
import os
import docker
from qdrant_client import QdrantClient

def start_qdrant_container():
    client = docker.from_env()
    qdrant_storage = os.path.abspath("knowledge_base/data/qdrant_storage")
    
    # Check if Qdrant container is already running
    containers = client.containers.list(filters={"name": "qdrant"})
    if containers:
        print("Qdrant container is already running.")
        return containers[0]
    
    # Start Qdrant container if not running
    container = client.containers.run(
        image="qdrant/qdrant",
        name="qdrant",
        ports={"6333/tcp": 6333, "6334/tcp": 6334},
        volumes={qdrant_storage: {"bind": "/qdrant/storage", "mode": "rw"}},
        detach=True,
        remove=True,  # Auto-remove container when stopped
    )
    print("Qdrant container started successfully.")
    return container

# Start Qdrant and get a Python client
container = start_qdrant_container()
qdrant_client = QdrantClient(host="localhost", port=6333)

In [None]:
print(qdrant_client.get_collections())