In [7]:
import tabula
import os
import hashlib
import json
from datetime import datetime
from pathlib import Path
import pandas as pd

In [8]:

# Can be local file path or URL
pdf_dir = "../data/raw/MSFT/10-K/PDFs/"

pdf_files = os.listdir(pdf_dir)
pdf_src = os.path.join(pdf_dir, pdf_files[0])

In [9]:
pdf_src

'../data/raw/MSFT/10-K/PDFs/MSFT_10-K_20230727_000095017023035122.pdf'

In [10]:
def detect_table_continuations(tables_metadata):
    """Detect tables that might be continuations across pages."""
    # Group by similar structure (columns, headers)
    continuation_groups = {}
    group_counter = 1
    
    for table_meta in tables_metadata:
        # Simple heuristic: same number of columns and similar headers
        key = f"{table_meta['num_columns']}_{table_meta['num_rows']}"
        
        if key not in continuation_groups:
            continuation_groups[key] = f"group_{group_counter}"
            group_counter += 1
        
        table_meta['continuation_group'] = continuation_groups[key]
    
    return tables_metadata


In [11]:

def compute_file_hash(file_path):
    """Compute SHA-256 hash of file content."""
    sha256_hash = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            sha256_hash.update(chunk)
    return sha256_hash.hexdigest()

def compute_table_id(file_id, page_number, table_index, mode, params_hash):
    """Compute deterministic table ID."""
    content = f"{file_id}_{page_number}_{table_index}_{mode}_{params_hash}"
    return hashlib.sha256(content.encode()).hexdigest()

def get_tabula_version():
    """Get tabula-py version."""
    try:
        import tabula
        return getattr(tabula, '__version__', 'unknown')
    except:
        return 'unknown'

def create_file_metadata(pdf_path, output_dir, tables_found, tables_valid, status="success"):
    """Create file-level metadata."""
    file_id = compute_file_hash(pdf_path)
    file_size = os.path.getsize(pdf_path)
    
    return {
        "file_id": file_id,
        "source_path": pdf_path,
        "filename": os.path.basename(pdf_path),
        "file_size_bytes": file_size,
        "extraction_timestamp": datetime.utcnow().isoformat() + "Z",
        "tabula_version": get_tabula_version(),
        "mode": "stream",
        "tables_found": tables_found,
        "tables_valid": tables_valid,
        "output_dir": output_dir,
        "processing_status": status
    }

def create_table_metadata(file_id, table_index, csv_path, page_number, table, 
                         is_valid, validation_reason, continuation_group=None, 
                         is_continuation=False, parent_table_id=None):
    """Create table-level metadata."""
    # Compute table ID
    params_hash = hashlib.sha256("stream_mode".encode()).hexdigest()[:8]
    table_id = compute_table_id(file_id, page_number, table_index, "stream", params_hash)
    
    # Compute CSV hash
    csv_hash = compute_file_hash(csv_path) if os.path.exists(csv_path) else None
    
    # Basic structure info
    num_rows = len(table) if table is not None else 0
    num_columns = len(table.columns) if table is not None and not table.empty else 0
    
    # Quality metrics
    numeric_cell_ratio = 0.0
    empty_cell_ratio = 0.0
    
    if table is not None and not table.empty:
        total_cells = num_rows * num_columns
        if total_cells > 0:
            # Count numeric cells
            numeric_cells = table.applymap(
                lambda x: str(x).replace(".", "", 1).isdigit()
            ).sum().sum()
            numeric_cell_ratio = numeric_cells / total_cells
            
            # Count empty cells
            empty_cells = table.isnull().sum().sum()
            empty_cell_ratio = empty_cells / total_cells
    
    return {
        "table_id": table_id,
        "file_id": file_id,
        "table_index": table_index,
        "csv_path": csv_path,
        "csv_sha256": csv_hash,
        "page_number": page_number,
        "num_rows": num_rows,
        "num_columns": num_columns,
        "is_valid_table": is_valid,
        "validation_reason": validation_reason,
        "numeric_cell_ratio": round(numeric_cell_ratio, 3),
        "empty_cell_ratio": round(empty_cell_ratio, 3),
        "continuation_group": continuation_group,
        "is_continuation": is_continuation,
        "parent_table_id": parent_table_id,
        "extraction_timestamp": datetime.utcnow().isoformat() + "Z"
    }

In [12]:
def is_valid_table(df):
    """Heuristic to filter real tables."""
    # Skip empty frames
    if df is None or df.empty:
        return False, "empty_table"
    
    # Skip if only one column (likely a paragraph)
    if df.shape[1] <= 1:
        return False, "single_column"
    
    # Keep if table has any numeric values
    has_numbers = df.map(
        lambda x: str(x).replace(".", "", 1).isdigit()
    ).any().any()
    
    if has_numbers:
        return True, "has_numbers"
    elif df.shape[1] > 1:
        return True, "multi_column"
    else:
        return False, "no_numbers_single_column"

def export_tables_to_csv_with_metadata(pdf_path: str, output_dir: str):
    """
    Extracts all tables from a PDF (stream mode) and saves only real tables as separate CSV files.
    Also generates metadata for files and tables.
    Args:
        pdf_path (str): Path to the input PDF.
        output_dir (str): Directory where CSV files will be saved.
    Returns:
        tuple: (csv_files, file_metadata, tables_metadata)
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Create metadata directory
    metadata_dir = os.path.join(output_dir, "metadata")
    os.makedirs(metadata_dir, exist_ok=True)

    # Use stream mode (better for borderless tables)
    tables = tabula.read_pdf(
        pdf_path,
        pages="all",
        multiple_tables=True,
        stream=True
    )

    # Compute file ID once
    file_id = compute_file_hash(pdf_path)
    
    output_files = []
    tables_metadata = []
    valid_table_count = 0
    
    for i, table in enumerate(tables, start=1):
        # Check if table is valid and get reason
        is_valid, validation_reason = is_valid_table(table)
        
        if is_valid:
            output_file = os.path.join(output_dir, f"table_{i}.csv")
            table.to_csv(output_file, index=False)
            output_files.append(output_file)
            valid_table_count += 1
        
        # Create metadata for all tables (valid and invalid)
        # Note: We don't have page numbers from tabula directly, so we'll use table index as proxy
        # In a more sophisticated version, you'd extract page info from tabula's area detection
        table_meta = create_table_metadata(
            file_id=file_id,
            table_index=i,
            csv_path=os.path.join(output_dir, f"table_{i}.csv") if is_valid else None,
            page_number=i,  # Simplified - tabula doesn't always provide exact page info
            table=table,
            is_valid=is_valid,
            validation_reason=validation_reason
        )
        tables_metadata.append(table_meta)
    
    # Create file-level metadata
    file_metadata = create_file_metadata(
        pdf_path=pdf_path,
        output_dir=output_dir,
        tables_found=len(tables),
        tables_valid=valid_table_count
    )
    
    # Detect continuations
    tables_metadata = detect_table_continuations(tables_metadata)
    
    # Save metadata files
    with open(os.path.join(metadata_dir, "files.jsonl"), "w") as f:
        f.write(json.dumps(file_metadata) + "\n")
    
    with open(os.path.join(metadata_dir, "tables.jsonl"), "w") as f:
        for table_meta in tables_metadata:
            f.write(json.dumps(table_meta) + "\n")
    
    print(f"Metadata saved to: {metadata_dir}")
    print(f"File metadata: {file_metadata['file_id']}")
    print(f"Tables metadata: {len(tables_metadata)} records")
    
    return output_files, file_metadata, tables_metadata

In [13]:
# Example usage with metadata
if __name__ == "__main__":
    pdf_file = pdf_src
    save_dir = "../data/parsed/tabula_output"

    csv_files, file_metadata, tables_metadata = export_tables_to_csv_with_metadata(pdf_file, save_dir)
    
    print("Exported CSV files:")
    for f in csv_files:
        print(f)
    
    print(f"\nFile metadata:")
    print(f"File ID: {file_metadata['file_id']}")
    print(f"Tables found: {file_metadata['tables_found']}")
    print(f"Valid tables: {file_metadata['tables_valid']}")
    
    print(f"\nTable metadata summary:")
    valid_tables = [t for t in tables_metadata if t['is_valid_table']]
    print(f"Valid tables: {len(valid_tables)}")
    print(f"Invalid tables: {len(tables_metadata) - len(valid_tables)}")
    
    # Show validation reasons
    validation_reasons = {}
    for t in tables_metadata:
        reason = t['validation_reason']
        validation_reasons[reason] = validation_reasons.get(reason, 0) + 1
    
    print(f"\nValidation reasons:")
    for reason, count in validation_reasons.items():
        print(f"  {reason}: {count}")


  numeric_cells = table.applymap(
  "extraction_timestamp": datetime.utcnow().isoformat() + "Z"
  numeric_cells = table.applymap(
  "extraction_timestamp": datetime.utcnow().isoformat() + "Z"


TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType