In [8]:
# 🎯 XBRL vs TABULA DATA MATCHING
import pandas as pd
import numpy as np
import re
import os
from pathlib import Path
import json
from bs4 import BeautifulSoup
from typing import Dict, List, Any
import warnings
warnings.filterwarnings('ignore')

print("🎯 XBRL vs TABULA DATA Comaprison")
print("=" * 50)
print("Focus: 2023 Microsoft 10-K filing")


🎯 XBRL vs TABULA DATA Comaprison
Focus: 2023 Microsoft 10-K filing


In [9]:
# 📊 LOAD TABULA DATA
print("\n📊 LOADING TABULA DATA")
print("=" * 30)

tabula_dir = "../data/parsed/tabula_output"
tabula_tables = {}

# Load all CSV files
for file in os.listdir(tabula_dir):
    if file.endswith('.csv'):
        try:
            df = pd.read_csv(os.path.join(tabula_dir, file))
            table_id = file.replace('.csv', '')
            tabula_tables[table_id] = df
        except Exception as e:
            print(f"⚠️ Could not load {file}: {e}")

print(f"✅ Loaded {len(tabula_tables)} Tabula tables")

# Show sample table info
for i, (table_id, df) in enumerate(list(tabula_tables.items())[:3]):
    print(f"   {table_id}: {df.shape[0]} rows × {df.shape[1]} columns")
    # Show a sample of numeric values
    numeric_samples = []
    for col in df.columns:
        for val in df[col].dropna()[:5]:
            val_str = str(val).strip()
            if re.search(r'\d{3,}', val_str):  # Has 3+ digits
                numeric_samples.append(val_str)
    if numeric_samples:
        print(f"      Sample values: {numeric_samples[:3]}")



📊 LOADING TABULA DATA
✅ Loaded 57 Tabula tables
   table_50: 73 rows × 5 columns
      Sample values: ['2023', '14,346', '2,409']
   table_44: 8 rows × 6 columns
      Sample values: ['2023', '$52,917', '36,394']
   table_45: 16 rows × 4 columns
      Sample values: ['2023', '2022', '2021']


In [10]:
# 🏦 LOAD XBRL DATA WITH SMART FILTERING
print("\n🏦 LOADING XBRL DATA WITH SMART FILTERING")
print("=" * 45)

# Enhanced XBRL parser that focuses on common financial concepts
class EnhancedXBRLParser:
    def __init__(self):
        # Key financial concepts that commonly appear in both XBRL and PDF tables
        self.target_concepts = {
            'revenue': ['Revenues', 'Revenue', 'RevenueFromContractWithCustomer', 'SalesRevenue'],
            'operating_income': ['OperatingIncomeLoss', 'OperatingIncome', 'IncomeLossFromOperations'],
            'net_income': ['NetIncomeLoss', 'NetIncome', 'ProfitLoss'],
            'total_assets': ['Assets', 'AssetsCurrent', 'AssetsNoncurrent'],
            'cash': ['CashAndCashEquivalentsAtCarryingValue', 'Cash', 'CashAndCashEquivalents'],
            'debt': ['LongTermDebt', 'DebtCurrent', 'DebtNoncurrent'],
            'equity': ['StockholdersEquity', 'ShareholdersEquity'],
            'cost_of_revenue': ['CostOfRevenue', 'CostOfGoodsAndServicesSold'],
            'research_development': ['ResearchAndDevelopmentExpense'],
            'sales_marketing': ['SellingGeneralAndAdministrativeExpense']
        }
    
    def parse_ixbrl_html(self, file_path):
        """Parse iXBRL HTML file focusing on key financial concepts"""
        
        with open(file_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f.read(), 'html.parser')
        
        facts = []
        
        # Find all ix:nonfraction tags (these contain numeric data)
        for tag in soup.find_all('ix:nonfraction'):
            try:
                concept = tag.get('name', '')
                context_ref = tag.get('contextref', '')
                unit_ref = tag.get('unitref', '')
                
                # Get the text value
                text_value = tag.get_text(strip=True)
                
                # More flexible numeric parsing
                if text_value:
                    # Remove common non-numeric characters but keep numbers
                    clean_text = re.sub(r'[^\d.,\-]', '', text_value)
                    
                    if clean_text and any(c.isdigit() for c in clean_text):
                        try:
                            # Handle scale attribute
                            scale = tag.get('scale', '0')
                            scale_factor = 10 ** int(scale) if scale.lstrip('-').isdigit() else 1
                            
                            # Parse the numeric value
                            clean_value = clean_text.replace(',', '')
                            if clean_value.startswith('.'):
                                clean_value = '0' + clean_value
                            
                            numeric_value = float(clean_value) * scale_factor
                            
                            # Focus on significant values and key concepts
                            if numeric_value > 100000:  # Lower threshold to catch more values
                                # Check if this concept matches our target concepts
                                concept_category = self._categorize_concept(concept)
                                
                                facts.append({
                                    'concept': concept,
                                    'concept_category': concept_category,
                                    'value': numeric_value,
                                    'text': text_value,
                                    'context': context_ref,
                                    'unit': unit_ref,
                                    'is_key_concept': concept_category != 'other'
                                })
                        except ValueError:
                            continue
            except:
                continue
        
        return facts
    
    def _categorize_concept(self, concept):
        """Categorize XBRL concept into major financial categories"""
        concept_lower = concept.lower()
        
        for category, keywords in self.target_concepts.items():
            if any(keyword.lower() in concept_lower for keyword in keywords):
                return category
        
        return 'other'

# Load XBRL data with enhanced parsing
xbrl_file = "../data/raw/MSFT/10-K/XBRL/MSFT_10-K_20230727_xbrl/msft-20230630.htm"

if os.path.exists(xbrl_file):
    parser = EnhancedXBRLParser()
    xbrl_facts = parser.parse_ixbrl_html(xbrl_file)
    
    # Separate key concepts from others
    key_facts = [f for f in xbrl_facts if f['is_key_concept']]
    other_facts = [f for f in xbrl_facts if not f['is_key_concept']]
    
    print(f"✅ Loaded {len(xbrl_facts)} total XBRL facts")
    print(f"   📊 Key financial concepts: {len(key_facts)}")
    print(f"   📋 Other concepts: {len(other_facts)}")
    
    # Sort by value and show key concepts first
    key_facts.sort(key=lambda x: x['value'], reverse=True)
    
    print(f"\n💰 TOP KEY FINANCIAL CONCEPTS:")
    for i, fact in enumerate(key_facts[:8], 1):
        print(f"   {i}. ${fact['value']/1e9:.1f}B - {fact['concept_category'].upper()}")
        print(f"      Concept: {fact['concept']}")
        print(f"      Text: '{fact['text']}' | Context: {fact['context'][:20]}...")
    
    # Combine all facts for matching
    xbrl_facts = key_facts + other_facts
    
else:
    print(f"❌ XBRL file not found: {xbrl_file}")
    xbrl_facts = []



🏦 LOADING XBRL DATA WITH SMART FILTERING
✅ Loaded 1325 total XBRL facts
   📊 Key financial concepts: 400
   📋 Other concepts: 925

💰 TOP KEY FINANCIAL CONCEPTS:
   1. $412.0B - TOTAL_ASSETS
      Concept: us-gaap:Assets
      Text: '411,976' | Context: C_d4129f7a-27bd-4daf...
   2. $412.0B - EQUITY
      Concept: us-gaap:LiabilitiesAndStockholdersEquity
      Text: '411,976' | Context: C_d4129f7a-27bd-4daf...
   3. $364.8B - TOTAL_ASSETS
      Concept: us-gaap:Assets
      Text: '364,840' | Context: C_c0e12f72-c285-4bcf...
   4. $364.8B - EQUITY
      Concept: us-gaap:LiabilitiesAndStockholdersEquity
      Text: '364,840' | Context: C_c0e12f72-c285-4bcf...
   5. $229.0B - REVENUE
      Concept: us-gaap:RevenueRemainingPerformanceObligation
      Text: '229' | Context: C_d4129f7a-27bd-4daf...
   6. $224.0B - REVENUE
      Concept: us-gaap:RevenueRemainingPerformanceObligation
      Text: '224' | Context: C_d41ed038-94a9-4379...
   7. $211.9B - REVENUE
      Concept: us-gaap:RevenueFrom

In [12]:
# 🔍 SMART TABULA VALUE EXTRACTION
print("\n🔍  TABULA VALUE EXTRACTION")
print("=" * 40)

def extract_tabula_values_smart(tabula_tables):
    """Extract numeric values with intelligent table classification"""
    
    all_values = []
    
    # Keywords that indicate financial statement tables
    financial_keywords = {
        'income_statement': ['revenue', 'income', 'profit', 'loss', 'operating', 'earnings', 'sales'],
        'balance_sheet': ['assets', 'liabilities', 'equity', 'cash', 'debt', 'inventory'],
        'cash_flow': ['cash flow', 'operating activities', 'investing', 'financing']
    }
    
    for table_id, df in tabula_tables.items():
        # Classify table type based on content
        table_text = ' '.join([str(col).lower() for col in df.columns]) + ' '
        table_text += ' '.join([str(val).lower() for val in df.values.flatten() if pd.notna(val)])
        
        table_type = 'unknown'
        for stmt_type, keywords in financial_keywords.items():
            if any(keyword in table_text for keyword in keywords):
                table_type = stmt_type
                break
        
        # Extract values with enhanced parsing
        for col_idx, col in enumerate(df.columns):
            col_name = str(col).lower()
            
            for row_idx, cell in enumerate(df[col].dropna()):
                cell_str = str(cell).strip()
                
                # Enhanced number extraction patterns
                patterns = [
                    r'[\d,]+\.?\d*',  # Standard numbers with commas
                    r'\$[\d,]+\.?\d*',  # Numbers with dollar signs
                    r'\([\d,]+\.?\d*\)',  # Numbers in parentheses (negative)
                    r'[\d,]+\.?\d*%'  # Percentages
                ]
                
                for pattern in patterns:
                    numbers = re.findall(pattern, cell_str)
                    
                    for num_str in numbers:
                        try:
                            # Clean the number
                            clean_str = re.sub(r'[^\d.,\-]', '', num_str)
                            if not clean_str or not any(c.isdigit() for c in clean_str):
                                continue
                                
                            clean_num = float(clean_str.replace(',', ''))
                            
                            # Handle negative values in parentheses
                            if '(' in num_str and ')' in num_str:
                                clean_num = -clean_num
                            
                            if abs(clean_num) >= 100:  # Lower threshold for more matches
                                # Intelligent scale detection based on context
                                likely_scales = []
                                
                                # If it's a large number (>1000), test all scales
                                if abs(clean_num) >= 1000:
                                    likely_scales = [
                                        ('raw', clean_num),
                                        ('thousands', clean_num * 1000),
                                        ('millions', clean_num * 1000000),
                                        ('billions', clean_num * 1000000000)
                                    ]
                                else:
                                    # For smaller numbers, likely already in actual units
                                    likely_scales = [
                                        ('millions', clean_num * 1000000),
                                        ('billions', clean_num * 1000000000),
                                        ('raw', clean_num)
                                    ]
                                
                                for scale_name, scaled_value in likely_scales:
                                    if abs(scaled_value) >= 100000:  # Over $100K
                                        # Determine financial category
                                        financial_category = 'other'
                                        if any(word in col_name for word in ['revenue', 'sales', 'income']):
                                            financial_category = 'revenue_income'
                                        elif any(word in col_name for word in ['asset', 'cash', 'inventory']):
                                            financial_category = 'assets'
                                        elif any(word in col_name for word in ['debt', 'liability']):
                                            financial_category = 'liabilities'
                                        
                                        all_values.append({
                                            'value': scaled_value,
                                            'original': clean_num,
                                            'scale': scale_name,
                                            'table': table_id,
                                            'table_type': table_type,
                                            'column': col,
                                            'column_category': financial_category,
                                            'row': row_idx,
                                            'text': cell_str,
                                            'is_financial': table_type != 'unknown'
                                        })
                        except:
                            continue
    
    return all_values

# Extract values with smart classification
tabula_values = extract_tabula_values_smart(tabula_tables)

print(f"📊  EXTRACTION RESULTS:")
print(f"   XBRL Values: {len(xbrl_facts):,}")
print(f"   Tabula Values (raw): {len(tabula_values):,}")

# Remove duplicates and prioritize financial tables
unique_tabula = []
seen = set()
financial_values = []
other_values = []

for item in tabula_values:
    key = (round(item['value'], 2), item['table'])
    if key not in seen:
        seen.add(key)
        if item['is_financial']:
            financial_values.append(item)
        else:
            other_values.append(item)

# Prioritize financial values
tabula_values = financial_values + other_values[:1000]  # Limit others to reduce noise

print(f"   Financial table values: {len(financial_values):,}")
print(f"   Other table values: {len(other_values):,}")
print(f"   Unique values for matching: {len(tabula_values):,}")

# Show sample of financial values by category
financial_by_type = {}
for item in financial_values:
    table_type = item['table_type']
    if table_type not in financial_by_type:
        financial_by_type[table_type] = []
    financial_by_type[table_type].append(item)

print(f"\n📋 FINANCIAL VALUES BY TABLE TYPE:")
for table_type, values in financial_by_type.items():
    top_values = sorted(values, key=lambda x: abs(x['value']), reverse=True)[:3]
    print(f"   {table_type.upper()}: {len(values)} values")
    for val in top_values:
        print(f"      ${val['value']/1e9:.1f}B ({val['scale']}) - {val['table']}")



🔍  TABULA VALUE EXTRACTION
📊  EXTRACTION RESULTS:
   XBRL Values: 1,325
   Tabula Values (raw): 4,658
   Financial table values: 3,218
   Other table values: 611
   Unique values for matching: 3,829

📋 FINANCIAL VALUES BY TABLE TYPE:
   INCOME_STATEMENT: 2871 values
      $647033115.0B (billions) - table_18
      $137919746.0B (billions) - table_18
      $122371218.0B (billions) - table_18
   BALANCE_SHEET: 347 values
      $55511.0B (billions) - table_42
      $52866.0B (billions) - table_42
      $49781.0B (billions) - table_42


In [None]:
# 🎯 INTELLIGENT MATCHING WITH CONCEPT ALIGNMENT

print("=" * 55)

def find_matches_intelligent(xbrl_facts, tabula_values, tolerance=0.02):
    """Find matches with concept category alignment and multiple tolerance levels"""
    
    matches = []
    
    # Prioritize key financial concepts
    key_xbrl_facts = [f for f in xbrl_facts if f.get('is_key_concept', False)]
    other_xbrl_facts = [f for f in xbrl_facts if not f.get('is_key_concept', False)]
    
    # Try different matching strategies
    print(f"🔍 Strategy 1: Key concept matching...")
    
    for xbrl_fact in key_xbrl_facts:
        xbrl_val = abs(xbrl_fact['value'])  # Use absolute value
        xbrl_category = xbrl_fact.get('concept_category', 'other')
        
        # Look for values in similar categories first
        similar_tabula = []
        other_tabula = []
        
        for tabula_item in tabula_values:
            if tabula_item['is_financial']:
                # Check if categories might align
                if (xbrl_category in ['revenue', 'operating_income', 'net_income'] and 
                    tabula_item['column_category'] == 'revenue_income'):
                    similar_tabula.append(tabula_item)
                elif (xbrl_category in ['total_assets', 'cash'] and 
                      tabula_item['column_category'] == 'assets'):
                    similar_tabula.append(tabula_item)
                else:
                    other_tabula.append(tabula_item)
            else:
                other_tabula.append(tabula_item)
        
        # Check similar category values first
        for tabula_item in similar_tabula + other_tabula:
            tabula_val = abs(tabula_item['value'])
            
            if tabula_val > 0 and xbrl_val > 0:
                rel_diff = abs(tabula_val - xbrl_val) / max(xbrl_val, tabula_val)
                
                if rel_diff <= tolerance:
                    match_confidence = 'high' if tabula_item in similar_tabula else 'medium'
                    
                    matches.append({
                        'xbrl_value': xbrl_fact['value'],
                        'xbrl_concept': xbrl_fact['concept'],
                        'xbrl_category': xbrl_category,
                        'xbrl_text': xbrl_fact['text'],
                        'xbrl_context': xbrl_fact['context'],
                        'tabula_value': tabula_item['value'],
                        'tabula_original': tabula_item['original'],
                        'tabula_scale': tabula_item['scale'],
                        'tabula_table': tabula_item['table'],
                        'tabula_table_type': tabula_item['table_type'],
                        'tabula_column': tabula_item['column'],
                        'tabula_column_category': tabula_item['column_category'],
                        'tabula_text': tabula_item['text'],
                        'difference_pct': rel_diff,
                        'accuracy': 100 - (rel_diff * 100),
                        'match_confidence': match_confidence,
                        'is_key_concept': True
                    })
    
    print(f"   Found {len(matches)} key concept matches")
    
    # If we have few matches, try with other facts and higher tolerance
    if len(matches) < 5:
        print(f"🔍 Strategy 2: Broader matching with {tolerance*2:.1%} tolerance...")
        
        for xbrl_fact in other_xbrl_facts[:50]:  # Limit to avoid too many comparisons
            xbrl_val = abs(xbrl_fact['value'])
            
            for tabula_item in tabula_values[:500]:  # Focus on top values
                tabula_val = abs(tabula_item['value'])
                
                if tabula_val > 0 and xbrl_val > 0:
                    rel_diff = abs(tabula_val - xbrl_val) / max(xbrl_val, tabula_val)
                    
                    if rel_diff <= tolerance * 2:  # Double tolerance for broader search
                        matches.append({
                            'xbrl_value': xbrl_fact['value'],
                            'xbrl_concept': xbrl_fact['concept'],
                            'xbrl_category': xbrl_fact.get('concept_category', 'other'),
                            'xbrl_text': xbrl_fact['text'],
                            'xbrl_context': xbrl_fact['context'],
                            'tabula_value': tabula_item['value'],
                            'tabula_original': tabula_item['original'],
                            'tabula_scale': tabula_item['scale'],
                            'tabula_table': tabula_item['table'],
                            'tabula_table_type': tabula_item['table_type'],
                            'tabula_column': tabula_item['column'],
                            'tabula_column_category': tabula_item['column_category'],
                            'tabula_text': tabula_item['text'],
                            'difference_pct': rel_diff,
                            'accuracy': 100 - (rel_diff * 100),
                            'match_confidence': 'low',
                            'is_key_concept': False
                        })
        
        print(f"   Found {len(matches)} total matches after broader search")
    
    # Remove duplicates and sort by accuracy
    unique_matches = []
    seen_pairs = set()
    
    for match in matches:
        pair_key = (match['xbrl_concept'], match['tabula_table'], round(match['tabula_original'], 2))
        if pair_key not in seen_pairs:
            seen_pairs.add(pair_key)
            unique_matches.append(match)
    
    unique_matches.sort(key=lambda x: (x['difference_pct'], -int(x['is_key_concept'])))
    
    return unique_matches

# Find matches with intelligent strategy
matches = find_matches_intelligent(xbrl_facts, tabula_values, tolerance=0.02)


print(f"   Total Matches Found: {len(matches)}")

if matches:
    # Separate by confidence
    high_confidence = [m for m in matches if m['match_confidence'] == 'high']
    medium_confidence = [m for m in matches if m['match_confidence'] == 'medium']
    low_confidence = [m for m in matches if m['match_confidence'] == 'low']
    
    print(f"   High confidence: {len(high_confidence)}")
    print(f"   Medium confidence: {len(medium_confidence)}")
    print(f"   Low confidence: {len(low_confidence)}")
    
    print(f"\n✅ TOP MATCHES:")
    for i, match in enumerate(matches[:8], 1):
        confidence_icon = "🎯" if match['match_confidence'] == 'high' else "🔍" if match['match_confidence'] == 'medium' else "📊"
        
        print(f"\n   {i}. {confidence_icon} MATCH #{i} - {match['accuracy']:.2f}% accurate")
        print(f"      🏦 XBRL: ${match['xbrl_value']/1e9:.2f}B")
        print(f"         Category: {match['xbrl_category'].upper()}")
        print(f"         Concept: {match['xbrl_concept']}")
        print(f"         Text: '{match['xbrl_text']}'")
        print(f"      📋 TABULA: ${match['tabula_value']/1e9:.2f}B")
        print(f"         Table Type: {match['tabula_table_type']}")
        print(f"         Original: {match['tabula_original']:,.0f} ({match['tabula_scale']})")
        print(f"         Table: {match['tabula_table']}")
        print(f"         Column: {match['tabula_column']}")
        print(f"         Text: '{match['tabula_text']}'")
        print(f"      📊 Difference: {match['difference_pct']:.4f}% | Confidence: {match['match_confidence']}")
        
else:
    print(f"\n❌ No matches found with intelligent strategy")
    print(f"   Let's try even broader tolerance (10%)...")
    
    # Final attempt with very broad tolerance
    broad_matches = find_matches_intelligent(xbrl_facts, tabula_values, tolerance=0.10)
    
    if broad_matches:
        print(f"   Found {len(broad_matches)} matches with 10% tolerance")
        for i, match in enumerate(broad_matches[:3], 1):
            print(f"\n   {i}. ${match['xbrl_value']/1e9:.1f}B ≈ ${match['tabula_value']/1e9:.1f}B")
            print(f"      XBRL: {match['xbrl_concept'][:40]}...")
            print(f"      Tabula: {match['tabula_table']} | {match['tabula_text']}")
            print(f"      Accuracy: {match['accuracy']:.1f}%")
            
        matches = broad_matches  # Use these for export
    else:
        # Show diagnostic information
        print(f"\n📊 DIAGNOSTIC INFORMATION:")
        
        if xbrl_facts:
            key_facts = [f for f in xbrl_facts if f.get('is_key_concept', False)]
            if key_facts:
                print(f"\n   Top XBRL Key Concepts:")
                for fact in key_facts[:5]:
                    print(f"      ${fact['value']/1e9:.1f}B - {fact['concept_category']} ({fact['concept'][:30]}...)")
        
        if tabula_values:
            financial_vals = [t for t in tabula_values if t['is_financial']][:5]
            print(f"\n   Top Tabula Financial Values:")
            for val in financial_vals:
                print(f"      ${val['value']/1e9:.1f}B ({val['scale']}) - {val['table_type']} | {val['table']}")


🔍 Strategy 1: Key concept matching...
   Found 4228 key concept matches

🎯 INTELLIGENT MATCHING RESULTS:
   Total Matches Found: 3122
   High confidence: 0
   Medium confidence: 3122
   Low confidence: 0

✅ TOP MATCHES:

   1. 🔍 MATCH #1 - 100.00% accurate
      🏦 XBRL: $411.98B
         Category: TOTAL_ASSETS
         Concept: us-gaap:Assets
         Text: '411,976'
      📋 TABULA: $411.98B
         Table Type: income_statement
         Original: 411,976 (millions)
         Table: table_22
         Column: Unnamed: 1
         Text: '$411,976'
      📊 Difference: 0.0000% | Confidence: medium

   2. 🔍 MATCH #2 - 100.00% accurate
      🏦 XBRL: $411.98B
         Category: EQUITY
         Concept: us-gaap:LiabilitiesAndStockholdersEquity
         Text: '411,976'
      📋 TABULA: $411.98B
         Table Type: income_statement
         Original: 411,976 (millions)
         Table: table_22
         Column: Unnamed: 1
         Text: '$411,976'
      📊 Difference: 0.0000% | Confidence: medium

 

In [None]:
# 📈 SUMMARY & EXPORT
print("\n📈 SUMMARY & EXPORT")
print("=" * 30)

if 'matches' in locals() and matches:
    # Create results DataFrame
    results = []
    for match in matches:
        results.append({
            'XBRL_Value_B': round(match['xbrl_value']/1e9, 3),
            'XBRL_Concept': match['xbrl_concept'],
            'XBRL_Text': match['xbrl_text'],
            'Tabula_Value_B': round(match['tabula_value']/1e9, 3),
            'Tabula_Original': match['tabula_original'],
            'Tabula_Scale': match['tabula_scale'],
            'Tabula_Table': match['tabula_table'],
            'Tabula_Column': match['tabula_column'],
            'Tabula_Text': match['tabula_text'],
            'Accuracy_%': round(match['accuracy'], 2),
            'Difference_%': round(match['difference_pct'] * 100, 4)
        })
    
    results_df = pd.DataFrame(results)
    
    # Export to CSV
    output_file = "../data/parsed/xbrl_tabula_exact_matches.csv"
    results_df.to_csv(output_file, index=False)
    
    print(f"✅ RESULTS EXPORTED:")
    print(f"   File: {output_file}")
    print(f"   Total Matches: {len(results_df)}")
    print(f"   Average Accuracy: {results_df['Accuracy_%'].mean():.2f}%")
    print(f"   Best Match: {results_df['Accuracy_%'].max():.2f}%")
    
    # Table summary
    table_counts = results_df['Tabula_Table'].value_counts()
    print(f"\n📋 MATCHES BY TABLE:")
    for table, count in table_counts.head(5).items():
        print(f"   {table}: {count} matches")
    
    # Scale summary
    scale_counts = results_df['Tabula_Scale'].value_counts()
    print(f"\n⚖️ MATCHES BY SCALE:")
    for scale, count in scale_counts.items():
        print(f"   {scale}: {count} matches")
    
    print(f"\n🎯 SUCCESS! Found {len(matches)} exact matches between XBRL and Tabula data")
    
else:
    print(f"❌ No exact matches found")
    print(f"\n🔍 DIAGNOSTIC INFO:")
    print(f"   XBRL facts loaded: {len(xbrl_facts) if 'xbrl_facts' in locals() else 0}")
    print(f"   Tabula values extracted: {len(tabula_values) if 'tabula_values' in locals() else 0}")
    print(f"   Tabula tables loaded: {len(tabula_tables) if 'tabula_tables' in locals() else 0}")
    
    if 'xbrl_facts' in locals() and 'tabula_values' in locals():
        print(f"\n📊 VALUE COMPARISON:")
        if xbrl_facts:
            top_xbrl = sorted([f['value'] for f in xbrl_facts], reverse=True)[:3]
            print(f"   Top XBRL: {[f'${v/1e9:.1f}B' for v in top_xbrl]}")
        
        if tabula_values:
            top_tabula = sorted([t['value'] for t in tabula_values], reverse=True)[:3]
            print(f"   Top Tabula: {[f'${v/1e9:.1f}B' for v in top_tabula]}")

print(f"\n" + "=" * 50)




📈 SUMMARY & EXPORT
✅ RESULTS EXPORTED:
   File: ../data/parsed/xbrl_tabula_exact_matches.csv
   Total Matches: 3122
   Average Accuracy: 99.12%
   Best Match: 100.00%

📋 MATCHES BY TABLE:
   table_23: 299 matches
   table_22: 181 matches
   table_42: 154 matches
   table_20: 142 matches
   table_57: 133 matches

⚖️ MATCHES BY SCALE:
   millions: 2584 matches
   thousands: 398 matches
   billions: 136 matches
   raw: 4 matches

🎯 SUCCESS! Found 3122 exact matches between XBRL and Tabula data

🎉 XBRL vs TABULA COMPARISON COMPLETE!
