# PDF Parser Evaluation
This notebook demonstrates the evaluation system for PDF parsing quality and regression detection.


In [17]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import json
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np


## Step 1: Load Manual Transcription Data and Create Ground Truth


In [18]:
# Load the manual transcription data
manual_data_path = '../data/manual_data/MSFT_10-K_20220728_000156459022026876-pages - MSFT_10-K_20220728_000156459022026876-pages.csv.csv'
manual_df = pd.read_csv(manual_data_path)

print("Manual transcription data shape:", manual_df.shape)
print("\nColumns:", manual_df.columns.tolist())
print("\nFirst few rows:")
manual_df.head(10)


Manual transcription data shape: (177, 4)

Columns: ['PART II', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3']

First few rows:


Unnamed: 0,PART II,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,Item 8,,,
1,STOCKHOLDERS’ EQUITY STATEMENTS,,,
2,"(In millions, except per share amounts)",,,
3,"Year Ended June 30,","$2,022.00","$2,021.00","$2,020.00"
4,Common stock and paid-in capital,,,
5,"Balance, beginning of period","$83,111.00","$80,552.00","$78,520.00"
6,Common stock issued,"$1,841.00","$1,963.00","$1,343.00"
7,Common stock repurchased,"-$5,688.00","-$5,539.00","-$4,599.00"
8,Stock-based compensation expense,"$7,502.00","$6,118.00","$5,289.00"
9,"Other, net",$173.00,$17.00,-$1.00


In [19]:
# Create ground truth directory and process manual data
ground_truth_dir = Path('../data/ground_truth')
ground_truth_dir.mkdir(exist_ok=True)

# Process manual data into ground truth format for pages 61-64
page_ranges = {
    61: (0, 30),    # Stockholders' equity statements
    62: (30, 85),   # Notes to financial statements part 1
    63: (85, 140),  # Notes to financial statements part 2
    64: (140, -1)   # Notes to financial statements part 3
}

for page_num, (start_idx, end_idx) in page_ranges.items():
    if end_idx == -1:
        page_data = manual_df.iloc[start_idx:]
    else:
        page_data = manual_df.iloc[start_idx:end_idx]
    
    # Create ground truth CSV for this page
    gt_file = ground_truth_dir / f'page_{page_num}_ground_truth.csv'
    
    # Process the data to create proper ground truth format
    ground_truth_rows = []
    
    for idx, row in page_data.iterrows():
        # Extract text from all non-empty columns
        text_parts = []
        for col in page_data.columns:
            val = str(row[col]).strip()
            if val and val != 'nan' and val != 'NaN':
                text_parts.append(val)
        
        if text_parts:
            ground_truth_rows.append({
                'page_number': page_num,
                'line_number': len(ground_truth_rows) + 1,
                'text': ' '.join(text_parts),
                'is_table_cell': True if len(text_parts) > 1 else False,
                'original_row': idx
            })
    
    gt_df = pd.DataFrame(ground_truth_rows)
    gt_df.to_csv(gt_file, index=False)
    print(f"Created ground truth file for page {page_num}: {len(gt_df)} lines")

print(f"\nGround truth files created in: {ground_truth_dir}")


Created ground truth file for page 61: 29 lines
Created ground truth file for page 62: 38 lines
Created ground truth file for page 63: 41 lines
Created ground truth file for page 64: 30 lines

Ground truth files created in: ../data/ground_truth


## Step 2: Load Simple Text Extraction Results


In [6]:
# Load the parsed text data from simple text extraction
parsed_text_file = '../data/parsed/MSFT/2022/MSFT_10-K_20220728_000156459022026876_extracted.txt'
with open(parsed_text_file, 'r', encoding='utf-8') as f:
    parsed_content = f.read()

print(f"Loaded parsed content: {len(parsed_content)} characters")
print("First 500 characters:")
print(parsed_content[:500])
print("\n...")
print("Last 500 characters:")
print(parsed_content[-500:])


Loaded parsed content: 365682 characters
First 500 characters:
# Extracted Text from MSFT_10-K_20220728_000156459022026876.pdf
# Processing Date: 2025-09-21T14:41:36.073576
# Total Pages: 111
# PDFplumber Pages: 109
# OCR Pages: 2
# Poor Quality Pages: 2


PAGE 1 | Method: pdfplumber | Quality Score: 100

UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
☒ ANNUAL REP

...
Last 500 characters:
 Rodriguez
/s/ CHARLES W. SCHARF Director
Charles W. Scharf
/s/ JOHN W. STANTON Director
John W. Stanton
/s/ JOHN W. THOMPSON Lead Independent Director
John W. Thompson
/s/ EMMA N. WALMSLEY Director
Emma N. Walmsley
/s/ PADMASREE WARRIOR Director
Padmasree Warrior
/s/ AMY E. HOOD Executive Vice President and Chief Financial Officer
Amy E. Hood (Principal Financial Officer)
/s/ ALICE L. JOLLA Corporate Vice President and Chief Accounting Officer (Principal
Alice L. Jolla Accounting Officer)
110




In [20]:
# Extract text for pages 61-64 from the parsed content
import re

# Split content by page markers
page_pattern = r'={80,}\nPAGE (\d+) \|.*?\n={80,}\n'
pages = re.split(page_pattern, parsed_content)

# Extract target pages (61-64)
target_pages = {}
current_page = None

for i, section in enumerate(pages):
    if section.isdigit():
        current_page = int(section)
    elif current_page and current_page in [61, 62, 63, 64]:
        target_pages[current_page] = section.strip()

print("Extracted pages:", list(target_pages.keys()))
for page_num in sorted(target_pages.keys()):
    print(f"\nPage {page_num}: {len(target_pages[page_num])} characters")
    print(f"First 200 chars: {target_pages[page_num][:200]}...")


Extracted pages: [61, 62, 63, 64]

Page 61: 1104 characters
First 200 chars: PART II
Item 8
STOCKHOLDERS’ EQUITY STATEMENTS
(In millions, except per share amounts)
Year Ended June 30, 2022 2021 2020
Common stock and paid-in capital
Balance, beginning of period $ 83,111 $ 80,55...

Page 62: 3554 characters
First 200 chars: PART II
Item 8
NOTES TO FINANCIAL STATEMENTS
NOTE 1 — ACCOUNTING POLICIES
Accounting Principles
Our consolidated financial statements and accompanying notes are prepared in accordance with accounting ...

Page 63: 4485 characters
First 200 chars: PART II
Item 8
Revenue Recognition
Revenue is recognized upon transfer of control of promised products or services to customers in an amount that reflects the
consideration we expect to receive in exc...

Page 64: 4566 characters
First 200 chars: PART II
Item 8
Judgment is required to determine the SSP for each distinct performance obligation. We use a single amount to estimate SSP for
items that are not sold separately, incl

## Step 3: Implement WER/CER Calculation and Table Metrics


In [21]:
# Install required packages for WER/CER calculation
import subprocess
import sys

try:
    from jiwer import wer, cer
    print("jiwer already installed")
except ImportError:
    print("Installing jiwer...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "jiwer"])
    from jiwer import wer, cer

import difflib
from datetime import datetime
from dataclasses import dataclass, asdict
from typing import Dict, List, Tuple, Optional, Any


jiwer already installed


In [22]:
@dataclass
class EvaluationMetrics:
    """Container for evaluation metrics"""
    wer: float
    cer: float
    table_precision: float
    table_recall: float
    table_f1: float
    text_similarity: float
    word_count_accuracy: float
    line_count_accuracy: float
    extraction_time: float
    quality_score: float

@dataclass
class GroundTruthData:
    """Container for ground truth data"""
    page_number: int
    text: str
    tables: List[pd.DataFrame]
    metadata: Dict[str, Any]

def normalize_text(text: str) -> str:
    """Normalize text for comparison"""
    if not text:
        return ""
    
    # Convert to lowercase and remove extra whitespace
    normalized = re.sub(r'\s+', ' ', text.lower().strip())
    
    # Remove common OCR artifacts and formatting
    normalized = re.sub(r'[^\w\s\.\,\!\?\-\$\%\(\)]', '', normalized)
    
    return normalized

def evaluate_text_extraction(predicted_text: str, ground_truth_text: str) -> Dict[str, float]:
    """
    Evaluate text extraction quality using various metrics.
    """
    # Normalize texts for comparison
    pred_normalized = normalize_text(predicted_text)
    gt_normalized = normalize_text(ground_truth_text)
    
    # Calculate WER and CER
    try:
        wer_score = wer(gt_normalized, pred_normalized)
    except Exception as e:
        print(f"Warning: WER calculation failed: {e}")
        wer_score = 1.0
    
    try:
        cer_score = cer(gt_normalized, pred_normalized)
    except Exception as e:
        print(f"Warning: CER calculation failed: {e}")
        cer_score = 1.0
    
    # Text similarity using SequenceMatcher
    similarity = difflib.SequenceMatcher(None, pred_normalized, gt_normalized).ratio()
    
    # Word and line count accuracy
    pred_words = len(pred_normalized.split())
    gt_words = len(gt_normalized.split())
    word_count_accuracy = 1 - abs(pred_words - gt_words) / max(gt_words, 1)
    
    pred_lines = len(predicted_text.split('\n'))
    gt_lines = len(ground_truth_text.split('\n'))
    line_count_accuracy = 1 - abs(pred_lines - gt_lines) / max(gt_lines, 1)
    
    return {
        'wer': wer_score,
        'cer': cer_score,
        'text_similarity': similarity,
        'word_count_accuracy': max(0, word_count_accuracy),
        'line_count_accuracy': max(0, line_count_accuracy),
        'predicted_word_count': pred_words,
        'ground_truth_word_count': gt_words,
        'predicted_line_count': pred_lines,
        'ground_truth_line_count': gt_lines
    }

print("Evaluation functions defined successfully!")


Evaluation functions defined successfully!


## Step 4: Run Evaluation on Pages 61-64


In [23]:
# Load ground truth data for evaluation
ground_truth_data = {}

for page_num in [61, 62, 63, 64]:
    gt_file = ground_truth_dir / f'page_{page_num}_ground_truth.csv'
    if gt_file.exists():
        df = pd.read_csv(gt_file)
        
        # Extract text from the ground truth
        if 'text' in df.columns:
            text = '\n'.join(df['text'].dropna().astype(str))
        else:
            # Concatenate all text from all columns
            text_parts = []
            for col in df.columns:
                if df[col].dtype == 'object':  # Text columns
                    text_parts.extend(df[col].dropna().astype(str).tolist())
            text = ' '.join(text_parts)
        
        ground_truth_data[page_num] = GroundTruthData(
            page_number=page_num,
            text=text,
            tables=[df] if not df.empty else [],
            metadata={'source_file': str(gt_file)}
        )

print(f"Loaded ground truth for {len(ground_truth_data)} pages")
print("Ground truth pages:", list(ground_truth_data.keys()))


Loaded ground truth for 4 pages
Ground truth pages: [61, 62, 63, 64]


In [24]:
# Evaluate each page
evaluation_results = []

for page_num in [61, 62, 63, 64]:
    if page_num in ground_truth_data and page_num in target_pages:
        print(f"\n=== Evaluating Page {page_num} ===")
        
        gt_text = ground_truth_data[page_num].text
        pred_text = target_pages[page_num]
        
        # Evaluate text extraction
        text_metrics = evaluate_text_extraction(pred_text, gt_text)
        
        print(f"WER: {text_metrics['wer']:.3f}")
        print(f"CER: {text_metrics['cer']:.3f}")
        print(f"Text Similarity: {text_metrics['text_similarity']:.3f}")
        print(f"Word Count Accuracy: {text_metrics['word_count_accuracy']:.3f}")
        print(f"Line Count Accuracy: {text_metrics['line_count_accuracy']:.3f}")
        
        # Store results
        page_result = {
            'page_number': page_num,
            'wer': text_metrics['wer'],
            'cer': text_metrics['cer'],
            'text_similarity': text_metrics['text_similarity'],
            'word_count_accuracy': text_metrics['word_count_accuracy'],
            'line_count_accuracy': text_metrics['line_count_accuracy'],
            'predicted_word_count': text_metrics['predicted_word_count'],
            'ground_truth_word_count': text_metrics['ground_truth_word_count'],
            'predicted_line_count': text_metrics['predicted_line_count'],
            'ground_truth_line_count': text_metrics['ground_truth_line_count']
        }
        evaluation_results.append(page_result)

print(f"\n=== SUMMARY ===")
print(f"Evaluated {len(evaluation_results)} pages")



=== Evaluating Page 61 ===
WER: 0.453
CER: 0.203
Text Similarity: 0.580
Word Count Accuracy: 0.963
Line Count Accuracy: 0.966

=== Evaluating Page 62 ===
WER: 0.058
CER: 0.047
Text Similarity: 0.976
Word Count Accuracy: 0.957
Line Count Accuracy: 0.955

=== Evaluating Page 63 ===
WER: 0.306
CER: 0.279
Text Similarity: 0.843
Word Count Accuracy: 0.756
Line Count Accuracy: 0.865

=== Evaluating Page 64 ===
WER: 0.534
CER: 0.474
Text Similarity: 0.808
Word Count Accuracy: 0.466
Line Count Accuracy: 0.533

=== SUMMARY ===
Evaluated 4 pages


## Step 5: Aggregate Metrics and Unit Tests with Thresholds


In [25]:
# Calculate aggregate metrics
if evaluation_results:
    df_results = pd.DataFrame(evaluation_results)
    
    aggregate_metrics = {
        'mean_wer': df_results['wer'].mean(),
        'mean_cer': df_results['cer'].mean(),
        'mean_text_similarity': df_results['text_similarity'].mean(),
        'mean_word_count_accuracy': df_results['word_count_accuracy'].mean(),
        'mean_line_count_accuracy': df_results['line_count_accuracy'].mean(),
        'std_wer': df_results['wer'].std(),
        'std_cer': df_results['cer'].std(),
        'min_text_similarity': df_results['text_similarity'].min(),
        'max_text_similarity': df_results['text_similarity'].max(),
        'pages_evaluated': len(evaluation_results),
        'evaluation_timestamp': datetime.now().isoformat()
    }
    
    print("=== AGGREGATE METRICS ===")
    for metric, value in aggregate_metrics.items():
        if isinstance(value, float):
            print(f"{metric}: {value:.3f}")
        else:
            print(f"{metric}: {value}")
else:
    print("No evaluation results available")


=== AGGREGATE METRICS ===
mean_wer: 0.338
mean_cer: 0.251
mean_text_similarity: 0.802
mean_word_count_accuracy: 0.786
mean_line_count_accuracy: 0.830
std_wer: 0.209
std_cer: 0.177
min_text_similarity: 0.580
max_text_similarity: 0.976
pages_evaluated: 4
evaluation_timestamp: 2025-09-26T01:34:57.426784


In [26]:
# Unit tests with metric thresholds
def test_parsing_quality_thresholds(metrics: dict) -> dict:
    """
    Test parsing quality against predefined thresholds.
    Returns test results with pass/fail status.
    """
    # Define quality thresholds
    thresholds = {
        'max_acceptable_wer': 0.5,      # WER should be < 0.5 (50% error rate)
        'max_acceptable_cer': 0.3,      # CER should be < 0.3 (30% error rate)
        'min_text_similarity': 0.6,     # Text similarity should be > 0.6
        'min_word_accuracy': 0.7,       # Word count accuracy should be > 0.7
        'min_line_accuracy': 0.5        # Line count accuracy should be > 0.5
    }
    
    test_results = {
        'tests_passed': 0,
        'tests_failed': 0,
        'test_details': {}
    }
    
    # Test WER
    wer_pass = metrics['mean_wer'] < thresholds['max_acceptable_wer']
    test_results['test_details']['wer_test'] = {
        'passed': wer_pass,
        'value': metrics['mean_wer'],
        'threshold': thresholds['max_acceptable_wer'],
        'message': f"WER {metrics['mean_wer']:.3f} {'<' if wer_pass else '>='} {thresholds['max_acceptable_wer']}"
    }
    if wer_pass:
        test_results['tests_passed'] += 1
    else:
        test_results['tests_failed'] += 1
    
    # Test CER
    cer_pass = metrics['mean_cer'] < thresholds['max_acceptable_cer']
    test_results['test_details']['cer_test'] = {
        'passed': cer_pass,
        'value': metrics['mean_cer'],
        'threshold': thresholds['max_acceptable_cer'],
        'message': f"CER {metrics['mean_cer']:.3f} {'<' if cer_pass else '>='} {thresholds['max_acceptable_cer']}"
    }
    if cer_pass:
        test_results['tests_passed'] += 1
    else:
        test_results['tests_failed'] += 1
    
    # Test Text Similarity
    sim_pass = metrics['mean_text_similarity'] > thresholds['min_text_similarity']
    test_results['test_details']['similarity_test'] = {
        'passed': sim_pass,
        'value': metrics['mean_text_similarity'],
        'threshold': thresholds['min_text_similarity'],
        'message': f"Text Similarity {metrics['mean_text_similarity']:.3f} {'>' if sim_pass else '<='} {thresholds['min_text_similarity']}"
    }
    if sim_pass:
        test_results['tests_passed'] += 1
    else:
        test_results['tests_failed'] += 1
    
    # Test Word Count Accuracy
    word_pass = metrics['mean_word_count_accuracy'] > thresholds['min_word_accuracy']
    test_results['test_details']['word_accuracy_test'] = {
        'passed': word_pass,
        'value': metrics['mean_word_count_accuracy'],
        'threshold': thresholds['min_word_accuracy'],
        'message': f"Word Accuracy {metrics['mean_word_count_accuracy']:.3f} {'>' if word_pass else '<='} {thresholds['min_word_accuracy']}"
    }
    if word_pass:
        test_results['tests_passed'] += 1
    else:
        test_results['tests_failed'] += 1
    
    # Test Line Count Accuracy
    line_pass = metrics['mean_line_count_accuracy'] > thresholds['min_line_accuracy']
    test_results['test_details']['line_accuracy_test'] = {
        'passed': line_pass,
        'value': metrics['mean_line_count_accuracy'],
        'threshold': thresholds['min_line_accuracy'],
        'message': f"Line Accuracy {metrics['mean_line_count_accuracy']:.3f} {'>' if line_pass else '<='} {thresholds['min_line_accuracy']}"
    }
    if line_pass:
        test_results['tests_passed'] += 1
    else:
        test_results['tests_failed'] += 1
    
    test_results['overall_pass'] = test_results['tests_failed'] == 0
    
    return test_results

# Run the tests
if 'aggregate_metrics' in locals():
    test_results = test_parsing_quality_thresholds(aggregate_metrics)
    
    print("\\n=== QUALITY THRESHOLD TESTS ===")
    print(f"Tests Passed: {test_results['tests_passed']}")
    print(f"Tests Failed: {test_results['tests_failed']}")
    print(f"Overall Result: {'PASS' if test_results['overall_pass'] else 'FAIL'}")
    
    print("\\nDetailed Results:")
    for test_name, details in test_results['test_details'].items():
        status = "✅ PASS" if details['passed'] else "❌ FAIL"
        print(f"  {test_name}: {status} - {details['message']}")
    
    if not test_results['overall_pass']:
        print("\\n⚠️  PARSER QUALITY BELOW ACCEPTABLE THRESHOLDS!")
        print("Consider improving the text extraction algorithm.")
else:
    print("No metrics available for testing")


\n=== QUALITY THRESHOLD TESTS ===
Tests Passed: 5
Tests Failed: 0
Overall Result: PASS
\nDetailed Results:
  wer_test: ✅ PASS - WER 0.338 < 0.5
  cer_test: ✅ PASS - CER 0.251 < 0.3
  similarity_test: ✅ PASS - Text Similarity 0.802 > 0.6
  word_accuracy_test: ✅ PASS - Word Accuracy 0.786 > 0.7
  line_accuracy_test: ✅ PASS - Line Accuracy 0.830 > 0.5
