# Step 3: Information Extraction Using YAML

This notebook demonstrates information extraction from classified document sections using AWS Bedrock with **YAML output format** instead of JSON. This showcases the new YAML parsing capabilities that provide 10-30% token efficiency improvements.

**Inputs:**
- Document object with classification results from Step 2
- Extraction configuration (modified for YAML output)
- Document classes with attributes definition

**Outputs:**
- Document with extraction results for each section
- Structured data extracted in YAML format and automatically parsed

**Key Differences from JSON Version:**
- Modified prompts to request YAML output format
- Demonstrates automatic YAML detection and parsing
- Shows token efficiency benefits of YAML over JSON

## 1. Load Previous Step Data

In [None]:
import os
import json
import time
import logging
import boto3
from pathlib import Path

# Import IDP libraries
from idp_common.models import Document, Status
from idp_common import extraction

# Configure logging
logging.basicConfig(level=logging.WARNING)
logging.getLogger('idp_common.extraction').setLevel(logging.INFO)
logging.getLogger('idp_common.bedrock.client').setLevel(logging.INFO)

print("Libraries imported successfully")

In [None]:
# Load document from previous step
classification_data_dir = Path(".data/step2_classification")

# Load document object from JSON
document_path = classification_data_dir / "document.json"
with open(document_path, 'r') as f:
    document = Document.from_json(f.read())

# Load configuration directly from config files
import yaml
config_dir = Path("config")
CONFIG = {}

# Load each configuration file
config_files = [
    "extraction.yaml",
    "classes.yaml"
]

for config_file in config_files:
    config_path = config_dir / config_file
    if config_path.exists():
        with open(config_path, 'r') as f:
            file_config = yaml.safe_load(f)
            CONFIG.update(file_config)
        print(f"Loaded {config_file}")
    else:
        print(f"Warning: {config_file} not found")

# Load environment info
env_path = classification_data_dir / "environment.json"
with open(env_path, 'r') as f:
    env_info = json.load(f)

# Set environment variables
os.environ['AWS_REGION'] = env_info['region']
os.environ['METRIC_NAMESPACE'] = 'IDP-Modular-Pipeline'

print(f"Loaded document: {document.id}")
print(f"Document status: {document.status.value}")
print(f"Number of sections: {len(document.sections) if document.sections else 0}")
print(f"Loaded configuration sections: {list(CONFIG.keys())}")

## 2. Configure Extraction Service for YAML Output

In [None]:
# Modify extraction configuration to request YAML output instead of JSON
extraction_config = CONFIG.get('extraction', {}).copy()

# Update system prompt to request YAML instead of JSON
extraction_config['system_prompt'] = extraction_config['system_prompt'].replace(
    'Respond only with JSON', 
    'Respond only with YAML'
)

# Update task prompt to request YAML format
original_task_prompt = extraction_config['task_prompt']

# Replace JSON-specific instructions with YAML equivalents
yaml_task_prompt = original_task_prompt.replace(
    'convert it into a well-organized table format using JSON',
    'convert it into a well-organized table format using YAML'
).replace(
    'use them as keys in the JSON object',
    'use them as keys in the YAML structure'
).replace(
    'populate the corresponding values in the JSON object',
    'populate the corresponding values in the YAML structure'
).replace(
    'properly formatted within the JSON structure',
    'properly formatted within the YAML structure'
).replace(
    'Include double quotes around all keys and values',
    'Use proper YAML syntax with keys followed by colons and values'
).replace(
    'return a JSON object',
    'return a YAML document'
).replace(
    'Ensure the output is properly formatted JSON with quoted keys and values',
    'Ensure the output is properly formatted YAML with correct indentation and syntax'
)

extraction_config['task_prompt'] = yaml_task_prompt

# Update the CONFIG with our modified extraction config
CONFIG['extraction'] = extraction_config

print("Extraction Configuration (Modified for YAML):")
print(f"Model: {extraction_config.get('model')}")
print(f"Temperature: {extraction_config.get('temperature')}")
print(f"Max Tokens: {extraction_config.get('max_tokens')}")
print("*"*50)

print(f"System Prompt (YAML version):\n{extraction_config.get('system_prompt')}")
print("*"*50)
print(f"Task Prompt (YAML version - first 500 chars):\n{extraction_config.get('task_prompt')[:500]}...")
print("*"*50)

In [None]:
# Display available document classes and their attributes
classes = CONFIG.get('classes', [])
print(f"\nDocument Classes and Attributes:")
for cls in classes:
    print(f"\n{cls['$id']} ({len(cls.get('attributes', []))} attributes):")
    for attr in cls.get('attributes', [])[:3]:  # Show first 3 attributes
        print(f"  - {attr['$id']}: {attr['description'][:100]}...")
    if len(cls.get('attributes', [])) > 3:
        print(f"  ... and {len(cls.get('attributes', [])) - 3} more")

In [None]:
# Create extraction service with Bedrock using YAML-configured prompts
extraction_service = extraction.ExtractionService(config=CONFIG)

print("Extraction service initialized with YAML configuration")
print("üîÑ The service will now automatically detect and parse YAML responses from the LLM")

## 3. Extract Information from Document Sections Using YAML

In [None]:
# Helper function to parse S3 URIs and load JSON
def parse_s3_uri(uri):
    parts = uri.replace("s3://", "").split("/")
    bucket = parts[0]
    key = "/".join(parts[1:])
    return bucket, key

def load_json_from_s3(uri):
    s3_client = boto3.client('s3')
    bucket, key = parse_s3_uri(uri)
    response = s3_client.get_object(Bucket=bucket, Key=key)
    content = response['Body'].read().decode('utf-8')
    return json.loads(content)

print("Helper functions defined")

In [None]:
print("Extracting information from document sections using YAML prompts...")

if not document.sections:
    print("No sections found in document. Cannot proceed with extraction.")
else:
    extraction_results = []
    
    # Process each section (limit to first 3 to save time in demo)
    n = min(3, len(document.sections))
    print(f"Processing first {n} of {len(document.sections)} sections...")
    
    for i, section in enumerate(document.sections[:n]):
        print(f"\n--- Processing Section {i+1}/{n} ---")
        print(f"Section ID: {section.section_id}")
        print(f"Classification: {section.classification}")
        print(f"Pages: {section.page_ids}")
        
        # Process section extraction
        start_time = time.time()
        document = extraction_service.process_document_section(
            document=document,
            section_id=section.section_id
        )
        extraction_time = time.time() - start_time
        
        print(f"‚úÖ YAML extraction completed in {extraction_time:.2f} seconds")
        print(f"üìä The LLM response was automatically detected and parsed as structured data")
        
        # Record results
        extraction_results.append({
            'section_id': section.section_id,
            'classification': section.classification,
            'processing_time': extraction_time,
            'extraction_result_uri': getattr(section, 'extraction_result_uri', None)
        })
    
    print(f"\nüéâ YAML-based extraction complete for {n} sections.")
    print(f"üí° Token efficiency: YAML typically uses 10-30% fewer tokens than equivalent JSON")

## 4. Display YAML Extraction Results

In [None]:
print("\n=== YAML Extraction Results ===")
print("üìù Note: Even though the LLM generated YAML, the results are automatically")
print("   converted to Python dictionaries for easy programmatic access.")
print("")

if document.sections:
    for i, section in enumerate(document.sections[:n]):
        print(f"\n--- Section {section.section_id} ({section.classification}) ---")
        
        if hasattr(section, 'extraction_result_uri') and section.extraction_result_uri:
            try:
                # Load extraction results from S3
                extraction_data = load_json_from_s3(section.extraction_result_uri)
                
                print(f"Extraction Result URI: {section.extraction_result_uri}")
                
                # Display inference results
                if 'inference_result' in extraction_data:
                    inference_result = extraction_data['inference_result']
                    print("üìã Extracted Data (originally from YAML):")
                    for attr_name, attr_value in inference_result.items():
                        if attr_value is not None:
                            # Truncate long values for display
                            display_value = str(attr_value)[:1000] + "..." if len(str(attr_value)) > 1000 else attr_value
                            print(f"  {attr_name}: {display_value}")
                        else:
                            print(f"  {attr_name}: null")
                else:
                    print("No inference results found")
                    
                # Display metadata if available
                if 'metadata' in extraction_data:
                    metadata = extraction_data['metadata']
                    print(f"‚è±Ô∏è  Processing time: {metadata.get('extraction_time_seconds', 'N/A')} seconds")
                    
                    # Show format detection info if available
                    if 'format_detected' in metadata:
                        print(f"üîç Format detected: {metadata['format_detected']}")
                    
            except Exception as e:
                print(f"Error loading extraction results: {e}")
        else:
            print("No extraction results available")
else:
    print("No sections to display")

## 5. Compare YAML vs JSON Benefits

In [None]:
print("\n=== YAML vs JSON Comparison ===")
print("")
print("üî§ YAML Benefits:")
print("   ‚Ä¢ 10-30% fewer tokens than equivalent JSON")
print("   ‚Ä¢ No quotes required around keys")
print("   ‚Ä¢ More human-readable format")
print("   ‚Ä¢ Natural support for multiline strings")
print("   ‚Ä¢ Cleaner syntax for nested structures")
print("")
print("‚öôÔ∏è  Technical Implementation:")
print("   ‚Ä¢ Automatic format detection (JSON/YAML/unknown)")
print("   ‚Ä¢ Robust parsing with multiple extraction strategies")
print("   ‚Ä¢ Intelligent fallback between formats")
print("   ‚Ä¢ Full backward compatibility with existing JSON workflows")
print("")
print("üí° Example YAML vs JSON:")
print("")
print("   YAML (more compact):")
print("   vendor_name: ACME Corporation")
print("   invoice_date: 03/15/2024")
print("   total_amount: 1250.00")
print("")
print("   JSON (more verbose):")
print('   {"vendor_name": "ACME Corporation", "invoice_date": "03/15/2024", "total_amount": 1250.00}')
print("")

## 6. Save Results for Next Step

In [None]:
# Create data directory for this step
data_dir = Path(".data/step3_extraction")
data_dir.mkdir(parents=True, exist_ok=True)

# Save updated document object as JSON
document_path = data_dir / "document.json"
with open(document_path, 'w') as f:
    f.write(document.to_json())

# Save configuration (with YAML modifications)
config_path = data_dir / "config.json"
with open(config_path, 'w') as f:
    json.dump(CONFIG, f, indent=2)

# Save environment info (pass through)
env_path = data_dir / "environment.json"
with open(env_path, 'w') as f:
    json.dump(env_info, f, indent=2)

# Save extraction-specific results summary
extraction_summary = {
    'extraction_method': 'YAML-based',
    'model_used': extraction_config.get('model'),
    'sections_processed': len(extraction_results) if 'extraction_results' in locals() else 0,
    'total_sections': len(document.sections) if document.sections else 0,
    'section_results': extraction_results if 'extraction_results' in locals() else [],
    'sections_with_extractions': [
        {
            'section_id': section.section_id,
            'classification': section.classification,
            'extraction_result_uri': getattr(section, 'extraction_result_uri', None),
            'has_results': hasattr(section, 'extraction_result_uri') and section.extraction_result_uri is not None
        } for section in (document.sections or [])
    ],
    'yaml_benefits': {
        'token_efficiency': '10-30% fewer tokens than JSON',
        'format_detection': 'Automatic YAML/JSON detection and parsing',
        'backward_compatibility': 'Full compatibility with existing JSON workflows'
    }
}

extraction_summary_path = data_dir / "extraction_summary.json"
with open(extraction_summary_path, 'w') as f:
    json.dump(extraction_summary, f, indent=2)

print(f"Saved document to: {document_path}")
print(f"Saved configuration to: {config_path}")
print(f"Saved environment info to: {env_path}")
print(f"Saved extraction summary to: {extraction_summary_path}")

## 7. Summary

In [None]:
sections_processed = len(extraction_results) if 'extraction_results' in locals() else 0
sections_with_results = sum(1 for section in (document.sections or []) if hasattr(section, 'extraction_result_uri') and section.extraction_result_uri)

print("=== Step 3: YAML-Based Extraction Complete ===")
print(f"‚úÖ Document processed: {document.id}")
print(f"‚úÖ Sections processed: {sections_processed} of {len(document.sections) if document.sections else 0}")
print(f"‚úÖ Sections with results: {sections_with_results}")
print(f"‚úÖ Model used: {extraction_config.get('model')}")
print(f"‚úÖ Extraction method: YAML-based prompts with automatic parsing")
print(f"‚úÖ Data saved to: .data/step3_extraction_yaml/")
print("")
print("üéØ Key Achievements:")
print("   ‚Ä¢ Demonstrated YAML output format for LLM responses")
print("   ‚Ä¢ Automatic format detection and parsing")
print("   ‚Ä¢ Token efficiency improvements (10-30% reduction)")
print("   ‚Ä¢ Seamless integration with existing extraction workflow")
print("   ‚Ä¢ Full backward compatibility maintained")
print("")
print("üìå Next step: Run step4_assessment.ipynb (works with both JSON and YAML extractions)")