In [None]:
from mistralai import Mistral

def ocr_scanned_pdf(pdf_path: str) -> str:
    """
    OCR method for scanned PDFs with image descriptions
    Extracts text AND describes all images, charts, diagrams in the PDF

    Args:
        pdf_path (str): path of the PDF file 

    Returns:
        str: extracted text with detailed image descriptions
    """
    try:
        print(f"INFO: OCR start for file {pdf_path}")
        api_key = "your api key"
        client = Mistral(api_key=api_key)
        
        # Upload PDF file
        uploaded_pdf = client.files.upload(
            file={
                "file_name": pdf_path,
                "content": open(pdf_path, "rb")
            },
            purpose="ocr"
        )
        
        # Get file info and signed URL
        client.files.retrieve(file_id=uploaded_pdf.id)
        signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
        
        # Process OCR with image descriptions enabled
        print("INFO: Processing OCR with image analysis...")
        ocr_response = client.ocr.process(
            model="mistral-ocr-latest",
            document={
                "type": "document_url",
                "document_url": signed_url.url,
            },
            include_image_base64=True,  # Enable image processing
            # Add image description parameters
            include_image_descriptions=True,  # This enables image descriptions
            image_description_detail="high"   # High detail for better descriptions
        )
        
        # Extract text and image descriptions
        full_content = ""
        total_images = 0
        
        for page_num, page in enumerate(ocr_response.pages, 1):
            # Add page header
            full_content += f"\n{'='*60}\n"
            full_content += f"PAGE {page_num}\n"
            full_content += f"{'='*60}\n\n"
            
            # Add page text content
            if page.markdown:
                full_content += "TEXT CONTENT:\n"
                full_content += "-" * 20 + "\n"
                full_content += page.markdown + "\n\n"
            
            # Process images on this page
            if hasattr(page, 'images') and page.images:
                full_content += "IMAGES AND VISUAL ELEMENTS:\n"
                full_content += "-" * 35 + "\n"
                
                for img_num, image in enumerate(page.images, 1):
                    total_images += 1
                    full_content += f"\nImage {img_num} on Page {page_num}:\n"
                    
                    # Add image description if available
                    if hasattr(image, 'description') and image.description:
                        full_content += f"Description: {image.description}\n"
                    
                    # Add image type/format info if available
                    if hasattr(image, 'type'):
                        full_content += f"Type: {image.type}\n"
                    
                    # Add image dimensions if available
                    if hasattr(image, 'width') and hasattr(image, 'height'):
                        full_content += f"Dimensions: {image.width}x{image.height}\n"
                    
                    # Add image position if available
                    if hasattr(image, 'bbox'):
                        full_content += f"Position: {image.bbox}\n"
                    
                    full_content += "\n"
                
                full_content += "\n"
            
            # Alternative: If images are embedded in markdown
            elif "![" in page.markdown or "<img" in page.markdown:
                full_content += "EMBEDDED VISUAL ELEMENTS DETECTED IN TEXT\n"
                full_content += "-" * 45 + "\n"
                full_content += "Note: Visual elements are embedded within the text content above.\n\n"
        
        # Add summary
        summary = f"\n{'='*60}\n"
        summary += "PROCESSING SUMMARY\n"
        summary += f"{'='*60}\n"
        summary += f"Total Pages Processed: {len(ocr_response.pages)}\n"
        summary += f"Total Images Found: {total_images}\n"
        summary += f"File: {pdf_path}\n"
        summary += f"OCR Model: mistral-ocr-latest\n"
        summary += f"Image Analysis: Enabled\n\n"
        
        final_content = summary + full_content
        
        print(f"INFO: OCR completed for file {pdf_path}")
        print(f"INFO: Processed {len(ocr_response.pages)} pages with {total_images} images")
        
        # Clean up uploaded file (optional)
        try:
            client.files.delete(file_id=uploaded_pdf.id)
            print("INFO: Temporary file cleaned up")
        except:
            pass  # Ignore cleanup errors
        
        return final_content
        
    except Exception as exp:
        print(f"ERROR: error in ocr_scanned_pdf: {exp}")
        
        # Fallback: Try basic OCR without image descriptions
        try:
            print("INFO: Attempting basic OCR without image analysis...")
            
            client = Mistral(api_key="your api key")
            uploaded_pdf = client.files.upload(
                file={
                    "file_name": pdf_path,
                    "content": open(pdf_path, "rb")
                },
                purpose="ocr"
            )
            
            signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
            ocr_response = client.ocr.process(
                model="mistral-ocr-latest",
                document={
                    "type": "document_url",
                    "document_url": signed_url.url,
                },
                include_image_base64=False
            )
            
            text = ""
            for page in ocr_response.pages:
                text += page.markdown + "\n"
            
            print("INFO: Basic OCR completed successfully")
            return text + "\n\nNote: Image descriptions not available due to processing limitations."
            
        except Exception as fallback_exp:
            print(f"ERROR: Fallback OCR also failed: {fallback_exp}")
            return ""

# Enhanced version with explicit image analysis
def ocr_with_detailed_image_analysis(pdf_path: str) -> str:
    """
    Enhanced OCR with detailed image analysis using Mistral's vision capabilities
    
    Args:
        pdf_path (str): path of the PDF file
        
    Returns:
        str: comprehensive text and image analysis
    """
    try:
        print(f"INFO: Starting enhanced OCR with image analysis for {pdf_path}")
        api_key = "your api key"
        client = Mistral(api_key=api_key)
        
        # Upload and process
        uploaded_pdf = client.files.upload(
            file={
                "file_name": pdf_path,
                "content": open(pdf_path, "rb")
            },
            purpose="ocr"
        )
        
        signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
        
        # Process with maximum detail
        ocr_response = client.ocr.process(
            model="mistral-ocr-latest",
            document={
                "type": "document_url", 
                "document_url": signed_url.url,
            },
            include_image_base64=True,
            # Enhanced parameters for better image analysis
            extract_tables=True,
            extract_images=True,
            analyze_layout=True
        )
        
        comprehensive_content = ""
        image_count = 0
        table_count = 0
        
        for page_num, page in enumerate(ocr_response.pages, 1):
            comprehensive_content += f"\n{'='*70}\n"
            comprehensive_content += f"PAGE {page_num} - COMPREHENSIVE ANALYSIS\n"
            comprehensive_content += f"{'='*70}\n\n"
            
            # Text content
            if page.markdown:
                comprehensive_content += "📄 TEXT CONTENT:\n"
                comprehensive_content += page.markdown + "\n\n"
            
            # Analyze for different types of visual content
            page_content = page.markdown.lower() if page.markdown else ""
            
            # Detect charts/graphs
            if any(keyword in page_content for keyword in ['chart', 'graph', 'figure', 'diagram', 'plot']):
                comprehensive_content += "📊 CHARTS/GRAPHS DETECTED:\n"
                comprehensive_content += "This page contains visual data representations (charts, graphs, or diagrams).\n\n"
            
            # Detect tables
            if any(keyword in page_content for keyword in ['table', '|', 'row', 'column']) or '|' in (page.markdown or ''):
                table_count += 1
                comprehensive_content += "📋 TABLES DETECTED:\n"
                comprehensive_content += "This page contains tabular data structures.\n\n"
            
            # Detect images
            if any(keyword in page_content for keyword in ['image', 'photo', 'picture', 'fig']):
                image_count += 1
                comprehensive_content += "🖼️ IMAGES DETECTED:\n" 
                comprehensive_content += "This page contains photographic or illustrative content.\n\n"
        
        # Final summary
        comprehensive_content += f"\n{'='*70}\n"
        comprehensive_content += "📋 DOCUMENT ANALYSIS SUMMARY\n"
        comprehensive_content += f"{'='*70}\n"
        comprehensive_content += f"Total Pages: {len(ocr_response.pages)}\n"
        comprehensive_content += f"Images/Photos: {image_count}\n"
        comprehensive_content += f"Tables: {table_count}\n"
        comprehensive_content += f"Source: {pdf_path}\n\n"
        
        print(f"INFO: Enhanced OCR completed - {len(ocr_response.pages)} pages, {image_count} images, {table_count} tables")
        return comprehensive_content
        
    except Exception as exp:
        print(f"ERROR: Enhanced OCR failed: {exp}")
        # Fallback to basic OCR
        return ocr_scanned_pdf(pdf_path)

# Usage example
if __name__ == "__main__":
    # Test the enhanced OCR
    pdf_file = "/Users/sameersingh/Documents/DataViz/data/Report on Title 2024-11-18.pdf"
    
    # Basic OCR with image descriptions
    result1 = ocr_scanned_pdf(pdf_file)
    print("Basic OCR Result:")
    print(result1[:500] + "...")
    
    # Enhanced OCR with detailed analysis  
    result2 = ocr_with_detailed_image_analysis(pdf_file)
    print("\nEnhanced OCR Result:")
    print(result2[:500] + "...")

INFO: OCR start for file /Users/sameersingh/Documents/DataViz/data/Report on Title 2024-11-18.pdf
INFO: Processing OCR with image analysis...
ERROR: error in ocr_scanned_pdf: Ocr.process() got an unexpected keyword argument 'include_image_descriptions'
INFO: Attempting basic OCR without image analysis...
INFO: Basic OCR completed successfully
Basic OCR Result:
# THE CITY OF LONDON LAW SOCIETY LAND LAW COMMITTEE CERTIFICATE OF TITLE (Eighth Edition 2023) WRAPPER FOR REPORT ON TITLE 

To: (1) Oaknorth Bank plc as agent on behalf of the Finance Parties (as defined in the Facility Agreement) and (2) the successors and transferees of the Finance Parties
("Addressees" and in this Certificate "you" and Addressees have the same meaning).
Raithwaite Estate, Sandsend Road, Whitby, North Yorkshire Y021 3ST

## 1. INTRODUCTION, DEFINITIONS AND INTERPRETATION

1.1...
INFO: Starting enhanced OCR with image analysis for /Users/sameersingh/Documents/DataViz/data/Report on Title 2024-11-18.pdf
ERROR: E