In [None]:
import os
import base64
import time
from pathlib import Path
from typing import List, Optional
import anthropic


In [None]:

# Configuration
ANTHROPIC_API_KEY = "your-api-key-here"  # Replace with your actual API key
INPUT_FOLDER = "input_images"  # Folder containing JPG files
OUTPUT_FOLDER = "transcripts"  # Folder for output text files
MODEL = "claude-3-sonnet-20240229"  # Or "claude-3-opus-20240229" for highest quality
REFERENCE_FOLDER = None  # Set to None if no reference OCR is available

In [None]:
# Transcription instructions - modify as needed
TRANSCRIPTION_INSTRUCTIONS = """Instruction"""



In [None]:
class ClaudeOCRProcessor:
    def __init__(self, api_key: str):
        self.client = anthropic.Anthropic(api_key=api_key)
        self.processed_count = 0
        self.failed_files = []
        self.reference_used_count = 0

    def encode_image(self, image_path: str) -> str:
        """Encode image to base64 string"""
        try:
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode('utf-8')
        except Exception as e:
            print(f"Error encoding {image_path}: {e}")
            return None

    def get_jpg_files(self, folder_path: str) -> List[str]:
        """Get all JPG files from the specified folder"""
        folder = Path(folder_path)
        if not folder.exists():
            print(f"Error: Folder '{folder_path}' does not exist")
            return []
        
        jpg_files = []
        for ext in ['*.jpg', '*.jpeg', '*.JPG', '*.JPEG']:
            jpg_files.extend(folder.glob(ext))
        
        return sorted([str(f) for f in jpg_files])

    def find_reference_file(self, image_path: str, reference_folder: str) -> Optional[str]:
        """Find corresponding reference OCR file for an image"""
        if not reference_folder or not os.path.exists(reference_folder):
            return None
        
        base_name = os.path.splitext(os.path.basename(image_path))[0]
        reference_folder_path = Path(reference_folder)
        
        # Try various naming conventions for reference files
        possible_names = [
            f"{base_name}.txt",
            f"{base_name}_ocr.txt",
            f"{base_name}_reference.txt",
            f"{base_name}_ref.txt"
        ]
        
        for name in possible_names:
            ref_path = reference_folder_path / name
            if ref_path.exists():
                return str(ref_path)
        
        return None

    def load_reference_text(self, reference_path: str) -> Optional[str]:
        """Load reference OCR text from file"""
        try:
            with open(reference_path, 'r', encoding='utf-8') as f:
                return f.read().strip()
        except Exception as e:
            print(f"Error reading reference file {reference_path}: {e}")
            return None

    def build_instructions(self, reference_text: Optional[str] = None) -> str:
        """Build complete transcription instructions with optional reference"""
        if reference_text:
            return BASE_TRANSCRIPTION_INSTRUCTIONS + REFERENCE_INSTRUCTIONS.format(
                reference_text=reference_text
            )
        else:
            return BASE_TRANSCRIPTION_INSTRUCTIONS

    def transcribe_image(self, image_path: str, reference_folder: Optional[str] = None, 
                        page_number: Optional[int] = None) -> Tuple[Optional[str], bool]:
        """Send image to Claude for transcription, optionally with reference OCR"""
        print(f"Processing: {os.path.basename(image_path)}")
        
        # Encode image
        base64_image = self.encode_image(image_path)
        if not base64_image:
            return None, False

        # Look for reference file
        reference_text = None
        reference_used = False
        if reference_folder:
            ref_path = self.find_reference_file(image_path, reference_folder)
            if ref_path:
                reference_text = self.load_reference_text(ref_path)
                if reference_text:
                    print(f"  Using reference: {os.path.basename(ref_path)}")
                    reference_used = True
                else:
                    print(f"  Reference file found but couldn't read: {ref_path}")
            else:
                print(f"  No reference file found")

        try:
            # Prepare the message
            page_id = page_number if page_number is not None else os.path.splitext(os.path.basename(image_path))[0]
            instructions = self.build_instructions(reference_text)
            
            message = self.client.messages.create(
                model=MODEL,
                max_tokens=4000,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": instructions
                            },
                            {
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": "image/jpeg",
                                    "data": base64_image
                                }
                            },
                            {
                                "type": "text", 
                                "text": f"Please transcribe this page as page {page_id}."
                            }
                        ]
                    }
                ]
            )
            
            return message.content[0].text, reference_used
            
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            self.failed_files.append(image_path)
            return None, False

    def save_transcript(self, transcript: str, output_path: str):
        """Save transcript to file"""
        try:
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(transcript)
            print(f"Saved: {output_path}")
        except Exception as e:
            print(f"Error saving {output_path}: {e}")

    def process_folder(self, input_folder: str, output_folder: str, reference_folder: Optional[str] = None):
        """Process all JPG files in the input folder"""
        jpg_files = self.get_jpg_files(input_folder)
        
        if not jpg_files:
            print(f"No JPG files found in '{input_folder}'")
            return

        print(f"Found {len(jpg_files)} JPG files to process")
        if reference_folder:
            print(f"Reference folder: {reference_folder}")
        else:
            print("No reference folder specified - processing without reference OCR")
        
        for i, image_path in enumerate(jpg_files, 1):
            print(f"\n--- Processing {i}/{len(jpg_files)} ---")
            
            # Generate output filename
            base_name = os.path.splitext(os.path.basename(image_path))[0]
            output_path = os.path.join(output_folder, f"{base_name}_transcript.txt")
            
            # Skip if already processed
            if os.path.exists(output_path):
                print(f"Skipping {base_name} (transcript already exists)")
                continue
            
            # Transcribe
            transcript, reference_used = self.transcribe_image(image_path, reference_folder, i)
            
            if transcript:
                self.save_transcript(transcript, output_path)
                self.processed_count += 1
                if reference_used:
                    self.reference_used_count += 1
            
            # Rate limiting - be respectful to the API
            if i < len(jpg_files):  # Don't sleep after the last file
                print("Waiting 2 seconds before next request...")
                time.sleep(2)

        # Summary
        print(f"\n--- Processing Complete ---")
        print(f"Successfully processed: {self.processed_count} files")
        if reference_folder:
            print(f"Used reference OCR for: {self.reference_used_count} files")
        if self.failed_files:
            print(f"Failed files: {len(self.failed_files)}")
            for failed_file in self.failed_files:
                print(f"  - {failed_file}")



In [None]:

def main():
    """Main function"""
    # Validate API key
    if ANTHROPIC_API_KEY == "your-api-key-here":
        print("Error: Please set your Anthropic API key in the ANTHROPIC_API_KEY variable")
        return

    # Create processor
    processor = ClaudeOCRProcessor(ANTHROPIC_API_KEY)
    
    # Process the folder
    processor.process_folder(INPUT_FOLDER, OUTPUT_FOLDER, REFERENCE_FOLDER)


if __name__ == "__main__":
    main()