In [1]:
from PIL import Image, ImageEnhance, ImageFilter
import numpy as np
from transformers import pipeline
import requests
from io import BytesIO

In [2]:
pipe = pipeline(model='microsoft/trocr-large-printed')

pytorch_model.bin:   0%|          | 0.00/2.43G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 1024,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwr

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cpu


In [None]:
image_url = "https://docelf.com/images/free_receipt_template_xs.png"

def trocr_extract(file_path=None, image=None, preprocess=True):
    if image is None and file_path:
        img = Image.open(file_path)
    elif image:
        img = image
    else:
        raise ValueError("Either file_path or image must be provided")
    
    # Make sure the image is in the right mode for processing
    if img.mode != 'RGB':
        img = img.convert('RGB')
    
    # Preprocessing to improve OCR results
    if preprocess:
        # Resize if image is too small
        if img.width < 1000 or img.height < 1000:
            ratio = max(1000/img.width, 1000/img.height)
            new_size = (int(img.width * ratio), int(img.height * ratio))
            img = img.resize(new_size, Image.LANCZOS)
        
        # Increase contrast
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(2.0)  # Increase contrast by factor of 2
        
        # Convert to grayscale for OCR
        img = img.convert('L')
        
        # Apply slight sharpening
        img = img.filter(ImageFilter.SHARPEN)
    else:
        # Just convert to grayscale if no preprocessing
        img = img.convert('L')
    
    # Run OCR
    print(f"Processing image of size {img.size} and mode {img.mode}")
    lst_raw_text = pipe(img)
    
    # Extract and process text
    text_result = lst_raw_text[0]['generated_text']
    print(f"Raw extracted text: '{text_result}'")
    
    # Clean up text if needed
    cleaned_text = text_result.strip()
    return cleaned_text

def process_image_from_url(url, display_image=True):
    try:
        print(f"Downloading image from: {url}")
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        
        print(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
        print(f"Content length: {len(response.content)} bytes")
        
        img_data = BytesIO(response.content)
        img = Image.open(img_data)
        print(f"Image opened successfully. Size: {img.size}, Format: {img.format}, Mode: {img.mode}")
        
        if display_image:
            from IPython.display import display
            display(img)
        
        print("Processing image with TrOCR...")
        # Try with preprocessing
        result = trocr_extract(image=img, preprocess=True)
        print(f"Extracted text (with preprocessing): '{result}'")
        
        if not result.strip():  # If no text was extracted, try without preprocessing
            print("\nRetrying without preprocessing...")
            result = trocr_extract(image=img, preprocess=False)
            print(f"Extracted text (without preprocessing): '{result}'")
            
        return result
    
    except requests.RequestException as e:
        print(f"Request error: {e}")
    except UnidentifiedImageError as e:
        print(f"Image error: {e}")
        print("The URL might not point to a valid image or the image format is not supported.")
    except Exception as e:
        print(f"Unexpected error: {type(e).__name__}, {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    process_image_from_url(image_url)

Downloading image from: https://docelf.com/images/free_receipt_template_xs.png
Processing image with TrOCR...
Processing image with TrOCR...
Extracted text: :
Extracted text: :


In [None]:
# Try with different receipt image URLs that are known to work well with OCR
alternative_urls = [
    "https://cdn-blog.adafruit.com/uploads/2015/11/receipt.png",
    "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRD0yltS5oCMwbQ23iQ8Tl9PXyNcXPh0PnhLw&usqp=CAU"
]

# Try the alternative URLs
for url in alternative_urls:
    print(f"\n\nTrying alternative image: {url}")
    process_image_from_url(url)

In [None]:
# Create a test image with known text
from PIL import Image, ImageDraw, ImageFont
import io

def create_test_image(text="TEST RECEIPT\nTotal: $25.99", width=500, height=300, bg_color='white'):
    # Create a white image
    img = Image.new('RGB', (width, height), color=bg_color)
    d = ImageDraw.Draw(img)
    
    # Try to use a default font or fall back to default
    try:
        font = ImageFont.truetype("Arial", 36)
    except IOError:
        font = ImageFont.load_default()
    
    # Draw text in the middle
    text_lines = text.split('\n')
    line_height = height // (len(text_lines) + 2)
    y_position = line_height
    
    for line in text_lines:
        # Calculate text position to center it
        if hasattr(d, 'textbbox'):
            # For newer Pillow versions
            left, top, right, bottom = d.textbbox((0, 0), line, font=font)
            text_width = right - left
            text_height = bottom - top
        else:
            # For older Pillow versions
            text_width, text_height = d.textsize(line, font=font)
            
        x_position = (width - text_width) // 2
        
        # Draw the text
        d.text((x_position, y_position), line, fill='black', font=font)
        y_position += line_height
    
    return img

# Create and display the test image
test_img = create_test_image()

# Display the test image
from IPython.display import display
display(test_img)

# Process the test image directly
print("\nProcessing test image:")
result = trocr_extract(image=test_img)
print(f"Extracted text from test image: '{result}'")