<a href="https://colab.research.google.com/github/topzson/Project_Hurricane_OCR/blob/main/OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("Installing necessary libraries...")
!pip install -U transformers datasets peft
print("Libraries installed successfully.")
!pip install -U bitsandbytes
print("bitsandbytes installed successfully.")

Installing necessary libraries...
Libraries installed successfully.
bitsandbytes installed successfully.


## Load Pre-trained Model and Tokenizer

### Subtask:
Load the `scb10x/typhoon-ocr1.5-2b` model and its corresponding tokenizer from Hugging Face.


In [2]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig

print("Loading processor and tokenizer...")
processor = AutoProcessor.from_pretrained("scb10x/typhoon-ocr1.5-2b", trust_remote_code=True)
tokenizer = processor.tokenizer # Get tokenizer from processor
print("Processor and tokenizer loaded successfully.")

print("Loading model...")

# Define the BitsAndBytesConfig for 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True # Enable CPU offloading for 32-bit modules
)

model = AutoModelForImageTextToText.from_pretrained(
    "scb10x/typhoon-ocr1.5-2b", # Corrected model name
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
print("Model loaded successfully.")

Loading processor and tokenizer...


preprocessor_config.json:   0%|          | 0.00/782 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/817 [00:00<?, ?B/s]

Processor and tokenizer loaded successfully.
Loading model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/4.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/199 [00:00<?, ?B/s]

Model loaded successfully.


## Prepare Dataset for Fine-tuning



In [3]:
from datasets import Dataset
import torch
from PIL import Image
import numpy as np

# --- Placeholder for your custom dataset ---
# In a real scenario, your data would include paths to images.
# For this example, we'll create a dummy dataset with dummy image data.
dummy_data = [
    {"image_data": Image.new('RGB', (224, 224), color='red'), "text": "This is a sample document for OCR.", "label": "Sample Document"},
    {"image_data": Image.new('RGB', (224, 224), color='blue'), "text": "Invoice #123: Total amount is $100.", "label": "Invoice Data"},
    {"image_data": Image.new('RGB', (224, 224), color='green'), "text": "Please process payment by 2023-12-31.", "label": "Payment Instruction"}
]

# Create a Dataset from the dummy data
dataset = Dataset.from_list(dummy_data)

print("Dataset created:")
print(dataset)
print("First example (before tokenization):")
print(dataset[0])

# Tokenize the dataset and process images
def tokenize_function(examples):
    # Process text
    text_encoding = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

    # Process images one by one to ensure each image's pixel_values is a separate entry
    # in the list for the 'pixel_values' column in the dataset.
    processed_pixel_values = []
    for img_pil in examples["image_data"]:
        # Process each image individually
        single_image_encoding = processor.image_processor(images=[img_pil], return_tensors="pt")
        # Assuming single_image_encoding['pixel_values'] is shape (1, num_patches, hidden_dim)
        # or (1, C, H, W). We take the [0] to get rid of the batch dimension for a single image.
        processed_pixel_values.append(single_image_encoding['pixel_values'][0])

    examples["input_ids"] = text_encoding["input_ids"]
    examples["attention_mask"] = text_encoding["attention_mask"]
    examples["pixel_values"] = processed_pixel_values # This will now be a list of tensors, one for each image

    # For causal language modeling, labels are typically the input_ids themselves
    # The DataCollatorForLanguageModeling will handle shifting for loss calculation.
    examples["labels"] = examples["input_ids"]

    return examples

# Apply the tokenize_function to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove original columns if no longer needed
tokenized_dataset = tokenized_dataset.remove_columns([col for col in dataset.column_names if col not in ['input_ids', 'attention_mask', 'pixel_values', 'labels']])

print("Tokenized dataset created:")
print(tokenized_dataset)
print("First tokenized example:")
print(tokenized_dataset[0])

Dataset created:
Dataset({
    features: ['image_data', 'text', 'label'],
    num_rows: 3
})
First example (before tokenization):
{'image_data': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=224x224 at 0x7B32D1B28920>, 'text': 'This is a sample document for OCR.', 'label': 'Sample Document'}


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Tokenized dataset created:
Dataset({
    features: ['input_ids', 'attention_mask', 'pixel_values', 'labels'],
    num_rows: 3
})
First tokenized example:
{'input_ids': [1986, 374, 264, 6077, 2197, 369, 80577, 13, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151

## Perform OCR Inference

In [4]:
from PIL import Image
import torch

def resize_if_needed(img, max_size):
    width, height = img.size
    # Only resize if one dimension exceeds max_size
    if width > max_size or height > max_size:
        if width >= height:
            scale = max_size / float(width)
            new_size = (max_size, int(height * scale))
        else:
            scale = max_size / float(height)
            new_size = (int(width * scale), max_size)

        img = img.resize(new_size, Image.Resampling.LANCZOS)
        print(f"{width, height}==> {img.size}")
        return img
    else:
        return img

# Path to the image file
image_path = "/content/sample_data/DrivingLicence_TEST.jpg"

# Load the image
try:
    img = Image.open(image_path).convert("RGB")
    print(f"Image loaded successfully from {image_path}")
except FileNotFoundError:
    print(f"Error: Image file not found at {image_path}")
    img = None
except Exception as e:
    print(f"Error loading image: {e}")
    img = None

if img:
    # This is important because the model is trained with a fixed image dimension of 1800 px
    img = resize_if_needed(img, 1800)

    # Define the prompt for OCR
    prompt = "Provide the OCR result for this document: "

    messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": img,
                    },
                    {
                        "type": "text",
                        "text": prompt
                    }
                ],
            }
        ]

    # Preparation for inference
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )
    inputs = inputs.to(model.device)

    # Inference: Generation of the output
    # Reduced max_new_tokens for faster inference
    generated_ids = model.generate(**inputs, max_new_tokens=1000)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    print("\n--- OCR Result ---")
    print(output_text[0])
else:
    print("Cannot perform OCR as the image could not be loaded.")

Image loaded successfully from /content/sample_data/DrivingLicence_TEST.jpg

--- OCR Result ---
ประเทศไทย Kingdom of Thailand ใบอนุญาตขับขี่รถยนต์ส่วนบุคคล Private Car Driving Licence ฉบับที่ 00000000 No. 00000000 วันอนุญาต 1 มีนาคม 2565 Issue Date 1 March 2022 วันสิ้นอายุ 1 มีนาคม 2568 Expire 1 March 2025 ชื่อ Name XXX XXXXXXXX XXXXXXXXX XX. XXXXXXXX XXXXXXXXX เกิดวันที่ Birth Date 23 พฤษภาคม 2538 23 May 1995 เลขที่บัตรประจำตัว / ID No. 00000 00000 00 0 นายทะเบียนจังหวัด กรุงเทพมหานคร Bangkok


In [6]:
import re

occr_result = output_text[0]

extracted_info = {}

# Extract Name
name_match = re.search(r'ชื่อ Name\s+([^\n]+?)\s+เกิดวันที่', occr_result)
if name_match:
    # Clean up any extra spaces or unwanted characters, and remove the first part of the name if it's 'XXX'
    full_name = name_match.group(1).strip()
    # Remove 'XXX' if it appears at the beginning of the name
    if full_name.startswith('XXX '):
        full_name = full_name[4:].strip()
    # Further refinement: remove the 'XX. XXXXXXXX XXXXXXXXX' pattern if present after the name
    full_name = re.sub(r'\s+XX\. XXXXXXXX XXXXXXXXX', '', full_name).strip()
    extracted_info['Name'] = full_name

# Extract Birth Date
birth_date_match = re.search(r'เกิดวันที่ Birth Date\s+([^\n]+?)\s+เลขที่บัตรประจำตัว', occr_result)
if birth_date_match:
    extracted_info['Birth Date'] = birth_date_match.group(1).strip()

# Extract ID No.
id_no_match = re.search(r'เลขที่บัตรประจำตัว / ID No.\s+([\d\s]+)', occr_result)
if id_no_match:
    extracted_info['ID No.'] = id_no_match.group(1).strip()

# Extract Issue Date
issue_date_match = re.search(r'วันอนุญาต\s+([^\n]+?)\s+Issue Date\s+([^\n]+?)\s+วันสิ้นอายุ', occr_result)
if issue_date_match:
    extracted_info['Issue Date'] = issue_date_match.group(2).strip() # Taking the English part

# Extract Expire Date
expire_date_match = re.search(r'วันสิ้นอายุ\s+([^\n]+?)\s+Expire\s+([^\n]+?)\s+ชื่อ', occr_result)
if expire_date_match:
    extracted_info['Expire Date'] = expire_date_match.group(2).strip() # Taking the English part

# Extract Province
province_match = re.search(r'นายทะเบียนจังหวัด\s+([^\n]+)', occr_result)
if province_match:
    extracted_info['Province'] = province_match.group(1).strip()

print("--- Extracted Information ---")
for key, value in extracted_info.items():
    print(f"{key}: {value}")


--- Extracted Information ---
Name: XXXXXXXX XXXXXXXXX
Birth Date: 23 พฤษภาคม 2538 23 May 1995
ID No.: 00000 00000 00 0
Issue Date: 1 March 2022
Expire Date: 1 March 2025
Province: กรุงเทพมหานคร Bangkok


In [7]:
import pandas as pd

# Convert the extracted_info dictionary to a DataFrame
df_extracted = pd.DataFrame([extracted_info])

print("--- Extracted Information DataFrame ---")
display(df_extracted)

--- Extracted Information DataFrame ---


Unnamed: 0,Name,Birth Date,ID No.,Issue Date,Expire Date,Province
0,XXXXXXXX XXXXXXXXX,23 พฤษภาคม 2538 23 May 1995,00000 00000 00 0,1 March 2022,1 March 2025,กรุงเทพมหานคร Bangkok
