In [None]:
import os
import sys

ENDSWITH = 'OCR'

NOTEBOOK_DIR = os.getcwd()

if not NOTEBOOK_DIR.endswith(ENDSWITH):
    raise ValueError(f"Not in correct dir, expect end with {ENDSWITH}, but got {NOTEBOOK_DIR} instead")

BASE_DIR = os.path.abspath(os.path.join(NOTEBOOK_DIR, '..', '..', '..', '..'))
print(f"Base directory: {BASE_DIR}")

sys.path.insert(0, os.path.join(BASE_DIR, 'code'))

In [None]:
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

In [None]:
from MangaOCREvaluator import MangaOCRDataset, MangaOCREvaluator
from pipeline.SegmentationModels.YoloSeg import YoloSeg
from pipeline.OCRModels.MangaOCRModel import MangaOCRModel

## Example 1: Evaluate OCR on a Single Image

This example shows how to:
1. Use YoloSeg to detect text bubbles
2. Create a dataset from the detection results
3. Evaluate OCR performance with ground truth

In [None]:
# Initialize the segmentation model
yolo_model_path = os.path.join(BASE_DIR, 'models', 'bubble-detection', 'best.pt')
yolo_seg = YoloSeg(yolo_model_path)
yolo_seg.load_model()

# Process a sample image
sample_image_path = os.path.join(BASE_DIR, 'data', 'Manga109_released_2023_12_07', 'images', 'AisazuNihaIrarenai', '002.jpg')

# Get segmentation results
img_rgb, boxes, masks = yolo_seg.predict(sample_image_path, print_bbox=False, plot=False)

print(f"Detected {len(boxes)} text bubbles")
yolo_seg.unload_model()

In [None]:
# Example ground truth texts (in real scenario, load from annotations)
# For demonstration, we'll create dummy ground truth
ground_truth_texts = ["Sample text " + str(i+1) for i in range(len(boxes))]

# Create dataset
dataset = MangaOCRDataset(
    images=[img_rgb],
    boxes_list=[boxes],
    masks_list=[masks],
    ground_truth_texts=[ground_truth_texts]
)

print(f"Dataset created with {len(dataset)} image(s)")
print(f"First image has {len(boxes)} text bubbles")

In [None]:
# Initialize OCR model and evaluator
ocr_model = MangaOCRModel()
evaluator = MangaOCREvaluator(device=device)

# Evaluate
metrics = evaluator.evaluate(ocr_model, dataset, batch_size=1, verbose=True)

## Example 2: Batch Evaluation on Multiple Images

This example demonstrates batch processing of multiple manga pages.

In [None]:
# Process multiple images
yolo_seg = YoloSeg(yolo_model_path)
yolo_seg.load_model()

image_dir = os.path.join(BASE_DIR, 'data', 'Manga109_released_2023_12_07', 'images', 'AisazuNihaIrarenai')
image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg')][:3]  # First 3 images

images = []
boxes_list = []
masks_list = []
ground_truth_list = []

for img_file in image_files:
    img_path = os.path.join(image_dir, img_file)
    img_rgb, boxes, masks = yolo_seg.predict(img_path, print_bbox=False, plot=False)
    
    images.append(img_rgb)
    boxes_list.append(boxes)
    masks_list.append(masks)
    
    # Create dummy ground truth (replace with actual annotations in production)
    gt_texts = [f"Text from {img_file} box {i+1}" for i in range(len(boxes))]
    ground_truth_list.append(gt_texts)
    
    print(f"Processed {img_file}: {len(boxes)} bubbles detected")

yolo_seg.unload_model()

# Create dataset
batch_dataset = MangaOCRDataset(images, boxes_list, masks_list, ground_truth_list)
print(f"\nBatch dataset created with {len(batch_dataset)} images")

In [None]:
# Evaluate on batch
ocr_model = MangaOCRModel()
evaluator = MangaOCREvaluator(device=device)

batch_metrics = evaluator.evaluate(ocr_model, batch_dataset, batch_size=1, verbose=False)

## Example 3: Using Real Manga109Dialog Annotations

This example shows how to load real ground truth from XML annotations.

In [None]:
# Load annotations for a specific manga
xml_path = os.path.join(BASE_DIR, 'data', 'Manga109Dialog', 'AisazuNihaIrarenai.xml')

# Load text annotations
text_dict = MangaOCREvaluator.load_manga109_annotations(xml_path)
print(f"Loaded {len(text_dict)} text annotations from XML")
print(f"Sample text IDs: {list(text_dict.keys())[:5]}")

## Notes on Usage

### Input Format
- **images**: List of numpy arrays (RGB format)
- **boxes_list**: List of lists, each containing bounding boxes [[x_min, y_min, x_max, y_max], ...]
- **masks_list**: List of masks from YoloSeg output
- **ground_truth_texts**: List of lists, each containing text strings for corresponding boxes

### Metrics
- **CER (Character Error Rate)**: Measures character-level accuracy
- **WER (Word Error Rate)**: Measures word-level accuracy

### Best Practices
1. Use batch_size=1 for manga images to avoid memory issues
2. Filter empty ground truth texts before evaluation
3. Load and unload models properly to manage memory
4. Use verbose=True for debugging individual predictions