In [None]:
import os
import sys

ENDSWITH = 'OCR'

NOTEBOOK_DIR = os.getcwd()

if not NOTEBOOK_DIR.endswith(ENDSWITH):
    raise ValueError(f"Not in correct dir, expect end with {ENDSWITH}, but got {NOTEBOOK_DIR} instead")

BASE_DIR = os.path.abspath(os.path.join(NOTEBOOK_DIR, '..', '..', '..', '..'))
print(f"Base directory: {BASE_DIR}")

sys.path.insert(0, os.path.join(BASE_DIR, 'code'))

In [None]:
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

In [None]:
from MangaOCREvaluator import ParseAnnotation, MangaOCRDataset, MangaOCREvaluator
from pipeline.OCRModels.MangaOCRModel import MangaOCRModel

## Step 1: Parse XML Annotations

Parse Manga109 XML annotations and save to JSON format.

In [None]:
# Define paths
manga_name = "AisazuNihaIrarenai"
xml_path = os.path.join(BASE_DIR, 'data', 'Manga109_released_2023_12_07', 'annotations', f'{manga_name}.xml')
images_dir = os.path.join(BASE_DIR, 'data', 'Manga109_released_2023_12_07', 'images', manga_name)
output_dir = os.path.join(BASE_DIR, 'data', 'MangaOCR', 'jsons_processed')

# Parse annotations
parser = ParseAnnotation(xml_path, images_dir, output_dir)
json_output_path = parser.parse_and_save()

## Step 2: Load and Inspect Data

Load the parsed annotations and check what we have.

In [None]:
# Load with text bbox
text_data = MangaOCREvaluator.load_manga109_annotations(
    json_output_path, 
    images_dir, 
    bbox_type="text"
)

print(f"Number of images with text: {len(text_data['image_paths'])}")
print(f"\nFirst image: {text_data['image_paths'][0]}")
print(f"Number of text boxes: {len(text_data['boxes_list'][0])}")
print(f"Text boxes: {text_data['boxes_list'][0][:3]}")  # Show first 3
print(f"Ground truth: {text_data['ground_truth_texts'][0][:3]}")  # Show first 3

## Step 3: Evaluate with Text BBox Only

Evaluate OCR performance using only text bounding boxes.

In [None]:
# Create dataset with text bbox
text_dataset = MangaOCRDataset(
    text_data["image_paths"][:5],  # Use first 5 images for demo
    text_data["boxes_list"][:5],
    text_data["ground_truth_texts"][:5],
    bbox_type="text"
)

# Initialize OCR model and evaluator
ocr_model = MangaOCRModel()
evaluator = MangaOCREvaluator(device=device)

# Evaluate
text_metrics = evaluator.evaluate(ocr_model, text_dataset, batch_size=1, verbose=False, bbox_type="text")

## Step 4: Compare Text BBox vs Bubble BBox

Use the built-in comparison method to evaluate both bbox types.

In [None]:
# Compare both bbox types
ocr_model = MangaOCRModel()
evaluator = MangaOCREvaluator(device=device)

comparison_results = evaluator.compare_bbox_types(
    ocr_model=ocr_model,
    json_path=json_output_path,
    images_dir=images_dir,
    batch_size=1,
    verbose=False,
    max_images=5  # Limit to first 5 images for demo
)

## Step 5: Analyze Results

The comparison shows:
- **Text BBox**: Uses the exact text region bounding boxes from annotations
- **Bubble BBox**: Uses speech bubble bounding boxes (if available)

Key insights:
- Lower CER/WER is better
- Text bbox should theoretically perform better as it's more precise
- Bubble bbox includes extra whitespace/background which may affect OCR

In [None]:
# Access individual results
print("Text BBox Results:")
print(f"  CER: {comparison_results['text_bbox']['cer']:.4f}")
print(f"  WER: {comparison_results['text_bbox']['wer']:.4f}")
print(f"  Samples: {comparison_results['text_bbox']['num_samples']}")

print("\nBubble BBox Results:")
print(f"  CER: {comparison_results['bubble_bbox']['cer']:.4f}")
print(f"  WER: {comparison_results['bubble_bbox']['wer']:.4f}")
print(f"  Samples: {comparison_results['bubble_bbox']['num_samples']}")

# Calculate improvement
cer_diff = comparison_results['text_bbox']['cer'] - comparison_results['bubble_bbox']['cer']
wer_diff = comparison_results['text_bbox']['wer'] - comparison_results['bubble_bbox']['wer']

print(f"\nImprovement using Text BBox:")
print(f"  CER: {cer_diff:.4f} ({'better' if cer_diff < 0 else 'worse'})")
print(f"  WER: {wer_diff:.4f} ({'better' if wer_diff < 0 else 'worse'})")

## Notes

### ParseAnnotation Class
- Input: Manga109 XML annotation file
- Output: JSON file in COCO format with text annotations
- Saves to: `data/MangaOCR/jsons_processed/`

### MangaOCRDataset Class
- No longer depends on YoloSeg output
- Works directly with parsed JSON annotations
- Loads images on-the-fly to save memory
- Supports both "text" and "bubble" bbox types

### MangaOCREvaluator
- `load_manga109_annotations()`: Loads and filters data from JSON
- `evaluate()`: Evaluates OCR with specified bbox type
- `compare_bbox_types()`: Automatically compares text vs bubble bboxes

### Workflow
1. Parse XML â†’ JSON (one-time setup)
2. Load JSON annotations
3. Create dataset with desired bbox type
4. Evaluate OCR performance
5. Compare results