In [None]:
import os
import cv2
import pickle
from paddleocr import PaddleOCR
import logging

logging.getLogger("ppocr").setLevel(logging.ERROR)

def normalize_text(text):
    replacements = {
        'ö': 'o',
        'ü': 'u',
        'ä': 'a',
        'ß': 'ss',
        'é': 'e',
        'è': 'e',
        'ê': 'e',
        'ô': 'o',
        'î': 'i',
        'â': 'a',
        '@': 'a',
    }
    
    for old_char, new_char in replacements.items():
        text = text.replace(old_char, new_char)
    
    return text.lower()

ocr = PaddleOCR(use_angle_cls=True, lang='vi')

def process_image(img_path, output_dir, frame_data):
    frame_id_with_ext = os.path.basename(img_path)
    frame_id, _ = os.path.splitext(frame_id_with_ext)
    frame_id = int(frame_id)

    img = cv2.imread(img_path)

    if img is None:
        print(f"Error: Image not found at {img_path}.")
        return

    result = ocr.ocr(img_path, cls=True)

    if result and result[0]:
        list_text = {normalize_text(elements[1][0]) for elements in result[0]}
    else:
        list_text = {}

    while len(frame_data) <= frame_id:
        frame_data.append({})
    frame_data[frame_id] = list_text

    print(f"frame_id {frame_id} - {img_path}")

def process_directory(root_path):
    output_dir = 'OCR-Objects'
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for dirpath, _, filenames in os.walk(root_path):
        if 'keyframes' in dirpath:
            parts = dirpath.split(os.sep)
            for part in parts:
                if part.startswith('L'):
                    output_file_path = os.path.join(output_dir, part + ".bin")
                
                    if os.path.exists(output_file_path):
                        continue

                    frame_data = []

                    for file in filenames:
                        if file.endswith('.jpg'):
                            img_path = os.path.join(dirpath, file)
                            process_image(img_path, output_dir, frame_data)

                    with open(output_file_path, 'wb') as bin_file:
                        pickle.dump(frame_data, bin_file)

                    print(f"Completed: {output_file_path}")

root_path = '../../dataset'
process_directory(root_path)
