In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import xml.etree.ElementTree as ET
from pathlib import Path

def parse_xml(xml_path):
    """Extract license plate texts and coordinates from XML annotations"""
    tree = ET.parse(xml_path)
    root = tree.getroot()

    plate_data = {}

    for image in root.findall('image'):
        filename = image.get('name')
        plates = []
        for box in image.findall('box'):
            if box.get('label') == 'carplate':
                plate_attr = box.find("attribute[@name='plate_number']")
                if plate_attr is not None and plate_attr.text is not None:
                    plates.append({
                        'text': plate_attr.text.strip(),
                        'coords': {
                            'xtl': float(box.get('xtl')),
                            'ytl': float(box.get('ytl')),
                            'xbr': float(box.get('xbr')),
                            'ybr': float(box.get('ybr'))
                        }
                    })
        if plates:
            plate_data[filename] = plates

    return plate_data

def process_coordinates(coord_str, img_width=2560, img_height=1440):
    """Convert YOLO format coordinates to absolute coordinates"""
    boxes = []
    for box in coord_str.strip('"').split('\n'):
        if box.strip():
            try:
                class_id, cx, cy, w, h = map(float, box.strip().split())
                x1 = int((cx - w/2) * img_width)
                y1 = int((cy - h/2) * img_height)
                x2 = int((cx + w/2) * img_width)
                y2 = int((cy + h/2) * img_height)
                boxes.append({
                    'x1': max(0, x1),
                    'y1': max(0, y1),
                    'x2': min(img_width, x2),
                    'y2': min(img_height, y2)
                })
            except ValueError as e:
                print(f"Error processing coordinates: {e}")
    return boxes

def txt_and_xml_to_csv(images_dir, labels_dir, xml_path, output_csv):
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)

    rows = []
    xml_data = parse_xml(xml_path) if xml_path and os.path.exists(xml_path) else {}
    print(f"🔍 Found {sum(len(v) for v in xml_data.values())} plate annotations in XML")

    for img_file in tqdm(os.listdir(images_dir), desc="Processing images"):
        if not img_file.lower().endswith(('.png','.jpg','.jpeg')):
            continue

        img_path = os.path.join(images_dir, img_file)
        base_name = os.path.splitext(img_file)[0].lower()
        txt_file = os.path.join(labels_dir, f"{base_name}.txt")
        img_filename = Path(img_file).name

        # Get data from TXT file
        txt_boxes = []
        if os.path.exists(txt_file):
            with open(txt_file, 'r') as f:
                coord_str = f.read()
            txt_boxes = process_coordinates(coord_str)

        # Get data from XML
        xml_plates = xml_data.get(img_filename, [])

        # Case 1: Both TXT and XML data available
        if txt_boxes and xml_plates:
            # Match boxes to plates (assuming same order)
            for i, (box, plate) in enumerate(zip(txt_boxes, xml_plates)):
                rows.append({
                    'filename': img_path,
                    'box_id': i+1,
                    'x1': box['x1'],
                    'y1': box['y1'],
                    'x2': box['x2'],
                    'y2': box['y2'],
                    'txt_content': coord_str.split('\n')[i].strip() if i < len(coord_str.split('\n')) else '',
                    'plate_text': plate['text']
                })

        # Case 2: Only TXT data available
        elif txt_boxes:
            for i, box in enumerate(txt_boxes):
                rows.append({
                    'filename': img_path,
                    'box_id': i+1,
                    'x1': box['x1'],
                    'y1': box['y1'],
                    'x2': box['x2'],
                    'y2': box['y2'],
                    'txt_content': coord_str.split('\n')[i].strip() if i < len(coord_str.split('\n')) else '',
                    'plate_text': ''
                })

        # Case 3: Only XML data available
        elif xml_plates:
            for i, plate in enumerate(xml_plates):
                rows.append({
                    'filename': img_path,
                    'box_id': i+1,
                    'x1': int(plate['coords']['xtl']),
                    'y1': int(plate['coords']['ytl']),
                    'x2': int(plate['coords']['xbr']),
                    'y2': int(plate['coords']['ybr']),
                    'txt_content': '',
                    'plate_text': plate['text']
                })

    if rows:
        df = pd.DataFrame(rows)
        df = df[['filename', 'box_id', 'x1', 'y1', 'x2', 'y2', 'txt_content', 'plate_text']]
        df.to_csv(output_csv, index=False)
        print(f"\n✅ Saved {len(df)} plate entries from {len(set(df['filename']))} images to {output_csv}")

        # Print statistics
        multi_plate_images = df['filename'].value_counts()[df['filename'].value_counts() > 1].count()
        print(f"- {multi_plate_images} images contain multiple plates")
        print("- Sample multi-plate entry:")
        print(df[df['filename'].duplicated(keep=False)].head(4))
    else:
        print("\n❌ No valid data found")

# Example usage
txt_and_xml_to_csv(
    images_dir="/content/drive/My Drive/cos30018-test/data/images/val",
    labels_dir="/content/drive/My Drive/cos30018-test/data/labels/val",
    xml_path="/content/drive/MyDrive/cos30018-test/data/annotations_val.xml",
    output_csv="/content/drive/My Drive/cos30018-test/EasyOCR/Result/labels_val.csv"
)

🔍 Found 704 plate annotations in XML


Processing images: 100%|██████████| 340/340 [00:02<00:00, 129.13it/s]



✅ Saved 704 plate entries from 317 images to /content/drive/My Drive/cos30018-test/EasyOCR/Result/labels_val.csv
- 206 images contain multiple plates
- Sample multi-plate entry:
                                            filename  box_id    x1   y1    x2  \
1  /content/drive/My Drive/cos30018-test/data/ima...       1  3539  990  3704   
2  /content/drive/My Drive/cos30018-test/data/ima...       2  1917  825  2111   
4  /content/drive/My Drive/cos30018-test/data/ima...       1  2559  633  2628   
5  /content/drive/My Drive/cos30018-test/data/ima...       2  3055  775  3226   

     y2 txt_content plate_text  
1  1088               QAB7872E  
2   878                QCS6367  
4   659               QAA9318S  
5   881                QKW3113  
