In [2]:
import os
import yaml

def build_and_clean_yolo_yaml(base_path, class_names):
    """
    Builds a YOLOv5/YOLOv8-style `data.yaml` file and performs class name and label index corrections.

    Args:
        base_path (str): Root path where the YOLO dataset is located.
                         Expects subdirectories `images/train`, `images/val`, `images/test`,
                         and corresponding `labels/train`, `labels/val`, `labels/test`.
        class_names (list): List of original class names, potentially with typos.

    This script:
    - Corrects known typos in class names.
    - Saves a corrected `data.yaml` in the same directory.
    - Updates class indices in `.txt` label files to reflect the corrected names.
    """
    # Define known typo fixes
    CLASS_NAME_FIXES = {
        'commeent': 'comment',
        'merge_noode': 'merge_node',
        'control_flowcontrol_flow': 'control_flow'
    }

    # Apply corrections to class names
    corrected_names = [CLASS_NAME_FIXES.get(name, name) for name in class_names]

    # Construct full image paths for YAML
    data_yaml = {
        'path': base_path,
        'train': os.path.join('images', 'train'),
        'val': os.path.join('images', 'val'),
        'test': os.path.join('images', 'test'),
        'nc': len(corrected_names),
        'names': corrected_names
    }

    yaml_path = os.path.join(base_path, 'data.yaml')
    with open(yaml_path, 'w') as f:
        yaml.safe_dump(data_yaml, f)
    print(f"Corrected data.yaml written to: {yaml_path}")

    # Create a mapping from old indices to new indices (if class order changed)
    index_map = {i: corrected_names.index(CLASS_NAME_FIXES.get(name, name))
                 for i, name in enumerate(class_names)
                 if name != CLASS_NAME_FIXES.get(name, name)}

    # Update label indices if any class indices have changed
    label_dirs = [os.path.join(base_path, f'labels/{subdir}') for subdir in ['train', 'val', 'test']]

    def fix_label_file(file_path):
        lines_changed = 0
        with open(file_path, 'r') as f:
            lines = f.readlines()
        new_lines = []
        for line in lines:
            parts = line.strip().split()
            if not parts:
                continue
            cls_idx = int(parts[0])
            new_cls_idx = index_map.get(cls_idx, cls_idx)
            if new_cls_idx != cls_idx:
                parts[0] = str(new_cls_idx)
                lines_changed += 1
            new_lines.append(' '.join(parts) + '\n')
        if lines_changed:
            with open(file_path, 'w') as f:
                f.writelines(new_lines)

    for label_dir in label_dirs:
        if not os.path.isdir(label_dir):
            continue
        for fname in os.listdir(label_dir):
            if fname.endswith('.txt'):
                fix_label_file(os.path.join(label_dir, fname))

    print("Label file indices updated where needed.")


# Example usage
if __name__ == "__main__":
    base_dataset_path = "/sfs/ceph/standard/sds_managed_sadewole/DS6050_SP25/group6/kaggle/kaggle_flowchart_yolo"
    original_class_names = [
        'action', 'activity', 'commeent', 'control_flow', 'control_flowcontrol_flow',
        'decision_node', 'exit_node', 'final_flow_node', 'final_node', 'fork',
        'merge', 'merge_noode', 'null', 'object', 'object_flow',
        'signal_recept', 'signal_send', 'start_node', 'text'
    ]
    build_and_clean_yolo_yaml(base_dataset_path, original_class_names)


Corrected data.yaml written to: /sfs/ceph/standard/sds_managed_sadewole/DS6050_SP25/group6/kaggle/kaggle_flowchart_yolo/data.yaml
Label file indices updated where needed.
