In [18]:
import json
import os
import sys
import glob
from PIL import Image

def fix_coco_annotations(json_path, output_path=None, verify_images=False):
    """
    Fix common issues in COCO annotation files and ensure they work with MMDetection
    
    Args:
        json_path: Path to the original COCO JSON file
        output_path: Path where to save the fixed JSON file (default: original_name_fixed.json)
        verify_images: Whether to check if image files exist (can be slow for large datasets)
    """
    if not os.path.exists(json_path):
        print(f"Error: File {json_path} does not exist!")
        return
        
    # Default output path
    if output_path is None:
        base_name = os.path.basename(json_path)
        name, ext = os.path.splitext(base_name)
        output_path = os.path.join(os.path.dirname(json_path), f"{name}_fixed{ext}")
    
    try:
        with open(json_path, 'r') as f:
            coco_data = json.load(f)
            
        print(f"Loaded COCO file: {json_path}")
        print(f"- Images: {len(coco_data.get('images', []))}")
        print(f"- Annotations: {len(coco_data.get('annotations', []))}")
        print(f"- Categories: {len(coco_data.get('categories', []))}")
        
        # Check for required sections
        required_sections = ['images', 'annotations', 'categories']
        missing_sections = [s for s in required_sections if s not in coco_data]
        
        if missing_sections:
            print(f"Error: Missing required sections: {missing_sections}")
            print("Cannot fix the file - it's missing core components")
            return
        
        # Check if there are any annotations
        if not coco_data['annotations']:
            print("Warning: No annotations found in the file!")
            
            # Try to auto-fix by checking for label files
            image_dir = input("Enter the path to the image directory: ")
            if not os.path.isdir(image_dir):
                print(f"Error: {image_dir} is not a valid directory!")
                return
                
            label_dir = input("Enter the path to the YOLO label directory: ")
            if not os.path.isdir(label_dir):
                print(f"Error: {label_dir} is not a valid directory!")
                return
            
            print("Attempting to rebuild annotations from YOLO label files...")
            annotations = []
            annotation_id = 1
            
            # Process each image in the COCO data
            for img in coco_data['images']:
                img_file = img['file_name']
                img_id = img['id']
                img_width = img['width']
                img_height = img['height']
                
                # Get the base name without extension
                base_name = os.path.splitext(img_file)[0]
                
                # Look for a corresponding label file
                label_file = os.path.join(label_dir, f"{base_name}.txt")
                
                if os.path.exists(label_file):
                    # Read the YOLO format annotations
                    with open(label_file, 'r') as f:
                        for line in f:
                            parts = line.strip().split()
                            if len(parts) < 5:
                                continue
                                
                            class_id = int(parts[0])
                            # YOLO format: [x_center, y_center, width, height] (normalized)
                            x_center = float(parts[1])
                            y_center = float(parts[2])
                            width = float(parts[3])
                            height = float(parts[4])
                            
                            # Convert to absolute pixel values
                            x_center_abs = x_center * img_width
                            y_center_abs = y_center * img_height
                            width_abs = width * img_width
                            height_abs = height * img_height
                            
                            # Convert to COCO format [x, y, width, height]
                            x_min = x_center_abs - (width_abs / 2)
                            y_min = y_center_abs - (height_abs / 2)
                            
                            # Calculate area
                            area = width_abs * height_abs
                            
                            # Create a simple segmentation
                            segmentation = [[
                                x_min, y_min,
                                x_min + width_abs, y_min,
                                x_min + width_abs, y_min + height_abs,
                                x_min, y_min + height_abs
                            ]]
                            
                            # Create COCO annotation
                            annotation = {
                                "segmentation": segmentation,
                                "area": area,
                                "iscrowd": 0,
                                "image_id": img_id,
                                "bbox": [x_min, y_min, width_abs, height_abs],
                                "category_id": class_id,
                                "id": annotation_id
                            }
                            
                            annotations.append(annotation)
                            annotation_id += 1
            
            if annotations:
                print(f"Created {len(annotations)} annotations from YOLO label files")
                coco_data['annotations'] = annotations
            else:
                print("Could not create any annotations from YOLO files")
        
        # Verify image IDs are unique
        image_ids = [img['id'] for img in coco_data['images']]
        if len(image_ids) != len(set(image_ids)):
            print("Warning: Duplicate image IDs found, fixing...")
            
            # Create a mapping of old to new IDs
            id_mapping = {}
            new_id = 1
            
            # Assign new unique IDs
            for img in coco_data['images']:
                old_id = img['id']
                if old_id in id_mapping:
                    # Already seen this ID, assign a new one
                    id_mapping[old_id] = new_id
                    img['id'] = new_id
                    new_id += 1
                else:
                    # First time seeing this ID, keep it
                    id_mapping[old_id] = old_id
            
            # Update annotation image_ids
            for ann in coco_data['annotations']:
                ann['image_id'] = id_mapping.get(ann['image_id'], ann['image_id'])
        
        # Check bbox format
        fixed_bbox_count = 0
        for ann in coco_data['annotations']:
            if 'bbox' in ann and len(ann['bbox']) == 4:
                x, y, w, h = ann['bbox']
                
                # Fix negative width/height (shouldn't happen in COCO format)
                if w <= 0:
                    ann['bbox'][2] = 1  # Set to minimum width
                    fixed_bbox_count += 1
                    
                if h <= 0:
                    ann['bbox'][3] = 1  # Set to minimum height
                    fixed_bbox_count += 1
        
        if fixed_bbox_count > 0:
            print(f"Fixed {fixed_bbox_count} invalid bounding box dimensions")
        
        # Verify category IDs
        valid_cat_ids = set(cat['id'] for cat in coco_data['categories'])
        invalid_anns = []
        for i, ann in enumerate(coco_data['annotations']):
            if ann['category_id'] not in valid_cat_ids:
                print(f"Warning: Annotation {ann['id']} has invalid category_id {ann['category_id']}")
                invalid_anns.append(i)
        
        # Remove annotations with invalid category IDs
        if invalid_anns:
            print(f"Removing {len(invalid_anns)} annotations with invalid category IDs")
            coco_data['annotations'] = [ann for i, ann in enumerate(coco_data['annotations']) if i not in invalid_anns]
        
        # Check for images that exist
        if verify_images:
            print("Verifying image files (this may take a while)...")
            image_dir = input("Enter the path to the image directory: ")
            if not os.path.isdir(image_dir):
                print(f"Warning: {image_dir} is not a valid directory, skipping image verification")
            else:
                missing_images = []
                for i, img in enumerate(coco_data['images']):
                    img_path = os.path.join(image_dir, img['file_name'])
                    if not os.path.exists(img_path):
                        missing_images.append(i)
                        
                if missing_images:
                    print(f"Warning: {len(missing_images)} images not found")
                    
                    # Remove missing images and their annotations
                    missing_img_ids = set(coco_data['images'][i]['id'] for i in missing_images)
                    coco_data['images'] = [img for i, img in enumerate(coco_data['images']) if i not in missing_images]
                    coco_data['annotations'] = [ann for ann in coco_data['annotations'] if ann['image_id'] not in missing_img_ids]
                    
                    print(f"Removed {len(missing_images)} missing images and their annotations")
        
        # Save the fixed COCO file
        with open(output_path, 'w') as f:
            json.dump(coco_data, f, indent=4)
            
        print(f"Saved fixed COCO file to {output_path}")
        print(f"Final counts: {len(coco_data['images'])} images, {len(coco_data['annotations'])} annotations")
        
    except json.JSONDecodeError:
        print("Error: Invalid JSON file")
    except Exception as e:
        print(f"Error fixing COCO file: {str(e)}")

def rebuild_coco_from_yolo(yolo_dir, output_path):
    """
    Rebuild a COCO format JSON file from YOLO annotations
    
    Args:
        yolo_dir: Path to the YOLO dataset directory
        output_path: Path where to save the COCO JSON file
    """
    print(f"Rebuilding COCO annotations from YOLO directory: {yolo_dir}")
    
    # Define expected directories
    train_img_dir = os.path.join(yolo_dir, 'val')
    train_label_dir = os.path.join(yolo_dir, 'labels', 'val')
    
    # Check if directories exist
    if not os.path.isdir(train_img_dir):
        print(f"Error: Train image directory not found: {train_img_dir}")
        return
        
    if not os.path.isdir(train_label_dir):
        print(f"Error: Train label directory not found: {train_label_dir}")
        return
    
    # Get class names from classes.txt if available
    classes_file = os.path.join(yolo_dir, 'classes.txt')
    classes = []
    
    if os.path.exists(classes_file):
        with open(classes_file, 'r') as f:
            classes = [line.strip() for line in f if line.strip()]
        print(f"Found {len(classes)} classes: {classes}")
    else:
        # Ask for classes
        class_input = input("Enter class names separated by commas (e.g., face,license_plate): ")
        classes = [c.strip() for c in class_input.split(',') if c.strip()]
    
    if not classes:
        print("Error: No classes specified!")
        return
    
    # Initialize COCO format
    coco_format = {
        "images": [],
        "annotations": [],
        "categories": [{"id": i, "name": name} for i, name in enumerate(classes)]
    }
    
    # Find all image files
    img_extensions = ['*.jpg', '*.jpeg', '*.png']
    img_files = []
    for ext in img_extensions:
        img_files.extend(glob.glob(os.path.join(train_img_dir, ext)))
    
    if not img_files:
        print("Error: No image files found!")
        return
    
    print(f"Found {len(img_files)} image files")
    
    # Process images and annotations
    image_id = 1
    annotation_id = 1
    
    for img_file in img_files:
        img_filename = os.path.basename(img_file)
        base_name = os.path.splitext(img_filename)[0]
        
        # Get image dimensions
        try:
            img = Image.open(img_file)
            img_width, img_height = img.size
        except Exception as e:
            print(f"Error reading image {img_file}: {e}")
            continue
        
        # Add image info
        coco_format["images"].append({
            "file_name": img_filename,
            "height": img_height,
            "width": img_width,
            "id": image_id
        })
        
        # Look for corresponding label file
        label_file = os.path.join(train_label_dir, f"{base_name}.txt")
        
        if os.path.exists(label_file):
            with open(label_file, 'r') as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) < 5:
                        continue
                    
                    class_id = int(parts[0])
                    x_center = float(parts[1])
                    y_center = float(parts[2])
                    width = float(parts[3])
                    height = float(parts[4])
                    
                    # Convert to absolute pixel values
                    x_center_abs = x_center * img_width
                    y_center_abs = y_center * img_height
                    width_abs = width * img_width
                    height_abs = height * img_height
                    
                    # Convert to COCO format
                    x_min = x_center_abs - (width_abs / 2)
                    y_min = y_center_abs - (height_abs / 2)
                    
                    area = width_abs * height_abs
                    
                    segmentation = [[
                        x_min, y_min,
                        x_min + width_abs, y_min,
                        x_min + width_abs, y_min + height_abs,
                        x_min, y_min + height_abs
                    ]]
                    
                    # Create COCO annotation
                    coco_format["annotations"].append({
                        "segmentation": segmentation,
                        "area": area,
                        "iscrowd": 0,
                        "image_id": image_id,
                        "bbox": [x_min, y_min, width_abs, height_abs],
                        "category_id": class_id,
                        "id": annotation_id
                    })
                    
                    annotation_id += 1
        else:
            print(f"No label file found for {img_filename}")
        
        image_id += 1
    
    # Save the COCO format JSON
    with open(output_path, 'w') as f:
        json.dump(coco_format, f, indent=4)
    
    print(f"Saved COCO annotations to {output_path}")
    print(f"Created {len(coco_format['images'])} images and {len(coco_format['annotations'])} annotations")

# if __name__ == "__main__":
#     print("COCO Annotation Fixer/Rebuilder Tool")
#     print("------------------------------------")
#     print("1. Fix existing COCO annotation file")
#     print("2. Rebuild COCO annotations from YOLO dataset")
    
#     choice = input("Enter your choice (1 or 2): ")
    
#     if choice == "1":
#         if len(sys.argv) >= 2:
#             json_path = sys.argv[1]
#         else:
#             json_path = input("Enter path to COCO JSON file: ")
        
#         output_path = input("Enter output path (leave empty for default): ")
#         if not output_path:
#             output_path = None
            
#         verify = input("Verify image files? (y/n, default: n): ").lower() == 'y'
#         fix_coco_annotations(json_path, output_path, verify)
        
#     elif choice == "2":
#         yolo_dir = input("Enter path to YOLO dataset directory: ")
#         output_path = input("Enter output path for COCO JSON: ")
#         rebuild_coco_from_yolo(yolo_dir, output_path)
        
#     else:
#         print("Invalid choice!")

In [19]:
yolo_dir = "./data/pp4av_dataset/"
output_path = "./data/annotations/val.json"
rebuild_coco_from_yolo(yolo_dir, output_path)

Rebuilding COCO annotations from YOLO directory: ./data/pp4av_dataset/


Enter class names separated by commas (e.g., face,license_plate):  face,license_plate


Found 690 image files
Saved COCO annotations to ./data/annotations/val.json
Created 690 images and 2708 annotations


In [20]:
import json
import os
import sys

def check_coco_json(json_path):
    """Analyze a COCO format JSON file to check for common issues."""
    
    print(f"Checking COCO annotations file: {json_path}")
    
    if not os.path.exists(json_path):
        print(f"Error: File {json_path} does not exist")
        return
        
    try:
        with open(json_path, 'r') as f:
            coco_data = json.load(f)
        
        # Check required sections
        required_keys = ['images', 'annotations', 'categories']
        for key in required_keys:
            if key not in coco_data:
                print(f"Error: Missing required section '{key}'")
                return
        
        # Print basic stats
        print(f"Number of images: {len(coco_data['images'])}")
        print(f"Number of annotations: {len(coco_data['annotations'])}")
        print(f"Number of categories: {len(coco_data['categories'])}")
        
        # Check if annotations exist
        if len(coco_data['annotations']) == 0:
            print("Error: No annotations found in the file!")
            return
            
        # Check image IDs - are they unique?
        image_ids = [img['id'] for img in coco_data['images']]
        unique_image_ids = set(image_ids)
        if len(image_ids) != len(unique_image_ids):
            print(f"Warning: Duplicate image IDs found! {len(image_ids)} total, {len(unique_image_ids)} unique")
        
        # Check annotation image_ids - do they match images?
        annotation_image_ids = set([ann['image_id'] for ann in coco_data['annotations']])
        valid_image_ids = set([img['id'] for img in coco_data['images']])
        
        missing_image_ids = annotation_image_ids - valid_image_ids
        if missing_image_ids:
            print(f"Error: {len(missing_image_ids)} annotations reference non-existent image IDs")
            print(f"First few missing: {list(missing_image_ids)[:5]}")
        
        # Check annotation structure of first entry
        if coco_data['annotations']:
            print("\nExample annotation:")
            ann = coco_data['annotations'][0]
            print(json.dumps(ann, indent=2))
            
            # Check bbox format
            if 'bbox' in ann:
                print("\nBounding box format:", end=" ")
                if len(ann['bbox']) == 4:
                    print("Valid (4 values)")
                    
                    # Check if bbox is [x,y,w,h] (COCO format) 
                    # by checking if width and height are positive
                    if ann['bbox'][2] > 0 and ann['bbox'][3] > 0:
                        print("Appears to be [x,y,width,height] format (correct)")
                    else:
                        print("Warning: Possible invalid width/height in bbox")
                else:
                    print(f"Invalid (expected 4 values, got {len(ann['bbox'])})")
        
        # Check category IDs
        cat_ids = [cat['id'] for cat in coco_data['categories']]
        ann_cat_ids = [ann['category_id'] for ann in coco_data['annotations']]
        
        invalid_cats = set(ann_cat_ids) - set(cat_ids)
        if invalid_cats:
            print(f"Error: Annotations use {len(invalid_cats)} category IDs that don't exist")
            print(f"Invalid category IDs: {invalid_cats}")
        
        print("\nCheck completed. If no errors shown above, the COCO format should be valid.")
        
    except json.JSONDecodeError:
        print("Error: Invalid JSON file")
    except Exception as e:
        print(f"Error analyzing file: {str(e)}")


In [22]:
print("Usage: python check_coco_annotations.py /path/to/annotations.json")
check_coco_json("/Ziob/343312/CrossKD/data/annotations/instances_train.json")

Usage: python check_coco_annotations.py /path/to/annotations.json
Checking COCO annotations file: /Ziob/343312/CrossKD/data/annotations/instances_train.json
Number of images: 2757
Number of annotations: 10875
Number of categories: 2

Example annotation:
{
  "segmentation": [
    [
      1267.0003199999999,
      445.00032,
      1279.5007999999998,
      445.00032,
      1279.5007999999998,
      459.20016,
      1267.0003199999999,
      459.20016
    ]
  ],
  "area": 177.5048159232,
  "iscrowd": 0,
  "image_id": 1,
  "bbox": [
    1267.0003199999999,
    445.00032,
    12.50048,
    14.19984
  ],
  "category_id": 0,
  "id": 1
}

Bounding box format: Valid (4 values)
Appears to be [x,y,width,height] format (correct)

Check completed. If no errors shown above, the COCO format should be valid.


In [23]:
import os
import sys
from mmengine.config import Config
from mmengine.utils import mkdir_or_exist
import json

def check_coco_file(ann_file):
    """Quick check of a COCO annotation file"""
    if not os.path.exists(ann_file):
        print(f"Error: Annotation file {ann_file} does not exist!")
        return False
        
    try:
        with open(ann_file, 'r') as f:
            coco_data = json.load(f)
            
        print(f"COCO file stats: {len(coco_data['images'])} images, {len(coco_data['annotations'])} annotations")
        
        # Check for common issues
        if len(coco_data['annotations']) == 0:
            print("Warning: No annotations found in the COCO file!")
            return False
            
        # Check if any image has annotations
        image_ids_with_annotations = set([ann['image_id'] for ann in coco_data['annotations']])
        if not image_ids_with_annotations:
            print("Warning: No images have annotations!")
            return False
            
        return True
    except Exception as e:
        print(f"Error checking COCO file: {str(e)}")
        return False

def fix_config(config_path, output_path=None):
    """Fix the MMDetection config file to correctly load COCO dataset"""
    
    if not os.path.exists(config_path):
        print(f"Error: Config file {config_path} does not exist!")
        return
    
    # Default output path
    if output_path is None:
        base_name = os.path.basename(config_path)
        output_path = f"fixed_{base_name}"
    
    # Load the config
    cfg = Config.fromfile(config_path)
    
    # Print the current config
    print("Current dataset configuration:")
    if hasattr(cfg, 'train_dataloader') and hasattr(cfg.train_dataloader, 'dataset'):
        dataset_cfg = cfg.train_dataloader.dataset
        print(f"Dataset type: {dataset_cfg.type}")
        if hasattr(dataset_cfg, 'ann_file'):
            print(f"Annotation file: {dataset_cfg.ann_file}")
        if hasattr(dataset_cfg, 'data_root'):
            print(f"Data root: {dataset_cfg.data_root}")
    else:
        print("Could not find train_dataloader.dataset in config")
        return
    
    # Check if data_root and ann_file are set correctly
    data_root = dataset_cfg.get('data_root', '')
    ann_file = dataset_cfg.get('ann_file', '')
    
    # Check the annotation file
    full_ann_path = ann_file
    if data_root and not os.path.isabs(ann_file):
        full_ann_path = os.path.join(data_root, ann_file)
    
    print(f"Checking annotation file: {full_ann_path}")
    valid_coco = check_coco_file(full_ann_path)
    
    if not valid_coco:
        print("\nYour COCO annotation file appears to have issues or cannot be found.")
        print("Please check the file path and content.")
        print("\nSuggestions:")
        print("1. Make sure the annotation file exists at the specified path")
        print("2. Check that it contains valid annotations")
        print("3. Verify the YOLO to COCO conversion generated valid data")
        return
        
    # Fix common issues with the dataset config
    
    # 1. Ensure correct data_prefix format for MMDetection 3.x
    if 'data_prefix' not in dataset_cfg or not isinstance(dataset_cfg.data_prefix, dict):
        img_prefix = dataset_cfg.get('img_prefix', '')
        if img_prefix:
            # Convert old style img_prefix to new style data_prefix
            dataset_cfg.data_prefix = dict(img=img_prefix)
            print(f"Updated data_prefix: {dataset_cfg.data_prefix}")
        else:
            # Set a default based on typical structure
            train_dir = os.path.dirname(ann_file).replace('annotations', 'train')
            if not train_dir:
                train_dir = 'train'
            dataset_cfg.data_prefix = dict(img=train_dir)
            print(f"Set default data_prefix: {dataset_cfg.data_prefix}")
    
    # 2. Add other necessary settings
    if 'metainfo' not in dataset_cfg:
        # Check if the COCO file has categories we can use
        try:
            with open(full_ann_path, 'r') as f:
                coco_data = json.load(f)
            
            # Extract classes from COCO categories
            classes = [cat['name'] for cat in coco_data.get('categories', [])]
            if classes:
                dataset_cfg.metainfo = dict(classes=classes)
                print(f"Added metainfo with classes: {classes}")
        except Exception as e:
            print(f"Could not extract classes from COCO file: {str(e)}")
            # Add default metainfo
            dataset_cfg.metainfo = dict(classes=['face', 'license_plate'])
            print("Added default metainfo")
    
    # Save the updated config
    backup_path = f"{config_path}.backup"
    if not os.path.exists(backup_path):
        import shutil
        shutil.copy(config_path, backup_path)
        print(f"Backed up original config to {backup_path}")
    
    cfg.dump(output_path)
    print(f"Saved updated config to {output_path}")
    
    print("\nNext steps:")
    print(f"1. Try running the analysis script with the updated config: python debug_analyze_dataset.py {output_path}")
    print("2. If issues persist, check the COCO annotation file structure")


In [25]:
print("Usage: python fix_mmdet_config.py /path/to/config.py [/path/to/output_config.py]")
config_path = sys.argv[1]
output_path = sys.argv[2] if len(sys.argv) == 3 else None
    
fix_config("configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py", "./data/fix_config.py")

Usage: python fix_mmdet_config.py /path/to/config.py [/path/to/output_config.py]
Current dataset configuration:
Dataset type: CocoDataset
Annotation file: annotations/instances_train.json
Data root: data/pp4av_dataset/
Checking annotation file: data/pp4av_dataset/annotations/instances_train.json
COCO file stats: 2757 images, 10875 annotations
Added metainfo with classes: ['face', 'license_plate']
Saved updated config to ./data/fix_config.py

Next steps:
1. Try running the analysis script with the updated config: python debug_analyze_dataset.py ./data/fix_config.py
2. If issues persist, check the COCO annotation file structure
