# Convert data into coco format
Official data set page: https://humansintheloop.org/resources/datasets/car-parts-and-car-damages-dataset/

The original data contains the next classes to annotate all damage types:
```
classes = ['Cracked', 'Scratch', 'Flaking', 'Broken part', 'Corrosion', 'Dent','Paint chip','Missing part']
```
For the purpose of this pre-processing, we will be using only one class to identify damages -  for binary annotations (damage / no damage):
```
classes = ['damage']
```

In [2]:
import os
import json
import cv2
import numpy as np
from PIL import Image

data_dir = "data/humansintheloop/Carpartsdataset/File1/"

# sample image
_ = """
image_path = os.path.join(data_dir, "img/", "Car damages 321.png")
json_path = os.path.join(data_dir, "ann/", "Car damages 321.png.json")

# Load annotations from the JSON file
with open(json_path, 'r') as f:
    annotations_data = json.load(f)

# Function to draw annotations on the image
def draw_annotations(image, annotations):
    for shape in annotations["objects"]:
        label = shape["classTitle"]
        points = shape["points"]["exterior"]
        pts = [(int(point[0]), int(point[1])) for point in points]
        pts = np.array(pts, np.int32)
        pts = pts.reshape((-1, 1, 2))

        color = (0, 255, 0)  # Green color for bounding boxes
        cv2.polylines(image, [pts], isClosed=True, color=color, thickness=2)
        cv2.putText(image, label, (pts[0][0][0], pts[0][0][1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    return image

# load the image
image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
print ("Size of the image:",image.shape)
# display the annotations
image_with_annotations = draw_annotations(image, annotations_data)
Image.fromarray(image_with_annotations)
"""

# Check the class distribution

In [233]:
# find all unique classes and their distribution
_ = """
class_dist = {}
file_count = 0
json_dir = os.path.join(data_dir, "ann/")
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        with open(os.path.join(json_dir, filename)) as f:
            annotation_data = json.load(f)
            for shape in annotation_data["objects"]:
                if shape["classTitle"] in class_dist:
                    class_dist[shape["classTitle"]] += 1
                else:
                    class_dist[shape["classTitle"]] = 1
        file_count += 1
"""

In [234]:
#print ("Total annotation-file count:", file_count)
#print ("Distribution of the classes:", class_dist)

# Save the data to COCO JSON format

Now, lets convert the image and annotation data into COCO JSON format.

The COCO (Common Objects in Context) dataset is a widely used dataset for object detection, segmentation, and captioning tasks. The annotations in the COCO dataset are stored in JSON format. Learn more about the COCO dataset.

The COCO JSON format is structured and standardized to represent object annotations and metadata for each image in the dataset. Here's an overview of the key components of the COCO JSON format:

Image Information:

id: A unique identifier for each image.
width: Width of the image in pixels.
height: Height of the image in pixels.
file_name: The filename of the image.
Annotation Information:

id: A unique identifier for each annotation.
image_id: The identifier of the image to which this annotation belongs.
category_id: The identifier of the category (class) label for the annotated object.
bbox: A list of four values representing the bounding box coordinates of the object. The order is [xmin, ymin, width, height].
area: The area of the object bounding box.
segmentation: The segmentation mask of the object (used for semantic segmentation tasks).
iscrowd: A binary flag (0 or 1) indicating whether the annotated object is a single instance (0) or a group or crowd (1).
attributes: Additional attributes associated with the annotation (e.g., color, shape, etc.).
Category Information:

id: A unique identifier for each category (class) label.
name: The name of the category.
The JSON file may also include additional information like licenses, dataset information, and annotations for captions (in case of captioning tasks).

Here's an example of a simplified JSON representation of a COCO annotation for a single image with one annotated object:
```
{
  "images": [
    {
      "id": 1,
      "width": 640,
      "height": 480,
      "file_name": "example.jpg"
    }
  ],
  "annotations": [
    {
      "id": 1,
      "image_id": 1,
      "category_id": 1,
      "bbox": [100, 200, 150, 100],
      "area": 15000,
      "segmentation": [100, 200, 250, 200, 250, 300, 100, 300],
      "iscrowd": 0,
      "attributes": {}
    }
  ],
  "categories": [
    {
      "id": 1,
      "name": "cat"
    }
  ]
}
```
Please note that this is a simplified example, and COCO JSON files may contain annotations for multiple images and objects, as well as additional metadata.

For this exercise, only the top 5 classes i.e., 'Scratch', 'Broken part','Dent','Paint chip','Missing part' are used.

Note: The label id of the classes will be in the order of the classes specified in the below list.

In [4]:
# set the classes required
#classes = ['Cracked', 'Scratch', 'Flaking', 'Broken part', 'Corrosion', 'Dent','Paint chip','Missing part']  # to annotate all damage types
classes = ['damage']  # for binary annotations (damage / no damage)

In [218]:
# debug
_ = 
"""
val_d = ['Car damages 1194.png',
 'Car damages 602.png',
 'Car damages 859.png',
 'Car damages 909.png']
"""

In [None]:
#from pycocotools.coco import COCO
#coco = COCO('../datasets/coco/annotations/instances_train2017.json')


In [266]:
# calculate "area" for annotations
_ = """
from pycocotools import mask, coco
from pycocotools.mask import frPyObjects, area

seg = [
    [
        379,
        216,
        337,
        218,
        300,
        223,
        318,
        238,
        350,
        237,
        369,
        231,
        375,
        227
    ]
]
segmentation = [
    [379, 216],
    [337, 218],
    [300, 223],
    [318, 238],
    [350, 237],
    [369, 231],
    [375, 227]
]
"""
#a = frPyObjects(seg, 1024, 1024)
#area(a)


#coco.annToRLE()


#area = mask.area(segmentation)
#print("Area:", area)

In [267]:
#from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)


Prepare image annotations according to coco format. Store the results in train and val directories

In [5]:
# Iterate over the annotation files and convert the annotations to COCO JSON format.

# Fraction of images for training/validation: 80/20
train_frac = 0.8  # debug: 1-0.005

# Replace these paths with your dataset paths
json_dir = os.path.join(data_dir, "ann/")
#coco_path_damage = f"{data_dir}/coco_annotations_damage.json"    # damage / no damage
#coco_path_damages = f"{data_dir}/coco_annotations_damages.json"  # damage type

# Initialize lists to store annotations and images data
annotations_list_train = []
annotations_list_val = []
images_list_train = []
images_list_val = []

# Create a dictionary to map class names to class IDs

_ ="""
class_names = {}
for i in range(len(classes)):
    class_names[classes[i]] = i
print (class_names)
"""

# create a dict with class ids
class_to_label = {class_name: i for i, class_name in enumerate(classes)}


# convert polygon annotations to COCO-style bounding boxes
def polygon_to_box(polygon) -> list:
    """Convert 1 polygon into a bounding box.

    # Arguments
      polygon: a numpy array of shape (N, 2) representing N vertices
               of the hand segmentation label (polygon); each vertex
               is a point: (x, y)
    """
    if len(polygon) < 3:  # a polygon has at least 3 vertices
        return None
    x_min = np.min(polygon[:, 0])
    x_max = np.max(polygon[:, 0])
    y_min = np.min(polygon[:, 1])
    y_max = np.max(polygon[:, 1])
    H = y_max - y_min
    W = x_max - x_min
    return [x_min, y_min, W, H]

# Iterate through the JSON annotation files
for idx, filename in enumerate(os.listdir(json_dir)):
    if filename.endswith(".json"):
        is_train = np.random.choice([0,1], size=1, p=[1-train_frac, train_frac])[0]  # use image for training
        #is_train = filename.replace(".json", "") not in val_d
        
        with open(os.path.join(json_dir, filename)) as f:
            ann_data = json.load(f)
            image_filename = filename.replace(".json", "")
            image_path = os.path.join(data_dir, "img", image_filename)
            image = {
                "file_name": image_filename,
                "height": ann_data["size"]["height"],
                "width": ann_data["size"]["width"]
                }
            if is_train:
                train_image_id = len(images_list_train)
                #print(f"[train] image_id: {train_image_id}")
                image["id"] = train_image_id# + 1
                images_list_train.append(image)
            else:
                val_image_id = len(images_list_val)
                #print(f"[val] image_id: {val_image_id}")
                image["id"] = val_image_id# + 1
                images_list_val.append(image)
                
            for o in ann_data["objects"]:
                label = o["classTitle"]
                label = "damage"  # overwrite class - for binary annotation (damage / no damage)
                # only use the classes selected
                if label in classes:
                    pass
                else:
                    continue
                    
                class_id = class_to_label[label]  # Use class_ids
                bbox = polygon_to_box(np.array(o["points"]["exterior"]))
                #print("here")
                annotation = {
                    "category_id": class_id+1,
                    "iscrowd": 0,
                    "bbox": bbox,
                    #"bbox_mode": BoxMode.XYWH_ABS, #BoxMode.XYXY_ABS,
                    #"segmentation": [o["points"]["exterior"]],
                    "segmentation": [np.array(o["points"]["exterior"]).ravel().tolist()]
                }
                
                if is_train:
                    annotation.update(
                        {
                        "id": len(annotations_list_train),#+ 1,
                        "image_id": train_image_id,
                        })
                    annotations_list_train.append(annotation)
                else:
                    annotation.update(
                        {
                        "id": len(annotations_list_val), # + 1,
                        "image_id": val_image_id,
                        })
                    annotations_list_val.append(annotation)

# Create the COCO data dictionary
coco_data_train = {
    "annotations": annotations_list_train,
    "images": images_list_train,
    "categories": [{"id": class_id+1, "name": class_name} for class_name, class_id in class_to_label.items()]
}
coco_data_val = {
    "annotations": annotations_list_val,
    "images": images_list_val,
    "categories": [{"id": class_id+1, "name": class_name} for class_name, class_id in class_to_label.items()]
}


In [7]:
# show sample annotation
{'annotations': coco_data_train['annotations'][0],
 'images': coco_data_train['images'][0],
 'categories': coco_data_train['categories']
}

{'annotations': {'category_id': 1,
  'iscrowd': 0,
  'bbox': [266.0, 127.0, 338.0, 112.0],
  'segmentation': [[266.0,
    150.0,
    281.0,
    174.0,
    332.0,
    184.0,
    394.0,
    200.0,
    433.0,
    221.0,
    464.0,
    239.0,
    481.0,
    236.0,
    485.0,
    230.0,
    529.0,
    223.0,
    555.0,
    216.0,
    584.0,
    208.0,
    604.0,
    201.0,
    600.0,
    189.0,
    590.0,
    174.0,
    576.0,
    166.0,
    550.0,
    154.0,
    526.0,
    146.0,
    503.0,
    138.0,
    482.0,
    131.0,
    472.0,
    127.0,
    470.0,
    129.0,
    447.0,
    134.0,
    406.0,
    134.0,
    354.0,
    136.0,
    307.0,
    142.0]],
  'id': 0,
  'image_id': 0},
 'images': {'file_name': 'Car damages 101.png',
  'height': 440,
  'width': 637,
  'id': 0},
 'categories': [{'id': 1, 'name': 'damage'}]}

In [159]:
# multi-class: damage types
# Save the COCO data to a JSON file
_ = """
coco_path_damages_train = f"{data_dir}train/coco_annotations_damages.json"  # damage type

coco_path_damages_val = f"{data_dir}val/coco_annotations_damages.json"  # damage type

# train annotations
with open(coco_path_damages_train, "w") as f:
    json.dump(coco_data_train, f, indent=4)


# val annotations
with open(coco_path_damages_val, "w") as f:
    json.dump(coco_data_val, f, indent=4)
"""

In [269]:
# single class: damage / no damage
# Save the COCO data to a JSON file
coco_path_damage_train = f"{data_dir}train/coco_annotations_damage.json"    # damage / no damage

coco_path_damage_val = f"{data_dir}val/coco_annotations_damage.json"    # damage / no damage


# train annotations
with open(coco_path_damage_train, "w") as f:
    json.dump(coco_data_train, f, indent=4)


# val annotations
with open(coco_path_damage_val, "w") as f:
    json.dump(coco_data_val, f, indent=4)

In [189]:
# copy images to train and val paths
import shutil
# Train
_ = 
"""
with open(coco_path_damage_train, "r") as file:
    i = json.load(file)

f = i["images"]

for i in f:
    shutil.copy(f"{data_dir}/img/{i['file_name']}", f"{data_dir}train/")


# Val
with open(coco_path_damage_val, "r") as file:
    i = json.load(file)

f = i["images"]

for i in f:
    shutil.copy(f"{data_dir}/img/{i['file_name']}", f"{data_dir}val/")
"""