# Generate dataset in yolov5 format from coco

## deps

In [1]:
import os
import os.path
import yaml
from shutil import copyfile

from pycocotools.coco import COCO
from tqdm import tqdm

import pandas as pd

## config

In [2]:
USE_SYMLINK = True
IMAGES_PATH = os.path.abspath("../rico2coco/rico/dataset/combined/")
SPLIT_PATH = os.path.abspath("../rico2coco/notebooks/train-val-test-split/")

In [3]:
ls "{IMAGES_PATH}" | grep ".jpg" | wc -l

66261


## aux functions

In [4]:
# Truncates numbers to N decimals
def truncate(n, decimals=0):
    multiplier = 10 ** decimals
    return int(n * multiplier) / multiplier


def get_coco_info(coco_annotations, image_path):
    coco = COCO(coco_annotations)
    cats = coco.loadCats(coco.getCatIds())
    nms = [cat["name"] for cat in cats]
    print("COCO categories: \n{}\n".format(" ".join(nms)))

    catIds = coco.getCatIds()
    imgIds = coco.getImgIds()
    # imgIds = coco.getImgIds(catIds=catIds)
    imageInfos = coco.loadImgs(imgIds)

    return coco, catIds, imgIds, imageInfos


def save_images(src_path, dst_path, imageInfos):
    os.makedirs(dst_path, exist_ok=True)

    for image in tqdm(imageInfos):
        file_name = image["file_name"]
        src = f"{src_path}/{file_name}"
        dst = f"{dst_path}/{file_name}"

        if not os.path.exists(src):
            raise Exception(f"do not exits: {src}")
        else:
            copyfile(src, dst)

    
def save_annotations(dst_path, coco, catIds, imgIds, imageInfos):
    """This is where the annotations will be saved in YOLO format"""
    os.makedirs(dst_path, exist_ok=True)
    

    for im in tqdm(imageInfos):
        dw = 1.0 / im["width"]
        dh = 1.0 / im["height"]

        annIds = coco.getAnnIds(imgIds=im["id"], catIds=catIds, iscrowd=None)
        anns = coco.loadAnns(annIds)

        txt_content = []
        for ann in anns:
            category_id = ann["category_id"] - 1

            xmin = ann["bbox"][0]
            ymin = ann["bbox"][1]
            xmax = ann["bbox"][2] + ann["bbox"][0]
            ymax = ann["bbox"][3] + ann["bbox"][1]

            x = (xmin + xmax) / 2
            y = (ymin + ymax) / 2

            w = xmax - xmin
            h = ymax - ymin

            x = x * dw
            w = w * dw
            y = y * dh
            h = h * dh

            txt_content.append(
                " ".join([
                    f"{category_id} ",
                    str(truncate(x, 7)),
                    str(truncate(y, 7)),
                    str(truncate(w, 7)),
                    str(truncate(h, 7))
                ])
            )
            
        
        file_name = im["file_name"].replace(".jpg", ".txt")
        with open(f"{dst_path}/{file_name}", "w") as myfile:
            for line in txt_content:
                myfile.write(line+"\n")

In [5]:
def load_split_ids(src_path):
    train_ids =  pd.read_csv(f"{src_path}/train.csv")["UI Number"].values.tolist()
    val_ids =  pd.read_csv(f"{src_path}/val.csv")["UI Number"].values.tolist()
    test_ids =  pd.read_csv(f"{src_path}/test.csv")["UI Number"].values.tolist()
    return train_ids, val_ids, test_ids

In [6]:
train_ids, val_ids, test_ids = load_split_ids(SPLIT_PATH)

In [7]:
len(train_ids)

19678

## run

In [8]:
def coco2yolo_with_data_split(
        yaml_path,
        dst_path,
        coco_annotations, 
        image_path,
        split_path,
    ):

    coco, catIds, imgIds, imageInfos = get_coco_info(
        coco_annotations, image_path
    )

    train_ids, val_ids, test_ids = load_split_ids(split_path)

    split_dict = {
        "train": set(train_ids), 
        "val": set(val_ids), 
        "test": set(test_ids)
    }
    
    data_yaml = dict(
        path = dst_path,
        nc = len(catIds),
        names = list(map(lambda cat: cat["name"], coco.loadCats(catIds)))
    )

    instances_per_split = {}
    for key_name, split_ids in split_dict.items():
        print(f"creating {key_name} dataset...")

        filtered_image_ids = list(
            filter(lambda im_id: im_id in split_ids, imgIds)
        )
        filtered_image_infos = list(
            filter(lambda im: im["id"] in split_ids, imageInfos)
        )

        save_images(
            image_path, 
            f"{dst_path}/images/{key_name}/", 
            filtered_image_infos
        )
        save_annotations(
            f"{dst_path}/labels/{key_name}/", 
            coco, catIds, filtered_image_ids, filtered_image_infos
        )
        
        data_yaml[key_name] = f"images/{key_name}/"
        
        instances_per_split[key_name] = filtered_image_ids
        
    with open(yaml_path, 'w') as outfile:
        yaml.dump(data_yaml, outfile, default_flow_style=True)
        
    return instances_per_split

In [9]:
!ls ../dataset/

README.md  ricoco.json	ricoco_clickable.json  ricoco_icon_legend.json


In [10]:
dst_paths = [
    ("data/rico2coco_clickable.yaml", "new_dataset/rico2coco_clickable/", "../dataset/ricoco_clickable.json"),
    ("data/ricoco_icon_legend.yaml", "new_dataset/ricoco_icon_legend/", "../dataset/ricoco_icon_legend.json"),
    ("data/ricoco.yaml", "new_dataset/ricoco/", "../dataset/ricoco.json"),
]

for yaml_path, dst_path, ann_path in dst_paths:
    
    print(f"working on {dst_path} ...")
        
    instances_per_split = coco2yolo_with_data_split(
        yaml_path, dst_path, ann_path, IMAGES_PATH, SPLIT_PATH
    )

    
    total_instances = sum(map(len, instances_per_split.values()))
    for key in instances_per_split:
        print(key, len(instances_per_split[key])/total_instances)
    print("\n")

working on new_dataset/rico2coco_clickable/ ...
loading annotations into memory...
Done (t=1.33s)
creating index...
index created!
COCO categories: 
clickable not_clickable

creating train dataset...


100%|██████████| 6734/6734 [00:02<00:00, 2824.78it/s]
100%|██████████| 6734/6734 [00:00<00:00, 7925.14it/s]


creating val dataset...


100%|██████████| 853/853 [00:00<00:00, 3813.61it/s]
100%|██████████| 853/853 [00:00<00:00, 6622.11it/s]


creating test dataset...


100%|██████████| 790/790 [00:00<00:00, 5186.68it/s]
100%|██████████| 790/790 [00:00<00:00, 8003.53it/s]


train 0.8038677330786678
val 0.1018264295093709
test 0.09430583741196132


working on new_dataset/ricoco_icon_legend/ ...
loading annotations into memory...
Done (t=0.12s)
creating index...
index created!
COCO categories: 

creating train dataset...


100%|██████████| 6734/6734 [00:01<00:00, 3731.24it/s]
100%|██████████| 6734/6734 [00:00<00:00, 16981.98it/s]


creating val dataset...


100%|██████████| 853/853 [00:00<00:00, 5404.27it/s]
100%|██████████| 853/853 [00:00<00:00, 18363.21it/s]


creating test dataset...


100%|██████████| 790/790 [00:00<00:00, 4858.26it/s]
100%|██████████| 790/790 [00:00<00:00, 15729.06it/s]


train 0.8038677330786678
val 0.1018264295093709
test 0.09430583741196132


working on new_dataset/ricoco/ ...
loading annotations into memory...
Done (t=0.99s)
creating index...
index created!
COCO categories: 
Web View List Item Multi-Tab Input Text Button Slider Background Image Advertisement Card Bottom Navigation Modal On/Off Switch Button Bar Number Stepper Text Map View Checkbox Date Picker Image Drawer Radio Button Video Toolbar Pager Indicator

creating train dataset...


100%|██████████| 6734/6734 [00:01<00:00, 3768.11it/s]
100%|██████████| 6734/6734 [00:00<00:00, 8457.54it/s]


creating val dataset...


100%|██████████| 853/853 [00:00<00:00, 5262.96it/s]
100%|██████████| 853/853 [00:00<00:00, 9150.16it/s]


creating test dataset...


100%|██████████| 790/790 [00:00<00:00, 4872.63it/s]
100%|██████████| 790/790 [00:00<00:00, 9039.87it/s]


train 0.8038677330786678
val 0.1018264295093709
test 0.09430583741196132




In [11]:
ls new_dataset/ricoco/labels/test/ | head

10185.txt
10188.txt
10199.txt
10206.txt
10209.txt
10210.txt
10213.txt
10367.txt
10369.txt
10378.txt
ls: write error


In [12]:
cat new_dataset/ricoco/labels/test/16129.txt

14  0.3277777 0.0708984 0.3638888 0.0363281
20  0.5 0.1708984 1.0 0.0683593
20  0.5 0.2666015 1.0 0.0683593


In [13]:
cat data/rico2coco_clickable.yaml

{names: [clickable, not_clickable], nc: 2, path: new_dataset/rico2coco_clickable/,
  test: images/test/, train: images/train/, val: images/val/}
