# 生成 COCO 数据集

按照流程 https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data#11-create-datasetyaml

In [4]:
import pandas as pd

In [39]:
# root path of reef dataset
reef_dataset_root = "/home/featurize/data"

full_df = pd.read_csv(f"{reef_dataset_root}/train.csv")

# folowing will generate dataset for full dataset training, if you want to do experiment
# you should change the train_df and val_df to the correct part of the data.
# we suggest that using video_id to split the dataset for experiments.
dest_dir = "/home/featurize/full"
train_df = full_df.loc[(full_df.annotations != '[]')]
val_df = full_df.loc[(full_df.video_id == 0) & (full_df.annotations != '[]')]
test_df = val_df

In [None]:
dest_dir = "./full"

In [40]:
print(len(train_df), len(val_df))

2776 6708


In [41]:
from pathlib import Path
import shutil
from tqdm.notebook import tqdm

In [42]:
Path(dest_dir).mkdir(exist_ok=True)

### 1. 创建 dataset.yaml

In [43]:
_ = Path(f"{dest_dir}/dataset.yaml").write_text(f"""path: {dest_dir}
train: images/train2017
val: images/val2017
test: images/test2017

nc: 1
names: ['patric']
""")

### 2. 复制图片文件到指定目录，同时创建对应的标签文件

将图片复制到 dataset.yaml 中的 train val test 目录下。并且，在 images 同级目录创建一个 labels 目录，然后 labels 目录中为每个 image 文件创建一个对应的 txt 文件，txt 文件格式为：

* 每行一个 object（一个框）
* 每行格式为 `class x_center y_center width height`
* 坐标全部 normalize 为 0 ~ 1 之间的数（除以高宽）
* 类别从 0 开始

如果一个图片没有标签，则不需要创建 txt 文件

In [46]:
def create_txt_file(path: Path, item):
    """根据 item 生成 txt 文件，并写入在对应的 path
    """
    if item.annotations == "[]":
        return
    anno_str = []
    for anno in eval(item.annotations):
        x, y, w, h = anno['x'] / 1280, anno['y'] / 720, anno['width'] / 1280, anno['height'] / 720

        # 有部分框超出边界
        h = min((1 - y), h)
        w = min((1 - x), w)

        xc = x + w / 2
        yc = y + h / 2
        anno_str.append(f"0 {xc} {yc} {w} {h}")
    path.write_text("\n".join(anno_str))


for mode in ["train", "val", "test"]:
    image_folder = Path(dest_dir) / "images" / f"{mode}2017"
    image_folder.mkdir(parents=True, exist_ok=True)

    label_folder = Path(dest_dir) / "labels" / f"{mode}2017"
    label_folder.mkdir(parents=True, exist_ok=True)

    df = locals().get(f"{mode}_df")

    for _, item in tqdm(df.iterrows(), total=len(df)):
        file_name = f"{item.video_id}_{item.video_frame}"
        shutil.copy(f"{reef_dataset_root}/train_images/video_{item.video_id}/{item.video_frame}.jpg", image_folder / f"{file_name}.jpg")
        create_txt_file(Path(label_folder) / f"{file_name}.txt", item)

  0%|          | 0/2776 [00:00<?, ?it/s]

  0%|          | 0/6708 [00:00<?, ?it/s]

  0%|          | 0/6708 [00:00<?, ?it/s]