In [2]:
from pathlib import Path
import cv2
import shutil

## Specify the source and destination folder
- dataset_dir is the source(input) dataset
- kitti_dir is the destination(output) dataset

In [None]:

dataset_dir = "~/Documents/datasets/tao-experiments/items/3_itemdetect_data_split/train/3_itemdetect_evenly_divided"
dataset = Path(dataset_dir).expanduser().resolve()
kitti_dst_dir = f"~/PycharmProjects/TAO_Toolkit/CV/data/item_detection/training"
kitti = Path(kitti_dst_dir).expanduser().resolve()
print(f"Dataset location: {dataset}")
print(f"destination location: {kitti}")

In [None]:
src_images = list(dataset.glob("*/*.jpg"))
src_labels = list(dataset.glob("*/*.txt"))

dst_images_d = kitti.joinpath("image")
dst_images_d.mkdir(parents=True, exist_ok=True)
dst_labels_d = kitti.joinpath("label")
dst_labels_d.mkdir(parents=True, exist_ok=True)

print(f"Dataset has {len(src_images)} images")
print(f"Dataset has {len(src_labels)} labels")

### Work on yolo -> kitti format conversion

```text
Values    Name      Description
----------------------------------------------------------------------------
   1    type         Describes the type of object: 'Car', 'Van', 'Truck',
                     'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram',
                     'Misc' or 'DontCare'
   1    truncated    Float from 0 (non-truncated) to 1 (truncated), where
                     truncated refers to the object leaving image boundaries
   1    occluded     Integer (0,1,2,3) indicating occlusion state:
                     0 = fully visible, 1 = partly occluded
                     2 = largely occluded, 3 = unknown
   1    alpha        Observation angle of object, ranging [-pi..pi]
   4    bbox         2D bounding box of object in the image (0-based index):
                     contains left, top, right, bottom pixel coordinates
   3    dimensions   3D object dimensions: height, width, length (in meters)
   3    location     3D object location x,y,z in camera coordinates (in meters)
   1    rotation_y   Rotation ry around Y-axis in camera coordinates [-pi..pi]
   1    score        Only for results: Float, indicating confidence in
                     detection, needed for p/r curves, higher is better.
```

```text
Example:
Car 0.27 0 2.50 862.65 129.39 1241.00 304.96 1.73 1.74 4.71 5.50 1.30 8.19 3.07
Car 0.68 3 -0.76 1184.97 141.54 1241.00 187.84 1.52 1.60 4.42 22.39 0.48 24.57 -0.03
Car 0.00 1 1.73 346.64 175.63 449.93 248.90 1.58 1.76 4.18 -5.13 1.67 17.86 1.46
Car 0.00 0 1.75 420.44 170.72 540.83 256.12 1.65 1.88 4.45 -2.78 1.64 16.30 1.58
Car 0.00 0 -0.35 815.59 143.96 962.82 198.54 1.90 1.78 4.72 10.19 0.90 26.65 0.01
Car 0.00 1 -2.09 966.10 144.74 1039.76 182.96 1.80 1.65 3.55 19.49 0.49 35.99 -1.59
Van 0.00 2 -2.07 1084.26 132.74 1173.25 177.89 2.11 1.75 4.31 26.02 0.24 36.41 -1.45
Car 0.00 2 -2.13 1004.98 144.16 1087.13 178.96 1.64 1.70 3.91 21.91 0.30 36.47 -1.59
Car 0.00 2 1.77 407.73 178.44 487.07 230.28 1.55 1.71 4.50 -5.35 1.76 24.13 1.55
Car 0.00 1 1.45 657.19 166.33 702.65 198.71 1.50 1.71 4.44 3.39 1.22 35.96 1.55
Car 0.00 1 -1.46 599.30 171.76 631.96 197.12 1.58 1.71 3.75 0.39 1.54 47.31 -1.45
Car 0.00 0 -1.02 557.79 165.74 591.61 181.27 1.66 1.65 4.45 -3.89 0.91 80.12 -1.07
```

### Update the mapping table (class_id --> class_name)
### Start the conversion yolo --> kitti

In [5]:
id2name = {'0': 'item'}

def yolo2kitti(class_id, center_x, center_y, w, h, width, height):
    bbox_width = float(w) * width
    bbox_height = float(h) * height
    center_x = float(center_x) * width
    center_y = float(center_y) * height
    x_min = max(round(center_x - (bbox_width / 2), 2), 1.0)
    y_min = max(round(center_y - (bbox_height / 2), 2), 1.0)
    x_max = min(round(center_x + (bbox_width / 2), 2), width - 1.0)
    y_max = min(round(center_y + (bbox_height / 2), 2), height - 1.0)
    # print(x_min, y_min, x_max, y_max)

    if y_min < y_max <= height and x_min < x_max <= width:
        kitti_string = f"{id2name[class_id]} 0 0 0 {x_min} {y_min} {x_max} {y_max} 0 0 0 0 0 0 0"
    else:
        kitti_string = ""
        print(f"warning: x_min: {x_min}, x_max: {x_max}, y_min: {y_min}, y_max: {y_max}")
    return kitti_string

### Option 1: Fixed WIDTH and HEIGHT
1. Convert the labels

In [None]:
WIDTH, HEIGHT = 960, 544
print(f"Expected images resolution (width x height) is {WIDTH} x {HEIGHT}")

In [None]:
for _src_label in src_labels:
    # _dst_image = str(dst_images_d.joinpath(f"{_src_label.name[:-4]}.jpg"))
    # print(_dst_label)
    # print(_dst_image)
    kitti_str_list = []
    with open(_src_label, 'r') as f:
        for yolo_str in f.readlines():
            c = [_.strip() for _ in yolo_str.split(" ")]
            class_id = c[0]
            x_center, y_center, w, h = [float(_) for _ in c[1:5]]
            kitti_str = yolo2kitti(class_id, x_center, y_center, w, h, WIDTH, HEIGHT)
            if len(kitti_str) > 0:
                kitti_str_list.append(kitti_str)
    # print("\n".join(kitti_str_list))

    if kitti_str_list:
        _dst_label = dst_labels_d.joinpath(_src_label.name)
        # print(_dst_label)
        with open(_dst_label, 'w') as f:
            kitti_content = "\n".join(kitti_str_list)
            f.write(kitti_content)


dst_labels = list(dst_labels_d.glob("*.txt"))
print(f"destination label counts: {len(dst_labels)}")



2. Convert the images

In [None]:
for _src_image in src_images:
    _dst_image = dst_images_d.joinpath(_src_image.name)
    _dst_label = dst_labels_d.joinpath(f"{_dst_image.name[:-4]}.txt")
    if _dst_label.is_file():
        frame = cv2.imread(str(_src_image))
        resized = cv2.resize(frame, (WIDTH, HEIGHT))
        cv2.imwrite(str(_dst_image), resized)
    else:
        print(f"Warning: cannot find file {_dst_label}")

dst_images = list(dst_images_d.glob("*.jpg"))

print(f"images: {len(dst_images)}")



### Option 2: Keep the original WIDTH and HEIGHT

In [6]:
for _src_image in src_images:
    _dst_image = dst_images_d.joinpath(_src_image.name)
    shutil.copy2(_src_image, _dst_image)

dst_images = list(dst_images_d.glob("*.jpg"))
print(f"destination images counts: {len(dst_images)}")

In [None]:
for _src_label in src_labels:
    _dst_image = str(dst_images_d.joinpath(f"{_src_label.name[:-4]}.jpg"))
    frame = cv2.imread(_dst_image)
    height, width = frame.shape[:2]
    # print(_dst_label)
    # print(_dst_image)
    kitti_str_list = []
    with open(_src_label, 'r') as f:
        for yolo_str in f.readlines():
            c = [_.strip() for _ in yolo_str.split(" ")]
            class_id = c[0]
            x_center, y_center, w, h = [float(_) for _ in c[1:5]]
            kitti_str = yolo2kitti(class_id, x_center, y_center, w, h, width, height)
            if len(kitti_str) > 0:
                kitti_str_list.append(kitti_str)
    # print("\n".join(kitti_str_list))

    if kitti_str_list:
        _dst_label = dst_labels_d.joinpath(_src_label.name)
        # print(_dst_label)
        with open(_dst_label, 'w') as f:
            kitti_content = "\n".join(kitti_str_list)
            f.write(kitti_content)


dst_labels = list(dst_labels_d.glob("*.txt"))
print(f"destination label counts: {len(dst_labels)}")

## TODO: Visualize the converted dataset