## Creating the dataset step-by-step
### Author: Vilem Gottwald

#### This file contains the whole process of the dataset creation.


In [1]:
import os
import datetime
import numpy as np
import pandas as pd
from pyntcloud import PyntCloud
import json
from modules import dt2str, listdir_paths
from pathlib import Path

# add project root to path for imports
import sys
PROJECT_ROOT = Path(globals()['_dh'][0]).resolve().parent
sys.path.append(str(PROJECT_ROOT))

# Path to the directory containing all data
DATA_PATH = PROJECT_ROOT / "data"


### Parse radar data and create pandas Dataframes

In [2]:
from modules.to_dataframe import create_dataframe
from radar_parser import RadarRecordingsParser

PANDAS_SAVED_DIR = DATA_PATH / 'parsing' / 'pandas_saved'
POINTS_PKL_PATH = PANDAS_SAVED_DIR / 'radar_points.pkl'
TARGETS_PKL_PATH = PANDAS_SAVED_DIR / 'radar_targets.pkl'
RADAR_RECORDINGS_DIR = DATA_PATH / 'radar_recordings'/ 'parsing' / 'records' / 'binaries' 

# Check if dataframes were already created and saved
if os.path.exists(POINTS_PKL_PATH) and os.path.exists(TARGETS_PKL_PATH):
    # Load dataframes form pickle files
    df_points = pd.read_pickle(POINTS_PKL_PATH)
    df_targets = pd.read_pickle(TARGETS_PKL_PATH)
    frame_times = df_points.groupby('frame')['timestamp'].first().to_dict()
    print('Dataframes loaded from previously created pickle files.')

else:
    # Parse radar data binaries
    parser = RadarRecordingsParser()
    points_list, targets_list, frame_times = parser.parse(RADAR_RECORDINGS_DIR)

    # Merge points from all frames into single numpy array, same with targets
    np_points = np.array([tuple(point) for frame in points_list for point in frame])
    np_targets = np.array([tuple(target) for frame in targets_list for target in frame])

    # Create dataframes
    df_points, df_targets = create_dataframe(np_points, np_targets, frame_times, str(PANDAS_SAVED_DIR))
    print('Dataframes created by parsing radar data.')

df_points.head()

Dataframes loaded from previously created pickle files.


Unnamed: 0,range,azimuth,elevation,doppler,targetID,snr,noise,frame,x,y,z,velocity,timestamp,idx,pointID,y_orig,total_seconds
5,50.831078,0.125469,0.047396,19.169559,7,107,535,11012,6.353885,50.350639,3.132035,19.343167,2023-01-19 14:04:04.903396,0,11012000,50.350639,1605844.903
6,51.197514,0.12537,0.02579,19.169559,7,104,535,11012,6.399689,50.638629,2.019823,19.327629,2023-01-19 14:04:04.903396,1,11012001,50.638629,1605844.903
7,50.831078,0.12539,0.031309,19.53125,7,126,532,11012,6.353885,50.297478,2.324974,19.695455,2023-01-19 14:04:04.903396,2,11012002,50.297478,1605844.903
8,51.197514,0.125339,0.013446,19.53125,7,116,534,11012,6.399689,50.585009,1.397035,19.687458,2023-01-19 14:04:04.903396,3,11012003,50.585009,1605844.903
9,50.831078,0.125345,0.016722,19.892939,7,98,543,11012,6.353885,50.237841,1.593992,20.053045,2023-01-19 14:04:04.903396,4,11012004,50.237841,1605844.903


In [6]:
from modules.frame_times import datatime_differences

x = datatime_differences(frame_times)

Max difference: 257.792176
Min difference: 0.000318
Mean difference: 0.056750158919483235
[[87 Timestamp('2023-01-19 13:54:58.685702') 88
  Timestamp('2023-01-19 13:54:58.734557')]
 [113 Timestamp('2023-01-19 13:54:59.984832') 114
  Timestamp('2023-01-19 13:55:00.023000')]
 [126 Timestamp('2023-01-19 13:55:00.623193') 127
  Timestamp('2023-01-19 13:55:00.672040')]
 ...
 [45012 Timestamp('2023-01-19 14:32:28.663124') 45013
  Timestamp('2023-01-19 14:32:28.714179')]
 [45022 Timestamp('2023-01-19 14:32:29.163918') 45023
  Timestamp('2023-01-19 14:32:29.215333')]
 [45036 Timestamp('2023-01-19 14:32:29.865777') 45037
  Timestamp('2023-01-19 14:32:29.914737')]]
4045


In [20]:
diffs_ms = sorted([el * 1000 for el in x[0]])
diffs_ms

[0.318,
 27.847,
 38.168,
 41.406,
 42.486999999999995,
 46.584,
 46.848,
 47.132,
 47.347,
 47.354,
 47.446000000000005,
 47.471,
 47.480000000000004,
 47.516000000000005,
 47.531,
 47.606,
 47.622,
 47.624,
 47.64,
 47.656,
 47.665,
 47.669999999999995,
 47.673,
 47.692,
 47.714,
 47.727,
 47.739999999999995,
 47.759,
 47.766999999999996,
 47.78,
 47.805,
 47.813,
 47.813,
 47.82,
 47.821000000000005,
 47.835,
 47.835,
 47.84,
 47.841,
 47.841,
 47.845,
 47.859,
 47.861000000000004,
 47.864,
 47.864999999999995,
 47.867,
 47.869,
 47.878,
 47.891999999999996,
 47.906,
 47.909,
 47.925000000000004,
 47.926,
 47.929,
 47.931000000000004,
 47.940999999999995,
 47.944,
 47.955999999999996,
 47.977,
 47.978,
 47.979,
 47.984,
 47.989999999999995,
 47.993,
 48.0,
 48.006,
 48.006,
 48.006,
 48.007,
 48.008,
 48.023,
 48.023999999999994,
 48.027,
 48.033,
 48.034,
 48.04,
 48.043,
 48.044000000000004,
 48.046,
 48.051,
 48.051,
 48.052,
 48.055,
 48.055,
 48.055,
 48.059,
 48.068,
 48.068,


In [None]:
sorted(diffs_ms, reverse=True)

### Generate point cloud .ply files that group frames in 500 ms time window

In [None]:
def generate_pointclouds(save_dir: str, milliseconds_span: int = 500, overwrite_existing: bool = False) -> None:
    """
    Generates pointclouds from grouped radar frames.

    :param save_dir: Path to directory where pointclouds will be saved.
    :param milliseconds_span: Time span in milliseconds for grouping.
    :param overwrite_existing: If True, existing files in save_dir will be overwritten.

    :return: None
    """
    # Check if save directory exists
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    elif os.path.isfile(save_dir):
        raise ValueError('Save directory is a file')
    elif not overwrite_existing and os.listdir(save_dir):
        raise ValueError('Save directory already contains files')
    
    empty_count = 0
    total_points_after_grouping = 0
    TIME_SPAN = datetime.timedelta(milliseconds=milliseconds_span)
    last_frame_idx = len(frame_times) - 1
    
    # Iterate over all frames
    for curr_frame_idx, cur_timestamp in enumerate(frame_times.values()):

        # Convert timestamp to datetime object
        if isinstance(cur_timestamp, datetime.datetime):
            cur_timestamp = pd.Timestamp(cur_timestamp)

        # Select points inside the given time boundary
        time_boundary = cur_timestamp - TIME_SPAN
        _BOUNDARY_MASK = (df_points['timestamp'] >= time_boundary) & (df_points['timestamp'] <= cur_timestamp)
        _IN_COLUMNS = ['x', 'y', 'z', 'snr', 'noise', 'velocity','pointID', 'timestamp', 'y_orig', 'total_seconds']
        frames_view = df_points[_BOUNDARY_MASK][_IN_COLUMNS].copy()

        # Shift points belonging to each timestamp based on the time difference
        prev_timestamps = [f for f in frames_view['timestamp'].unique() if f != cur_timestamp]
        for prev_timestamp in prev_timestamps:
            delta_t_seconds = (cur_timestamp - prev_timestamp).total_seconds()

            # Shift points in y direction
            frame_selection = frames_view['timestamp'] == prev_timestamp
            frames_view.loc[frame_selection, 'y']  += delta_t_seconds * frames_view[frame_selection]['velocity']

        # Crop shifted points and select only relevant columns
        _Y_CLIP_MASK = (frames_view['y'] > 0.0) & (frames_view['y'] < 85.0)
        _OUT_COLUMNS = ['x', 'y', 'z', 'snr', 'noise', 'velocity', 'y_orig', 'total_seconds', 'pointID']
        frames_view = frames_view[_Y_CLIP_MASK][_OUT_COLUMNS]

        if frames_view.empty:
            empty_count += 1
            continue

        total_points_after_grouping += len(frames_view)

        # Store points into ply objects
        cloud = PyntCloud(frames_view)        

        # Create files from .ply objects
        save_path = os.path.join(save_dir, dt2str(cur_timestamp) + '.ply')
        cloud.to_file(save_path, as_text=True)

        print(f'\r{dt2str(cur_timestamp)}.ply ... {curr_frame_idx} / {last_frame_idx}', end='')

    nonempty_count = len(frame_times) - empty_count
    print(f'\rGenerated {nonempty_count} pointclouds. Skipped {empty_count} empty frames.')
    print(f'Total points after grouping: {total_points_after_grouping}')
    print(f'Average number of points per frame: {total_points_after_grouping / nonempty_count}')

# SAVE_DIR = str(DATA_PATH /'labeling' / 'pointclouds')
# generate_pointclouds(SAVE_DIR, milliseconds_span=500, overwrite_existing=True)

### Generate images for annotation with timestamps corresponding to the generated pointclouds

In [None]:
from modules import VideoFrameExporter

def generate_images(dst_dirpath: str, video_path: str) -> None:
    """ 
    Generates images from video frames.
    
    :param dst_dirpath: Path to directory where images will be saved.
    :param video_path: Path to video file.
    
    :return: None
    """

    frame_exporter = VideoFrameExporter(video_path)

    last_frame_idx = len(frame_times) - 1
    successful_frames = 0
    for idx, timestamp in enumerate(frame_times.values()):
        print('\r', end='') # Clear terminal line

        # Get the exported frame image filepath
        frame_name = dt2str(timestamp)
        frame_path =  os.path.join(dst_dirpath, frame_name) +".jpg"

        # Export frame image
        success = frame_exporter.export(timestamp, frame_path)

        if success is None:
            # frame image not found - skipped
            pass

        elif success:
            # frame image found and saved
            successful_frames += 1

            # if delta is too big, print warning
            diff = timestamp - frame_exporter.frame_time if timestamp > frame_exporter.frame_time else frame_exporter.frame_time - timestamp
            if diff > datetime.timedelta(milliseconds=50):
                print(f"Warning {timestamp} saved with image {frame_exporter.frame_time}, time diff: {diff.microseconds/1000} ")
        else:
            # video ended
            print(f"Video ended at {frame_exporter.frame_time}")
            break

        print(f'\r{frame_name}.jpg ... {idx} / {last_frame_idx}', end='')

    print(f'\nSuccessfully exported {successful_frames} images.')

# FRAME_IMAGES_DIR = str(DATA_PATH / 'labeling' / 'frames_images')
# VIDEO_PATH = str(DATA_PATH / 'labeling' / 'records' / 'video_converted' / '20230119T135459128.mp4')
# generate_images(FRAME_IMAGES_DIR, VIDEO_PATH)

### Generate rough bounding boxes using the detecting algorithm to speed up labeling

Bounding boxes are generated for each frame and exported as json files with the ouptut format of the labeling tool.
The labeling tool is then used to correct theese rough predicions.

In [None]:

from detection.detector import Clusterer
from modules import create_json, get_bounding_box


def generate_cluster_labels(
    points_dirpath, labels_dirpath, start_idx=0, end_idx=None):
    """
    Generates unannotated labels with rough clusters for the given point cloud files.

    :param points_dirpath: Path to the directory containing the point cloud files.
    :param labels_dirpath: Path to the directory where the labels will be saved.
    :param start_idx: Index of the first point cloud file to process.
    :param end_idx: Index of the last point cloud file to process.

    :return: None
    """
    # Get filepaths to all ply files in given directory
    ply_filepaths = listdir_paths(points_dirpath)

    # Set stop index for progress printing
    if end_idx is None:
        end_idx = len(ply_filepaths)

    clusterer = Clusterer()

    # Process each point cloud file
    for i, ply_filepath in enumerate(ply_filepaths[start_idx:end_idx], start_idx):

        # Load point cloud from file
        try:
            cloud = PyntCloud.from_file(ply_filepath)
            points_df = cloud.points
        except Exception as e:
            print(f'{ply_filepath} resulted in error: {e}')
            continue

        # List for storing the generated bboxes
        bboxes = []

        # Cluster points by lanes
        clustered_points = clusterer.cluster(points_df)

        # Create bounding box for each cluster
        cluster_labels = [
            label for label in np.unique(clustered_points[:, -1]) if label > -1
        ]
        for cluster_label in cluster_labels:
            sel_cluster = clustered_points[clustered_points[:, -1] == cluster_label]
            bounding_box = get_bounding_box(sel_cluster[:, :3], def_name="x")
            bboxes.append(bounding_box)

        # Create labels json file
        ply_basename = os.path.split(ply_filepath)[1]
        create_json(ply_basename, bboxes, points_dirpath, labels_dirpath)

        # Print progress
        print(f"\r {ply_basename} created! {i}/{end_idx - 1}", end="")

# PLY_FOLDER = str(DATA_PATH / 'labeling' / 'pointclouds')
# LABELS_FOLDER = str(DATA_PATH / 'labeling' / 'labels2')
# generate_cluster_labels(PLY_FOLDER, LABELS_FOLDER, end_idx=50)

### Labeling using labelCloud
Once labelcloud is installed via pip install labelCloud, it can be run from the terminal: 

```console
foo@bar:~$ labelcloud
starting labelcloud...
```

### Count how many vehicles were labeled

In [None]:
from collections import Counter

def count_labeled_objects(labels_dirpath):
    """
    Counts the number of labeled objects in each frame.

    :param labels_dirpath: Path to the directory containing the JSON files.
    
    :return: None
    """
    labels_filepaths = listdir_paths(labels_dirpath)

    name_counts = Counter()

    for i, json_filepath in enumerate(labels_filepaths):

        # Open the JSON file and load the data
        with open(json_filepath) as json_file:
            data = json.load(json_file)

        # Get the list of object names and count their occurrences
        names = [obj['name'] for obj in data['objects']]

        # If all objects are 'x', then there are no annotated bboxes
        if len(names) == names.count('x'):
            print(f'Frame {i} - {os.path.basename(json_filepath)} has no annotated bboxes, ending...')
            break

        # Print frames where some objects are not labeled
        if 'x' in names or 'New' in names:
            print(f'Frame {i} - {os.path.basename(json_filepath)} has unlabeled objects')
        
        # Update the counter
        name_counts.update(Counter(names))

    # Print the result
    print(f"\nTotal number of labeled objects: {sum(name_counts.values())}")
    for name, count in name_counts.items():
        print(f"{name}: {count}")

# LABELS_DIRPATH = str(DATA_PATH / 'labeling' / 'labels')
# count_labeled_objects(LABELS_DIRPATH)

### Generate dataset as numpy files from labels and points

In [None]:
from modules import get_contained_points_mask

# index of each column in the numpy array
_IDX = {'x': 0,
        'y': 1,
        'z': 2,
        'snr': 3,
        'noise': 4,
        'velocity': 5,
        'y_orig': 6,
        'total_seconds': 7,
        'point_id': 8,
        'object_id': 9,
        'class_id': 10,
}
# save the index to json file for later use in other scripts
with open(str(DATA_PATH / 'dataset' / 'dataset_columns.json'), 'w') as f:
    json.dump(_IDX, f)

# mapping from class name to class id
name2class_id = {'none': 0, 'car': 1, 'van': 2, 'box_truck': 3, 'truck': 4,}
with open(str(DATA_PATH / 'dataset' / 'class_ids.json'), 'w') as f:
    json.dump(name2class_id, f)

def generate_dataset(labels_dir, points_dir, output_dir, start_idx=0, end_idx=None):
    """
    Generates the dataset from the given labels and points directories.

    :param labels_dir: Path to the directory containing the JSON files.
    :param points_dir: Path to the directory containing the PLY files.
    :param output_dir: Path to the directory where the dataset will be saved.
    :param start_idx: Index of the first frame to be processed.
    :param end_idx: Index of the last frame to be processed.

    :return: None
    """

    labels_filepaths = listdir_paths(labels_dir)

    if end_idx is None:
        end_idx = len(labels_filepaths)

    for frame_idx, label_filepath in enumerate(labels_filepaths[start_idx:end_idx], start_idx):

        # Load the bboxes data from json
        with open(label_filepath) as json_file:
            label_data = json.load(json_file)

        # Get the name of the ply file from the label file
        points_filename = Path(label_filepath).with_suffix('.ply').name
        points_filepath = os.path.join(points_dir, points_filename)

        # Load points from ply file
        try:
            point_cloud = PyntCloud.from_file(points_filepath)
        except FileNotFoundError as e:
            print(f"PLY file for {e.filename} doesn't exist")
            continue

        # Get the list of object names and count their occurrences
        bboxes =  label_data['objects']
        points = point_cloud.points[['x', 'y', 'z', 'snr', 'noise', 'velocity', 'y_orig', 'total_seconds', 'pointID']].to_numpy()

        # [object_id, class_id] ...
        points_object_info = np.full((points.shape[0], 2), -1)

        for object_idx, bbox in enumerate(bboxes):
            mask = get_contained_points_mask(bbox, points)
            points_object_info[mask, 0] = object_idx
            points_object_info[mask, 1] = name2class_id.get(bbox['name'], -1)


        # Check if any points are outside bounding boxes
        if np.any(np.all((points_object_info == -1), axis=0)):
            print(f'\nUnannotated point in {Path(label_filepath).stem} - {frame_idx}, ending...')
            return

        # Check if frame is unannotated
        if np.all(points_object_info[:, 1] == -1):
            print(f'\nUnannotated frame in {Path(label_filepath).stem} - {frame_idx}, ending...')
            return

        # Check if any bboxes are unannotated
        if np.any(points_object_info[:, 1] == -1):
            print(f'\nUnannotated bbox in {Path(label_filepath).stem} - {frame_idx}, ending...')
            return

        # Add object info to points
        # [[x, y, z, snr, noise, velocity, y_orig, total_seconds, point_id, object_id, class_id], ... ]
        points_all = np.hstack((points, points_object_info))

        # correct datetime malformed by pyntcloud
        points_all[:, _IDX['total_seconds']] = list(map(lambda x: frame_times[x].timestamp(), points_all[:, _IDX['point_id']] // 1000))

        # save as numpy array file
        save_filename = Path(label_filepath).with_suffix('.npy').name
        save_filepath = os.path.join(output_dir, save_filename)
        np.save(save_filepath, points_all)
        print(f"\r {save_filename} created! {frame_idx}/{end_idx - 1}", end="")


# LABELS_DIR = DATA_PATH / 'labeling' / 'labels'
# POINTS_DIR = DATA_PATH / 'labeling' / 'pointclouds'
# DATASET_DIR = DATA_PATH / 'dataset' / 'dataset_gt
# generate_dataset(LABELS_DIR, POINTS_DIR, DATASET_DIR, start_idx=0, end_idx=10501)