Skip to content

Commit

Permalink
[DAR-2639][External] Add data & tests for in-platform model training (#…
Browse files Browse the repository at this point in the history
…872)

* Added model tests (WIP

* Fixed path for video frames pulled with folders

* Added model training data & tests

* Added get_annotations() test (WIP)

* Fixed type issue for get_annotations()

* Re-added convert_xyxy_to_bounding_box()

* get_annotations() fixes

* Finalised test data

* Fixes for splitting video annotations

* Fix for video frames
  • Loading branch information
JBWilkie committed Jun 20, 2024
1 parent fff4f13 commit 6ee45c5
Show file tree
Hide file tree
Showing 11 changed files with 382 additions and 84 deletions.
9 changes: 7 additions & 2 deletions darwin/dataset/download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ def _download_image_from_json_annotation(
parent_path,
annotation_path,
video_frames,
use_folders,
)
if force_slots:
return _download_all_slots_from_json_annotation(
Expand All @@ -235,6 +236,7 @@ def _download_image_from_json_annotation(
parent_path,
annotation_path,
video_frames,
use_folders,
)

return []
Expand Down Expand Up @@ -302,12 +304,15 @@ def _download_single_slot_from_json_annotation(
parent_path: Path,
annotation_path: Path,
video_frames: bool,
use_folders: bool = True,
) -> Iterable[Callable[[], None]]:
slot = annotation.slots[0]
generator = []

if video_frames and slot.type != "image":
video_path: Path = parent_path / annotation_path.stem
video_path: Path = parent_path / (
annotation_path.stem if not use_folders else Path(annotation.filename).stem
)
video_path.mkdir(exist_ok=True, parents=True)

# Indicates it's a long video and uses the segment and manifest
Expand Down Expand Up @@ -339,7 +344,7 @@ def _download_single_slot_from_json_annotation(
image_url = image["url"]
image_filename = image["file_name"]
suffix = Path(image_filename).suffix
stem = annotation_path.stem
stem = Path(annotation.filename).stem
filename = str(Path(stem + suffix))
image_path = parent_path / sanitize_filename(
filename or annotation.filename
Expand Down
14 changes: 14 additions & 0 deletions darwin/dataset/remote_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,20 @@ def split_video_annotations(self, release_name: str = "latest") -> None:
for frame_annotation in frame_annotations:
annotation = self._build_image_annotation(frame_annotation, self.team)

# When splitting into frames, we need to read each frame individually
# Because we use the source name suffix, we need to adjust this to .png here
current_stem = Path(
annotation["item"]["slots"][0]["source_files"][0]["file_name"]
).stem
annotation["item"]["slots"][0]["source_files"][0]["file_name"] = (
current_stem + ".png"
)
# We also need to account for the folder that this function creates
item_name = annotation["item"]["name"].split("/")[0]
if annotation["item"]["path"] == "/":
annotation["item"]["path"] += item_name
else:
annotation["item"]["path"] += "/" + item_name
video_frame_annotations_path = annotations_path / annotation_file.stem
video_frame_annotations_path.mkdir(exist_ok=True, parents=True)

Expand Down
94 changes: 48 additions & 46 deletions darwin/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,11 +400,11 @@ def create_bbox_object(obj, box_mode, classes=None):
def get_annotations(
dataset_path: PathLike,
partition: Optional[str] = None,
split_type: Optional[str] = "random",
annotation_format: str = "coco",
split: Optional[str] = "default",
split_type: Optional[str] = None,
annotation_type: str = "polygon",
release_name: Optional[str] = None,
annotation_format: str = "coco",
ignore_inconsistent_examples: bool = False,
) -> Iterator[Dict[str, Any]]:
"""
Expand All @@ -415,17 +415,17 @@ def get_annotations(
dataset_path : PathLike
Path to the location of the dataset on the file system.
partition : Optional[str], default: None
Selects one of the partitions ``[train, val, test]``.
Selects one of the partitions ``[train, val, test, None]``. If not specified, all annotations are returned.
split_type : Optional[str], default: "random"
Heuristic used to do the split ``[random, stratified]``. If not specified, random is used.
annotation_format : str
Re-formatting of the annotation when loaded ``[coco, darwin]``..
split : Optional[str], default: "default"
Selects the split that defines the percentages used (use 'default' to select the default split).
split_type : Optional[str], default: None
Heuristic used to do the split ``[random, stratified, None]``.
annotation_type : str, default: "polygon"
The type of annotation classes ``[tag, bounding_box, polygon]``.
release_name : Optional[str], default: None
Version of the dataset.
annotation_format : Optional[str], default: "coco"
Re-formatting of the annotation when loaded ``[coco, darwin]``.
ignore_inconsistent_examples : bool, default: False
Ignore examples for which we have annotations, but either images are missing,
or more than one images exist for the same annotation.
Expand Down Expand Up @@ -467,18 +467,18 @@ def get_annotations(
)

if partition:
stems = _get_stems_from_split(
release_path, split, split_type, annotation_type, partition
annotation_filepaths = _get_annotation_filepaths_from_split(
release_path, annotation_type, partition, split_type, split=split
)
else:
stems = get_annotation_files_from_dir(annotations_dir)
annotation_filepaths = get_annotation_files_from_dir(annotations_dir)

(
images_paths,
annotations_paths,
invalid_annotation_paths,
) = _map_annotations_to_images(
stems, annotations_dir, images_dir, ignore_inconsistent_examples
annotation_filepaths, images_dir, ignore_inconsistent_examples
)

print(f"Found {len(invalid_annotation_paths)} invalid annotations")
Expand All @@ -505,55 +505,57 @@ def _validate_inputs(
Validates the input parameters for partition, split_type, and annotation_type.
Args:
partition (str, None): Dataset partition. Should be 'train', 'val', 'test' or None.
split_type (str, None): Type of dataset split. Can be 'random', 'stratified' or None.
partition (str, None): Dataset partition. Should be 'train', 'val', 'test', or None.
split_type (str, None): Type of dataset split. Can be 'random' or 'stratified'.
annotation_type (str): Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'.
Raises:
ValueError: If the input parameters do not match the expected values.
"""
if partition not in ["train", "val", "test", None]:
raise ValueError("partition should be either 'train', 'val', 'test', or None")
if split_type not in ["random", "stratified", None]:
raise ValueError("split_type should be either 'random', 'stratified', or None")
raise ValueError("partition should be either 'train', 'val', 'test', or 'None'")
if split_type not in ["random", "stratified"]:
raise ValueError("split_type should be either 'random', or 'stratified'")
if annotation_type not in ["tag", "polygon", "bounding_box"]:
raise ValueError(
"annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
)


def _get_stems_from_split(
def _get_annotation_filepaths_from_split(
release_path: Path,
split: str,
split_type: Union[str, None],
annotation_type: str,
partition: Union[str, None],
) -> Generator:
partition: str,
split_type: str,
split: Optional[str] = "default",
) -> Generator[str, None, None]:
"""
Determines the file stems based on the dataset split and other parameters.
Determines the filpaths based on the dataset split and other parameters.
Args:
release_path (Path): Path to the dataset release.
split (str): Dataset split identifier.
split_type (str, None): Type of dataset split. Can be 'random', 'stratified' or None.
annotation_type (str): Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'.
partition (str, None): Dataset partition. Should be 'train', 'val', 'test' or None.
release_path : Path
Path to the dataset release.
annotation_type : str
Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'.
partition : str
Dataset partition. Should be 'train', 'val', 'test'.
split_type : str
Type of dataset split. Can be 'random' or 'stratified'.
split : Optional[str]
Dataset split identifier.
Returns:
Generator[str]: File stems for the dataset.
Generator: [str, None, None]
Filepaths for the dataset.
Raises:
ValueError: If the split_type is invalid.
FileNotFoundError: If the dataset partition file is not found.
"""
if split_type is None:
split_file = f"{partition}.txt"
elif split_type == "random":
if split_type == "random":
split_file = f"{split_type}_{partition}.txt"
elif split_type == "stratified":
split_file = f"{split_type}_{annotation_type}_{partition}.txt"
else:
raise ValueError(f"Invalid split_type ({split_type})")

split_path: Path = release_path / "lists" / str(split) / split_file

Expand All @@ -567,16 +569,15 @@ def _get_stems_from_split(


def _map_annotations_to_images(
stems: List[str],
annotations_dir: Path,
annotation_filepaths: Generator[str, None, None],
images_dir: Path,
ignore_inconsistent_examples: bool,
) -> Tuple[List[Path], List[Path], List[Path]]:
"""
Maps annotations to their corresponding images based on the file stems.
Args:
stems (List[str]): List of file stems.
annotation_filepaths (Generator[str, None, None]): List of annotation filepaths.
annotations_dir (Path): Directory containing annotation files.
images_dir (Path): Directory containing image files.
ignore_inconsistent_examples (bool): Flag to determine if inconsistent examples should be ignored.
Expand All @@ -591,14 +592,14 @@ def _map_annotations_to_images(
annotations_paths = []
invalid_annotation_paths = []
with_folders = any(item.is_dir() for item in images_dir.iterdir())
for annotation_path in get_annotation_files_from_dir(annotations_dir):
for annotation_path in annotation_filepaths:
darwin_json = stream_darwin_json(Path(annotation_path))
image_path = get_image_path_from_stream(
darwin_json, images_dir, Path(annotation_path), with_folders
)
if image_path.exists():
images_paths.append(image_path)
annotations_paths.append(annotation_path)
annotations_paths.append(Path(annotation_path))
continue
else:
if ignore_inconsistent_examples:
Expand All @@ -618,7 +619,7 @@ def _load_and_format_annotations(
annotation_format: str,
annotation_type: str,
classes: List[str],
) -> Generator:
) -> Generator[str, None, None]:
"""
Loads and formats annotations based on the specified format and type.
Expand Down Expand Up @@ -654,7 +655,7 @@ def _load_and_format_annotations(
)
elif annotation_format == "darwin":
for annotation_path in annotations_paths:
record = attempt_decode(annotation_path)
record = attempt_decode(Path(annotation_path))
yield record


Expand Down Expand Up @@ -795,16 +796,17 @@ def compute_distributions(
)
if not split_file.exists():
split_file = split_path / f"random_{partition}.txt"
stems: List[str] = [e.rstrip("\n\r") for e in split_file.open()]

for stem in stems:
if not stem.endswith(".json"):
stem = f"{stem}.json"
annotation_path: Path = annotations_dir / stem
annotation_filepaths: List[str] = [
e.rstrip("\n\r") for e in split_file.open()
]
for annotation_filepath in annotation_filepaths:
if not annotation_filepath.endswith(".json"):
annotation_filepath = f"{annotation_filepath}.json"
annotation_path: Path = annotations_dir / annotation_filepath
annotation_file: Optional[dt.AnnotationFile] = parse_path(
annotation_path
)

if annotation_file is None:
continue

Expand Down
38 changes: 36 additions & 2 deletions darwin/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,13 +456,18 @@ def get_image_path_from_stream(
Path to the image file.
"""
try:
item_name_stem = Path(darwin_json["item"]["name"]).stem
source_name_suffix = Path(
darwin_json["item"]["slots"][0]["source_files"][0]["file_name"]
).suffix
local_file_name = Path(item_name_stem + source_name_suffix)
if not with_folders:
return images_dir / Path(darwin_json["item"]["name"])
return images_dir / local_file_name
else:
return (
images_dir
/ (Path(darwin_json["item"]["path"].lstrip("/\\")))
/ Path(darwin_json["item"]["name"])
/ local_file_name
)
except OSError:
# Load in the JSON as normal
Expand Down Expand Up @@ -1300,6 +1305,35 @@ def convert_polygons_to_sequences(
return sequences


def convert_xyxy_to_bounding_box(box: List[Union[int, float]]) -> dt.BoundingBox:
"""
Converts a list of xy coordinates representing a bounding box into a dictionary.
This is used by in-platform model training.
Parameters
----------
box : List[Union[int, float]]
List of arrays of coordinates in the format [x1, y1, x2, y2]
Returns
-------
BoundingBox
Bounding box in the format ``{x: x1, y: y1, h: height, w: width}``.
Raises
------
ValueError
If ``box`` has an incorrect format.
"""
if not isinstance(box[0], float) and not isinstance(box[0], int):
raise ValueError("Unknown input format")

x1, y1, x2, y2 = box
width = x2 - x1
height = y2 - y1
return {"x": x1, "y": y1, "w": width, "h": height}


def convert_polygons_to_mask(
polygons: List, height: int, width: int, value: Optional[int] = 1
) -> np.ndarray:
Expand Down
Loading

0 comments on commit 6ee45c5

Please sign in to comment.