[DAR-2639][External] Add data & tests for in-platform model training (#…

…872) * Added model tests (WIP * Fixed path for video frames pulled with folders * Added model training data & tests * Added get_annotations() test (WIP) * Fixed type issue for get_annotations() * Re-added convert_xyxy_to_bounding_box() * get_annotations() fixes * Finalised test data * Fixes for splitting video annotations * Fix for video frames
v7labs · Jun 20, 2024 · 6ee45c5 · 6ee45c5
1 parent fff4f13
commit 6ee45c5
Show file tree

Hide file tree

Showing 11 changed files with 382 additions and 84 deletions.
diff --git a/darwin/dataset/download_manager.py b/darwin/dataset/download_manager.py
@@ -223,6 +223,7 @@ def _download_image_from_json_annotation(
                 parent_path,
                 annotation_path,
                 video_frames,
+                use_folders,
             )
         if force_slots:
             return _download_all_slots_from_json_annotation(
@@ -235,6 +236,7 @@ def _download_image_from_json_annotation(
                 parent_path,
                 annotation_path,
                 video_frames,
+                use_folders,
             )
 
     return []
@@ -302,12 +304,15 @@ def _download_single_slot_from_json_annotation(
     parent_path: Path,
     annotation_path: Path,
     video_frames: bool,
+    use_folders: bool = True,
 ) -> Iterable[Callable[[], None]]:
     slot = annotation.slots[0]
     generator = []
 
     if video_frames and slot.type != "image":
-        video_path: Path = parent_path / annotation_path.stem
+        video_path: Path = parent_path / (
+            annotation_path.stem if not use_folders else Path(annotation.filename).stem
+        )
         video_path.mkdir(exist_ok=True, parents=True)
 
         # Indicates it's a long video and uses the segment and manifest
@@ -339,7 +344,7 @@ def _download_single_slot_from_json_annotation(
             image_url = image["url"]
             image_filename = image["file_name"]
             suffix = Path(image_filename).suffix
-            stem = annotation_path.stem
+            stem = Path(annotation.filename).stem
             filename = str(Path(stem + suffix))
             image_path = parent_path / sanitize_filename(
                 filename or annotation.filename

diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py
@@ -163,6 +163,20 @@ def split_video_annotations(self, release_name: str = "latest") -> None:
             for frame_annotation in frame_annotations:
                 annotation = self._build_image_annotation(frame_annotation, self.team)
 
+                # When splitting into frames, we need to read each frame individually
+                # Because we use the source name suffix, we need to adjust this to .png here
+                current_stem = Path(
+                    annotation["item"]["slots"][0]["source_files"][0]["file_name"]
+                ).stem
+                annotation["item"]["slots"][0]["source_files"][0]["file_name"] = (
+                    current_stem + ".png"
+                )
+                # We also need to account for the folder that this function creates
+                item_name = annotation["item"]["name"].split("/")[0]
+                if annotation["item"]["path"] == "/":
+                    annotation["item"]["path"] += item_name
+                else:
+                    annotation["item"]["path"] += "/" + item_name
                 video_frame_annotations_path = annotations_path / annotation_file.stem
                 video_frame_annotations_path.mkdir(exist_ok=True, parents=True)
 

diff --git a/darwin/dataset/utils.py b/darwin/dataset/utils.py
@@ -400,11 +400,11 @@ def create_bbox_object(obj, box_mode, classes=None):
 def get_annotations(
     dataset_path: PathLike,
     partition: Optional[str] = None,
+    split_type: Optional[str] = "random",
+    annotation_format: str = "coco",
     split: Optional[str] = "default",
-    split_type: Optional[str] = None,
     annotation_type: str = "polygon",
     release_name: Optional[str] = None,
-    annotation_format: str = "coco",
     ignore_inconsistent_examples: bool = False,
 ) -> Iterator[Dict[str, Any]]:
     """
@@ -415,17 +415,17 @@ def get_annotations(
     dataset_path : PathLike
         Path to the location of the dataset on the file system.
     partition : Optional[str], default: None
-        Selects one of the partitions ``[train, val, test]``.
+        Selects one of the partitions ``[train, val, test, None]``. If not specified, all annotations are returned.
+    split_type : Optional[str], default: "random"
+        Heuristic used to do the split ``[random, stratified]``. If not specified, random is used.
+    annotation_format : str
+        Re-formatting of the annotation when loaded ``[coco, darwin]``..
     split : Optional[str], default: "default"
         Selects the split that defines the percentages used (use 'default' to select the default split).
-    split_type : Optional[str], default: None
-        Heuristic used to do the split ``[random, stratified, None]``.
     annotation_type : str, default: "polygon"
         The type of annotation classes ``[tag, bounding_box, polygon]``.
     release_name : Optional[str], default: None
         Version of the dataset.
-    annotation_format : Optional[str], default: "coco"
-        Re-formatting of the annotation when loaded ``[coco, darwin]``.
     ignore_inconsistent_examples : bool, default: False
         Ignore examples for which we have annotations, but either images are missing,
         or more than one images exist for the same annotation.
@@ -467,18 +467,18 @@ def get_annotations(
     )
 
     if partition:
-        stems = _get_stems_from_split(
-            release_path, split, split_type, annotation_type, partition
+        annotation_filepaths = _get_annotation_filepaths_from_split(
+            release_path, annotation_type, partition, split_type, split=split
         )
     else:
-        stems = get_annotation_files_from_dir(annotations_dir)
+        annotation_filepaths = get_annotation_files_from_dir(annotations_dir)
 
     (
         images_paths,
         annotations_paths,
         invalid_annotation_paths,
     ) = _map_annotations_to_images(
-        stems, annotations_dir, images_dir, ignore_inconsistent_examples
+        annotation_filepaths, images_dir, ignore_inconsistent_examples
     )
 
     print(f"Found {len(invalid_annotation_paths)} invalid annotations")
@@ -505,55 +505,57 @@ def _validate_inputs(
     Validates the input parameters for partition, split_type, and annotation_type.
 
     Args:
-        partition (str, None): Dataset partition. Should be 'train', 'val', 'test' or None.
-        split_type (str, None): Type of dataset split. Can be 'random', 'stratified' or None.
+        partition (str, None): Dataset partition. Should be 'train', 'val', 'test', or None.
+        split_type (str, None): Type of dataset split. Can be 'random' or 'stratified'.
         annotation_type (str): Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'.
 
     Raises:
         ValueError: If the input parameters do not match the expected values.
     """
     if partition not in ["train", "val", "test", None]:
-        raise ValueError("partition should be either 'train', 'val', 'test', or None")
-    if split_type not in ["random", "stratified", None]:
-        raise ValueError("split_type should be either 'random', 'stratified', or None")
+        raise ValueError("partition should be either 'train', 'val', 'test', or 'None'")
+    if split_type not in ["random", "stratified"]:
+        raise ValueError("split_type should be either 'random', or 'stratified'")
     if annotation_type not in ["tag", "polygon", "bounding_box"]:
         raise ValueError(
             "annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
         )
 
 
-def _get_stems_from_split(
+def _get_annotation_filepaths_from_split(
     release_path: Path,
-    split: str,
-    split_type: Union[str, None],
     annotation_type: str,
-    partition: Union[str, None],
-) -> Generator:
+    partition: str,
+    split_type: str,
+    split: Optional[str] = "default",
+) -> Generator[str, None, None]:
     """
-    Determines the file stems based on the dataset split and other parameters.
+    Determines the filpaths based on the dataset split and other parameters.
 
     Args:
-        release_path (Path): Path to the dataset release.
-        split (str): Dataset split identifier.
-        split_type (str, None): Type of dataset split. Can be 'random', 'stratified' or None.
-        annotation_type (str): Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'.
-        partition (str, None): Dataset partition. Should be 'train', 'val', 'test' or None.
+        release_path : Path
+            Path to the dataset release.
+        annotation_type : str
+            Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'.
+        partition : str
+            Dataset partition. Should be 'train', 'val', 'test'.
+        split_type : str
+            Type of dataset split. Can be 'random' or 'stratified'.
+        split : Optional[str]
+            Dataset split identifier.
 
     Returns:
-        Generator[str]: File stems for the dataset.
+        Generator: [str, None, None]
+            Filepaths for the dataset.
 
     Raises:
         ValueError: If the split_type is invalid.
         FileNotFoundError: If the dataset partition file is not found.
     """
-    if split_type is None:
-        split_file = f"{partition}.txt"
-    elif split_type == "random":
+    if split_type == "random":
         split_file = f"{split_type}_{partition}.txt"
     elif split_type == "stratified":
         split_file = f"{split_type}_{annotation_type}_{partition}.txt"
-    else:
-        raise ValueError(f"Invalid split_type ({split_type})")
 
     split_path: Path = release_path / "lists" / str(split) / split_file
 
@@ -567,16 +569,15 @@ def _get_stems_from_split(
 
 
 def _map_annotations_to_images(
-    stems: List[str],
-    annotations_dir: Path,
+    annotation_filepaths: Generator[str, None, None],
     images_dir: Path,
     ignore_inconsistent_examples: bool,
 ) -> Tuple[List[Path], List[Path], List[Path]]:
     """
     Maps annotations to their corresponding images based on the file stems.
 
     Args:
-        stems (List[str]): List of file stems.
+        annotation_filepaths (Generator[str, None, None]): List of annotation filepaths.
         annotations_dir (Path): Directory containing annotation files.
         images_dir (Path): Directory containing image files.
         ignore_inconsistent_examples (bool): Flag to determine if inconsistent examples should be ignored.
@@ -591,14 +592,14 @@ def _map_annotations_to_images(
     annotations_paths = []
     invalid_annotation_paths = []
     with_folders = any(item.is_dir() for item in images_dir.iterdir())
-    for annotation_path in get_annotation_files_from_dir(annotations_dir):
+    for annotation_path in annotation_filepaths:
         darwin_json = stream_darwin_json(Path(annotation_path))
         image_path = get_image_path_from_stream(
             darwin_json, images_dir, Path(annotation_path), with_folders
         )
         if image_path.exists():
             images_paths.append(image_path)
-            annotations_paths.append(annotation_path)
+            annotations_paths.append(Path(annotation_path))
             continue
         else:
             if ignore_inconsistent_examples:
@@ -618,7 +619,7 @@ def _load_and_format_annotations(
     annotation_format: str,
     annotation_type: str,
     classes: List[str],
-) -> Generator:
+) -> Generator[str, None, None]:
     """
     Loads and formats annotations based on the specified format and type.
 
@@ -654,7 +655,7 @@ def _load_and_format_annotations(
             )
     elif annotation_format == "darwin":
         for annotation_path in annotations_paths:
-            record = attempt_decode(annotation_path)
+            record = attempt_decode(Path(annotation_path))
             yield record
 
 
@@ -795,16 +796,17 @@ def compute_distributions(
             )
             if not split_file.exists():
                 split_file = split_path / f"random_{partition}.txt"
-            stems: List[str] = [e.rstrip("\n\r") for e in split_file.open()]
 
-            for stem in stems:
-                if not stem.endswith(".json"):
-                    stem = f"{stem}.json"
-                annotation_path: Path = annotations_dir / stem
+            annotation_filepaths: List[str] = [
+                e.rstrip("\n\r") for e in split_file.open()
+            ]
+            for annotation_filepath in annotation_filepaths:
+                if not annotation_filepath.endswith(".json"):
+                    annotation_filepath = f"{annotation_filepath}.json"
+                annotation_path: Path = annotations_dir / annotation_filepath
                 annotation_file: Optional[dt.AnnotationFile] = parse_path(
                     annotation_path
                 )
-
                 if annotation_file is None:
                     continue
 

diff --git a/darwin/utils/utils.py b/darwin/utils/utils.py
@@ -456,13 +456,18 @@ def get_image_path_from_stream(
         Path to the image file.
     """
     try:
+        item_name_stem = Path(darwin_json["item"]["name"]).stem
+        source_name_suffix = Path(
+            darwin_json["item"]["slots"][0]["source_files"][0]["file_name"]
+        ).suffix
+        local_file_name = Path(item_name_stem + source_name_suffix)
         if not with_folders:
-            return images_dir / Path(darwin_json["item"]["name"])
+            return images_dir / local_file_name
         else:
             return (
                 images_dir
                 / (Path(darwin_json["item"]["path"].lstrip("/\\")))
-                / Path(darwin_json["item"]["name"])
+                / local_file_name
             )
     except OSError:
         # Load in the JSON as normal
@@ -1300,6 +1305,35 @@ def convert_polygons_to_sequences(
     return sequences
 
 
+def convert_xyxy_to_bounding_box(box: List[Union[int, float]]) -> dt.BoundingBox:
+    """
+    Converts a list of xy coordinates representing a bounding box into a dictionary.
+    This is used by in-platform model training.
+
+    Parameters
+    ----------
+    box : List[Union[int, float]]
+        List of arrays of coordinates in the format [x1, y1, x2, y2]
+
+    Returns
+    -------
+    BoundingBox
+        Bounding box in the format ``{x: x1, y: y1, h: height, w: width}``.
+
+    Raises
+    ------
+    ValueError
+        If ``box`` has an incorrect format.
+    """
+    if not isinstance(box[0], float) and not isinstance(box[0], int):
+        raise ValueError("Unknown input format")
+
+    x1, y1, x2, y2 = box
+    width = x2 - x1
+    height = y2 - y1
+    return {"x": x1, "y": y1, "w": width, "h": height}
+
+
 def convert_polygons_to_mask(
     polygons: List, height: int, width: int, value: Optional[int] = 1
 ) -> np.ndarray: