v7labs · ChristofferEdlund · Sep 28, 2023 · Sep 20, 2023 · Sep 20, 2023 · Sep 21, 2023
diff --git a/darwin/torch/dataset.py b/darwin/torch/dataset.py
@@ -15,7 +15,7 @@
     ConvertPolygonsToInstanceMasks,
     ConvertPolygonsToSemanticMask,
 )
-from darwin.torch.utils import polygon_area
+from darwin.torch.utils import clamp_bbox_to_image_size, polygon_area
 from darwin.utils import convert_polygons_to_sequences
 
 
@@ -333,8 +333,19 @@ def get_target(self, index: int) -> Dict[str, Any]:
             min_y: float = np.min([np.min(y_coord) for y_coord in y_coords])
             max_x: float = np.max([np.max(x_coord) for x_coord in x_coords])
             max_y: float = np.max([np.max(y_coord) for y_coord in y_coords])
-            w: float = max_x - min_x + 1
-            h: float = max_y - min_y + 1
+
+            # Clamp the coordinates to the image dimensions
+            min_x: float = max(0, min_x)
+            min_y: float = max(0, min_y)
+            max_x: float = min(target["width"] - 1, max_x)
+            max_y: float = min(target["height"] - 1, max_y)
+
+            assert min_x < max_x and min_y < max_y
+
+            # Convert to XYWH
+            w: float = max_x - min_x
+            h: float = max_y - min_y
+
             # Compute the area of the polygon
             # TODO fix with addictive/subtractive paths in complex polygons
             poly_area: float = np.sum([polygon_area(x_coord, y_coord) for x_coord, y_coord in zip(x_coords, y_coords)])
@@ -390,7 +401,6 @@ class SemanticSegmentationDataset(LocalDataset):
     """
 
     def __init__(self, transform: Optional[Union[List[Callable], Callable]] = None, **kwargs):
-
         super().__init__(annotation_type="polygon", **kwargs)
         if not "__background__" in self.classes:
             self.classes.insert(0, "__background__")
@@ -546,6 +556,9 @@ def __getitem__(self, index: int):
         img: PILImage.Image = self.get_image(index)
         target: Dict[str, Any] = self.get_target(index)
 
+        width, height = img.size
+        target = clamp_bbox_to_image_size(target, width, height)
+
         if self.transform is not None:
             img_tensor, target = self.transform(img, target)
         else:

diff --git a/darwin/torch/transforms.py b/darwin/torch/transforms.py
@@ -1,14 +1,33 @@
 import random
+from pathlib import Path
 from typing import Any, Dict, Optional, Tuple, Union
 
+import numpy as np
 import torch
 import torchvision.transforms as transforms
 import torchvision.transforms.functional as F
 from PIL import Image as PILImage
 
-from darwin.torch.utils import convert_segmentation_to_mask, flatten_masks_by_category
+# Optional dependency
+try:
+    import albumentations as A
+    from albumentations import Compose
+except ImportError:
+    A = None
+
+from typing import TYPE_CHECKING, Type
+
+if TYPE_CHECKING:
+    from albumentations.pytorch import ToTensorV2
+
+    AType = Type[ToTensorV2]
+else:
+    AType = Type[None]
+    Compose = Type[None]
 
 
+from darwin.torch.utils import convert_segmentation_to_mask, flatten_masks_by_category
+
 TargetKey = Union["boxes", "labels", "mask", "masks", "image_id", "area", "iscrowd"]
 TargetType = Dict[TargetKey, torch.Tensor]
 
@@ -191,9 +210,6 @@ def __call__(self, image: PILImage.Image, target: TargetType) -> Tuple[PILImage.
         boxes = [obj["bbox"] for obj in annotations]
         # guard against no boxes via resizing
         boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
-        boxes[:, 2:] += boxes[:, :2]
-        boxes[:, 0::2].clamp_(min=0, max=w)
-        boxes[:, 1::2].clamp_(min=0, max=h)
 
         classes = [obj["category_id"] for obj in annotations]
         classes = torch.tensor(classes, dtype=torch.int64)
@@ -209,20 +225,21 @@ def __call__(self, image: PILImage.Image, target: TargetType) -> Tuple[PILImage.
             if num_keypoints:
                 keypoints = keypoints.view(num_keypoints, -1, 3)
 
-        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
-        boxes = boxes[keep]
-        classes = classes[keep]
-        masks = masks[keep]
-        if keypoints is not None:
-            keypoints = keypoints[keep]
-
         target["boxes"] = boxes
         target["labels"] = classes
         target["masks"] = masks
         target["image_id"] = image_id
         if keypoints is not None:
             target["keypoints"] = keypoints
 
+        # Remove boxes with widht or height zero
+        keep = (boxes[:, 3] > 0) & (boxes[:, 2] > 0)
+        boxes = boxes[keep]
+        classes = classes[keep]
+        masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
         # conversion to coco api
         area = torch.tensor([obj["area"] for obj in annotations])
         iscrowd = torch.tensor([obj.get("iscrowd", 0) for obj in annotations])
@@ -278,3 +295,100 @@ def __call__(self, image: PILImage.Image, annotation: Dict[str, Any]) -> Tuple[P
             target = torch.zeros((h, w), dtype=torch.uint8)
         target = PILImage.fromarray(target.numpy())
         return image, target
+
+
+class AlbumentationsTransform:
+    """
+    Wrapper class for Albumentations augmentations.
+    """
+
+    def __init__(self, transform: Compose):
+        self._check_albumentaion_dependency()
+        self.transform = transform
+
+    @classmethod
+    def from_path(cls, config_path: str) -> "AlbumentationsTransform":
+        config_path = Path(config_path)
+        try:
+            transform = A.load(str(config_path))
+            return cls(transform)
+        except Exception as e:
+            raise ValueError(f"Invalid config path: {config_path}. Error: {e}")
+
+    @classmethod
+    def from_dict(cls, alb_dict: dict) -> "AlbumentationsTransform":
+        try:
+            transform = A.from_dict(alb_dict)
+            return cls(transform)
+        except Exception as e:
+            raise ValueError(f"Invalid albumentations dictionary. Error: {e}")
+
+    def __call__(self, image, annotation: dict = None) -> tuple:
+        np_image = np.array(image)
+        if annotation is None:
+            annotation = {}
+        albu_data = self._pre_process(np_image, annotation)
+        transformed_data = self.transform(**albu_data)
+        image, transformed_annotation = self._post_process(transformed_data, annotation)
+
+        return image, transformed_annotation
+
+    def _pre_process(self, image: np.ndarray, annotation: dict) -> dict:
+        """
+        Prepare image and annotation for albumentations transformation.
+        """
+        albumentation_dict = {"image": image}
+
+        boxes = annotation.get("boxes")
+        if boxes is not None:
+            albumentation_dict["bboxes"] = boxes.numpy().tolist()
+
+        labels = annotation.get("labels")
+        if labels is not None:
+            albumentation_dict["labels"] = labels.tolist()
+
+        masks = annotation.get("masks")
+        if masks is not None:
+            albumentation_dict["masks"] = masks.numpy()
+
+        return albumentation_dict
+
+    def _post_process(self, albumentation_output: dict, annotation: dict) -> tuple:
+        """
+        Process the output of albumentations transformation back to desired format.
+        """
+        output_annotation = {}
+        image = albumentation_output["image"]
+
+        bboxes = albumentation_output.get("bboxes")
+        if bboxes is not None:
+            output_annotation["boxes"] = torch.tensor(bboxes)
+            if "area" in annotation and "masks" not in albumentation_output:
+                output_annotation["area"] = output_annotation["boxes"][:, 2] * output_annotation["boxes"][:, 3]
+
+        labels = albumentation_output.get("labels")
+        if labels is not None:
+            output_annotation["labels"] = torch.tensor(labels)
+
+        masks = albumentation_output.get("masks")
+        if masks is not None:
+            if isinstance(masks[0], np.ndarray):
+                output_annotation["masks"] = torch.tensor(np.array(masks))
+            else:
+                output_annotation["masks"] = torch.stack(masks)
+            if "area" in annotation:
+                output_annotation["area"] = torch.sum(output_annotation["masks"], dim=[1, 2])
+
+        # Copy other metadata from original annotation
+        for key, value in annotation.items():
+            output_annotation.setdefault(key, value)
+
+        return image, output_annotation
+
+    def _check_albumentaion_dependency(self):
+        if A is None:
+            raise ImportError(
+                "The albumentations library is not installed. "
+                "To use this function, install it with pip install albumentations, "
+                "or install the ml extras of this package."
+            )
diff --git a/darwin/torch/utils.py b/darwin/torch/utils.py
@@ -192,3 +192,36 @@ def detectron2_register_dataset(
     if evaluator_type:
         MetadataCatalog.get(catalog_name).set(evaluator_type=evaluator_type)
     return catalog_name
+
+
+def clamp_bbox_to_image_size(annotations, img_width, img_height, format="xywh"):
+    """
+    Clamps bounding boxes in annotations to the given image dimensions.
+
+    :param annotations: Dictionary containing bounding box coordinates in 'boxes' key.
+    :param img_width: Width of the image.
+    :param img_height: Height of the image.
+    :param format: Format of the bounding boxes, either "xywh" or "xyxy".
+    :return: Annotations with clamped bounding boxes.
+
+    The function modifies the input annotations dictionary to clamp the bounding box coordinates
+    based on the specified format, ensuring they lie within the image dimensions.
+    """
+    boxes = annotations["boxes"]
+
+    if format == "xyxy":
+        boxes[:, 0::2].clamp_(min=0, max=img_width - 1)
+        boxes[:, 1::2].clamp_(min=0, max=img_height - 1)
+
+    elif format == "xywh":
+        # First, clamp the x and y coordinates
+        boxes[:, 0].clamp_(min=0, max=img_width - 1)
+        boxes[:, 1].clamp_(min=0, max=img_height - 1)
+        # Then, clamp the width and height
+        boxes[:, 2].clamp_(min=torch.tensor(0), max=img_width - boxes[:, 0] - 1)  # -1 since we images are zero-indexed
+        boxes[:, 3].clamp_(min=torch.tensor(0), max=img_height - boxes[:, 1] - 1)  # -1 since we images are zero-indexed
+    else:
+        raise ValueError(f"Unsupported bounding box format: {format}")
+
+    annotations["boxes"] = boxes
+    return annotations
diff --git a/darwin/utils/utils.py b/darwin/utils/utils.py
@@ -919,8 +919,8 @@ def convert_polygons_to_sequences(
         path: List[Union[int, float]] = []
         for point in polygon:
             # Clip coordinates to the image size
-            x = max(min(point["x"], width - 1) if width else point["x"], 0)
-            y = max(min(point["y"], height - 1) if height else point["y"], 0)
+            x = max(min(point["x"], width -1) if width else point["x"], 0)
+            y = max(min(point["y"], height -1) if height else point["y"], 0)
             if rounding:
                 path.append(round(x))
                 path.append(round(y))

diff --git a/pyproject.toml b/pyproject.toml
@@ -107,7 +107,7 @@ dev = [
     "flake8-pyproject",
 ]
 test = ["responses", "pytest", "flake8-pyproject"]
-ml = ["torch", "torchvision", "scikit-learn"]
+ml = ["torch", "torchvision", "scikit-learn", "albumentations"]
 medical = ["nibabel", "connected-components-3d"]
 ocv = ["opencv-python-headless"]
 

diff --git a/tests/darwin/torch/dataset_test.py b/tests/darwin/torch/dataset_test.py
@@ -129,8 +129,9 @@ def test_loads_object_detection_dataset_from_bounding_box_annotations(
         assert image.size() == (3, 50, 50)
 
         label = {k: v.numpy().tolist() for k, v in label.items()}
+
         assert label == {
-            "boxes": [[4, 33, 17, 36]],
+            "boxes": [[4, 33, 17, 16]], # we need to account for xywh format and clamping
             "area": [612],
             "labels": [1],
             "image_id": [0],
@@ -149,7 +150,7 @@ def test_loads_object_detection_dataset_from_polygon_annotations(
 
         label = {k: v.numpy().tolist() for k, v in label.items()}
         assert label == {
-            "boxes": [[4, 33, 17, 36]],
+            "boxes": [[4, 33, 17, 16]], # we need to account for xywh format and clamping
             "area": [612],
             "labels": [1],
             "image_id": [0],
@@ -168,7 +169,7 @@ def test_loads_object_detection_dataset_from_complex_polygon_annotations(
 
         label = {k: v.numpy().tolist() for k, v in label.items()}
         assert label == {
-            "boxes": [[1, 1, 39, 49]],
+            "boxes": [[1, 1, 39, 48]],
             "area": [1911],
             "labels": [1],
             "image_id": [0],
@@ -210,7 +211,7 @@ def test_loads_instance_segmentation_dataset_from_polygon_annotations(
 
         label = {k: _maybe_tensor_to_list(v) for k, v in label.items()}
 
-        assert label["boxes"] == [[4.0, 33.0, 41.0, 50.0]]
+        assert label["boxes"] == [[4.0, 33.0, 36.0, 16.0]]
         assert label["area"] == [576.0]
         assert label["labels"] == [1]
         assert label["image_id"] == [0]
@@ -231,7 +232,7 @@ def test_loads_instance_segmentation_dataset_from_complex_polygon_annotations(
 
         label = {k: _maybe_tensor_to_list(v) for k, v in label.items()}
 
-        assert label["boxes"] == [[1.0, 1.0, 41.0, 50.0]]
+        assert label["boxes"] == [[1.0, 1.0, 39.0, 48.0]]
         assert label["area"] == [592.0]
         assert label["labels"] == [1]
         assert label["image_id"] == [0]

diff --git a/tests/darwin/torch/utils_test.py b/tests/darwin/torch/utils_test.py
@@ -3,7 +3,7 @@
 import numpy as np
 import torch
 
-from darwin.torch.utils import flatten_masks_by_category
+from darwin.torch.utils import clamp_bbox_to_image_size, flatten_masks_by_category
 from tests.fixtures import *
 
 
@@ -67,3 +67,24 @@ def test_should_handle_multiple_overlaps(self, multiple_overlap_masks) -> None:
         expected_counts = torch.as_tensor([7, 2], dtype=torch.uint8)
         assert torch.equal(unique, expected_unique)
         assert torch.equal(counts, expected_counts)
+
+class TestClampBboxToImageSize:
+    def test_clamp_bbox_xyxy(self):
+        annotations = {'boxes': torch.tensor([[5.0, 5.0, 15.0, 15.0], [-5.0, -5.0, 25.0, 25.0]])}
+        width = 20
+        height = 20
+
+        clamped_annotations = clamp_bbox_to_image_size(annotations, width, height, format="xyxy")
+        expected_boxes = torch.tensor([[5.0, 5.0, 15.0, 15.0], [0.0, 0.0, 19.0, 19.0]])
+
+        assert torch.equal(clamped_annotations['boxes'], expected_boxes)
+
+    def test_clamp_bbox_xywh(self):
+        annotations = {'boxes': torch.tensor([[5.0, 5.0, 15.0, 15.0], [-5.0, -5.0, 30.0, 30.0]])}
+        width = 20
+        height = 20
+
+        clamped_annotations = clamp_bbox_to_image_size(annotations, width, height, format="xywh")
+        expected_boxes = torch.tensor([[5.0, 5.0, 14.0, 14.0], [0.0, 0.0, 19.0, 19.0]])
+
+        assert torch.equal(clamped_annotations['boxes'], expected_boxes)