Skip to content

Commit

Permalink
[PY-645][externa] Improved tolerance for dots in filenames & test lin…
Browse files Browse the repository at this point in the history
…ting (#746)

* Improved tolerance for dots in filenames & test linting

* Fixed broken tests

* Added case-insensitivity

* Case insensitivity for dataset item name

* Do not set image_id to lowercase
  • Loading branch information
JBWilkie committed Dec 19, 2023
1 parent e3165e4 commit 79859ab
Show file tree
Hide file tree
Showing 7 changed files with 267 additions and 119 deletions.
47 changes: 35 additions & 12 deletions darwin/dataset/local_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ def __init__(
self.original_annotations_path: Optional[List[Path]] = None
self.keep_empty_annotations = keep_empty_annotations

release_path, annotations_dir, images_dir = self._initial_setup(dataset_path, release_name)
release_path, annotations_dir, images_dir = self._initial_setup(
dataset_path, release_name
)
self._validate_inputs(partition, split_type, annotation_type)
# Get the list of classes

Expand Down Expand Up @@ -120,7 +122,9 @@ def _validate_inputs(self, partition, split_type, annotation_type):
if split_type not in ["random", "stratified"]:
raise ValueError("split_type should be either 'random', 'stratified'")
if annotation_type not in ["tag", "polygon", "bounding_box"]:
raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")
raise ValueError(
"annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
)

def _setup_annotations_and_images(
self,
Expand Down Expand Up @@ -148,7 +152,9 @@ def _setup_annotations_and_images(
darwin_json, images_dir, with_folders, json_version, annotation_filepath
)
if image_path.exists():
if not keep_empty_annotations and is_stream_list_empty(darwin_json["annotations"]):
if not keep_empty_annotations and is_stream_list_empty(
darwin_json["annotations"]
):
continue
self.images_path.append(image_path)
self.annotations_path.append(annotation_filepath)
Expand Down Expand Up @@ -215,7 +221,9 @@ def get_height_and_width(self, index: int) -> Tuple[float, float]:
parsed = parse_darwin_json(self.annotations_path[index], index)
return parsed.image_height, parsed.image_width

def extend(self, dataset: "LocalDataset", extend_classes: bool = False) -> "LocalDataset":
def extend(
self, dataset: "LocalDataset", extend_classes: bool = False
) -> "LocalDataset":
"""
Extends the current dataset with another one.
Expand Down Expand Up @@ -310,7 +318,10 @@ def parse_json(self, index: int) -> Dict[str, Any]:
# Filter out unused classes and annotations of a different type
if self.classes is not None:
annotations = [
a for a in annotations if a.annotation_class.name in self.classes and self.annotation_type_supported(a)
a
for a in annotations
if a.annotation_class.name in self.classes
and self.annotation_type_supported(a)
]
return {
"image_id": index,
Expand All @@ -327,15 +338,20 @@ def annotation_type_supported(self, annotation) -> bool:
elif self.annotation_type == "bounding_box":
is_bounding_box = annotation_type == "bounding_box"
is_supported_polygon = (
annotation_type in ["polygon", "complex_polygon"] and "bounding_box" in annotation.data
annotation_type in ["polygon", "complex_polygon"]
and "bounding_box" in annotation.data
)
return is_bounding_box or is_supported_polygon
elif self.annotation_type == "polygon":
return annotation_type in ["polygon", "complex_polygon"]
else:
raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")
raise ValueError(
"annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
)

def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.ndarray]:
def measure_mean_std(
self, multi_threaded: bool = True
) -> Tuple[np.ndarray, np.ndarray]:
"""
Computes mean and std of trained images, given the train loader.
Expand All @@ -358,7 +374,9 @@ def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.
results = pool.map(self._return_mean, self.images_path)
mean = np.sum(np.array(results), axis=0) / len(self.images_path)
# Online image_classification deviation
results = pool.starmap(self._return_std, [[item, mean] for item in self.images_path])
results = pool.starmap(
self._return_std, [[item, mean] for item in self.images_path]
)
std_sum = np.sum(np.array([item[0] for item in results]), axis=0)
total_pixel_count = np.sum(np.array([item[1] for item in results]))
std = np.sqrt(std_sum / total_pixel_count)
Expand Down Expand Up @@ -404,14 +422,20 @@ def _compute_weights(labels: List[int]) -> np.ndarray:
@staticmethod
def _return_mean(image_path: Path) -> np.ndarray:
img = np.array(load_pil_image(image_path))
mean = np.array([np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])])
mean = np.array(
[np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])]
)
return mean / 255.0

# Loads an image with OpenCV and returns the channel wise std of the image.
@staticmethod
def _return_std(image_path: Path, mean: np.ndarray) -> Tuple[np.ndarray, float]:
img = np.array(load_pil_image(image_path)) / 255.0
m2 = np.square(np.array([img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]))
m2 = np.square(
np.array(
[img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]
)
)
return np.sum(np.sum(m2, axis=1), 1), m2.size / 3.0

def __getitem__(self, index: int):
Expand Down Expand Up @@ -482,7 +506,6 @@ def get_annotation_filepaths(

if partition is None:
return (str(e) for e in sorted(annotations_dir.glob("**/*.json")))

if split_type == "random":
split_filename = f"{split_type}_{partition}.txt"
elif split_type == "stratified":
Expand Down
42 changes: 32 additions & 10 deletions darwin/exporter/formats/darwin_1_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,23 @@ def _export_file(annotation_file: AnnotationFile, _: int, output_dir: Path) -> N
try:
output: DictFreeForm = _build_json(annotation_file)
except Exception as e:
raise ExportException_CouldNotBuildOutput(f"Could not build output for {annotation_file.path}") from e
raise ExportException_CouldNotBuildOutput(
f"Could not build output for {annotation_file.path}"
) from e

try:
with open(output_file_path, "w") as f:
op = json.dumps(
output,
option=json.OPT_INDENT_2 | json.OPT_SERIALIZE_NUMPY | json.OPT_NON_STR_KEYS,
option=json.OPT_INDENT_2
| json.OPT_SERIALIZE_NUMPY
| json.OPT_NON_STR_KEYS,
).decode("utf-8")
f.write(op)
except Exception as e:
raise ExportException_CouldNotWriteFile(f"Could not write output for {annotation_file.path}") from e
raise ExportException_CouldNotWriteFile(
f"Could not write output for {annotation_file.path}"
) from e


def _build_json(annotation_file: AnnotationFile) -> DictFreeForm:
Expand Down Expand Up @@ -130,11 +136,17 @@ def _build_sub_annotation(sub: SubAnnotation) -> DictFreeForm:
def _build_authorship(annotation: Union[VideoAnnotation, Annotation]) -> DictFreeForm:
annotators = {}
if annotation.annotators:
annotators = {"annotators": [_build_author(annotator) for annotator in annotation.annotators]}
annotators = {
"annotators": [
_build_author(annotator) for annotator in annotation.annotators
]
}

reviewers = {}
if annotation.reviewers:
reviewers = {"annotators": [_build_author(reviewer) for reviewer in annotation.reviewers]}
reviewers = {
"annotators": [_build_author(reviewer) for reviewer in annotation.reviewers]
}

return {**annotators, **reviewers}

Expand All @@ -143,15 +155,19 @@ def _build_video_annotation(annotation: VideoAnnotation) -> DictFreeForm:
return {
**annotation.get_data(
only_keyframes=False,
post_processing=lambda annotation, _: _build_image_annotation(annotation, skip_slots=True),
post_processing=lambda annotation, _: _build_image_annotation(
annotation, skip_slots=True
),
),
"name": annotation.annotation_class.name,
"slot_names": annotation.slot_names,
**_build_authorship(annotation),
}


def _build_image_annotation(annotation: Annotation, skip_slots: bool = False) -> DictFreeForm:
def _build_image_annotation(
annotation: Annotation, skip_slots: bool = False
) -> DictFreeForm:
json_subs = {}
for sub in annotation.subs:
json_subs.update(_build_sub_annotation(sub))
Expand All @@ -169,7 +185,9 @@ def _build_image_annotation(annotation: Annotation, skip_slots: bool = False) ->
return {**base_json, "slot_names": annotation.slot_names}


def _build_legacy_annotation_data(annotation_class: AnnotationClass, data: DictFreeForm) -> DictFreeForm:
def _build_legacy_annotation_data(
annotation_class: AnnotationClass, data: DictFreeForm
) -> DictFreeForm:
v1_data = {}
polygon_annotation_mappings = {"complex_polygon": "paths", "polygon": "path"}

Expand Down Expand Up @@ -232,7 +250,9 @@ def build_image_annotation(annotation_file: AnnotationFile) -> Dict[str, Any]:
annotations: List[Dict[str, Any]] = []
for annotation in annotation_file.annotations:
payload = {
annotation.annotation_class.annotation_type: _build_annotation_data(annotation),
annotation.annotation_class.annotation_type: _build_annotation_data(
annotation
),
"name": annotation.annotation_class.name,
}

Expand Down Expand Up @@ -260,6 +280,8 @@ def _build_annotation_data(annotation: Annotation) -> Dict[str, Any]:
return {"path": annotation.data["paths"]}

if annotation.annotation_class.annotation_type == "polygon":
return dict(filter(lambda item: item[0] != "bounding_box", annotation.data.items()))
return dict(
filter(lambda item: item[0] != "bounding_box", annotation.data.items())
)

return dict(annotation.data)
49 changes: 14 additions & 35 deletions darwin/exporter/formats/nifti.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import ast
import json as native_json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple, Union
Expand Down Expand Up @@ -137,50 +138,28 @@ def check_for_error_and_return_imageid(
image_id : str
"""
# check if all item slots have the correct file-extension
# Check if all item slots have the correct file-extension
for slot in video_annotation.slots:
for source_file in slot.source_files:
filename = Path(source_file["file_name"])

try:
suffixes = filename.suffixes[-2:]
except IndexError:
suffixes = filename.suffixes
if len(suffixes) == 2:
if suffixes[0] == ".nii" and suffixes[1] == ".gz":
image_id = str(filename).rstrip("".join(suffixes))
else:
return create_error_message_json(
"Two suffixes found but not ending in .nii.gz",
output_dir,
str(filename),
)
elif len(suffixes) == 1:
if suffixes[0] == ".nii" or suffixes[0] == ".dcm":
image_id = filename.stem
else:
return create_error_message_json(
"Misconfigured filename, not ending in .nii or .dcm. Are you sure this is medical data?",
output_dir,
str(filename),
)
else:
if not (
filename.name.lower().endswith(".nii.gz")
or filename.name.lower().endswith(".nii")
or filename.name.lower().endswith(".dcm")
):
return create_error_message_json(
"You are trying to export to nifti. Filename should contain either .nii, .nii.gz or .dcm extension."
"Are you sure this is medical data?",
"Misconfigured filename, not ending in .nii, .nii.gz or .dcm. Are you sure this is medical data?",
output_dir,
str(filename),
)

filename = Path(video_annotation.filename)
try:
suffixes = filename.suffixes[-2:]
except IndexError:
suffixes = filename.suffixes
if len(suffixes) == 2:
image_id = str(filename).rstrip("".join(suffixes))
elif len(suffixes) == 1:
image_id = str(filename.stem)
if filename.name.lower().endswith(".nii.gz"):
image_id = re.sub(r"(?i)\.nii\.gz$", "", str(filename))
elif filename.name.lower().endswith(".nii"):
image_id = re.sub(r"(?i)\.nii$", "", str(filename))
elif filename.name.lower().endswith(".dcm"):
image_id = re.sub(r"(?i)\.dcm$", "", str(filename))
else:
image_id = str(filename)

Expand Down
4 changes: 1 addition & 3 deletions darwin/importer/formats/csv_tags_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def parse_path(path: Path) -> Optional[List[dt.AnnotationFile]]:
file_annotation_map[filename].append(annotation)
for filename in file_annotation_map:
annotations = file_annotation_map[filename]
annotation_classes = {
annotation.annotation_class for annotation in annotations
}
annotation_classes = {annotation.annotation_class for annotation in annotations}
filename_path = Path(filename)
remote_path = str(filename_path.parent)
if not remote_path.startswith("/"):
Expand Down
4 changes: 3 additions & 1 deletion darwin/importer/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,9 @@ def _warn_unsupported_annotations(parsed_files: List[AnnotationFile]) -> None:
if annotation.annotation_class.annotation_type in UNSUPPORTED_CLASSES:
skipped_annotations.append(annotation)
if len(skipped_annotations) > 0:
types = {c.annotation_class.annotation_type for c in skipped_annotations} # noqa: C417
types = {
c.annotation_class.annotation_type for c in skipped_annotations
} # noqa: C417
console.print(
f"Import of annotation class types '{', '.join(types)}' is not yet supported. Skipping {len(skipped_annotations)} "
+ "annotations from '{parsed_file.full_path}'.\n",
Expand Down
31 changes: 25 additions & 6 deletions darwin/torch/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@ class ClassificationDataset(LocalDataset):
be composed via torchvision.
"""

def __init__(self, transform: Optional[Union[Callable, List]] = None, **kwargs) -> None:
def __init__(
self, transform: Optional[Union[Callable, List]] = None, **kwargs
) -> None:
super().__init__(annotation_type="tag", **kwargs)

if transform is not None and isinstance(transform, list):
Expand Down Expand Up @@ -152,7 +154,11 @@ def get_target(self, index: int) -> Tensor:

data = self.parse_json(index)
annotations = data.pop("annotations")
tags = [a.annotation_class.name for a in annotations if a.annotation_class.annotation_type == "tag"]
tags = [
a.annotation_class.name
for a in annotations
if a.annotation_class.annotation_type == "tag"
]

if not self.is_multi_label:
# Binary or multiclass must have a label per image
Expand All @@ -176,7 +182,11 @@ def check_if_multi_label(self) -> None:
for idx in range(len(self)):
target = self.parse_json(idx)
annotations = target.pop("annotations")
tags = [a.annotation_class.name for a in annotations if a.annotation_class.annotation_type == "tag"]
tags = [
a.annotation_class.name
for a in annotations
if a.annotation_class.annotation_type == "tag"
]

if len(tags) > 1:
self.is_multi_label = True
Expand Down Expand Up @@ -324,7 +334,9 @@ def get_target(self, index: int) -> Dict[str, Any]:
path_key = "paths"

if path_key not in annotation.data:
print(f"Warning: missing polygon in annotation {self.annotations_path[index]}")
print(
f"Warning: missing polygon in annotation {self.annotations_path[index]}"
)
# Extract the sequences of coordinates from the polygon annotation
sequences = convert_polygons_to_sequences(
annotation.data[path_key],
Expand Down Expand Up @@ -353,7 +365,12 @@ def get_target(self, index: int) -> Dict[str, Any]:

# Compute the area of the polygon
# TODO fix with addictive/subtractive paths in complex polygons
poly_area: float = np.sum([polygon_area(x_coord, y_coord) for x_coord, y_coord in zip(x_coords, y_coords)])
poly_area: float = np.sum(
[
polygon_area(x_coord, y_coord)
for x_coord, y_coord in zip(x_coords, y_coords)
]
)

# Create and append the new entry for this annotation
annotations.append(
Expand Down Expand Up @@ -405,7 +422,9 @@ class SemanticSegmentationDataset(LocalDataset):
Object used to convert polygons to semantic masks.
"""

def __init__(self, transform: Optional[Union[List[Callable], Callable]] = None, **kwargs):
def __init__(
self, transform: Optional[Union[List[Callable], Callable]] = None, **kwargs
):
super().__init__(annotation_type="polygon", **kwargs)
if "__background__" not in self.classes:
self.classes.insert(0, "__background__")
Expand Down

0 comments on commit 79859ab

Please sign in to comment.