Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PY-645][externa] Improved tolerance for dots in filenames & test linting #746

Merged
merged 6 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 35 additions & 12 deletions darwin/dataset/local_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ def __init__(
self.original_annotations_path: Optional[List[Path]] = None
self.keep_empty_annotations = keep_empty_annotations

release_path, annotations_dir, images_dir = self._initial_setup(dataset_path, release_name)
release_path, annotations_dir, images_dir = self._initial_setup(
dataset_path, release_name
)
self._validate_inputs(partition, split_type, annotation_type)
# Get the list of classes

Expand Down Expand Up @@ -120,7 +122,9 @@ def _validate_inputs(self, partition, split_type, annotation_type):
if split_type not in ["random", "stratified"]:
raise ValueError("split_type should be either 'random', 'stratified'")
if annotation_type not in ["tag", "polygon", "bounding_box"]:
raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")
raise ValueError(
"annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
)

def _setup_annotations_and_images(
self,
Expand Down Expand Up @@ -148,7 +152,9 @@ def _setup_annotations_and_images(
darwin_json, images_dir, with_folders, json_version, annotation_filepath
)
if image_path.exists():
if not keep_empty_annotations and is_stream_list_empty(darwin_json["annotations"]):
if not keep_empty_annotations and is_stream_list_empty(
darwin_json["annotations"]
):
continue
self.images_path.append(image_path)
self.annotations_path.append(annotation_filepath)
Expand Down Expand Up @@ -215,7 +221,9 @@ def get_height_and_width(self, index: int) -> Tuple[float, float]:
parsed = parse_darwin_json(self.annotations_path[index], index)
return parsed.image_height, parsed.image_width

def extend(self, dataset: "LocalDataset", extend_classes: bool = False) -> "LocalDataset":
def extend(
self, dataset: "LocalDataset", extend_classes: bool = False
) -> "LocalDataset":
"""
Extends the current dataset with another one.

Expand Down Expand Up @@ -310,7 +318,10 @@ def parse_json(self, index: int) -> Dict[str, Any]:
# Filter out unused classes and annotations of a different type
if self.classes is not None:
annotations = [
a for a in annotations if a.annotation_class.name in self.classes and self.annotation_type_supported(a)
a
for a in annotations
if a.annotation_class.name in self.classes
and self.annotation_type_supported(a)
]
return {
"image_id": index,
Expand All @@ -327,15 +338,20 @@ def annotation_type_supported(self, annotation) -> bool:
elif self.annotation_type == "bounding_box":
is_bounding_box = annotation_type == "bounding_box"
is_supported_polygon = (
annotation_type in ["polygon", "complex_polygon"] and "bounding_box" in annotation.data
annotation_type in ["polygon", "complex_polygon"]
and "bounding_box" in annotation.data
)
return is_bounding_box or is_supported_polygon
elif self.annotation_type == "polygon":
return annotation_type in ["polygon", "complex_polygon"]
else:
raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")
raise ValueError(
"annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
)

def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.ndarray]:
def measure_mean_std(
self, multi_threaded: bool = True
) -> Tuple[np.ndarray, np.ndarray]:
"""
Computes mean and std of trained images, given the train loader.

Expand All @@ -358,7 +374,9 @@ def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.
results = pool.map(self._return_mean, self.images_path)
mean = np.sum(np.array(results), axis=0) / len(self.images_path)
# Online image_classification deviation
results = pool.starmap(self._return_std, [[item, mean] for item in self.images_path])
results = pool.starmap(
self._return_std, [[item, mean] for item in self.images_path]
)
std_sum = np.sum(np.array([item[0] for item in results]), axis=0)
total_pixel_count = np.sum(np.array([item[1] for item in results]))
std = np.sqrt(std_sum / total_pixel_count)
Expand Down Expand Up @@ -404,14 +422,20 @@ def _compute_weights(labels: List[int]) -> np.ndarray:
@staticmethod
def _return_mean(image_path: Path) -> np.ndarray:
img = np.array(load_pil_image(image_path))
mean = np.array([np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])])
mean = np.array(
[np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])]
)
return mean / 255.0

# Loads an image with OpenCV and returns the channel wise std of the image.
@staticmethod
def _return_std(image_path: Path, mean: np.ndarray) -> Tuple[np.ndarray, float]:
img = np.array(load_pil_image(image_path)) / 255.0
m2 = np.square(np.array([img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]))
m2 = np.square(
np.array(
[img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]
)
)
return np.sum(np.sum(m2, axis=1), 1), m2.size / 3.0

def __getitem__(self, index: int):
Expand Down Expand Up @@ -482,7 +506,6 @@ def get_annotation_filepaths(

if partition is None:
return (str(e) for e in sorted(annotations_dir.glob("**/*.json")))

if split_type == "random":
split_filename = f"{split_type}_{partition}.txt"
elif split_type == "stratified":
Expand Down
42 changes: 32 additions & 10 deletions darwin/exporter/formats/darwin_1_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,23 @@ def _export_file(annotation_file: AnnotationFile, _: int, output_dir: Path) -> N
try:
output: DictFreeForm = _build_json(annotation_file)
except Exception as e:
raise ExportException_CouldNotBuildOutput(f"Could not build output for {annotation_file.path}") from e
raise ExportException_CouldNotBuildOutput(
f"Could not build output for {annotation_file.path}"
) from e

try:
with open(output_file_path, "w") as f:
op = json.dumps(
output,
option=json.OPT_INDENT_2 | json.OPT_SERIALIZE_NUMPY | json.OPT_NON_STR_KEYS,
option=json.OPT_INDENT_2
| json.OPT_SERIALIZE_NUMPY
| json.OPT_NON_STR_KEYS,
).decode("utf-8")
f.write(op)
except Exception as e:
raise ExportException_CouldNotWriteFile(f"Could not write output for {annotation_file.path}") from e
raise ExportException_CouldNotWriteFile(
f"Could not write output for {annotation_file.path}"
) from e


def _build_json(annotation_file: AnnotationFile) -> DictFreeForm:
Expand Down Expand Up @@ -130,11 +136,17 @@ def _build_sub_annotation(sub: SubAnnotation) -> DictFreeForm:
def _build_authorship(annotation: Union[VideoAnnotation, Annotation]) -> DictFreeForm:
annotators = {}
if annotation.annotators:
annotators = {"annotators": [_build_author(annotator) for annotator in annotation.annotators]}
annotators = {
"annotators": [
_build_author(annotator) for annotator in annotation.annotators
]
}

reviewers = {}
if annotation.reviewers:
reviewers = {"annotators": [_build_author(reviewer) for reviewer in annotation.reviewers]}
reviewers = {
"annotators": [_build_author(reviewer) for reviewer in annotation.reviewers]
}

return {**annotators, **reviewers}

Expand All @@ -143,15 +155,19 @@ def _build_video_annotation(annotation: VideoAnnotation) -> DictFreeForm:
return {
**annotation.get_data(
only_keyframes=False,
post_processing=lambda annotation, _: _build_image_annotation(annotation, skip_slots=True),
post_processing=lambda annotation, _: _build_image_annotation(
annotation, skip_slots=True
),
),
"name": annotation.annotation_class.name,
"slot_names": annotation.slot_names,
**_build_authorship(annotation),
}


def _build_image_annotation(annotation: Annotation, skip_slots: bool = False) -> DictFreeForm:
def _build_image_annotation(
annotation: Annotation, skip_slots: bool = False
) -> DictFreeForm:
json_subs = {}
for sub in annotation.subs:
json_subs.update(_build_sub_annotation(sub))
Expand All @@ -169,7 +185,9 @@ def _build_image_annotation(annotation: Annotation, skip_slots: bool = False) ->
return {**base_json, "slot_names": annotation.slot_names}


def _build_legacy_annotation_data(annotation_class: AnnotationClass, data: DictFreeForm) -> DictFreeForm:
def _build_legacy_annotation_data(
annotation_class: AnnotationClass, data: DictFreeForm
) -> DictFreeForm:
v1_data = {}
polygon_annotation_mappings = {"complex_polygon": "paths", "polygon": "path"}

Expand Down Expand Up @@ -232,7 +250,9 @@ def build_image_annotation(annotation_file: AnnotationFile) -> Dict[str, Any]:
annotations: List[Dict[str, Any]] = []
for annotation in annotation_file.annotations:
payload = {
annotation.annotation_class.annotation_type: _build_annotation_data(annotation),
annotation.annotation_class.annotation_type: _build_annotation_data(
annotation
),
"name": annotation.annotation_class.name,
}

Expand Down Expand Up @@ -260,6 +280,8 @@ def _build_annotation_data(annotation: Annotation) -> Dict[str, Any]:
return {"path": annotation.data["paths"]}

if annotation.annotation_class.annotation_type == "polygon":
return dict(filter(lambda item: item[0] != "bounding_box", annotation.data.items()))
return dict(
filter(lambda item: item[0] != "bounding_box", annotation.data.items())
)

return dict(annotation.data)
48 changes: 13 additions & 35 deletions darwin/exporter/formats/nifti.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,50 +137,28 @@ def check_for_error_and_return_imageid(
image_id : str

"""
# check if all item slots have the correct file-extension
# Check if all item slots have the correct file-extension
for slot in video_annotation.slots:
for source_file in slot.source_files:
filename = Path(source_file["file_name"])

try:
suffixes = filename.suffixes[-2:]
except IndexError:
suffixes = filename.suffixes
if len(suffixes) == 2:
if suffixes[0] == ".nii" and suffixes[1] == ".gz":
image_id = str(filename).rstrip("".join(suffixes))
else:
return create_error_message_json(
"Two suffixes found but not ending in .nii.gz",
output_dir,
str(filename),
)
elif len(suffixes) == 1:
if suffixes[0] == ".nii" or suffixes[0] == ".dcm":
image_id = filename.stem
else:
return create_error_message_json(
"Misconfigured filename, not ending in .nii or .dcm. Are you sure this is medical data?",
output_dir,
str(filename),
)
else:
if not (
filename.name.lower().endswith(".nii.gz")
or filename.name.lower().endswith(".nii")
or filename.name.lower().endswith(".dcm")
):
return create_error_message_json(
"You are trying to export to nifti. Filename should contain either .nii, .nii.gz or .dcm extension."
"Are you sure this is medical data?",
"Misconfigured filename, not ending in .nii, .nii.gz or .dcm. Are you sure this is medical data?",
output_dir,
str(filename),
)

filename = Path(video_annotation.filename)
try:
suffixes = filename.suffixes[-2:]
except IndexError:
suffixes = filename.suffixes
if len(suffixes) == 2:
image_id = str(filename).rstrip("".join(suffixes))
elif len(suffixes) == 1:
image_id = str(filename.stem)
Comment on lines -180 to -183
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little unsure about this. Currently, if .nii.gz then we include this in the image_id

Otherwise, if .dcm or .nii, we don't include this in the image_id. Why is this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, if .nii.gz then we include this in the image_id

Is that actually true? image_id = str(filename).rstrip("".join(suffixes)) would strip out .nii.gz 🤔

Copy link
Collaborator Author

@JBWilkie JBWilkie Dec 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually yes that is correct

Just tested and the behaviour now is:

  • If any slot has a name in ending in anything other than .nii, .dcm, or .nii.gz, the exporter fails (we want this)
  • The filename itself can be anything, and if it ends with any of the 3 above we strip it away

I believe this is correct

if filename.name.lower().endswith(".nii.gz"):
image_id = str(filename).rstrip(".nii.gz")
elif filename.name.lower().endswith(".nii"):
image_id = str(filename).rstrip(".nii")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this work for case where we have upper cases? the condition will trigger, but the strip won't find the upper case extension, right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

God I need to multitask less...

Updated. Now we check the lowered version of the filename so we're guaranteed to pick up any of the 3 matches

elif filename.name.lower().endswith(".dcm"):
image_id = str(filename).rstrip(".dcm")
else:
image_id = str(filename)

Expand Down
4 changes: 1 addition & 3 deletions darwin/importer/formats/csv_tags_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def parse_path(path: Path) -> Optional[List[dt.AnnotationFile]]:
file_annotation_map[filename].append(annotation)
for filename in file_annotation_map:
annotations = file_annotation_map[filename]
annotation_classes = {
annotation.annotation_class for annotation in annotations
}
annotation_classes = {annotation.annotation_class for annotation in annotations}
filename_path = Path(filename)
remote_path = str(filename_path.parent)
if not remote_path.startswith("/"):
Expand Down
4 changes: 3 additions & 1 deletion darwin/importer/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,9 @@ def _warn_unsupported_annotations(parsed_files: List[AnnotationFile]) -> None:
if annotation.annotation_class.annotation_type in UNSUPPORTED_CLASSES:
skipped_annotations.append(annotation)
if len(skipped_annotations) > 0:
types = {c.annotation_class.annotation_type for c in skipped_annotations} # noqa: C417
types = {
c.annotation_class.annotation_type for c in skipped_annotations
} # noqa: C417
console.print(
f"Import of annotation class types '{', '.join(types)}' is not yet supported. Skipping {len(skipped_annotations)} "
+ "annotations from '{parsed_file.full_path}'.\n",
Expand Down
31 changes: 25 additions & 6 deletions darwin/torch/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@ class ClassificationDataset(LocalDataset):
be composed via torchvision.
"""

def __init__(self, transform: Optional[Union[Callable, List]] = None, **kwargs) -> None:
def __init__(
self, transform: Optional[Union[Callable, List]] = None, **kwargs
) -> None:
super().__init__(annotation_type="tag", **kwargs)

if transform is not None and isinstance(transform, list):
Expand Down Expand Up @@ -152,7 +154,11 @@ def get_target(self, index: int) -> Tensor:

data = self.parse_json(index)
annotations = data.pop("annotations")
tags = [a.annotation_class.name for a in annotations if a.annotation_class.annotation_type == "tag"]
tags = [
a.annotation_class.name
for a in annotations
if a.annotation_class.annotation_type == "tag"
]

if not self.is_multi_label:
# Binary or multiclass must have a label per image
Expand All @@ -176,7 +182,11 @@ def check_if_multi_label(self) -> None:
for idx in range(len(self)):
target = self.parse_json(idx)
annotations = target.pop("annotations")
tags = [a.annotation_class.name for a in annotations if a.annotation_class.annotation_type == "tag"]
tags = [
a.annotation_class.name
for a in annotations
if a.annotation_class.annotation_type == "tag"
]

if len(tags) > 1:
self.is_multi_label = True
Expand Down Expand Up @@ -324,7 +334,9 @@ def get_target(self, index: int) -> Dict[str, Any]:
path_key = "paths"

if path_key not in annotation.data:
print(f"Warning: missing polygon in annotation {self.annotations_path[index]}")
print(
f"Warning: missing polygon in annotation {self.annotations_path[index]}"
)
# Extract the sequences of coordinates from the polygon annotation
sequences = convert_polygons_to_sequences(
annotation.data[path_key],
Expand Down Expand Up @@ -353,7 +365,12 @@ def get_target(self, index: int) -> Dict[str, Any]:

# Compute the area of the polygon
# TODO fix with addictive/subtractive paths in complex polygons
poly_area: float = np.sum([polygon_area(x_coord, y_coord) for x_coord, y_coord in zip(x_coords, y_coords)])
poly_area: float = np.sum(
[
polygon_area(x_coord, y_coord)
for x_coord, y_coord in zip(x_coords, y_coords)
]
)

# Create and append the new entry for this annotation
annotations.append(
Expand Down Expand Up @@ -405,7 +422,9 @@ class SemanticSegmentationDataset(LocalDataset):
Object used to convert polygons to semantic masks.
"""

def __init__(self, transform: Optional[Union[List[Callable], Callable]] = None, **kwargs):
def __init__(
self, transform: Optional[Union[List[Callable], Callable]] = None, **kwargs
):
super().__init__(annotation_type="polygon", **kwargs)
if "__background__" not in self.classes:
self.classes.insert(0, "__background__")
Expand Down
Loading
Loading