From 4c863abb477ff3c84d01345bafafae9e2c3d8f3a Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Mon, 25 Sep 2023 13:43:38 +0100 Subject: [PATCH 1/4] Changes to RemoteDataset.pull() to account for identically named files in different releases --- darwin/dataset/download_manager.py | 17 +++++------------ darwin/dataset/remote_dataset.py | 21 +++++++++++++++++++-- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/darwin/dataset/download_manager.py b/darwin/dataset/download_manager.py index d37d24302..48acbff8e 100644 --- a/darwin/dataset/download_manager.py +++ b/darwin/dataset/download_manager.py @@ -94,7 +94,7 @@ def download_all_images_from_annotations( # Verify that there is not already image in the images folder unfiltered_files = images_path.rglob(f"*") if use_folders else images_path.glob(f"*") - existing_images = {image.stem: image for image in unfiltered_files if is_image_extension_allowed(image.suffix)} + existing_images = [image for image in unfiltered_files if is_image_extension_allowed(image.suffix)] annotations_to_download_path = [] for annotation_path in annotations_path.glob(f"*.{annotation_format}"): @@ -103,11 +103,11 @@ def download_all_images_from_annotations( continue if not force_replace: - # Check collisions on image filename and json filename on the system - if annotation.filename in existing_images: - continue - if sanitize_filename(annotation_path.stem) in existing_images: + # Check the planned path for the image against the existing images + planned_image_path = images_path / Path(annotation.remote_path).relative_to('/') / Path(annotation.filename) + if planned_image_path in existing_images: continue + annotations_to_download_path.append(annotation_path) if len(annotation.slots) > 1: force_slots = True @@ -116,13 +116,6 @@ def download_all_images_from_annotations( if len(slot.source_files) > 1: force_slots = True - if remove_extra: - # Removes existing images for which there is not corresponding annotation - annotations_downloaded_stem = [a.stem for a in annotations_path.glob(f"*.{annotation_format}")] - for existing_image in existing_images.values(): - if existing_image.stem not in annotations_downloaded_stem: - print(f"Removing {existing_image} as there is no corresponding annotation") - existing_image.unlink() # Create the generator with the partial functions download_functions: List = [] for annotation_path in annotations_to_download_path: diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py index 5d89ad233..1f7fab356 100644 --- a/darwin/dataset/remote_dataset.py +++ b/darwin/dataset/remote_dataset.py @@ -43,7 +43,7 @@ from darwin.exporter.formats.darwin import build_image_annotation from darwin.item import DatasetItem from darwin.item_sorter import ItemSorter -from darwin.utils import parse_darwin_json, split_video_annotation, urljoin +from darwin.utils import parse_darwin_json, split_video_annotation, urljoin, is_image_extension_allowed if TYPE_CHECKING: from darwin.client import Client @@ -351,7 +351,24 @@ def pull( for error in errors: self.console.print(f"\t - {error}") - downloaded_file_count = len([f for f in self.local_images_path.rglob("*") if f.is_file()]) + # Remove images that don't have a corresponding annotation file in the release + if remove_extra: + annotations_paths = [annotation_path for annotation_path in annotations_dir.glob("*.json")] + unfiltered_files = self.local_images_path.rglob(f"*") if use_folders else self.local_images_path.glob(f"*") + existing_images = [image for image in unfiltered_files if is_image_extension_allowed(image.suffix)] + local_paths = [] + + for annotation_path in annotations_dir.glob(f"*.json"): + annotation = parse_darwin_json(annotation_path, count=0) + local_paths.append(Path(annotation.slots[0].source_files[0]['local_path'])) + + for existing_image in existing_images: + if existing_image not in local_paths: + print(f"Removing {existing_image} as there is no corresponding annotation in this release") + existing_image.unlink() + + downloaded_file_count = len([f for f in self.local_images_path.rglob("*") if f.is_file() and not f.name.startswith('.')]) + console.print(f"Total file count after download completed {str(downloaded_file_count)}.") return None, count From 0a485874ff97c40c9800f7cd854cf9ed460180ae Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Tue, 3 Oct 2023 11:32:26 +0100 Subject: [PATCH 2/4] Change existing_images from list to dict & restore original remove_extra flag function --- darwin/dataset/download_manager.py | 10 +++++++++- darwin/dataset/remote_dataset.py | 18 +----------------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/darwin/dataset/download_manager.py b/darwin/dataset/download_manager.py index 48acbff8e..b56c8f6c9 100644 --- a/darwin/dataset/download_manager.py +++ b/darwin/dataset/download_manager.py @@ -94,7 +94,7 @@ def download_all_images_from_annotations( # Verify that there is not already image in the images folder unfiltered_files = images_path.rglob(f"*") if use_folders else images_path.glob(f"*") - existing_images = [image for image in unfiltered_files if is_image_extension_allowed(image.suffix)] + existing_images = {image for image in unfiltered_files if is_image_extension_allowed(image.suffix)} annotations_to_download_path = [] for annotation_path in annotations_path.glob(f"*.{annotation_format}"): @@ -116,6 +116,14 @@ def download_all_images_from_annotations( if len(slot.source_files) > 1: force_slots = True + if remove_extra: + # Removes existing images for which there is not corresponding annotation + annotations_downloaded_stem = [a.stem for a in annotations_path.glob(f"*.{annotation_format}")] + for existing_image in existing_images: + if existing_image.stem not in annotations_downloaded_stem: + print(f"Removing {existing_image} as there is no corresponding annotation") + existing_image.unlink() + # Create the generator with the partial functions download_functions: List = [] for annotation_path in annotations_to_download_path: diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py index 1f7fab356..77971730c 100644 --- a/darwin/dataset/remote_dataset.py +++ b/darwin/dataset/remote_dataset.py @@ -43,7 +43,7 @@ from darwin.exporter.formats.darwin import build_image_annotation from darwin.item import DatasetItem from darwin.item_sorter import ItemSorter -from darwin.utils import parse_darwin_json, split_video_annotation, urljoin, is_image_extension_allowed +from darwin.utils import parse_darwin_json, split_video_annotation, urljoin if TYPE_CHECKING: from darwin.client import Client @@ -351,22 +351,6 @@ def pull( for error in errors: self.console.print(f"\t - {error}") - # Remove images that don't have a corresponding annotation file in the release - if remove_extra: - annotations_paths = [annotation_path for annotation_path in annotations_dir.glob("*.json")] - unfiltered_files = self.local_images_path.rglob(f"*") if use_folders else self.local_images_path.glob(f"*") - existing_images = [image for image in unfiltered_files if is_image_extension_allowed(image.suffix)] - local_paths = [] - - for annotation_path in annotations_dir.glob(f"*.json"): - annotation = parse_darwin_json(annotation_path, count=0) - local_paths.append(Path(annotation.slots[0].source_files[0]['local_path'])) - - for existing_image in existing_images: - if existing_image not in local_paths: - print(f"Removing {existing_image} as there is no corresponding annotation in this release") - existing_image.unlink() - downloaded_file_count = len([f for f in self.local_images_path.rglob("*") if f.is_file() and not f.name.startswith('.')]) console.print(f"Total file count after download completed {str(downloaded_file_count)}.") From 57962383d1f494e937141d399b7c7067ef15f960 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Tue, 3 Oct 2023 11:58:54 +0100 Subject: [PATCH 3/4] Adjusted planned_image_path to be non POSIX-specific --- darwin/dataset/download_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/darwin/dataset/download_manager.py b/darwin/dataset/download_manager.py index b56c8f6c9..fcb5300df 100644 --- a/darwin/dataset/download_manager.py +++ b/darwin/dataset/download_manager.py @@ -104,7 +104,7 @@ def download_all_images_from_annotations( if not force_replace: # Check the planned path for the image against the existing images - planned_image_path = images_path / Path(annotation.remote_path).relative_to('/') / Path(annotation.filename) + planned_image_path = images_path / Path(annotation.remote_path.lstrip('/')).resolve().absolute() / Path(annotation.filename) if planned_image_path in existing_images: continue From f3652c418fbd92a3e6498987c2a17d6a8cfeef6b Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Tue, 3 Oct 2023 17:11:34 +0100 Subject: [PATCH 4/4] Adjusted planned_image_path so that Windows is accounted for with backslashes --- darwin/dataset/download_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/darwin/dataset/download_manager.py b/darwin/dataset/download_manager.py index fcb5300df..9eb5c9f68 100644 --- a/darwin/dataset/download_manager.py +++ b/darwin/dataset/download_manager.py @@ -104,7 +104,7 @@ def download_all_images_from_annotations( if not force_replace: # Check the planned path for the image against the existing images - planned_image_path = images_path / Path(annotation.remote_path.lstrip('/')).resolve().absolute() / Path(annotation.filename) + planned_image_path = images_path / Path(annotation.remote_path.lstrip('/\\')).resolve().absolute() / Path(annotation.filename) if planned_image_path in existing_images: continue