From a9b67aec38f8d9f3830a0956514cd0efc8e59bc4 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Mon, 6 May 2024 17:44:13 +0100 Subject: [PATCH 1/6] Import warning for overwriting --- darwin/backend_v2.py | 17 +++++++++++++++ darwin/importer/importer.py | 41 +++++++++++++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/darwin/backend_v2.py b/darwin/backend_v2.py index ea0d2cfdd..9e4e84d7a 100644 --- a/darwin/backend_v2.py +++ b/darwin/backend_v2.py @@ -253,3 +253,20 @@ def register_items(self, payload: Dict[str, Any], team_slug: str) -> None: return self._client._post_raw( f"/v2/teams/{team_slug}/items/register_existing", payload ) + + def _get_remote_annotations( + self, + item_id: str, + team_slug: str, + ) -> List: + """ + Returns the annotations currently present on a remote dataset item. + + Parameters + ---------- + item_id: str + The UUID of the item. + team_slug: str + The team slug. + """ + return self._client._get(f"v2/teams/{team_slug}/items/{item_id}/annotations") diff --git a/darwin/importer/importer.py b/darwin/importer/importer.py index 62a4d2e82..f855cfab3 100644 --- a/darwin/importer/importer.py +++ b/darwin/importer/importer.py @@ -182,7 +182,7 @@ def _build_attribute_lookup(dataset: "RemoteDataset") -> Dict[str, Unknown]: def _get_remote_files( dataset: "RemoteDataset", filenames: List[str], chunk_size: int = 100 -) -> Dict[str, Tuple[int, str]]: +) -> Dict[str, Tuple[str, str]]: """ Fetches remote files from the datasets in chunks; by default 100 filenames at a time. @@ -790,7 +790,7 @@ def import_annotations( # noqa: C901 console.print("Fetching remote file list...", style="info") # This call will only filter by filename; so can return a superset of matched files across different paths # There is logic in this function to then include paths to narrow down to the single correct matching file - remote_files: Dict[str, Tuple[int, str]] = {} + remote_files: Dict[str, Tuple[str, str]] = {} # Try to fetch files in large chunks; in case the filenames are too large and exceed the url size # retry in smaller chunks @@ -939,6 +939,14 @@ def import_annotations( # noqa: C901 file for file in parsed_files if file not in files_to_not_track ] if files_to_track: + if not append: + # Remember to add a flag that can bypass this warning! + # Add unit test(s) for this functionality at the end + continue_to_overwrite = _overwrite_warning( + dataset.client, dataset, files_to_track, remote_files, console + ) + if not continue_to_overwrite: + return _warn_unsupported_annotations(files_to_track) for parsed_file in track(files_to_track): image_id, default_slot_name = remote_files[parsed_file.full_path] @@ -1355,3 +1363,32 @@ def _console_theme() -> Theme: "info": "bold deep_sky_blue1", } ) + + +def _overwrite_warning( + client: "Client", + dataset: "RemoteDataset", + files: List[dt.AnnotationFile], + remote_files: Dict[str, Tuple[str, str]], + console: Console, +) -> bool: + files_to_overwrite = [] + for file in files: + item_id = remote_files[file.full_path][0] + remote_annotations = client.api_v2._get_remote_annotations( + item_id, + dataset.team, + ) + if remote_annotations: + files_to_overwrite.append(file) + if files_to_overwrite: + console.print( + "The following dataset items already have annotations that will be overwritten by this import:", + style="warning", + ) + for file in files_to_overwrite: + console.print(f"- {file.full_path}", style="warning") + proceed = input("Do you want to proceed with the import? [y/N] ") + if proceed.lower() != "y": + return False + return True From 6e0148a4c2fbd11d23eb76e0801a8c3848c09e8a Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Mon, 6 May 2024 17:48:47 +0100 Subject: [PATCH 2/6] Removed duplicate prints when there are multiple imports to the same file --- darwin/importer/importer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/darwin/importer/importer.py b/darwin/importer/importer.py index f855cfab3..87161897b 100644 --- a/darwin/importer/importer.py +++ b/darwin/importer/importer.py @@ -1379,15 +1379,15 @@ def _overwrite_warning( item_id, dataset.team, ) - if remote_annotations: - files_to_overwrite.append(file) + if remote_annotations and file.full_path not in files_to_overwrite: + files_to_overwrite.append(file.full_path) if files_to_overwrite: console.print( - "The following dataset items already have annotations that will be overwritten by this import:", + f"The following {len(files_to_overwrite)} dataset items already have annotations that will be overwritten by this import:", style="warning", ) for file in files_to_overwrite: - console.print(f"- {file.full_path}", style="warning") + console.print(f"- {file}", style="warning") proceed = input("Do you want to proceed with the import? [y/N] ") if proceed.lower() != "y": return False From 0f6046cf3db583ced8108eb5a34b136d178061f0 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Mon, 6 May 2024 17:59:10 +0100 Subject: [PATCH 3/6] Added SDK & CLI options to bypass the overwrite warning --- darwin/cli.py | 1 + darwin/cli_functions.py | 5 +++++ darwin/importer/importer.py | 6 +++++- darwin/options.py | 5 +++++ 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/darwin/cli.py b/darwin/cli.py index cafb80cbf..003f20802 100644 --- a/darwin/cli.py +++ b/darwin/cli.py @@ -166,6 +166,7 @@ def _run(args: Namespace, parser: ArgumentParser) -> None: args.delete_for_empty, args.import_annotators, args.import_reviewers, + args.overwrite, cpu_limit=args.cpu_limit, ) elif args.action == "convert": diff --git a/darwin/cli_functions.py b/darwin/cli_functions.py index f814217dd..07624518e 100644 --- a/darwin/cli_functions.py +++ b/darwin/cli_functions.py @@ -851,6 +851,7 @@ def dataset_import( delete_for_empty: bool = False, import_annotators: bool = False, import_reviewers: bool = False, + overwrite: bool = False, use_multi_cpu: bool = False, cpu_limit: Optional[int] = None, ) -> None: @@ -881,6 +882,9 @@ def dataset_import( import_reviewers : bool, default: False If ``True`` it will import the reviewers from the files to the dataset, if . If ``False`` it will not import the reviewers. + overwrite : bool, default: False + If ``True`` it will bypass a warning that the import will overwrite the current annotations if any are present. + If ``False`` this warning will be skipped and the import will overwrite the current annotations. use_multi_cpu : bool, default: False If ``True`` it will use all multiple CPUs to speed up the import process. cpu_limit : Optional[int], default: Core count - 2 @@ -904,6 +908,7 @@ def dataset_import( delete_for_empty, import_annotators, import_reviewers, + overwrite, use_multi_cpu, cpu_limit, ) diff --git a/darwin/importer/importer.py b/darwin/importer/importer.py index 87161897b..5e094277f 100644 --- a/darwin/importer/importer.py +++ b/darwin/importer/importer.py @@ -673,6 +673,7 @@ def import_annotations( # noqa: C901 delete_for_empty: bool = False, import_annotators: bool = False, import_reviewers: bool = False, + overwrite: bool = False, use_multi_cpu: bool = False, # Set to False to give time to resolve MP behaviours cpu_limit: Optional[int] = None, # 0 because it's set later in logic ) -> None: @@ -704,6 +705,9 @@ def import_annotations( # noqa: C901 import_reviewers : bool, default: False If ``True`` it will import the reviewers from the files to the dataset, if . If ``False`` it will not import the reviewers. + overwrite : bool, default: False + If ``True`` it will bypass a warning that the import will overwrite the current annotations if any are present. + If ``False`` this warning will be skipped and the import will overwrite the current annotations. use_multi_cpu : bool, default: True If ``True`` will use multiple available CPU cores to parse the annotation files. If ``False`` will use only the current Python process, which runs in one core. @@ -939,7 +943,7 @@ def import_annotations( # noqa: C901 file for file in parsed_files if file not in files_to_not_track ] if files_to_track: - if not append: + if not append and not overwrite: # Remember to add a flag that can bypass this warning! # Add unit test(s) for this functionality at the end continue_to_overwrite = _overwrite_warning( diff --git a/darwin/options.py b/darwin/options.py index 0a9b5151f..1411a8385 100644 --- a/darwin/options.py +++ b/darwin/options.py @@ -333,6 +333,11 @@ def __init__(self) -> None: action="store_true", help="Import reviewers metadata from the annotation files, where available", ) + parser_import.add_argument( + "--overwrite", + action="store_true", + help="Bypass warnings about overwiting existing annotations.", + ) # Cpu limit for multiprocessing tasks def cpu_default_types(input: Any) -> Optional[int]: # type: ignore From 998e5e85c876f30cb03839d93cc09a357712473f Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Mon, 6 May 2024 18:02:00 +0100 Subject: [PATCH 4/6] Code documentation --- darwin/importer/importer.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/darwin/importer/importer.py b/darwin/importer/importer.py index 5e094277f..facf0816c 100644 --- a/darwin/importer/importer.py +++ b/darwin/importer/importer.py @@ -944,8 +944,6 @@ def import_annotations( # noqa: C901 ] if files_to_track: if not append and not overwrite: - # Remember to add a flag that can bypass this warning! - # Add unit test(s) for this functionality at the end continue_to_overwrite = _overwrite_warning( dataset.client, dataset, files_to_track, remote_files, console ) @@ -1376,6 +1374,28 @@ def _overwrite_warning( remote_files: Dict[str, Tuple[str, str]], console: Console, ) -> bool: + """ + Determines if any dataset items targeted for import already have annotations that will be overwritten. + If they do, a warning is displayed to the user and they are prompted to confirm if they want to proceed with the import. + + Parameters + ---------- + client : Client + The Darwin Client object. + dataset : RemoteDataset + The dataset where the annotations will be imported. + files : List[dt.AnnotationFile] + The list of annotation files that will be imported. + remote_files : Dict[str, Tuple[str, str]] + A dictionary of the remote files in the dataset. + console : Console + The console object. + + Returns + ------- + bool + True if the user wants to proceed with the import, False otherwise. + """ files_to_overwrite = [] for file in files: item_id = remote_files[file.full_path][0] From d4a07223086c6470e726703e07a158be8b9d52f0 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Mon, 6 May 2024 18:27:50 +0100 Subject: [PATCH 5/6] Unit tests --- darwin/cli_functions.py | 2 +- darwin/importer/importer.py | 2 +- tests/darwin/importer/importer_test.py | 92 +++++++++++++++++++++++++- 3 files changed, 92 insertions(+), 4 deletions(-) diff --git a/darwin/cli_functions.py b/darwin/cli_functions.py index 07624518e..efde7456f 100644 --- a/darwin/cli_functions.py +++ b/darwin/cli_functions.py @@ -884,7 +884,7 @@ def dataset_import( If ``False`` it will not import the reviewers. overwrite : bool, default: False If ``True`` it will bypass a warning that the import will overwrite the current annotations if any are present. - If ``False`` this warning will be skipped and the import will overwrite the current annotations. + If ``False`` this warning will be skipped and the import will overwrite the current annotations without warning. use_multi_cpu : bool, default: False If ``True`` it will use all multiple CPUs to speed up the import process. cpu_limit : Optional[int], default: Core count - 2 diff --git a/darwin/importer/importer.py b/darwin/importer/importer.py index facf0816c..66ae30056 100644 --- a/darwin/importer/importer.py +++ b/darwin/importer/importer.py @@ -707,7 +707,7 @@ def import_annotations( # noqa: C901 If ``False`` it will not import the reviewers. overwrite : bool, default: False If ``True`` it will bypass a warning that the import will overwrite the current annotations if any are present. - If ``False`` this warning will be skipped and the import will overwrite the current annotations. + If ``False`` this warning will be skipped and the import will overwrite the current annotations without warning. use_multi_cpu : bool, default: True If ``True`` will use multiple available CPU cores to parse the annotation files. If ``False`` will use only the current Python process, which runs in one core. diff --git a/tests/darwin/importer/importer_test.py b/tests/darwin/importer/importer_test.py index 5e2534a30..d86a552c1 100644 --- a/tests/darwin/importer/importer_test.py +++ b/tests/darwin/importer/importer_test.py @@ -1,13 +1,13 @@ import json from pathlib import Path from typing import List, Tuple -from unittest.mock import Mock, _patch, patch +from unittest.mock import MagicMock, Mock, _patch, patch import pytest from rich.theme import Theme from darwin import datatypes as dt -from darwin.importer.importer import _parse_empty_masks +from darwin.importer.importer import _overwrite_warning, _parse_empty_masks def root_path(x: str) -> str: @@ -593,3 +593,91 @@ def test_console_theme() -> None: from darwin.importer.importer import _console_theme assert isinstance(_console_theme(), Theme) + + +def test_overwrite_warning_proceeds_with_import(): + annotations: List[dt.AnnotationLike] = [ + dt.Annotation( + dt.AnnotationClass("cat1", "polygon"), + { + "paths": [ + [ + {"x": -1, "y": -1}, + {"x": -1, "y": 1}, + {"x": 1, "y": 1}, + {"x": 1, "y": -1}, + {"x": -1, "y": -1}, + ] + ], + "bounding_box": {"x": -1, "y": -1, "w": 2, "h": 2}, + }, + ) + ] + client = MagicMock() + dataset = MagicMock() + files = [ + dt.AnnotationFile( + path=Path("/"), + filename="file1", + annotation_classes={a.annotation_class for a in annotations}, + annotations=annotations, + remote_path="/", + ), + dt.AnnotationFile( + path=Path("/"), + filename="file2", + annotation_classes={a.annotation_class for a in annotations}, + annotations=annotations, + remote_path="/", + ), + ] + remote_files = {"/file1": ("id1", "path1"), "/file2": ("id2", "path2")} + console = MagicMock() + + with patch("builtins.input", return_value="y"): + result = _overwrite_warning(client, dataset, files, remote_files, console) + assert result is True + + +def test_overwrite_warning_aborts_import(): + annotations: List[dt.AnnotationLike] = [ + dt.Annotation( + dt.AnnotationClass("cat1", "polygon"), + { + "paths": [ + [ + {"x": -1, "y": -1}, + {"x": -1, "y": 1}, + {"x": 1, "y": 1}, + {"x": 1, "y": -1}, + {"x": -1, "y": -1}, + ] + ], + "bounding_box": {"x": -1, "y": -1, "w": 2, "h": 2}, + }, + ) + ] + client = MagicMock() + dataset = MagicMock() + files = [ + dt.AnnotationFile( + path=Path("/"), + filename="file1", + annotation_classes={a.annotation_class for a in annotations}, + annotations=annotations, + remote_path="/", + ), + dt.AnnotationFile( + path=Path("/"), + filename="file2", + annotation_classes={a.annotation_class for a in annotations}, + annotations=annotations, + remote_path="/", + ), + ] + remote_files = {"/file1": ("id1", "path1"), "/file2": ("id2", "path2")} + console = MagicMock() + + with patch("builtins.input", return_value="n"): + result = _overwrite_warning(client, dataset, files, remote_files, console) + assert result is False From 6661ac2888083ebd0fdac855a1f032b4eab6d68b Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Mon, 6 May 2024 22:06:59 +0100 Subject: [PATCH 6/6] Fix for multiple files --- darwin/importer/importer.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/darwin/importer/importer.py b/darwin/importer/importer.py index 66ae30056..463a35791 100644 --- a/darwin/importer/importer.py +++ b/darwin/importer/importer.py @@ -905,6 +905,13 @@ def import_annotations( # noqa: C901 style="info", ) + if not append and not overwrite: + continue_to_overwrite = _overwrite_warning( + dataset.client, dataset, local_files, remote_files, console + ) + if not continue_to_overwrite: + return + # Need to re parse the files since we didn't save the annotations in memory for local_path in set(local_file.path for local_file in local_files): # noqa: C401 imported_files: Union[List[dt.AnnotationFile], dt.AnnotationFile, None] = ( @@ -943,12 +950,6 @@ def import_annotations( # noqa: C901 file for file in parsed_files if file not in files_to_not_track ] if files_to_track: - if not append and not overwrite: - continue_to_overwrite = _overwrite_warning( - dataset.client, dataset, files_to_track, remote_files, console - ) - if not continue_to_overwrite: - return _warn_unsupported_annotations(files_to_track) for parsed_file in track(files_to_track): image_id, default_slot_name = remote_files[parsed_file.full_path] @@ -1370,7 +1371,7 @@ def _console_theme() -> Theme: def _overwrite_warning( client: "Client", dataset: "RemoteDataset", - files: List[dt.AnnotationFile], + local_files: List[dt.AnnotationFile], remote_files: Dict[str, Tuple[str, str]], console: Console, ) -> bool: @@ -1385,7 +1386,7 @@ def _overwrite_warning( dataset : RemoteDataset The dataset where the annotations will be imported. files : List[dt.AnnotationFile] - The list of annotation files that will be imported. + The list of local annotation files to will be imported. remote_files : Dict[str, Tuple[str, str]] A dictionary of the remote files in the dataset. console : Console @@ -1397,14 +1398,13 @@ def _overwrite_warning( True if the user wants to proceed with the import, False otherwise. """ files_to_overwrite = [] - for file in files: - item_id = remote_files[file.full_path][0] + for local_file in local_files: remote_annotations = client.api_v2._get_remote_annotations( - item_id, + local_file.item_id, dataset.team, ) - if remote_annotations and file.full_path not in files_to_overwrite: - files_to_overwrite.append(file.full_path) + if remote_annotations and local_file.full_path not in files_to_overwrite: + files_to_overwrite.append(local_file.full_path) if files_to_overwrite: console.print( f"The following {len(files_to_overwrite)} dataset items already have annotations that will be overwritten by this import:",