Skip to content

Commit

Permalink
[DAR-2246][External] Unify darwin-py naming convention (#868)
Browse files Browse the repository at this point in the history
* Made downloading with folders the default behaviour

* Added placeholder tests

* Pull naming convention tests & data

* Updated pull naming convention

* Remove unused codepath

* Aligned single-slotted multi-source file behaviour with multi-slotted multi-source file behaviour

* Added tests for reading files locally

* Allow get_annotations() to work with flat dataset exports

* Pull with folders by default, remove unused argument & fix planned image path issue

* Updated test data

* Restored planned_image_path construction types

* Remvoed testing assert

* Long videos tests
  • Loading branch information
JBWilkie committed Jun 19, 2024
1 parent 60c89b8 commit fff4f13
Show file tree
Hide file tree
Showing 6 changed files with 280 additions and 21 deletions.
29 changes: 12 additions & 17 deletions darwin/dataset/download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def download_all_images_from_annotations(
force_replace: bool = False,
remove_extra: bool = False,
annotation_format: str = "json",
use_folders: bool = False,
use_folders: bool = True,
video_frames: bool = False,
force_slots: bool = False,
ignore_slots: bool = False,
Expand Down Expand Up @@ -95,11 +95,13 @@ def download_all_images_from_annotations(

if not force_replace:
# Check the planned path for the image against the existing images
planned_image_path = (
images_path
/ Path(annotation.remote_path.lstrip("/\\")).resolve().absolute()
/ Path(annotation.filename)
)
filename = Path(annotation.filename)
if use_folders and annotation.remote_path != "/":
planned_image_path = (
images_path / Path(annotation.remote_path.lstrip("/\\"))
) / filename
else:
planned_image_path = images_path / filename
if planned_image_path in existing_images:
continue

Expand Down Expand Up @@ -221,7 +223,6 @@ def _download_image_from_json_annotation(
parent_path,
annotation_path,
video_frames,
use_folders,
)
if force_slots:
return _download_all_slots_from_json_annotation(
Expand All @@ -234,7 +235,6 @@ def _download_image_from_json_annotation(
parent_path,
annotation_path,
video_frames,
use_folders,
)

return []
Expand All @@ -255,7 +255,7 @@ def _download_all_slots_from_json_annotation(
slot_path.mkdir(exist_ok=True, parents=True)

if video_frames and slot.type != "image":
video_path: Path = slot_path / "sections"
video_path: Path = slot_path
video_path.mkdir(exist_ok=True, parents=True)
if not slot.frame_urls:
segment_manifests = get_segment_manifests(slot, slot_path, api_key)
Expand Down Expand Up @@ -302,7 +302,6 @@ def _download_single_slot_from_json_annotation(
parent_path: Path,
annotation_path: Path,
video_frames: bool,
use_folders: bool = False,
) -> Iterable[Callable[[], None]]:
slot = annotation.slots[0]
generator = []
Expand Down Expand Up @@ -339,13 +338,9 @@ def _download_single_slot_from_json_annotation(
image = slot.source_files[0]
image_url = image["url"]
image_filename = image["file_name"]

if not use_folders:
suffix = Path(image_filename).suffix
stem = annotation_path.stem
filename = str(Path(stem + suffix))
else:
filename = slot.source_files[0]["file_name"]
suffix = Path(image_filename).suffix
stem = annotation_path.stem
filename = str(Path(stem + suffix))
image_path = parent_path / sanitize_filename(
filename or annotation.filename
)
Expand Down
4 changes: 2 additions & 2 deletions darwin/dataset/remote_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def pull(
remove_extra: bool = False,
subset_filter_annotations_function: Optional[Callable] = None,
subset_folder_name: Optional[str] = None,
use_folders: bool = False,
use_folders: bool = True,
video_frames: bool = False,
force_slots: bool = False,
ignore_slots: bool = False,
Expand Down Expand Up @@ -217,7 +217,7 @@ def pull(
If it needs to receive other parameters is advised to use functools.partial() for it.
subset_folder_name: Optional[str], default: None
Name of the folder with the subset of the dataset. If not provided a timestamp is used.
use_folders : bool, default: False
use_folders : bool, default: True
Recreates folders from the dataset.
video_frames : bool, default: False
Pulls video frames images instead of video files.
Expand Down
3 changes: 2 additions & 1 deletion darwin/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,10 +590,11 @@ def _map_annotations_to_images(
images_paths = []
annotations_paths = []
invalid_annotation_paths = []
with_folders = any(item.is_dir() for item in images_dir.iterdir())
for annotation_path in get_annotation_files_from_dir(annotations_dir):
darwin_json = stream_darwin_json(Path(annotation_path))
image_path = get_image_path_from_stream(
darwin_json, images_dir, Path(annotation_path)
darwin_json, images_dir, Path(annotation_path), with_folders
)
if image_path.exists():
images_paths.append(image_path)
Expand Down
240 changes: 239 additions & 1 deletion tests/darwin/dataset/remote_dataset_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import shutil
import tempfile
import types
import zipfile
from datetime import datetime
from pathlib import Path
from typing import Any, Dict
Expand All @@ -13,10 +15,11 @@
from darwin.client import Client
from darwin.config import Config
from darwin.dataset import RemoteDataset
from darwin.dataset.download_manager import _download_image_from_json_annotation
from darwin.dataset.release import Release
from darwin.dataset.remote_dataset_v2 import RemoteDatasetV2
from darwin.dataset.upload_manager import LocalFile, UploadHandlerV2
from darwin.datatypes import ObjectStore
from darwin.datatypes import ManifestItem, ObjectStore, SegmentManifest
from darwin.exceptions import UnsupportedExportFormat, UnsupportedFileType
from darwin.item import DatasetItem
from tests.fixtures import *
Expand Down Expand Up @@ -806,6 +809,241 @@ def fake_download_zip(self, path):
assert metadata_path.exists()


class TestPullNamingConvention:
def _test_pull_naming_convention(
self, file_name, use_folders, video_frames, force_slots
):
with tempfile.TemporaryDirectory() as temp_dir:
with zipfile.ZipFile("tests/data.zip") as zfile:
zfile.extractall(temp_dir)
file_path = (
Path(temp_dir) / "v7-darwin-json-v2/pull_naming_tests" / file_name
)
download_func = _download_image_from_json_annotation(
api_key="api_key",
annotation_path=file_path,
image_path=Path("dataset_dir_path"),
use_folders=use_folders,
video_frames=video_frames,
force_slots=force_slots,
ignore_slots=False,
)
return download_func

def test_single_slotted_image_flat_structure(self):
file_name = "single_slotted_image_flat.json"
expected_paths = [Path("dataset_dir_path/single_slotted_image_flat.png")]
download_funcs = self._test_pull_naming_convention(
file_name,
use_folders=False,
video_frames=False,
force_slots=False,
)
assert download_funcs[0].args[2] == expected_paths[0]

def test_single_slotted_video_flat_structure(self):
file_name = "single_slotted_video_flat.json"
expected_paths = [Path("dataset_dir_path/single_slotted_video_flat.mp4")]
download_funcs = self._test_pull_naming_convention(
file_name,
use_folders=False,
video_frames=False,
force_slots=False,
)
assert download_funcs[0].args[2] == expected_paths[0]

def test_single_slotted_image_folder_structure(self):
file_name = "single_slotted_image_folder.json"
expected_paths = [Path("dataset_dir_path/dir1/single_slotted_image_folder.png")]
download_funcs = self._test_pull_naming_convention(
file_name,
use_folders=True,
video_frames=False,
force_slots=False,
)
assert download_funcs[0].args[2] == expected_paths[0]

def test_single_slotted_video_folder_structure(self):
file_name = "single_slotted_video_folder.json"
expected_paths = [Path("dataset_dir_path/dir1/single_slotted_video_folder.mp4")]
download_funcs = self._test_pull_naming_convention(
file_name,
use_folders=True,
video_frames=False,
force_slots=False,
)
assert download_funcs[0].args[2] == expected_paths[0]

def test_multi_slotted_item_flat_structure(self):
file_name = "multi_slotted_item_flat.json"
expected_paths = [
Path("dataset_dir_path/multi_slotted_item_flat/0/000000580654.jpg"),
Path("dataset_dir_path/multi_slotted_item_flat/1/000000580676.jpg"),
Path("dataset_dir_path/multi_slotted_item_flat/2/000000580703.jpg"),
]
download_funcs = self._test_pull_naming_convention(
file_name,
use_folders=False,
video_frames=False,
force_slots=True,
)
assert download_funcs[0].args[2] == expected_paths[0]
assert download_funcs[1].args[2] == expected_paths[1]
assert download_funcs[2].args[2] == expected_paths[2]

def test_multi_slotted_item_folder_structure(self):
file_name = "multi_slotted_item_folder.json"
expected_paths = [
Path("dataset_dir_path/dir1/multi_slotted_item_folder/0/000000580654.jpg"),
Path("dataset_dir_path/dir1/multi_slotted_item_folder/1/000000580676.jpg"),
Path("dataset_dir_path/dir1/multi_slotted_item_folder/2/000000580703.jpg"),
]
downloads_funcs = self._test_pull_naming_convention(
file_name,
use_folders=True,
video_frames=False,
force_slots=True,
)
assert downloads_funcs[0].args[2] == expected_paths[0]
assert downloads_funcs[1].args[2] == expected_paths[1]
assert downloads_funcs[2].args[2] == expected_paths[2]

def test_single_slotted_video_item_with_frames(self):
file_name = "single_slotted_video_frames.json"
base_path = "dataset_dir_path/single_slotted_video_frames/"
expected_paths = [Path(base_path + f"{i:07d}.png") for i in range(8)]
download_funcs = self._test_pull_naming_convention(
file_name, use_folders=False, video_frames=True, force_slots=False
)
for i, func in enumerate(download_funcs):
assert func.args[1] == expected_paths[i]

def test_multi_slotted_video_item_with_frames(self):
file_name = "multi_slotted_video_frames.json"
base_path = "dataset_dir_path/multi_slotted_video_frames/"
download_funcs = self._test_pull_naming_convention(
file_name, use_folders=False, video_frames=True, force_slots=True
)

for i in range(30):
slot_name = i // 10
frame_index = i % 10
expected_path = Path(f"{base_path}{slot_name}/{frame_index:07d}.png")
assert download_funcs[i].args[1] == expected_path

def test_single_slotted_item_multiple_source_files(self):
file_name = "single_slot_multiple_source_files.json"
expected_paths = [
Path("dataset_dir_path/single_slot_multiple_source_files/0/slice_1.dcm"),
Path("dataset_dir_path/single_slot_multiple_source_files/0/slice_2.dcm"),
]
download_funcs = self._test_pull_naming_convention(
file_name, use_folders=False, video_frames=False, force_slots=True
)
assert download_funcs[0].args[2] == expected_paths[0]
assert download_funcs[1].args[2] == expected_paths[1]

def test_single_slotted_long_video(self):
file_name = "single_slotted_long_video.json"
expected_paths = [Path("dataset_dir_path/single_slotted_long_video.mp4")]
download_funcs = self._test_pull_naming_convention(
file_name, use_folders=False, video_frames=False, force_slots=False
)
assert download_funcs[0].args[2] == expected_paths[0]

def test_single_slotted_long_video_with_frames(self):
file_name = "single_slotted_long_video_frames.json"
expected_paths = [
Path("dataset_dir_path/single_slotted_long_video_frames/.0000000.ts")
]

with patch(
"darwin.dataset.download_manager.get_segment_manifests"
) as mock_get_segment_manifests:
mock_get_segment_manifests.return_value = [
SegmentManifest(
slot="0",
segment=0,
total_frames=5,
items=[
ManifestItem(
frame=0,
absolute_frame=0,
segment=0,
visibility=True,
timestamp=0,
visible_frame=0,
),
ManifestItem(
frame=1,
absolute_frame=1,
segment=0,
visibility=True,
timestamp=0.04,
visible_frame=1,
),
ManifestItem(
frame=2,
absolute_frame=2,
segment=0,
visibility=True,
timestamp=0.08,
visible_frame=2,
),
ManifestItem(
frame=3,
absolute_frame=3,
segment=0,
visibility=True,
timestamp=0.12,
visible_frame=3,
),
ManifestItem(
frame=4,
absolute_frame=4,
segment=0,
visibility=True,
timestamp=0.16,
visible_frame=4,
),
],
)
]

download_funcs = self._test_pull_naming_convention(
file_name, use_folders=False, video_frames=True, force_slots=False
)
assert download_funcs[0].args[2] == expected_paths[0]

def test_multi_slotted_item_multiple_source_files(self):
file_name = "multiple_slots_multiple_source_files.json"
expected_paths = [
Path(
"dataset_dir_path/multiple_slots_multiple_source_files/0.1/slice_1.dcm"
),
Path(
"dataset_dir_path/multiple_slots_multiple_source_files/0.1/slice_2.dcm"
),
Path(
"dataset_dir_path/multiple_slots_multiple_source_files/1.1/slice_3.dcm"
),
Path(
"dataset_dir_path/multiple_slots_multiple_source_files/1.1/slice_1.dcm"
),
Path(
"dataset_dir_path/multiple_slots_multiple_source_files/1.1/slice_2.dcm"
),
]
download_funcs = self._test_pull_naming_convention(
file_name, use_folders=False, video_frames=False, force_slots=True
)
assert download_funcs[0].args[2] == expected_paths[0]
assert download_funcs[1].args[2] == expected_paths[1]
assert download_funcs[2].args[2] == expected_paths[2]
assert download_funcs[3].args[2] == expected_paths[3]
assert download_funcs[4].args[2] == expected_paths[4]


@pytest.fixture
def dataset_item(dataset_slug: str) -> DatasetItem:
return DatasetItem(
Expand Down
25 changes: 25 additions & 0 deletions tests/darwin/utils/get_image_path_from_stream_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pathlib import Path

from darwin.utils.utils import get_image_path_from_stream


class TestGetImagePathFromStream:
def test_with_folders_true(self):
darwin_json = {"item": {"name": "image.jpg", "path": "/folder"}}
images_dir = Path("/images")
annotation_filepath = Path("/annotations/annotation.json")
expected = Path("/images/folder/image.jpg")
result = get_image_path_from_stream(
darwin_json, images_dir, annotation_filepath, True
)
assert result == expected

def test_with_folders_false(self):
darwin_json = {"item": {"name": "image.jpg", "path": "/folder"}}
images_dir = Path("/images")
annotation_filepath = Path("/annotations/annotation.json")
expected = Path("/images/image.jpg")
result = get_image_path_from_stream(
darwin_json, images_dir, annotation_filepath, False
)
assert result == expected
Binary file modified tests/data.zip
Binary file not shown.

0 comments on commit fff4f13

Please sign in to comment.