Skip to content
Merged
37 changes: 15 additions & 22 deletions darwin/dataset/local_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@
from PIL import Image as PILImage

from darwin.dataset.utils import get_classes, get_release_path, load_pil_image
from darwin.utils import SUPPORTED_IMAGE_EXTENSIONS, parse_darwin_json
from darwin.utils import (
SUPPORTED_IMAGE_EXTENSIONS,
get_image_path_from_stream,
parse_darwin_json,
stream_darwin_json,
)


class LocalDataset:
Expand Down Expand Up @@ -126,30 +131,18 @@ def _setup_annotations_and_images(
partition,
split_type,
):
stems = build_stems(
release_path, annotations_dir, annotation_type, split, partition, split_type
)
for stem in stems:
annotation_path = annotations_dir / f"{stem}.json"
images = []
for ext in SUPPORTED_IMAGE_EXTENSIONS:
image_path = images_dir / f"{stem}{ext}"
if image_path.exists():
images.append(image_path)
continue
image_path = images_dir / f"{stem}{ext.upper()}"
if image_path.exists():
images.append(image_path)
if len(images) < 1:
# Find all the annotations and their corresponding images
for annotation_path in sorted(annotations_dir.glob("**/*.json")):
darwin_json = stream_darwin_json(annotation_path)
image_path = get_image_path_from_stream(darwin_json, images_dir)
if image_path.exists():
self.images_path.append(image_path)
self.annotations_path.append(annotation_path)
continue
else:
raise ValueError(
f"Annotation ({annotation_path}) does not have a corresponding image"
)
if len(images) > 1:
raise ValueError(
f"Image ({stem}) is present with multiple extensions. This is forbidden."
)
self.images_path.append(images[0])
self.annotations_path.append(annotation_path)

def _initial_setup(self, dataset_path, release_name):
assert dataset_path is not None
Expand Down
40 changes: 14 additions & 26 deletions darwin/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
SUPPORTED_EXTENSIONS,
SUPPORTED_VIDEO_EXTENSIONS,
attempt_decode,
get_image_path_from_stream,
is_unix_like_os,
parse_darwin_json,
)
from darwin.utils.utils import stream_darwin_json

# E.g.: {"partition" => {"class_name" => 123}}
AnnotationDistribution = Dict[str, Counter]
Expand Down Expand Up @@ -569,33 +571,19 @@ def _map_annotations_to_images(
images_paths = []
annotations_paths = []
invalid_annotation_paths = []
for stem in stems:
annotation_path = annotations_dir / f"{stem}.json"
images = []
for ext in SUPPORTED_EXTENSIONS:
image_path = images_dir / f"{stem}{ext}"
if image_path.exists():
images.append(image_path)
continue
image_path = images_dir / f"{stem}{ext.upper()}"
if image_path.exists():
images.append(image_path)

image_count = len(images)
if image_count != 1 and ignore_inconsistent_examples:
invalid_annotation_paths.append(annotation_path)
for annotation_path in annotations_dir.glob("**/*.json"):
darwin_json = stream_darwin_json(annotation_path)
image_path = get_image_path_from_stream(darwin_json, images_dir)
if image_path.exists():
images_paths.append(image_path)
annotations_paths.append(annotation_path)
continue
elif image_count < 1:
raise ValueError(
f"Annotation ({annotation_path}) does not have a corresponding image"
)
elif image_count > 1:
raise ValueError(
f"Image ({stem}) is present with multiple extensions. This is forbidden."
)

images_paths.append(images[0])
annotations_paths.append(annotation_path)
else:
if ignore_inconsistent_examples:
invalid_annotation_paths.append(annotation_path)
continue
else:
raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image")

return images_paths, annotations_paths, invalid_annotation_paths

Expand Down
41 changes: 41 additions & 0 deletions darwin/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@
)

import deprecation
import json_stream
import numpy as np
import orjson as json
import requests
from json_stream.base import PersistentStreamingJSONObject
from jsonschema import exceptions, validators
from requests import Response, request
from rich.progress import ProgressType, track
Expand Down Expand Up @@ -454,6 +456,45 @@ def parse_darwin_json(path: Path, count: Optional[int] = None) -> Optional[dt.An
else:
return _parse_darwin_image(path, data, count)

def stream_darwin_json(path: Path) -> PersistentStreamingJSONObject:
"""
Returns a Darwin JSON file as a persistent stream. This allows for parsing large files without
loading them entirely into memory.

Parameters
----------
path : Path
Path to the file to parse.

Returns
-------
PersistentStreamingJSONObject
A stream of the JSON file.
"""

with path.open() as infile:
return json_stream.load(infile, persistent=True)

def get_image_path_from_stream(darwin_json: PersistentStreamingJSONObject, images_dir: Path) -> Path:
"""
Returns the path to the image file associated with the given darwin json file (V1 or V2).

Parameters
----------
darwin_json : PersistentStreamingJSONObject
A stream of the JSON file.
images_dir : Path
Path to the directory containing the images.

Returns
-------
Path
Path to the image file.
"""
try:
return images_dir / (Path(darwin_json['item']['path'].lstrip('/\\'))) / Path(darwin_json['item']['name'])
except KeyError:
return images_dir / (Path(darwin_json['image']['path'].lstrip('/\\'))) / Path(darwin_json['image']['filename'])

def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile:
item = data["item"]
Expand Down
Loading