Skip to content

Commit e8c9fe0

Browse files
author
John Wilkie
committed
Split annotation import payloads if required
1 parent 67e0866 commit e8c9fe0

File tree

2 files changed

+128
-0
lines changed

2 files changed

+128
-0
lines changed

darwin/importer/importer.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import concurrent.futures
22
import uuid
3+
import json
4+
import copy
35
from collections import defaultdict
46
from functools import partial
57
from logging import getLogger
@@ -20,6 +22,7 @@
2022
Union,
2123
)
2224

25+
2326
from darwin.datatypes import (
2427
AnnotationFile,
2528
Property,
@@ -1864,6 +1867,17 @@ def _import_annotations(
18641867

18651868
try:
18661869
dataset.import_annotation(id, payload=payload)
1870+
except RequestEntitySizeExceeded:
1871+
logger.warning(
1872+
"Annotation payload exceeds request entity size. Splitting payload into smaller chunks for import."
1873+
)
1874+
payloads = _split_payloads(payload)
1875+
for chunked_payload in payloads:
1876+
try:
1877+
dataset.import_annotation(id, payload=chunked_payload)
1878+
except Exception as e:
1879+
errors.append(e)
1880+
success = dt.Success.FAILURE
18671881
except Exception as e:
18681882
errors.append(e)
18691883
success = dt.Success.FAILURE
@@ -2185,3 +2199,57 @@ def _warn_for_annotations_with_multiple_instance_ids(
21852199
console.print(
21862200
f"- File: {file} has {files_with_multi_instance_id_annotations[file]} annotation(s) with multiple instance IDs"
21872201
)
2202+
2203+
2204+
def _split_payloads(
2205+
payload: Dict[str, Any], max_payload_size: int = 32_000_000
2206+
) -> List[Dict[str, Any]]:
2207+
"""
2208+
Splits a payload into smaller chunks to avoid HTTP 413 errors due to large request entity sizes.
2209+
2210+
This function takes an input payload and splits it into smaller payloads, ensuring each chunk does not exceed the specified maximum size. This is particularly useful when importing annotations, as it prevents HTTP 413 errors (`RequestEntitySizeExceeded`) from occurring due to large request entity sizes.
2211+
2212+
Parameters
2213+
----------
2214+
payload : Dict[str, Any]
2215+
The input payload to be split.
2216+
max_payload_size : int, optional
2217+
The maximum size of each split payload. Defaults to 32,000,000 bytes.
2218+
2219+
Returns
2220+
-------
2221+
List[Dict[str, Any]]
2222+
A list of split payloads, each not exceeding the specified maximum size.
2223+
2224+
Raises
2225+
------
2226+
ValueError
2227+
If any single annotation exceeds the `max_payload_size` limit
2228+
"""
2229+
payloads = []
2230+
base_payload = {"annotations": [], "overwrite": payload["overwrite"]}
2231+
current_payload = copy.deepcopy(base_payload)
2232+
current_payload_size = 0
2233+
2234+
for annotation in payload["annotations"]:
2235+
annotation_size = len(json.dumps({"annotations": [annotation]}).encode("utf-8"))
2236+
if current_payload_size + annotation_size < max_payload_size:
2237+
current_payload["annotations"].append(annotation)
2238+
current_payload_size += annotation_size
2239+
else:
2240+
if annotation_size > max_payload_size:
2241+
raise ValueError(
2242+
f"One or more annotations exceed the maximum allowed size of 32 MiB ({max_payload_size})"
2243+
)
2244+
payloads.append(current_payload)
2245+
current_payload = copy.deepcopy(base_payload)
2246+
current_payload["overwrite"] = (
2247+
False # Required to make sure subsequent payloads don't overwrite previous ones
2248+
)
2249+
current_payload["annotations"].append(annotation)
2250+
current_payload_size = annotation_size
2251+
2252+
if current_payload["annotations"]:
2253+
payloads.append(current_payload)
2254+
2255+
return payloads

tests/darwin/importer/importer_test.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
_import_properties,
3535
_warn_for_annotations_with_multiple_instance_ids,
3636
_serialize_item_level_properties,
37+
_split_payloads,
3738
)
3839

3940

@@ -3473,3 +3474,62 @@ def test_serialize_item_level_properties_multiple_properties():
34733474
]
34743475

34753476
assert result == expected
3477+
3478+
3479+
def test__split_payloads_returns_multiple_payloads():
3480+
payload = {
3481+
"annotations": [
3482+
{"id": "annotation_1", "data": "data1"},
3483+
{"id": "annotation_2", "data": "data2"},
3484+
{"id": "annotation_3", "data": "data3"},
3485+
],
3486+
"overwrite": True,
3487+
}
3488+
max_payload_size = 100
3489+
3490+
result = _split_payloads(payload, max_payload_size)
3491+
3492+
assert len(result) == 3
3493+
assert result[0]["annotations"] == [payload["annotations"][0]]
3494+
assert result[1]["annotations"] == [payload["annotations"][1]]
3495+
assert result[2]["annotations"] == [payload["annotations"][2]]
3496+
3497+
3498+
def test__split_payloads_with_annotation_exceeding_size_limit():
3499+
payload = {
3500+
"annotations": [
3501+
{"id": "annotation_1", "data": "a" * 1000}, # Large annotation
3502+
{"id": "annotation_2", "data": "data2"},
3503+
],
3504+
"overwrite": True,
3505+
}
3506+
max_payload_size = 50
3507+
3508+
with pytest.raises(
3509+
ValueError,
3510+
match="One or more annotations exceed the maximum allowed size",
3511+
):
3512+
_split_payloads(payload, max_payload_size)
3513+
3514+
3515+
def test__split_payloads_overwrites_on_first_payload_and_appends_on_the_rest():
3516+
"""
3517+
When importing annotations, we need to respect the overwrite behaviour defined by the user.
3518+
However, if we need to split payloads, all payloads after the first will have to be appended
3519+
"""
3520+
payload = {
3521+
"annotations": [
3522+
{"id": "annotation_1", "data": "data1"},
3523+
{"id": "annotation_2", "data": "data2"},
3524+
{"id": "annotation_3", "data": "data3"},
3525+
],
3526+
"overwrite": True,
3527+
}
3528+
max_payload_size = 100
3529+
3530+
result = _split_payloads(payload, max_payload_size)
3531+
3532+
assert len(result) == 3
3533+
assert result[0]["overwrite"]
3534+
assert not result[1]["overwrite"]
3535+
assert not result[2]["overwrite"]

0 commit comments

Comments
 (0)