Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions darwin/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,9 @@ def _run(args: Namespace, parser: ArgumentParser) -> None:
args.w,
args.h,
)
# Annotation schema validation
elif args.command == "validate":
f.validate_schemas(location=args.location, pattern=args.pattern, silent=args.silent, output=args.output)


if __name__ == "__main__":
Expand Down
80 changes: 79 additions & 1 deletion darwin/cli_functions.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import argparse
import concurrent.futures
import datetime
import json
import os
import sys
import traceback
from glob import glob
from itertools import tee
from pathlib import Path
from typing import Any, Dict, Iterator, List, NoReturn, Optional, Set, Union
Expand Down Expand Up @@ -34,12 +36,15 @@
from darwin.dataset.utils import get_release_path
from darwin.datatypes import ExportParser, ImportParser, PathLike, Team
from darwin.exceptions import (
AnnotationFileValidationError,
IncompatibleOptions,
InvalidLogin,
MissingConfig,
MissingSchema,
NameTaken,
NotFound,
Unauthenticated,
UnknownAnnotationFileSchema,
UnrecognizableFileEncoding,
UnsupportedExportFormat,
UnsupportedFileType,
Expand All @@ -55,6 +60,7 @@
persist_client_configuration,
prompt,
secure_continue_request,
validate_file_against_schema,
)


Expand Down Expand Up @@ -798,6 +804,10 @@ def dataset_import(
_error(str(e))
except UnrecognizableFileEncoding as e:
_error(str(e))
except UnknownAnnotationFileSchema as e:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍🏻

_error(str(e))
except AnnotationFileValidationError as e:
_error(str(e))


def list_files(
Expand Down Expand Up @@ -939,7 +949,75 @@ def delete_files(dataset_slug: str, files: List[str], skip_user_confirmation: bo
except NotFound as e:
_error(f"No dataset with name '{e.name}'")
except:
_error(f"An error has occurred, please try again later.")
_error("An error has occurred, please try again later.")


def validate_schemas(
location: str,
pattern: bool = False,
silent: bool = False,
output: Optional[Path] = None,
) -> None:
"""
Validate function for the CLI. Takes one of 3 required key word arguments describing the location of files and prints and/or saves an output

Parameters
----------
location : str
str path to a folder or file location to search
pattern : bool, optional
glob style pattern matching, by default None
silent : bool, optional
flag to set silent console printing, only showing errors, by default False
output : Optional[Path], optional
filename for saving to output, by default None
"""

all_errors = {}
if pattern:
to_validate = [Path(filename) for filename in glob(location)]
elif os.path.isfile(location):
to_validate = [Path(location)]
elif os.path.isdir(location):
to_validate = [Path(filename) for filename in Path(location).glob("*.json")]
else:
to_validate = []

console = Console(theme=_console_theme(), stderr=True)

if not to_validate:
console.print("No files found to validate", style="warning")
return

console.print(f"Validating schemas for {len(to_validate)} files")

for file in to_validate:
try:
errors = [{"message": e.message, "location": e.json_path} for e in validate_file_against_schema(file)]
except MissingSchema as e:
errors = [{"message": e.message, "location": "schema link"}]

all_errors[str(file)] = errors
if not errors:
if not silent:
console.print(f"{str(file)}: No Errors", style="success")
continue
console.print(f"{str(file)}: {len(errors)} errors", style="error")
for error in errors:
console.print(f"\t- Problem found in {error['location']}", style="error")
console.print(f"\t\t- {error['message']}", style="error")

if output:
try:
filename: Path = output
if os.path.isdir(output):
filename = Path(os.path.join(output, "report.json"))
with open(filename, "w") as outfile:
json.dump(all_errors, outfile, indent=2)
console.print(f"Writing report to {filename}", style="success")
except Exception as e:
console.print(f"Error writing output file with {e}", style="error")
console.print("Did you supply an invalid filename?")


def dataset_convert(dataset_identifier: str, format: str, output_dir: Optional[PathLike] = None) -> None:
Expand Down
3 changes: 3 additions & 0 deletions darwin/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,9 @@ class AnnotationFileVersion:
minor: int = 0
suffix: str = ""

def __str__(self) -> str:
return f"{self.major}.{self.minor}{self.suffix}"


@dataclass
class AnnotationFile:
Expand Down
89 changes: 86 additions & 3 deletions darwin/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from cmath import exp
from pathlib import Path
from textwrap import dedent
from typing import List

from jsonschema.exceptions import ValidationError as jscValidationError

from darwin.datatypes import AnnotationFileVersion


class IncompatibleOptions(Exception):
Expand Down Expand Up @@ -146,6 +153,84 @@ class RequestEntitySizeExceeded(Exception):
"""


class MissingSchema(Exception):
"""
Used to indicate a problem loading or finding the schema
"""

def __init__(self, message: str):
"""_summary_

Parameters
----------
message : str
Message to propogate up the stack
"""
self.message = message

def __str__(self) -> str:
return self.message


class AnnotationFileValidationError(Exception):
"""
Used to indicate error while validation JSON annotation files.
"""

def __init__(self, parent_error: jscValidationError, file_path: Path):
"""
Parameters
----------
parent_error: ValidationError
Error reported by ``jsonschema``.
file_path: Path
Path to annotation file that failed to validate.
"""
self.parent_error = parent_error
self.file_path = file_path

def __str__(self) -> str:
return f"Unable to verify annotation file: '{self.file_path}'\n\n{self.parent_error.__str__()}".rstrip()


class UnknownAnnotationFileSchema(Exception):
"""
Used to indicate error when inferring schema for JSON annotation file.
"""

def __init__(
self, file_path: Path, supported_versions: List[AnnotationFileVersion], detected_version: AnnotationFileVersion
):
"""
Parameters
----------
file_path: Path
Path to annotation file that failed to validate.

supported_versions: List[AnnotationFileVersion]
todo

detected_version: AnnotationFileVersion
todo
"""
self.file_path = file_path
self.detected_version = detected_version
self.supported_versions = list(map(str, supported_versions))

def __str__(self) -> str:
return dedent(
f"""\
Unable to find JSON schema for annotation file: '{self.file_path}'

Given annotation file should have either:
* optional `schema_ref` field with URL to JSON schema
* `version` field set to one of supported natively versions: {self.supported_versions}

Detected annotation file version is: '{self.detected_version}'.
"""
)


class UnknownExportVersion(Exception):
"""Used when dataset version is not recognized."""

Expand Down Expand Up @@ -177,9 +262,7 @@ def __init__(self, import_type: str, annotation_type: str):
annotation_type: str
The unsupported annotation type.
"""
super().__init__(
f"Unsupported annotation type {annotation_type} for {import_type} import"
)
super().__init__(f"Unsupported annotation type {annotation_type} for {import_type} import")
self.import_type = import_type
self.annotation_type = annotation_type

Expand Down
18 changes: 18 additions & 0 deletions darwin/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,24 @@ def __init__(self):

parser_convert.add_argument("output_dir", type=str, help="Where to store output files.")

# VALIDATE SCHEMA
parser_validate_schema = subparsers.add_parser(
"validate", help="Validate annotation files against Darwin schema"
)
parser_validate_schema.add_argument(
"location",
help="Location of file/folder to validate. Accepts single files or a folder to search *.json files",
)
parser_validate_schema.add_argument(
"--pattern",
action="store_true",
help="'location' is a Folder + File glob style pattern to search (eg: ./*.json)",
)

parser_validate_schema.add_argument(
"--silent", action="store_true", help="Flag to suppress all output except errors to console"
)
parser_validate_schema.add_argument("--output", help="name of file to write output json to")
# DATASET
dataset = subparsers.add_parser(
"dataset", help="Dataset related functions.", description="Arguments to interact with datasets"
Expand Down
79 changes: 67 additions & 12 deletions darwin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,21 @@
import deprecation
import numpy as np
import orjson as json
from requests import Response
import requests
from jsonschema import exceptions, validators
from requests import Response, request
from rich.progress import ProgressType, track
from upolygon import draw_polygon

import darwin.datatypes as dt
from darwin.config import Config
from darwin.exceptions import OutdatedDarwinJSONFormat, UnsupportedFileType
from darwin.exceptions import (
AnnotationFileValidationError,
MissingSchema,
OutdatedDarwinJSONFormat,
UnknownAnnotationFileSchema,
UnsupportedFileType,
)
from darwin.version import __version__

if TYPE_CHECKING:
Expand All @@ -48,6 +56,8 @@
]
SUPPORTED_EXTENSIONS = SUPPORTED_IMAGE_EXTENSIONS + SUPPORTED_VIDEO_EXTENSIONS

_darwin_schema_cache = {}


def is_extension_allowed_by_filename(filename: str) -> bool:
"""
Expand Down Expand Up @@ -337,6 +347,43 @@ def _get_local_filename(metadata: Dict[str, Any]) -> str:
return metadata["filename"]


def _get_schema(data: dict) -> Optional[dict]:
version = _parse_version(data)
schema_url = data.get("schema_ref") or _default_schema(version)
if not schema_url:
return None
if schema_url not in _darwin_schema_cache:
response = requests.get(schema_url)
response.raise_for_status()
schema = response.json()
_darwin_schema_cache[schema_url] = schema
return _darwin_schema_cache[schema_url]


def validate_file_against_schema(path: Path) -> List:
data, _ = load_data_from_file(path)
return validate_data_against_schema(data)


def validate_data_against_schema(data) -> List:
try:
schema = _get_schema(data)
except requests.exceptions.RequestException as e:
raise MissingSchema(f"Error retrieving schema from url: {e}")
if not schema:
raise MissingSchema("Schema not found")
validator = validators.Draft202012Validator(schema)
errors = list(validator.iter_errors(data))
return errors


def load_data_from_file(path: Path):
with path.open() as infile:
data = json.loads(infile.read())
version = _parse_version(data)
return data, version


def parse_darwin_json(path: Path, count: Optional[int]) -> Optional[dt.AnnotationFile]:
"""
Parses the given JSON file in v7's darwin proprietary format. Works for images, split frame
Expand All @@ -363,18 +410,18 @@ def parse_darwin_json(path: Path, count: Optional[int]) -> Optional[dt.Annotatio
"""

path = Path(path)
with path.open() as f:
data = json.loads(f.read())
if "annotations" not in data:
return None

if _parse_version(data).major == 2:
return _parse_darwin_v2(path, data)
data, version = load_data_from_file(path)
if "annotations" not in data:
return None

if version.major == 2:
return _parse_darwin_v2(path, data)
else:
if "fps" in data["image"] or "frame_count" in data["image"]:
return _parse_darwin_video(path, data, count)
else:
if "fps" in data["image"] or "frame_count" in data["image"]:
return _parse_darwin_video(path, data, count)
else:
return _parse_darwin_image(path, data, count)
return _parse_darwin_image(path, data, count)


def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile:
Expand Down Expand Up @@ -995,3 +1042,11 @@ def _data_to_annotations(data: Dict[str, Any]) -> List[Union[dt.Annotation, dt.V
filter(None, map(_parse_darwin_video_annotation, raw_video_annotations))
)
return [*image_annotations, *video_annotations]


def _supported_schema_versions():
return {(2, 0, ""): "https://darwin-public.s3.eu-west-1.amazonaws.com/darwin_json/2.0/schema.json"}


def _default_schema(version: dt.AnnotationFileVersion):
return _supported_schema_versions().get((version.major, version.minor, version.suffix))
Loading