v7labs · Nathanjp91 · Jan 19, 2023 · Oct 13, 2022 · Dec 14, 2022 · Dec 15, 2022
diff --git a/darwin/cli.py b/darwin/cli.py
@@ -157,6 +157,9 @@ def _run(args: Namespace, parser: ArgumentParser) -> None:
                 args.w,
                 args.h,
             )
+    # Annotation schema validation
+    elif args.command == "validate":
+        f.validate_schemas(location=args.location, pattern=args.pattern, silent=args.silent, output=args.output)
 
 
 if __name__ == "__main__":

diff --git a/darwin/cli_functions.py b/darwin/cli_functions.py
@@ -1,9 +1,11 @@
 import argparse
 import concurrent.futures
 import datetime
+import json
 import os
 import sys
 import traceback
+from glob import glob
 from itertools import tee
 from pathlib import Path
 from typing import Any, Dict, Iterator, List, NoReturn, Optional, Set, Union
@@ -34,12 +36,15 @@
 from darwin.dataset.utils import get_release_path
 from darwin.datatypes import ExportParser, ImportParser, PathLike, Team
 from darwin.exceptions import (
+    AnnotationFileValidationError,
     IncompatibleOptions,
     InvalidLogin,
     MissingConfig,
+    MissingSchema,
     NameTaken,
     NotFound,
     Unauthenticated,
+    UnknownAnnotationFileSchema,
     UnrecognizableFileEncoding,
     UnsupportedExportFormat,
     UnsupportedFileType,
@@ -55,6 +60,7 @@
     persist_client_configuration,
     prompt,
     secure_continue_request,
+    validate_file_against_schema,
 )
 
 
@@ -798,6 +804,10 @@ def dataset_import(
         _error(str(e))
     except UnrecognizableFileEncoding as e:
         _error(str(e))
+    except UnknownAnnotationFileSchema as e:
+        _error(str(e))
+    except AnnotationFileValidationError as e:
+        _error(str(e))
 
 
 def list_files(
@@ -939,7 +949,75 @@ def delete_files(dataset_slug: str, files: List[str], skip_user_confirmation: bo
     except NotFound as e:
         _error(f"No dataset with name '{e.name}'")
     except:
-        _error(f"An error has occurred, please try again later.")
+        _error("An error has occurred, please try again later.")
+
+
+def validate_schemas(
+    location: str,
+    pattern: bool = False,
+    silent: bool = False,
+    output: Optional[Path] = None,
+) -> None:
+    """
+    Validate function for the CLI. Takes one of 3 required key word arguments describing the location of files and prints and/or saves an output
+
+    Parameters
+    ----------
+    location : str
+        str path to a folder or file location to search
+    pattern : bool, optional
+        glob style pattern matching, by default None
+    silent : bool, optional
+        flag to set silent console printing, only showing errors, by default False
+    output : Optional[Path], optional
+        filename for saving to output, by default None
+    """
+
+    all_errors = {}
+    if pattern:
+        to_validate = [Path(filename) for filename in glob(location)]
+    elif os.path.isfile(location):
+        to_validate = [Path(location)]
+    elif os.path.isdir(location):
+        to_validate = [Path(filename) for filename in Path(location).glob("*.json")]
+    else:
+        to_validate = []
+
+    console = Console(theme=_console_theme(), stderr=True)
+
+    if not to_validate:
+        console.print("No files found to validate", style="warning")
+        return
+
+    console.print(f"Validating schemas for {len(to_validate)} files")
+
+    for file in to_validate:
+        try:
+            errors = [{"message": e.message, "location": e.json_path} for e in validate_file_against_schema(file)]
+        except MissingSchema as e:
+            errors = [{"message": e.message, "location": "schema link"}]
+
+        all_errors[str(file)] = errors
+        if not errors:
+            if not silent:
+                console.print(f"{str(file)}: No Errors", style="success")
+            continue
+        console.print(f"{str(file)}: {len(errors)} errors", style="error")
+        for error in errors:
+            console.print(f"\t- Problem found in {error['location']}", style="error")
+            console.print(f"\t\t- {error['message']}", style="error")
+
+    if output:
+        try:
+            filename: Path = output
+            if os.path.isdir(output):
+                filename = Path(os.path.join(output, "report.json"))
+            with open(filename, "w") as outfile:
+                json.dump(all_errors, outfile, indent=2)
+            console.print(f"Writing report to {filename}", style="success")
+        except Exception as e:
+            console.print(f"Error writing output file with {e}", style="error")
+            console.print("Did you supply an invalid filename?")
 
 
 def dataset_convert(dataset_identifier: str, format: str, output_dir: Optional[PathLike] = None) -> None:

diff --git a/darwin/datatypes.py b/darwin/datatypes.py
@@ -286,6 +286,9 @@ class AnnotationFileVersion:
     minor: int = 0
     suffix: str = ""
 
+    def __str__(self) -> str:
+        return f"{self.major}.{self.minor}{self.suffix}"
+
 
 @dataclass
 class AnnotationFile:

diff --git a/darwin/exceptions.py b/darwin/exceptions.py
@@ -1,4 +1,11 @@
+from cmath import exp
 from pathlib import Path
+from textwrap import dedent
+from typing import List
+
+from jsonschema.exceptions import ValidationError as jscValidationError
+
+from darwin.datatypes import AnnotationFileVersion
 
 
 class IncompatibleOptions(Exception):
@@ -146,6 +153,84 @@ class RequestEntitySizeExceeded(Exception):
     """
 
 
+class MissingSchema(Exception):
+    """
+    Used to indicate a problem loading or finding the schema
+    """
+
+    def __init__(self, message: str):
+        """_summary_
+
+        Parameters
+        ----------
+        message : str
+            Message to propogate up the stack
+        """
+        self.message = message
+
+    def __str__(self) -> str:
+        return self.message
+
+
+class AnnotationFileValidationError(Exception):
+    """
+    Used to indicate error while validation JSON annotation files.
+    """
+
+    def __init__(self, parent_error: jscValidationError, file_path: Path):
+        """
+        Parameters
+        ----------
+        parent_error: ValidationError
+            Error reported by ``jsonschema``.
+        file_path: Path
+            Path to annotation file that failed to validate.
+        """
+        self.parent_error = parent_error
+        self.file_path = file_path
+
+    def __str__(self) -> str:
+        return f"Unable to verify annotation file: '{self.file_path}'\n\n{self.parent_error.__str__()}".rstrip()
+
+
+class UnknownAnnotationFileSchema(Exception):
+    """
+    Used to indicate error when inferring schema for JSON annotation file.
+    """
+
+    def __init__(
+        self, file_path: Path, supported_versions: List[AnnotationFileVersion], detected_version: AnnotationFileVersion
+    ):
+        """
+        Parameters
+        ----------
+        file_path: Path
+            Path to annotation file that failed to validate.
+
+        supported_versions: List[AnnotationFileVersion]
+            todo
+
+        detected_version: AnnotationFileVersion
+            todo
+        """
+        self.file_path = file_path
+        self.detected_version = detected_version
+        self.supported_versions = list(map(str, supported_versions))
+
+    def __str__(self) -> str:
+        return dedent(
+            f"""\
+            Unable to find JSON schema for annotation file: '{self.file_path}'
+
+            Given annotation file should have either:
+                * optional `schema_ref` field with URL to JSON schema
+                * `version` field set to one of supported natively versions: {self.supported_versions}
+
+            Detected annotation file version is: '{self.detected_version}'.
+            """
+        )
+
+
 class UnknownExportVersion(Exception):
     """Used when dataset version is not recognized."""
 
@@ -177,9 +262,7 @@ def __init__(self, import_type: str, annotation_type: str):
         annotation_type: str
             The unsupported annotation type.
         """
-        super().__init__(
-            f"Unsupported annotation type {annotation_type} for {import_type} import"
-        )
+        super().__init__(f"Unsupported annotation type {annotation_type} for {import_type} import")
         self.import_type = import_type
         self.annotation_type = annotation_type
 

diff --git a/darwin/options.py b/darwin/options.py
@@ -45,6 +45,24 @@ def __init__(self):
 
         parser_convert.add_argument("output_dir", type=str, help="Where to store output files.")
 
+        # VALIDATE SCHEMA
+        parser_validate_schema = subparsers.add_parser(
+            "validate", help="Validate annotation files against Darwin schema"
+        )
+        parser_validate_schema.add_argument(
+            "location",
+            help="Location of file/folder to validate. Accepts single files or a folder to search *.json files",
+        )
+        parser_validate_schema.add_argument(
+            "--pattern",
+            action="store_true",
+            help="'location' is a Folder + File glob style pattern to search (eg: ./*.json)",
+        )
+
+        parser_validate_schema.add_argument(
+            "--silent", action="store_true", help="Flag to suppress all output except errors to console"
+        )
+        parser_validate_schema.add_argument("--output", help="name of file to write output json to")
         # DATASET
         dataset = subparsers.add_parser(
             "dataset", help="Dataset related functions.", description="Arguments to interact with datasets"

diff --git a/darwin/utils.py b/darwin/utils.py
@@ -21,13 +21,21 @@
 import deprecation
 import numpy as np
 import orjson as json
-from requests import Response
+import requests
+from jsonschema import exceptions, validators
+from requests import Response, request
 from rich.progress import ProgressType, track
 from upolygon import draw_polygon
 
 import darwin.datatypes as dt
 from darwin.config import Config
-from darwin.exceptions import OutdatedDarwinJSONFormat, UnsupportedFileType
+from darwin.exceptions import (
+    AnnotationFileValidationError,
+    MissingSchema,
+    OutdatedDarwinJSONFormat,
+    UnknownAnnotationFileSchema,
+    UnsupportedFileType,
+)
 from darwin.version import __version__
 
 if TYPE_CHECKING:
@@ -48,6 +56,8 @@
 ]
 SUPPORTED_EXTENSIONS = SUPPORTED_IMAGE_EXTENSIONS + SUPPORTED_VIDEO_EXTENSIONS
 
+_darwin_schema_cache = {}
+
 
 def is_extension_allowed_by_filename(filename: str) -> bool:
     """
@@ -337,6 +347,43 @@ def _get_local_filename(metadata: Dict[str, Any]) -> str:
         return metadata["filename"]
 
 
+def _get_schema(data: dict) -> Optional[dict]:
+    version = _parse_version(data)
+    schema_url = data.get("schema_ref") or _default_schema(version)
+    if not schema_url:
+        return None
+    if schema_url not in _darwin_schema_cache:
+        response = requests.get(schema_url)
+        response.raise_for_status()
+        schema = response.json()
+        _darwin_schema_cache[schema_url] = schema
+    return _darwin_schema_cache[schema_url]
+
+
+def validate_file_against_schema(path: Path) -> List:
+    data, _ = load_data_from_file(path)
+    return validate_data_against_schema(data)
+
+
+def validate_data_against_schema(data) -> List:
+    try:
+        schema = _get_schema(data)
+    except requests.exceptions.RequestException as e:
+        raise MissingSchema(f"Error retrieving schema from url: {e}")
+    if not schema:
+        raise MissingSchema("Schema not found")
+    validator = validators.Draft202012Validator(schema)
+    errors = list(validator.iter_errors(data))
+    return errors
+
+
+def load_data_from_file(path: Path):
+    with path.open() as infile:
+        data = json.loads(infile.read())
+    version = _parse_version(data)
+    return data, version
+
+
 def parse_darwin_json(path: Path, count: Optional[int]) -> Optional[dt.AnnotationFile]:
     """
     Parses the given JSON file in v7's darwin proprietary format. Works for images, split frame
@@ -363,18 +410,18 @@ def parse_darwin_json(path: Path, count: Optional[int]) -> Optional[dt.Annotatio
     """
 
     path = Path(path)
-    with path.open() as f:
-        data = json.loads(f.read())
-        if "annotations" not in data:
-            return None
 
-        if _parse_version(data).major == 2:
-            return _parse_darwin_v2(path, data)
+    data, version = load_data_from_file(path)
+    if "annotations" not in data:
+        return None
+
+    if version.major == 2:
+        return _parse_darwin_v2(path, data)
+    else:
+        if "fps" in data["image"] or "frame_count" in data["image"]:
+            return _parse_darwin_video(path, data, count)
         else:
-            if "fps" in data["image"] or "frame_count" in data["image"]:
-                return _parse_darwin_video(path, data, count)
-            else:
-                return _parse_darwin_image(path, data, count)
+            return _parse_darwin_image(path, data, count)
 
 
 def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile:
@@ -995,3 +1042,11 @@ def _data_to_annotations(data: Dict[str, Any]) -> List[Union[dt.Annotation, dt.V
         filter(None, map(_parse_darwin_video_annotation, raw_video_annotations))
     )
     return [*image_annotations, *video_annotations]
+
+
+def _supported_schema_versions():
+    return {(2, 0, ""): "https://darwin-public.s3.eu-west-1.amazonaws.com/darwin_json/2.0/schema.json"}
+
+
+def _default_schema(version: dt.AnnotationFileVersion):
+    return _supported_schema_versions().get((version.major, version.minor, version.suffix))