diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 6dac4d5..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b10085 --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.pyc +*.py[cod] +*$py.class +*mypy* +*.egg* +*DS_Store* +envs/ +dist/ +*.egg-info +tasks/ +*.dump +*rdb +MnesiaCore.rabbit* +kubernetes/config +#static/ +# C extensions +nohup.out +*.so +minikube-darwin-amd64 + +# Distribution / packaging +.Python +src/staticfiles/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +ml_models/ + +# Translations +*.mo +*.pot +*.env +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +sample_large_files +sample_tif_files + +# Vscode settings folder +.vscode/ + +# python virtual env for SDK testing +venv_* +pylint.html +_mask.c +tags +debug_* +*.idea +*.DS_Store +htmlcov diff --git a/superannotate-databricks-connector/Dockerfile.spark b/Dockerfile.spark similarity index 100% rename from superannotate-databricks-connector/Dockerfile.spark rename to Dockerfile.spark diff --git a/superannotate-databricks-connector/Dockerfile.test b/Dockerfile.test similarity index 63% rename from superannotate-databricks-connector/Dockerfile.test rename to Dockerfile.test index b6e7b74..a698d34 100644 --- a/superannotate-databricks-connector/Dockerfile.test +++ b/Dockerfile.test @@ -1,18 +1,18 @@ FROM spark_docker_v2 -# Add neccessary data -ADD superannotate_databricks_connector superannotate_databricks_connector -ADD setup.py setup.py -ADD tests tests - # Build the package RUN python setup.py sdist bdist_wheel # Add the distribution -ADD dist dist +COPY src src + +RUN python -m build src # Install the package RUN pip install dist/*.whl +# Add necessary data +ADD tests tests + # Run unit tests RUN python -m unittest discover tests \ No newline at end of file diff --git a/LICENSE b/LICENSE.txt similarity index 100% rename from LICENSE rename to LICENSE.txt diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..854dc61 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,50 @@ +[project] + +name = "superannotate_databricks_connector" # Required + +version = "0.0.1dev1" + +description = "Custom functions to work with SuperAnnotate in Databricks" + +readme = "README.md" + +requires-python = ">=3.8" + +license = { file = "LICENSE.txt" } + +keywords = ["superannotate_databricks_connector", "superannotate"] + +authors = [ + { name = "Leo Lindén", email = "leo@superannotate.com" } +] + +maintainers = [ + { name = "Leo Lindén", email = "leo@superannotate.com" } +] + +classifiers = [# Optional + "Intended Audience :: Developers", + "Topic :: Software Development :: Build Tools", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3 :: Only", +] + +dependencies = [ + "pyspark~=3.4.0" +] + +[project.optional-dependencies] +test = ["pytest"] + +[project.urls] # Optional +"Homepage" = "https://github.com/superannotateai/superannotate-databricks-connector" +"Bug Reports" = "https://github.com/superannotateai/superannotate-databricks-connector/issues" +"Source" = "https://github.com/superannotateai/superannotate-databricks-connector/" + +[build-system] +requires = ["setuptools>=43.0.0", "wheel"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/src/superannotate_databricks_connector/__init__.py b/src/superannotate_databricks_connector/__init__.py new file mode 100644 index 0000000..ed62a13 --- /dev/null +++ b/src/superannotate_databricks_connector/__init__.py @@ -0,0 +1,10 @@ +from superannotate_databricks_connector import schemas +from superannotate_databricks_connector import text +from superannotate_databricks_connector import vector + + +__all__ = [ + 'schemas', + 'text', + 'vector' +] diff --git a/superannotate-databricks-connector/superannotate_databricks_connector/__init__.py b/src/superannotate_databricks_connector/schemas/__init__.py similarity index 100% rename from superannotate-databricks-connector/superannotate_databricks_connector/__init__.py rename to src/superannotate_databricks_connector/schemas/__init__.py diff --git a/superannotate-databricks-connector/superannotate_databricks_connector/schemas/comment.py b/src/superannotate_databricks_connector/schemas/comment.py similarity index 71% rename from superannotate-databricks-connector/superannotate_databricks_connector/schemas/comment.py rename to src/superannotate_databricks_connector/schemas/comment.py index 360bc1f..7a9f16c 100644 --- a/superannotate-databricks-connector/superannotate_databricks_connector/schemas/comment.py +++ b/src/superannotate_databricks_connector/schemas/comment.py @@ -13,22 +13,22 @@ def get_comment_schema(): comment_schema = StructType([ StructField("correspondence", ArrayType(MapType( - StringType(), - StringType())), + StringType(), + StringType())), True), StructField("x", FloatType(), True), StructField("y", FloatType(), True), StructField("resolved", BooleanType(), True), StructField("createdAt", StringType(), True), StructField("createdBy", MapType( - StringType(), - StringType()), + StringType(), + StringType()), True), StructField("creationType", StringType(), True), StructField("updatedAt", StringType(), True), StructField("updatedBy", MapType( - StringType(), - StringType()), + StringType(), + StringType()), True) ]) - return comment_schema \ No newline at end of file + return comment_schema diff --git a/superannotate-databricks-connector/superannotate_databricks_connector/schemas/text_schema.py b/src/superannotate_databricks_connector/schemas/text_schema.py similarity index 100% rename from superannotate-databricks-connector/superannotate_databricks_connector/schemas/text_schema.py rename to src/superannotate_databricks_connector/schemas/text_schema.py diff --git a/superannotate-databricks-connector/superannotate_databricks_connector/schemas/vector_schema.py b/src/superannotate_databricks_connector/schemas/vector_schema.py similarity index 100% rename from superannotate-databricks-connector/superannotate_databricks_connector/schemas/vector_schema.py rename to src/superannotate_databricks_connector/schemas/vector_schema.py diff --git a/superannotate-databricks-connector/superannotate_databricks_connector/text.py b/src/superannotate_databricks_connector/text.py similarity index 92% rename from superannotate-databricks-connector/superannotate_databricks_connector/text.py rename to src/superannotate_databricks_connector/text.py index a313d0d..a93d642 100644 --- a/superannotate-databricks-connector/superannotate_databricks_connector/text.py +++ b/src/superannotate_databricks_connector/text.py @@ -1,5 +1,5 @@ from datetime import datetime -from .schemas.text_schema import get_text_schema +from superannotate_databricks_connector.schemas.text_schema import get_text_schema def convert_dates(instance): @@ -45,7 +45,7 @@ def get_text_dataframe(annotations, spark): "status": item["metadata"]["status"], "annotatorEmail": item["metadata"]["annotatorEmail"], "qaEmail": item["metadata"]["qaEmail"], - "entities": [convert_dates(instance) for instance + "entities": [convert_dates(instance) for instance in item["instances"] if instance["type"] == "entity"], "tags": [convert_dates(instance) for instance in item["instances"] if instance["type"] == "tag"] diff --git a/superannotate-databricks-connector/superannotate_databricks_connector/vector.py b/src/superannotate_databricks_connector/vector.py similarity index 98% rename from superannotate-databricks-connector/superannotate_databricks_connector/vector.py rename to src/superannotate_databricks_connector/vector.py index 4509cd5..e8d818c 100644 --- a/superannotate-databricks-connector/superannotate_databricks_connector/vector.py +++ b/src/superannotate_databricks_connector/vector.py @@ -1,4 +1,4 @@ -from .schemas.vector_schema import get_vector_schema +from superannotate_databricks_connector.schemas.vector_schema import get_vector_schema def process_comment(comment): diff --git a/superannotate-databricks-connector/.DS_Store b/superannotate-databricks-connector/.DS_Store deleted file mode 100644 index cbf4a7b..0000000 Binary files a/superannotate-databricks-connector/.DS_Store and /dev/null differ diff --git a/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/__init__.py b/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/__init__.py deleted file mode 100644 index 0a141ae..0000000 --- a/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .io import * -from .vector import * -from .schemas import * \ No newline at end of file diff --git a/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/io.py b/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/io.py deleted file mode 100644 index a263dde..0000000 --- a/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/io.py +++ /dev/null @@ -1,57 +0,0 @@ -from delta.tables import DeltaTable -from pyspark.sql import DataFrame, SparkSession -import logging - - -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - - -def write_annotations_to_delta(annotations: DataFrame, database: str, table: str, spark: SparkSession) -> None: - """ - Takes a list of annotations and writes them to the specified delta table - - Args: - annotations (dataframe): Spark dataframe with annotations - database (string): The name of the database to write to - table (string): The name of the delta table to write to - spark (sparkContext): The spark context - create_if_not_exists (Bool): Should the function create the delta table if it does not exists, or raise an exception, - - Returns: - None - """ - logging.info('Writing annotations to delta...') - # Set the database and table names - database_name = database - table_name = f"{database_name}.{table}" - - # Check if the table exists - table_exists = spark.catalog.tableExists(table_name) - - if not table_exists: - if create_if_not_exists: - # If the table does not exist, create the database if it does not exist - logging.warning(f"{database_name}.{table} not found.") - logging.info("Creating new table..." ) - spark.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}") - # Write the DataFrame as a new Delta table in the specified schema and partitioned by projectId - annotations.write.format("delta").partitionBy("projectId").mode("overwrite").saveAsTable(table_name) - else: - logging.warning(f"{database_name}.{table} not found.") - raise Exception(f"{database_name}.{table} not found. To create table set the parameter create_if_not_exists=True") - else: - # If the table exists - logging.info("Merging new records..." ) - - # Read the managed Delta table - delta_table = DeltaTable.forName(spark, table_name) - - # Define the merge condition using unique identifiers - merge_condition = "source.image_name = target.image_name AND source.projectId = target.projectId" - - # Merge the DataFrame into the existing managed Delta table - delta_table.alias("target")\ - .merge(annotations.alias("source"), merge_condition)\ - .whenMatchedUpdateAll()\ - .whenNotMatchedInsertAll()\ - .execute() diff --git a/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/processing.py b/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/processing.py deleted file mode 100644 index 5375a44..0000000 --- a/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/processing.py +++ /dev/null @@ -1,125 +0,0 @@ -from .schemas import get_vector_schema - - -def process_comment(comment): - comment["x"] = float(comment.get("x")) - comment["y"] = float(comment.get("y")) - return comment - - -def process_vector_instance(instance, custom_id_map=None): - """ - Takes one annotation instance and unpacks it. - - Args: - instance (dict). One instance from the SuperAnnotate annotation format. - - Returns: - (dict) reformated instance - """ - return {'instance_type': instance['type'], - 'classId': instance["classId"] if custom_id_map is None else - custom_id_map.get(instance["className"]), - 'probability': instance.get('probability'), - 'bbox_points': {k: float(v) for k, v in - instance['points'].items()} - if instance["type"] == "bbox" else None, - 'polygon_points': [float(p) for p in instance['points']] - if instance["type"] == "polygon" else None, - 'polygon_exclude': instance["exclude"] - if instance["type"] == "polygon" else None, - 'point_points': {"x": float(instance["x"]), - "y": float(instance["y"])} - if instance["type"] == "point" else None, - 'ellipse_points': {"cx": float(instance["cx"]), - "cy": float(instance["cy"]), - "rx": float(instance["rx"]), - "ry": float(instance["ry"]), - "angle": float(instance["angle"])} - if instance["type"] == "ellipse" else None, - 'cuboid_points': {outer_k: {inner_k: float(inner_v) - for inner_k, inner_v in outer_v.items() - } for outer_k, outer_v - in instance['points'].items()} - if instance["type"] == "cuboid" else None, - 'groupId': instance['groupId'], - 'locked': instance.get('locked'), - 'attributes': instance['attributes'], - 'trackingId': instance.get('trackingId'), - 'error': instance.get('error'), - 'createdAt': instance.get('createdAt'), - 'createdBy': instance.get('createdBy'), - 'creationType': instance.get('creationType'), - 'updatedAt': instance.get('updatedAt'), - 'updatedBy': instance.get('updatedBy'), - 'className': instance.get('className') - } - - -def process_bounding_box(bbox, custom_id_map=None): - """XYXY class - keras_cv.bounding_box.XYXY() - XYXY contains axis indices for the XYXY format. - - All values in the XYXY format should be absolute pixel values. - - The XYXY format consists of the following required indices: - - LEFT: left of the bounding box - TOP: top of the bounding box - RIGHT: right of the bounding box - BOTTOM: bottom of the bounding box""" - - object_box = [int(x) for x in [bbox["points"]["x1"], - bbox["points"]["y1"], - bbox["points"]["x2"], - bbox["points"]["y2"]]] - object_class = bbox["classId"] if custom_id_map is None else custom_id_map.get(bbox["className"]) - return object_box, object_class - - -def get_boxes(instances, custom_id_map=None): - boxes = [] - classes = [] - for instance in instances: - if instance["type"] == "bbox": - ob, oc = process_bounding_box(instance, custom_id_map) - boxes.append(ob) - classes.append(oc) - return {"classes": classes, "boxes": boxes} - - -def get_vector_dataframe(annotations, spark, custom_id_map=None): - """ - Transforms a list of SuperAnnotate annotations from a vector - project into a spark dataframe - - Args: - annotations (list[dict]): The annotations in the SuperAnnotate format - spark (sparkContext): The spark context - - Returns: - spark_df: A spark dataframe containing the annotations. - """ - rows = [] - for item in annotations: - flattened_item = { - "image_height": item["metadata"]["height"], - "image_width": item["metadata"]["width"], - "image_name": item["metadata"]["name"], - 'projectId': item["metadata"]['projectId'], - 'isPredicted': item["metadata"]['isPredicted'], - 'status': item["metadata"]['status'], - 'pinned': item["metadata"]['pinned'], - 'annotatorEmail': item["metadata"]['annotatorEmail'], - 'qaEmail': item["metadata"]['qaEmail'], - "instances": [process_vector_instance(instance, custom_id_map) - for instance in item["instances"]], - "bounding_boxes": get_boxes(item["instances"], custom_id_map), - "comments": [process_comment(comment) - for comment in item["comments"]] - } - rows.append(flattened_item) - schema = get_vector_schema() - spark_df = spark.createDataFrame(rows, schema=schema) - return spark_df diff --git a/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/schemas.py b/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/schemas.py deleted file mode 100644 index a878d6a..0000000 --- a/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/schemas.py +++ /dev/null @@ -1,89 +0,0 @@ -from pyspark.sql.types import ( - StructType, StructField, StringType, IntegerType, FloatType, BooleanType, TimestampType, MapType, ArrayType -) - -def get_comment_schema(): - comment_schema = StructType([ - StructField("correspondence", ArrayType(MapType(StringType(),StringType())),True), - StructField("x",FloatType(),True), - StructField("y",FloatType(),True), - StructField("resolved",BooleanType(),True), - StructField("createdAt",StringType(),True), - StructField("createdBy",MapType(StringType(),StringType()),True), - StructField("creationType",StringType(),True), - StructField("updatedAt",StringType(),True), - StructField("updatedBy",MapType(StringType(),StringType()),True) - ]) - return comment_schema - - - -def get_point_schema(): - point_schema = StructType([ - StructField("x", FloatType(), True), - StructField("y", FloatType(), True) - ]) - return point_schema - - -def get_cuboid_schema(): - cuboid_points_schema = StructType([ - StructField("f1", get_point_schema(), True), - StructField("f2", get_point_schema(), True), - StructField("r1", get_point_schema(), True), - StructField("r2", get_point_schema(), True) - ]) - return cuboid_points_schema - - -def get_vector_instance_schema(): - instance_schema = StructType([ - StructField("instance_type", StringType(), True), - StructField("classId", IntegerType(), True), - StructField("probability", IntegerType(), True), - StructField("bbox_points", MapType(StringType(), FloatType()), True), - StructField("polygon_points", ArrayType(FloatType()), True), - StructField("polygon_exclude", ArrayType(ArrayType(FloatType())),True), - StructField("cuboid_points", get_cuboid_schema(), True), - StructField("ellipse_points", MapType(StringType(),FloatType()),True), - StructField("point_points", MapType(StringType(),FloatType()),True), - StructField("groupId", IntegerType(), True), - StructField("locked", BooleanType(), True), - StructField("attributes", ArrayType(MapType(StringType(), StringType())), True), - StructField("trackingId", StringType(), True), - StructField("error", StringType(), True), - StructField("createdAt", StringType(), True), - StructField("createdBy", MapType(StringType(), StringType()), True), - StructField("creationType", StringType(), True), - StructField("updatedAt", StringType(), True), - StructField("updatedBy", MapType(StringType(), StringType()), True), - StructField("className", StringType(), True) - ]) - return instance_schema - - -def get_boxes_schema(): - instance_schema = StructType([ - StructField("classes",ArrayType(IntegerType()),True), - StructField("boxes",ArrayType(ArrayType(IntegerType())),True) - ]) - return instance_schema - - -def get_vector_schema(): - schema = StructType([ - StructField("image_height", IntegerType(), True), - StructField("image_width", IntegerType(), True), - StructField("image_name", StringType(), True), - StructField("projectId", IntegerType(), True), - StructField("isPredicted", BooleanType(), True), - StructField("status", StringType(), True), - StructField("pinned", BooleanType(), True), - StructField("annotatorEmail", StringType(), True), - StructField("qaEmail", StringType(), True), - StructField("instances", ArrayType(get_vector_instance_schema()), True), - StructField("bounding_boxes",get_boxes_schema(),True), - StructField("comments", ArrayType(get_comment_schema()),True) - - ]) - return schema \ No newline at end of file diff --git a/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/vector.py b/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/vector.py deleted file mode 100644 index c641e79..0000000 --- a/superannotate-databricks-connector/build/lib/superannotate-databricks-connector/vector.py +++ /dev/null @@ -1,241 +0,0 @@ -from pyspark.sql.types import ( - StructType, - StructField, - StringType, - IntegerType, - FloatType, - BooleanType, - MapType, - ArrayType -) - - -def get_comment_schema(): - comment_schema = StructType([ - StructField("correspondence", - ArrayType(MapType( - StringType(), - StringType())), - True), - StructField("x", FloatType(), True), - StructField("y", FloatType(), True), - StructField("resolved", BooleanType(), True), - StructField("createdAt", StringType(), True), - StructField("createdBy", MapType( - StringType(), - StringType()), - True), - StructField("creationType", StringType(), True), - StructField("updatedAt", StringType(), True), - StructField("updatedBy", MapType( - StringType(), - StringType()), - True) - ]) - return comment_schema - - -def get_point_schema(): - point_schema = StructType([ - StructField("x", FloatType(), True), - StructField("y", FloatType(), True) - ]) - return point_schema - - -def get_cuboid_schema(): - cuboid_points_schema = StructType([ - StructField("f1", get_point_schema(), True), - StructField("f2", get_point_schema(), True), - StructField("r1", get_point_schema(), True), - StructField("r2", get_point_schema(), True) - ]) - return cuboid_points_schema - - -def get_vector_instance_schema(): - instance_schema = StructType([ - StructField("instance_type", StringType(), True), - StructField("classId", IntegerType(), True), - StructField("probability", IntegerType(), True), - StructField("bbox_points", MapType(StringType(), - FloatType()), True), - StructField("polygon_points", ArrayType(FloatType()), True), - StructField("polygon_exclude", ArrayType( - ArrayType( - FloatType())), - True), - StructField("cuboid_points", get_cuboid_schema(), True), - StructField("ellipse_points", MapType(StringType(), - FloatType()), True), - StructField("point_points", MapType(StringType(), - FloatType()), True), - StructField("groupId", IntegerType(), True), - StructField("locked", BooleanType(), True), - StructField("attributes", ArrayType( - MapType(StringType(), - StringType())), True), - StructField("trackingId", StringType(), True), - StructField("error", StringType(), True), - StructField("createdAt", StringType(), True), - StructField("createdBy", MapType(StringType(), - StringType()), True), - StructField("creationType", StringType(), True), - StructField("updatedAt", StringType(), True), - StructField("updatedBy", MapType(StringType(), - StringType()), True), - StructField("className", StringType(), True) - ]) - return instance_schema - - -def get_boxes_schema(): - instance_schema = StructType([ - StructField("classes", ArrayType(IntegerType()), True), - StructField("boxes", ArrayType(ArrayType(IntegerType())), True) - ]) - return instance_schema - - -def get_vector_schema(): - schema = StructType([ - StructField("image_height", IntegerType(), True), - StructField("image_width", IntegerType(), True), - StructField("image_name", StringType(), True), - StructField("projectId", IntegerType(), True), - StructField("isPredicted", BooleanType(), True), - StructField("status", StringType(), True), - StructField("pinned", BooleanType(), True), - StructField("annotatorEmail", StringType(), True), - StructField("qaEmail", StringType(), True), - StructField("instances", ArrayType(get_vector_instance_schema()), - True), - StructField("bounding_boxes", ArrayType(IntegerType()), True), - StructField("comments", ArrayType(get_comment_schema()), True) - - ]) - return schema - - -def process_comment(comment): - comment["x"] = float(comment.get("x")) - comment["y"] = float(comment.get("y")) - return comment - - -def process_vector_instance(instance, custom_id_map=None): - """ - Takes one annotation instance and unpacks it. - - Args: - instance (dict). One instance from the SuperAnnotate annotation format. - - Returns: - (dict) reformated instance - """ - return { - 'instance_type': instance['type'], - 'classId': instance["classId"] if custom_id_map is None - else custom_id_map.get(instance["className"]), - 'probability': instance.get('probability'), - 'bbox_points': {k: float(v) for k, v in instance['points'].items()} - if instance["type"] == "bbox" else None, - 'polygon_points': [float(p) for p in instance['points']] - if instance["type"] == "polygon" else None, - 'polygon_exclude': instance["exclude"] - if instance["type"] == "polygon" else None, - 'point_points': {"x": float(instance["x"]), - "y": float(instance["y"]) - } if instance["type"] == "point" else None, - 'ellipse_points': {"cx": float(instance["cx"]), - "cy": float(instance["cy"]), - "rx": float(instance["rx"]), - "ry": float(instance["ry"]), - "angle": float(instance["angle"])} - if instance["type"] == "ellipse" else None, - 'cuboid_points': {outer_k: {inner_k: float(inner_v) - for inner_k, inner_v in outer_v.items()} - for outer_k, outer_v in instance['points'].items()} - if instance["type"] == "cuboid" else None, - 'groupId': instance['groupId'], - 'locked': instance.get('locked'), - 'attributes': instance['attributes'], - 'trackingId': instance.get('trackingId'), - 'error': instance.get('error'), - 'createdAt': instance.get('createdAt'), - 'createdBy': instance.get('createdBy'), - 'creationType': instance.get('creationType'), - 'updatedAt': instance.get('updatedAt'), - 'updatedBy': instance.get('updatedBy'), - 'className': instance.get('className') - } - - -def process_bounding_box(bbox, custom_id_map=None): - """Class that converts a bounding box and a class to - XYXYC format - - LEFT: left of the bounding box - TOP: top of the bounding box - RIGHT: right of the bounding box - BOTTOM: bottom of the bounding box - CLASS: class id of the bounding box""" - - object_box = [int(x) for x in [bbox["points"]["x1"], - bbox["points"]["y1"], - bbox["points"]["x2"], - bbox["points"]["y2"]]] - object_class = bbox["classId"] - if custom_id_map is not None: - object_class = custom_id_map.get(bbox["className"]) - object_box.append(object_class) - return object_box - - -def get_boxes(instances, custom_id_map=None): - """ - Takes all the instances and return the bounding boxes - as a one dimensional array with XYXYC format - """ - boxes = [] - for instance in instances: - if instance["type"] == "bbox": - ob = process_bounding_box(instance, custom_id_map) - boxes.extend(ob) - return boxes - - -def get_vector_dataframe(annotations, spark, custom_id_map=None): - """ - Transforms a list of SuperAnnotate annotations from a vector - project into a spark dataframe - - Args: - annotations (list[dict]): The annotations in the SuperAnnotate format - spark (sparkContext): The spark context - - Returns: - spark_df: A spark dataframe containing the annotations. - """ - rows = [] - for item in annotations: - flattened_item = { - "image_height": item["metadata"]["height"], - "image_width": item["metadata"]["width"], - "image_name": item["metadata"]["name"], - 'projectId': item["metadata"]['projectId'], - 'isPredicted': item["metadata"]['isPredicted'], - 'status': item["metadata"]['status'], - 'pinned': item["metadata"]['pinned'], - 'annotatorEmail': item["metadata"]['annotatorEmail'], - 'qaEmail': item["metadata"]['qaEmail'], - "instances": [process_vector_instance(instance, custom_id_map) - for instance in item["instances"]], - "bounding_boxes": get_boxes(item["instances"], custom_id_map), - "comments": [process_comment(comment) - for comment in item["comments"]] - } - rows.append(flattened_item) - schema = get_vector_schema() - spark_df = spark.createDataFrame(rows, schema=schema) - return spark_df diff --git a/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/__init__.py b/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/__init__.py deleted file mode 100644 index 0a141ae..0000000 --- a/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .io import * -from .vector import * -from .schemas import * \ No newline at end of file diff --git a/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/io.py b/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/io.py deleted file mode 100644 index a263dde..0000000 --- a/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/io.py +++ /dev/null @@ -1,57 +0,0 @@ -from delta.tables import DeltaTable -from pyspark.sql import DataFrame, SparkSession -import logging - - -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - - -def write_annotations_to_delta(annotations: DataFrame, database: str, table: str, spark: SparkSession) -> None: - """ - Takes a list of annotations and writes them to the specified delta table - - Args: - annotations (dataframe): Spark dataframe with annotations - database (string): The name of the database to write to - table (string): The name of the delta table to write to - spark (sparkContext): The spark context - create_if_not_exists (Bool): Should the function create the delta table if it does not exists, or raise an exception, - - Returns: - None - """ - logging.info('Writing annotations to delta...') - # Set the database and table names - database_name = database - table_name = f"{database_name}.{table}" - - # Check if the table exists - table_exists = spark.catalog.tableExists(table_name) - - if not table_exists: - if create_if_not_exists: - # If the table does not exist, create the database if it does not exist - logging.warning(f"{database_name}.{table} not found.") - logging.info("Creating new table..." ) - spark.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}") - # Write the DataFrame as a new Delta table in the specified schema and partitioned by projectId - annotations.write.format("delta").partitionBy("projectId").mode("overwrite").saveAsTable(table_name) - else: - logging.warning(f"{database_name}.{table} not found.") - raise Exception(f"{database_name}.{table} not found. To create table set the parameter create_if_not_exists=True") - else: - # If the table exists - logging.info("Merging new records..." ) - - # Read the managed Delta table - delta_table = DeltaTable.forName(spark, table_name) - - # Define the merge condition using unique identifiers - merge_condition = "source.image_name = target.image_name AND source.projectId = target.projectId" - - # Merge the DataFrame into the existing managed Delta table - delta_table.alias("target")\ - .merge(annotations.alias("source"), merge_condition)\ - .whenMatchedUpdateAll()\ - .whenNotMatchedInsertAll()\ - .execute() diff --git a/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/processing.py b/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/processing.py deleted file mode 100644 index 5375a44..0000000 --- a/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/processing.py +++ /dev/null @@ -1,125 +0,0 @@ -from .schemas import get_vector_schema - - -def process_comment(comment): - comment["x"] = float(comment.get("x")) - comment["y"] = float(comment.get("y")) - return comment - - -def process_vector_instance(instance, custom_id_map=None): - """ - Takes one annotation instance and unpacks it. - - Args: - instance (dict). One instance from the SuperAnnotate annotation format. - - Returns: - (dict) reformated instance - """ - return {'instance_type': instance['type'], - 'classId': instance["classId"] if custom_id_map is None else - custom_id_map.get(instance["className"]), - 'probability': instance.get('probability'), - 'bbox_points': {k: float(v) for k, v in - instance['points'].items()} - if instance["type"] == "bbox" else None, - 'polygon_points': [float(p) for p in instance['points']] - if instance["type"] == "polygon" else None, - 'polygon_exclude': instance["exclude"] - if instance["type"] == "polygon" else None, - 'point_points': {"x": float(instance["x"]), - "y": float(instance["y"])} - if instance["type"] == "point" else None, - 'ellipse_points': {"cx": float(instance["cx"]), - "cy": float(instance["cy"]), - "rx": float(instance["rx"]), - "ry": float(instance["ry"]), - "angle": float(instance["angle"])} - if instance["type"] == "ellipse" else None, - 'cuboid_points': {outer_k: {inner_k: float(inner_v) - for inner_k, inner_v in outer_v.items() - } for outer_k, outer_v - in instance['points'].items()} - if instance["type"] == "cuboid" else None, - 'groupId': instance['groupId'], - 'locked': instance.get('locked'), - 'attributes': instance['attributes'], - 'trackingId': instance.get('trackingId'), - 'error': instance.get('error'), - 'createdAt': instance.get('createdAt'), - 'createdBy': instance.get('createdBy'), - 'creationType': instance.get('creationType'), - 'updatedAt': instance.get('updatedAt'), - 'updatedBy': instance.get('updatedBy'), - 'className': instance.get('className') - } - - -def process_bounding_box(bbox, custom_id_map=None): - """XYXY class - keras_cv.bounding_box.XYXY() - XYXY contains axis indices for the XYXY format. - - All values in the XYXY format should be absolute pixel values. - - The XYXY format consists of the following required indices: - - LEFT: left of the bounding box - TOP: top of the bounding box - RIGHT: right of the bounding box - BOTTOM: bottom of the bounding box""" - - object_box = [int(x) for x in [bbox["points"]["x1"], - bbox["points"]["y1"], - bbox["points"]["x2"], - bbox["points"]["y2"]]] - object_class = bbox["classId"] if custom_id_map is None else custom_id_map.get(bbox["className"]) - return object_box, object_class - - -def get_boxes(instances, custom_id_map=None): - boxes = [] - classes = [] - for instance in instances: - if instance["type"] == "bbox": - ob, oc = process_bounding_box(instance, custom_id_map) - boxes.append(ob) - classes.append(oc) - return {"classes": classes, "boxes": boxes} - - -def get_vector_dataframe(annotations, spark, custom_id_map=None): - """ - Transforms a list of SuperAnnotate annotations from a vector - project into a spark dataframe - - Args: - annotations (list[dict]): The annotations in the SuperAnnotate format - spark (sparkContext): The spark context - - Returns: - spark_df: A spark dataframe containing the annotations. - """ - rows = [] - for item in annotations: - flattened_item = { - "image_height": item["metadata"]["height"], - "image_width": item["metadata"]["width"], - "image_name": item["metadata"]["name"], - 'projectId': item["metadata"]['projectId'], - 'isPredicted': item["metadata"]['isPredicted'], - 'status': item["metadata"]['status'], - 'pinned': item["metadata"]['pinned'], - 'annotatorEmail': item["metadata"]['annotatorEmail'], - 'qaEmail': item["metadata"]['qaEmail'], - "instances": [process_vector_instance(instance, custom_id_map) - for instance in item["instances"]], - "bounding_boxes": get_boxes(item["instances"], custom_id_map), - "comments": [process_comment(comment) - for comment in item["comments"]] - } - rows.append(flattened_item) - schema = get_vector_schema() - spark_df = spark.createDataFrame(rows, schema=schema) - return spark_df diff --git a/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/schemas.py b/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/schemas.py deleted file mode 100644 index a878d6a..0000000 --- a/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/schemas.py +++ /dev/null @@ -1,89 +0,0 @@ -from pyspark.sql.types import ( - StructType, StructField, StringType, IntegerType, FloatType, BooleanType, TimestampType, MapType, ArrayType -) - -def get_comment_schema(): - comment_schema = StructType([ - StructField("correspondence", ArrayType(MapType(StringType(),StringType())),True), - StructField("x",FloatType(),True), - StructField("y",FloatType(),True), - StructField("resolved",BooleanType(),True), - StructField("createdAt",StringType(),True), - StructField("createdBy",MapType(StringType(),StringType()),True), - StructField("creationType",StringType(),True), - StructField("updatedAt",StringType(),True), - StructField("updatedBy",MapType(StringType(),StringType()),True) - ]) - return comment_schema - - - -def get_point_schema(): - point_schema = StructType([ - StructField("x", FloatType(), True), - StructField("y", FloatType(), True) - ]) - return point_schema - - -def get_cuboid_schema(): - cuboid_points_schema = StructType([ - StructField("f1", get_point_schema(), True), - StructField("f2", get_point_schema(), True), - StructField("r1", get_point_schema(), True), - StructField("r2", get_point_schema(), True) - ]) - return cuboid_points_schema - - -def get_vector_instance_schema(): - instance_schema = StructType([ - StructField("instance_type", StringType(), True), - StructField("classId", IntegerType(), True), - StructField("probability", IntegerType(), True), - StructField("bbox_points", MapType(StringType(), FloatType()), True), - StructField("polygon_points", ArrayType(FloatType()), True), - StructField("polygon_exclude", ArrayType(ArrayType(FloatType())),True), - StructField("cuboid_points", get_cuboid_schema(), True), - StructField("ellipse_points", MapType(StringType(),FloatType()),True), - StructField("point_points", MapType(StringType(),FloatType()),True), - StructField("groupId", IntegerType(), True), - StructField("locked", BooleanType(), True), - StructField("attributes", ArrayType(MapType(StringType(), StringType())), True), - StructField("trackingId", StringType(), True), - StructField("error", StringType(), True), - StructField("createdAt", StringType(), True), - StructField("createdBy", MapType(StringType(), StringType()), True), - StructField("creationType", StringType(), True), - StructField("updatedAt", StringType(), True), - StructField("updatedBy", MapType(StringType(), StringType()), True), - StructField("className", StringType(), True) - ]) - return instance_schema - - -def get_boxes_schema(): - instance_schema = StructType([ - StructField("classes",ArrayType(IntegerType()),True), - StructField("boxes",ArrayType(ArrayType(IntegerType())),True) - ]) - return instance_schema - - -def get_vector_schema(): - schema = StructType([ - StructField("image_height", IntegerType(), True), - StructField("image_width", IntegerType(), True), - StructField("image_name", StringType(), True), - StructField("projectId", IntegerType(), True), - StructField("isPredicted", BooleanType(), True), - StructField("status", StringType(), True), - StructField("pinned", BooleanType(), True), - StructField("annotatorEmail", StringType(), True), - StructField("qaEmail", StringType(), True), - StructField("instances", ArrayType(get_vector_instance_schema()), True), - StructField("bounding_boxes",get_boxes_schema(),True), - StructField("comments", ArrayType(get_comment_schema()),True) - - ]) - return schema \ No newline at end of file diff --git a/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/vector.py b/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/vector.py deleted file mode 100644 index c641e79..0000000 --- a/superannotate-databricks-connector/build/lib/superannotate_databricks_connector/vector.py +++ /dev/null @@ -1,241 +0,0 @@ -from pyspark.sql.types import ( - StructType, - StructField, - StringType, - IntegerType, - FloatType, - BooleanType, - MapType, - ArrayType -) - - -def get_comment_schema(): - comment_schema = StructType([ - StructField("correspondence", - ArrayType(MapType( - StringType(), - StringType())), - True), - StructField("x", FloatType(), True), - StructField("y", FloatType(), True), - StructField("resolved", BooleanType(), True), - StructField("createdAt", StringType(), True), - StructField("createdBy", MapType( - StringType(), - StringType()), - True), - StructField("creationType", StringType(), True), - StructField("updatedAt", StringType(), True), - StructField("updatedBy", MapType( - StringType(), - StringType()), - True) - ]) - return comment_schema - - -def get_point_schema(): - point_schema = StructType([ - StructField("x", FloatType(), True), - StructField("y", FloatType(), True) - ]) - return point_schema - - -def get_cuboid_schema(): - cuboid_points_schema = StructType([ - StructField("f1", get_point_schema(), True), - StructField("f2", get_point_schema(), True), - StructField("r1", get_point_schema(), True), - StructField("r2", get_point_schema(), True) - ]) - return cuboid_points_schema - - -def get_vector_instance_schema(): - instance_schema = StructType([ - StructField("instance_type", StringType(), True), - StructField("classId", IntegerType(), True), - StructField("probability", IntegerType(), True), - StructField("bbox_points", MapType(StringType(), - FloatType()), True), - StructField("polygon_points", ArrayType(FloatType()), True), - StructField("polygon_exclude", ArrayType( - ArrayType( - FloatType())), - True), - StructField("cuboid_points", get_cuboid_schema(), True), - StructField("ellipse_points", MapType(StringType(), - FloatType()), True), - StructField("point_points", MapType(StringType(), - FloatType()), True), - StructField("groupId", IntegerType(), True), - StructField("locked", BooleanType(), True), - StructField("attributes", ArrayType( - MapType(StringType(), - StringType())), True), - StructField("trackingId", StringType(), True), - StructField("error", StringType(), True), - StructField("createdAt", StringType(), True), - StructField("createdBy", MapType(StringType(), - StringType()), True), - StructField("creationType", StringType(), True), - StructField("updatedAt", StringType(), True), - StructField("updatedBy", MapType(StringType(), - StringType()), True), - StructField("className", StringType(), True) - ]) - return instance_schema - - -def get_boxes_schema(): - instance_schema = StructType([ - StructField("classes", ArrayType(IntegerType()), True), - StructField("boxes", ArrayType(ArrayType(IntegerType())), True) - ]) - return instance_schema - - -def get_vector_schema(): - schema = StructType([ - StructField("image_height", IntegerType(), True), - StructField("image_width", IntegerType(), True), - StructField("image_name", StringType(), True), - StructField("projectId", IntegerType(), True), - StructField("isPredicted", BooleanType(), True), - StructField("status", StringType(), True), - StructField("pinned", BooleanType(), True), - StructField("annotatorEmail", StringType(), True), - StructField("qaEmail", StringType(), True), - StructField("instances", ArrayType(get_vector_instance_schema()), - True), - StructField("bounding_boxes", ArrayType(IntegerType()), True), - StructField("comments", ArrayType(get_comment_schema()), True) - - ]) - return schema - - -def process_comment(comment): - comment["x"] = float(comment.get("x")) - comment["y"] = float(comment.get("y")) - return comment - - -def process_vector_instance(instance, custom_id_map=None): - """ - Takes one annotation instance and unpacks it. - - Args: - instance (dict). One instance from the SuperAnnotate annotation format. - - Returns: - (dict) reformated instance - """ - return { - 'instance_type': instance['type'], - 'classId': instance["classId"] if custom_id_map is None - else custom_id_map.get(instance["className"]), - 'probability': instance.get('probability'), - 'bbox_points': {k: float(v) for k, v in instance['points'].items()} - if instance["type"] == "bbox" else None, - 'polygon_points': [float(p) for p in instance['points']] - if instance["type"] == "polygon" else None, - 'polygon_exclude': instance["exclude"] - if instance["type"] == "polygon" else None, - 'point_points': {"x": float(instance["x"]), - "y": float(instance["y"]) - } if instance["type"] == "point" else None, - 'ellipse_points': {"cx": float(instance["cx"]), - "cy": float(instance["cy"]), - "rx": float(instance["rx"]), - "ry": float(instance["ry"]), - "angle": float(instance["angle"])} - if instance["type"] == "ellipse" else None, - 'cuboid_points': {outer_k: {inner_k: float(inner_v) - for inner_k, inner_v in outer_v.items()} - for outer_k, outer_v in instance['points'].items()} - if instance["type"] == "cuboid" else None, - 'groupId': instance['groupId'], - 'locked': instance.get('locked'), - 'attributes': instance['attributes'], - 'trackingId': instance.get('trackingId'), - 'error': instance.get('error'), - 'createdAt': instance.get('createdAt'), - 'createdBy': instance.get('createdBy'), - 'creationType': instance.get('creationType'), - 'updatedAt': instance.get('updatedAt'), - 'updatedBy': instance.get('updatedBy'), - 'className': instance.get('className') - } - - -def process_bounding_box(bbox, custom_id_map=None): - """Class that converts a bounding box and a class to - XYXYC format - - LEFT: left of the bounding box - TOP: top of the bounding box - RIGHT: right of the bounding box - BOTTOM: bottom of the bounding box - CLASS: class id of the bounding box""" - - object_box = [int(x) for x in [bbox["points"]["x1"], - bbox["points"]["y1"], - bbox["points"]["x2"], - bbox["points"]["y2"]]] - object_class = bbox["classId"] - if custom_id_map is not None: - object_class = custom_id_map.get(bbox["className"]) - object_box.append(object_class) - return object_box - - -def get_boxes(instances, custom_id_map=None): - """ - Takes all the instances and return the bounding boxes - as a one dimensional array with XYXYC format - """ - boxes = [] - for instance in instances: - if instance["type"] == "bbox": - ob = process_bounding_box(instance, custom_id_map) - boxes.extend(ob) - return boxes - - -def get_vector_dataframe(annotations, spark, custom_id_map=None): - """ - Transforms a list of SuperAnnotate annotations from a vector - project into a spark dataframe - - Args: - annotations (list[dict]): The annotations in the SuperAnnotate format - spark (sparkContext): The spark context - - Returns: - spark_df: A spark dataframe containing the annotations. - """ - rows = [] - for item in annotations: - flattened_item = { - "image_height": item["metadata"]["height"], - "image_width": item["metadata"]["width"], - "image_name": item["metadata"]["name"], - 'projectId': item["metadata"]['projectId'], - 'isPredicted': item["metadata"]['isPredicted'], - 'status': item["metadata"]['status'], - 'pinned': item["metadata"]['pinned'], - 'annotatorEmail': item["metadata"]['annotatorEmail'], - 'qaEmail': item["metadata"]['qaEmail'], - "instances": [process_vector_instance(instance, custom_id_map) - for instance in item["instances"]], - "bounding_boxes": get_boxes(item["instances"], custom_id_map), - "comments": [process_comment(comment) - for comment in item["comments"]] - } - rows.append(flattened_item) - schema = get_vector_schema() - spark_df = spark.createDataFrame(rows, schema=schema) - return spark_df diff --git a/superannotate-databricks-connector/build/lib/superspark/__init__.py b/superannotate-databricks-connector/build/lib/superspark/__init__.py deleted file mode 100644 index 9124cea..0000000 --- a/superannotate-databricks-connector/build/lib/superspark/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .io import * -from .processing import * -from .schemas import * \ No newline at end of file diff --git a/superannotate-databricks-connector/build/lib/superspark/io.py b/superannotate-databricks-connector/build/lib/superspark/io.py deleted file mode 100644 index a263dde..0000000 --- a/superannotate-databricks-connector/build/lib/superspark/io.py +++ /dev/null @@ -1,57 +0,0 @@ -from delta.tables import DeltaTable -from pyspark.sql import DataFrame, SparkSession -import logging - - -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - - -def write_annotations_to_delta(annotations: DataFrame, database: str, table: str, spark: SparkSession) -> None: - """ - Takes a list of annotations and writes them to the specified delta table - - Args: - annotations (dataframe): Spark dataframe with annotations - database (string): The name of the database to write to - table (string): The name of the delta table to write to - spark (sparkContext): The spark context - create_if_not_exists (Bool): Should the function create the delta table if it does not exists, or raise an exception, - - Returns: - None - """ - logging.info('Writing annotations to delta...') - # Set the database and table names - database_name = database - table_name = f"{database_name}.{table}" - - # Check if the table exists - table_exists = spark.catalog.tableExists(table_name) - - if not table_exists: - if create_if_not_exists: - # If the table does not exist, create the database if it does not exist - logging.warning(f"{database_name}.{table} not found.") - logging.info("Creating new table..." ) - spark.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}") - # Write the DataFrame as a new Delta table in the specified schema and partitioned by projectId - annotations.write.format("delta").partitionBy("projectId").mode("overwrite").saveAsTable(table_name) - else: - logging.warning(f"{database_name}.{table} not found.") - raise Exception(f"{database_name}.{table} not found. To create table set the parameter create_if_not_exists=True") - else: - # If the table exists - logging.info("Merging new records..." ) - - # Read the managed Delta table - delta_table = DeltaTable.forName(spark, table_name) - - # Define the merge condition using unique identifiers - merge_condition = "source.image_name = target.image_name AND source.projectId = target.projectId" - - # Merge the DataFrame into the existing managed Delta table - delta_table.alias("target")\ - .merge(annotations.alias("source"), merge_condition)\ - .whenMatchedUpdateAll()\ - .whenNotMatchedInsertAll()\ - .execute() diff --git a/superannotate-databricks-connector/build/lib/superspark/processing.py b/superannotate-databricks-connector/build/lib/superspark/processing.py deleted file mode 100644 index 93b78e7..0000000 --- a/superannotate-databricks-connector/build/lib/superspark/processing.py +++ /dev/null @@ -1,113 +0,0 @@ -from .schemas import get_vector_schema - - -def process_comment(comment): - comment["x"] = float(comment.get("x")) - comment["y"] = float(comment.get("y")) - return comment - - -def process_vector_instance(instance,custom_id_map=None): - """ - Takes one annotation instance and unpacks it. - - Args: - instance (dict). One instance from the SuperAnnotate annotation format. - - Returns: - (dict) reformated instance - """ - return { - 'instance_type': instance['type'], - 'classId': instance["classId"] if custom_id_map is None else custom_id_map.get(instance["className"]), - 'probability': instance.get('probability'), - 'bbox_points': {k:float(v) for k,v in instance['points'].items()} if instance["type"] == "bbox" else None, - 'polygon_points': [float(p) for p in instance['points']] if instance["type"] == "polygon" else None, - 'polygon_exclude': instance["exclude"] if instance["type"] == "polygon" else None, - 'point_points':{"x":float(instance["x"]), - "y":float(instance["y"])} if instance["type"] == "point" else None, - 'ellipse_points':{"cx":float(instance["cx"]), - "cy":float(instance["cy"]), - "rx":float(instance["rx"]), - "ry":float(instance["ry"]), - "angle":float(instance["angle"])} if instance["type"] == "ellipse" else None, - 'cuboid_points':{outer_k: {inner_k: float(inner_v) for inner_k, inner_v in outer_v.items()} for outer_k, outer_v in d['points'].items()} if instance["type"] == "cuboid" else None, - 'groupId': instance['groupId'], - 'locked': instance.get('locked'), - 'attributes': instance['attributes'], - 'trackingId': instance.get('trackingId'), - 'error': instance.get('error'), - 'createdAt': instance.get('createdAt'), - 'createdBy': instance.get('createdBy'), - 'creationType': instance.get('creationType'), - 'updatedAt': instance.get('updatedAt'), - 'updatedBy': instance.get('updatedBy'), - 'className': instance.get('className') - } - - -def process_bounding_box(bbox,custom_id_map=None): - """XYXY class - keras_cv.bounding_box.XYXY() - XYXY contains axis indices for the XYXY format. - - All values in the XYXY format should be absolute pixel values. - - The XYXY format consists of the following required indices: - - LEFT: left of the bounding box - TOP: top of the bounding box - RIGHT: right of the bounding box - BOTTOM: bottom of the bounding box""" - - object_box = [int(x) for x in [bbox["points"]["x1"], - bbox["points"]["y1"], - bbox["points"]["x2"], - bbox["points"]["y2"]]] - object_class = bbox["classId"] if custom_id_map is None else custom_id_map.get(bbox["className"]) - return object_box,object_class - -def get_boxes(instances,custom_id_map=None): - boxes = [] - classes = [] - for instance in instances: - if instance["type"] == "bbox": - ob,oc = process_bounding_box(instance,custom_id_map) - boxes.append(ob) - classes.append(oc) - return {"classes":classes,"boxes":boxes} - - -def get_vector_dataframe(annotations,spark,custom_id_map = None): - print(custom_id_map) - """ - Transforms a list of SuperAnnotate annotations from a vector - project into a spark dataframe - - Args: - annotations (list[dict]): The annotations in the SuperAnnotate format - spark (sparkContext): The spark context - - Returns: - spark_df: A spark dataframe containing the annotations. - """ - rows = [] - for item in annotations: - flattened_item = { - "image_height": item["metadata"]["height"], - "image_width": item["metadata"]["width"], - "image_name": item["metadata"]["name"], - 'projectId': item["metadata"]['projectId'], - 'isPredicted': item["metadata"]['isPredicted'], - 'status': item["metadata"]['status'], - 'pinned': item["metadata"]['pinned'], - 'annotatorEmail': item["metadata"]['annotatorEmail'], - 'qaEmail': item["metadata"]['qaEmail'], - "instances": [process_vector_instance(instance,custom_id_map) for instance in item["instances"]], - "bounding_boxes":get_boxes(item["instances"],custom_id_map), - "comments":[process_comment(comment) for comment in item["comments"]] - } - rows.append(flattened_item) - schema = get_vector_schema() - spark_df = spark.createDataFrame(rows, schema=schema) - return spark_df \ No newline at end of file diff --git a/superannotate-databricks-connector/build/lib/superspark/schemas.py b/superannotate-databricks-connector/build/lib/superspark/schemas.py deleted file mode 100644 index a878d6a..0000000 --- a/superannotate-databricks-connector/build/lib/superspark/schemas.py +++ /dev/null @@ -1,89 +0,0 @@ -from pyspark.sql.types import ( - StructType, StructField, StringType, IntegerType, FloatType, BooleanType, TimestampType, MapType, ArrayType -) - -def get_comment_schema(): - comment_schema = StructType([ - StructField("correspondence", ArrayType(MapType(StringType(),StringType())),True), - StructField("x",FloatType(),True), - StructField("y",FloatType(),True), - StructField("resolved",BooleanType(),True), - StructField("createdAt",StringType(),True), - StructField("createdBy",MapType(StringType(),StringType()),True), - StructField("creationType",StringType(),True), - StructField("updatedAt",StringType(),True), - StructField("updatedBy",MapType(StringType(),StringType()),True) - ]) - return comment_schema - - - -def get_point_schema(): - point_schema = StructType([ - StructField("x", FloatType(), True), - StructField("y", FloatType(), True) - ]) - return point_schema - - -def get_cuboid_schema(): - cuboid_points_schema = StructType([ - StructField("f1", get_point_schema(), True), - StructField("f2", get_point_schema(), True), - StructField("r1", get_point_schema(), True), - StructField("r2", get_point_schema(), True) - ]) - return cuboid_points_schema - - -def get_vector_instance_schema(): - instance_schema = StructType([ - StructField("instance_type", StringType(), True), - StructField("classId", IntegerType(), True), - StructField("probability", IntegerType(), True), - StructField("bbox_points", MapType(StringType(), FloatType()), True), - StructField("polygon_points", ArrayType(FloatType()), True), - StructField("polygon_exclude", ArrayType(ArrayType(FloatType())),True), - StructField("cuboid_points", get_cuboid_schema(), True), - StructField("ellipse_points", MapType(StringType(),FloatType()),True), - StructField("point_points", MapType(StringType(),FloatType()),True), - StructField("groupId", IntegerType(), True), - StructField("locked", BooleanType(), True), - StructField("attributes", ArrayType(MapType(StringType(), StringType())), True), - StructField("trackingId", StringType(), True), - StructField("error", StringType(), True), - StructField("createdAt", StringType(), True), - StructField("createdBy", MapType(StringType(), StringType()), True), - StructField("creationType", StringType(), True), - StructField("updatedAt", StringType(), True), - StructField("updatedBy", MapType(StringType(), StringType()), True), - StructField("className", StringType(), True) - ]) - return instance_schema - - -def get_boxes_schema(): - instance_schema = StructType([ - StructField("classes",ArrayType(IntegerType()),True), - StructField("boxes",ArrayType(ArrayType(IntegerType())),True) - ]) - return instance_schema - - -def get_vector_schema(): - schema = StructType([ - StructField("image_height", IntegerType(), True), - StructField("image_width", IntegerType(), True), - StructField("image_name", StringType(), True), - StructField("projectId", IntegerType(), True), - StructField("isPredicted", BooleanType(), True), - StructField("status", StringType(), True), - StructField("pinned", BooleanType(), True), - StructField("annotatorEmail", StringType(), True), - StructField("qaEmail", StringType(), True), - StructField("instances", ArrayType(get_vector_instance_schema()), True), - StructField("bounding_boxes",get_boxes_schema(),True), - StructField("comments", ArrayType(get_comment_schema()),True) - - ]) - return schema \ No newline at end of file diff --git a/superannotate-databricks-connector/main.py b/superannotate-databricks-connector/main.py deleted file mode 100644 index dd20ecf..0000000 --- a/superannotate-databricks-connector/main.py +++ /dev/null @@ -1,16 +0,0 @@ -from pyspark.sql import SparkSession - -spark = SparkSession.builder.appName("Test Application").getOrCreate() - -simpleData = (("Java", 4000, 5), - ("Python", 4600, 10), - ("Scala", 4100, 15), - ("Scala", 4500, 15), - ("PHP", 3000, 20)) - -columns = ["CourseName", "fee", "discount %"] - -# Create DataFrame -df = spark.createDataFrame(data=simpleData, schema=columns) -df.printSchema() -df.show(truncate=False) diff --git a/superannotate-databricks-connector/requirements.txt b/superannotate-databricks-connector/requirements.txt deleted file mode 100644 index e69de29..0000000 diff --git a/superannotate-databricks-connector/setup.py b/superannotate-databricks-connector/setup.py deleted file mode 100644 index 005f09d..0000000 --- a/superannotate-databricks-connector/setup.py +++ /dev/null @@ -1,10 +0,0 @@ -from setuptools import setup - -setup(name='superannotate_databricks_connector', - version='0.3', - description='Custom functions to work with SuperAnnotate in Databricks', - url='http://superannotate.com', - author='Leo Lindén', - author_email='leo@superannotate.com', - license='All Rights Reserved', - packages=['superannotate_databricks_connector']) diff --git a/superannotate-databricks-connector/superannotate_databricks_connector.egg-info/PKG-INFO b/superannotate-databricks-connector/superannotate_databricks_connector.egg-info/PKG-INFO deleted file mode 100644 index 38684d1..0000000 --- a/superannotate-databricks-connector/superannotate_databricks_connector.egg-info/PKG-INFO +++ /dev/null @@ -1,8 +0,0 @@ -Metadata-Version: 2.1 -Name: superannotate-databricks-connector -Version: 0.3 -Summary: Custom functions to work with SuperAnnotate in Databricks -Home-page: http://superannotate.com -Author: Leo Lindén -Author-email: leo@superannotate.com -License: All Rights Reserved diff --git a/superannotate-databricks-connector/superannotate_databricks_connector.egg-info/SOURCES.txt b/superannotate-databricks-connector/superannotate_databricks_connector.egg-info/SOURCES.txt deleted file mode 100644 index 32c3a99..0000000 --- a/superannotate-databricks-connector/superannotate_databricks_connector.egg-info/SOURCES.txt +++ /dev/null @@ -1,10 +0,0 @@ -setup.py -superannotate_databricks_connector/__init__.py -superannotate_databricks_connector/io.py -superannotate_databricks_connector/processing.py -superannotate_databricks_connector/schemas.py -superannotate_databricks_connector/vector.py -superannotate_databricks_connector.egg-info/PKG-INFO -superannotate_databricks_connector.egg-info/SOURCES.txt -superannotate_databricks_connector.egg-info/dependency_links.txt -superannotate_databricks_connector.egg-info/top_level.txt \ No newline at end of file diff --git a/superannotate-databricks-connector/superannotate_databricks_connector.egg-info/dependency_links.txt b/superannotate-databricks-connector/superannotate_databricks_connector.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/superannotate-databricks-connector/superannotate_databricks_connector.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/superannotate-databricks-connector/superannotate_databricks_connector.egg-info/top_level.txt b/superannotate-databricks-connector/superannotate_databricks_connector.egg-info/top_level.txt deleted file mode 100644 index c487ce4..0000000 --- a/superannotate-databricks-connector/superannotate_databricks_connector.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -superannotate_databricks_connector diff --git a/superannotate-databricks-connector/superannotate_databricks_connector/schemas/__init__.py b/superannotate-databricks-connector/superannotate_databricks_connector/schemas/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/superannotate-databricks-connector/test.py b/superannotate-databricks-connector/test.py deleted file mode 100644 index 2d6950d..0000000 --- a/superannotate-databricks-connector/test.py +++ /dev/null @@ -1 +0,0 @@ -from superspark import * \ No newline at end of file diff --git a/superannotate-databricks-connector/tests/.DS_Store b/superannotate-databricks-connector/tests/.DS_Store deleted file mode 100644 index 593d828..0000000 Binary files a/superannotate-databricks-connector/tests/.DS_Store and /dev/null differ diff --git a/superannotate-databricks-connector/tests/__init__.py b/superannotate-databricks-connector/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..eb87a5f --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,5 @@ +from pathlib import Path + +DATA_SET_PATH = str(Path(__file__).parent / "test_data") + +__all__ = ["DATA_SET_PATH"] diff --git a/superannotate-databricks-connector/tests/test_data/text/example_annotation.json b/tests/test_data/text/example_annotation.json similarity index 100% rename from superannotate-databricks-connector/tests/test_data/text/example_annotation.json rename to tests/test_data/text/example_annotation.json diff --git a/superannotate-databricks-connector/tests/test_data/text/expected_df.parquet/._SUCCESS.crc b/tests/test_data/text/expected_df.parquet/._SUCCESS.crc similarity index 100% rename from superannotate-databricks-connector/tests/test_data/text/expected_df.parquet/._SUCCESS.crc rename to tests/test_data/text/expected_df.parquet/._SUCCESS.crc diff --git a/superannotate-databricks-connector/tests/test_data/text/expected_df.parquet/.part-00000-7f6ed633-a832-4449-a980-3061e425daf2-c000.snappy.parquet.crc b/tests/test_data/text/expected_df.parquet/.part-00000-7f6ed633-a832-4449-a980-3061e425daf2-c000.snappy.parquet.crc similarity index 100% rename from superannotate-databricks-connector/tests/test_data/text/expected_df.parquet/.part-00000-7f6ed633-a832-4449-a980-3061e425daf2-c000.snappy.parquet.crc rename to tests/test_data/text/expected_df.parquet/.part-00000-7f6ed633-a832-4449-a980-3061e425daf2-c000.snappy.parquet.crc diff --git a/superannotate-databricks-connector/tests/test_data/text/expected_df.parquet/_SUCCESS b/tests/test_data/text/expected_df.parquet/_SUCCESS similarity index 100% rename from superannotate-databricks-connector/tests/test_data/text/expected_df.parquet/_SUCCESS rename to tests/test_data/text/expected_df.parquet/_SUCCESS diff --git a/superannotate-databricks-connector/tests/test_data/text/expected_df.parquet/part-00000-7f6ed633-a832-4449-a980-3061e425daf2-c000.snappy.parquet b/tests/test_data/text/expected_df.parquet/part-00000-7f6ed633-a832-4449-a980-3061e425daf2-c000.snappy.parquet similarity index 100% rename from superannotate-databricks-connector/tests/test_data/text/expected_df.parquet/part-00000-7f6ed633-a832-4449-a980-3061e425daf2-c000.snappy.parquet rename to tests/test_data/text/expected_df.parquet/part-00000-7f6ed633-a832-4449-a980-3061e425daf2-c000.snappy.parquet diff --git a/superannotate-databricks-connector/tests/test_data/vector/example_annotation.json b/tests/test_data/vector/example_annotation.json similarity index 100% rename from superannotate-databricks-connector/tests/test_data/vector/example_annotation.json rename to tests/test_data/vector/example_annotation.json diff --git a/superannotate-databricks-connector/tests/test_data/vector/expected_df.parquet/.part-00000-d8491445-54cc-4f7e-8293-a3d205e2d9a5-c000.snappy.parquet.crc b/tests/test_data/vector/expected_df.parquet/.part-00000-d8491445-54cc-4f7e-8293-a3d205e2d9a5-c000.snappy.parquet.crc similarity index 100% rename from superannotate-databricks-connector/tests/test_data/vector/expected_df.parquet/.part-00000-d8491445-54cc-4f7e-8293-a3d205e2d9a5-c000.snappy.parquet.crc rename to tests/test_data/vector/expected_df.parquet/.part-00000-d8491445-54cc-4f7e-8293-a3d205e2d9a5-c000.snappy.parquet.crc diff --git a/superannotate-databricks-connector/tests/test_data/vector/expected_df.parquet/_SUCCESS b/tests/test_data/vector/expected_df.parquet/_SUCCESS similarity index 100% rename from superannotate-databricks-connector/tests/test_data/vector/expected_df.parquet/_SUCCESS rename to tests/test_data/vector/expected_df.parquet/_SUCCESS diff --git a/superannotate-databricks-connector/tests/test_data/vector/expected_df.parquet/part-00000-d8491445-54cc-4f7e-8293-a3d205e2d9a5-c000.snappy.parquet b/tests/test_data/vector/expected_df.parquet/part-00000-d8491445-54cc-4f7e-8293-a3d205e2d9a5-c000.snappy.parquet similarity index 100% rename from superannotate-databricks-connector/tests/test_data/vector/expected_df.parquet/part-00000-d8491445-54cc-4f7e-8293-a3d205e2d9a5-c000.snappy.parquet rename to tests/test_data/vector/expected_df.parquet/part-00000-d8491445-54cc-4f7e-8293-a3d205e2d9a5-c000.snappy.parquet diff --git a/superannotate-databricks-connector/tests/test_data/vector/expected_instances.json b/tests/test_data/vector/expected_instances.json similarity index 100% rename from superannotate-databricks-connector/tests/test_data/vector/expected_instances.json rename to tests/test_data/vector/expected_instances.json diff --git a/superannotate-databricks-connector/tests/test_text.py b/tests/test_text.py similarity index 59% rename from superannotate-databricks-connector/tests/test_text.py rename to tests/test_text.py index 2d6bb3e..34be759 100644 --- a/superannotate-databricks-connector/tests/test_text.py +++ b/tests/test_text.py @@ -1,7 +1,10 @@ -import unittest import json +import os.path +import unittest + +from tests import DATA_SET_PATH from pyspark.sql import SparkSession -from superannotate_databricks_connector.text import ( +from src.superannotate_databricks_connector.text import ( get_text_dataframe ) @@ -9,17 +12,11 @@ class TestVectorDataFrame(unittest.TestCase): def test_vector_dataframe(self): spark = SparkSession.builder.master("local").getOrCreate() - with open("./tests/test_data/text/example_annotation.json", - "r") as f: + with open(os.path.join(DATA_SET_PATH, "text/example_annotation.json"), "r") as f: data = json.load(f) actual_df = get_text_dataframe([data], spark) - expected_df = spark.read.parquet( - "./tests/test_data/text/expected_df.parquet") + expected_df = spark.read.parquet(os.path.join(DATA_SET_PATH, "text/expected_df.parquet")) self.assertEqual(sorted(actual_df.collect()), sorted(expected_df.collect())) - - -if __name__ == '__main__': - unittest.main() diff --git a/superannotate-databricks-connector/tests/test_vector.py b/tests/test_vector.py similarity index 87% rename from superannotate-databricks-connector/tests/test_vector.py rename to tests/test_vector.py index d187f32..971f0e9 100644 --- a/superannotate-databricks-connector/tests/test_vector.py +++ b/tests/test_vector.py @@ -1,5 +1,9 @@ -import unittest import json +import os.path +import unittest + +from tests import DATA_SET_PATH + from pyspark.sql import SparkSession from superannotate_databricks_connector.vector import ( process_bounding_box, @@ -13,13 +17,11 @@ class TestVectorInstances(unittest.TestCase): def __init__(self, *args): super().__init__(*args) - with open("./tests/test_data/vector/example_annotation.json", - "r") as f: + with open(os.path.join(DATA_SET_PATH, "vector/example_annotation.json"), "r") as f: data = json.load(f) target_data = [] - with open('./tests/test_data/vector/expected_instances.json', - "r") as f: + with open(os.path.join(DATA_SET_PATH, 'vector/expected_instances.json'),"r") as f: for line in f: target_data.append(json.loads(line)) @@ -94,7 +96,7 @@ def test_get_boxes(self): "y1": 2.1, "y2": 18.9 }, - "classId": 10229}] + "classId": 10229}] target = [2, 1, 13, 22, 10228, 3, 2, 4, 19, 10229] self.assertEqual(get_boxes(instances), target) @@ -102,14 +104,12 @@ def test_get_boxes(self): class TestVectorDataFrame(unittest.TestCase): def test_vector_dataframe(self): spark = SparkSession.builder.master("local").getOrCreate() - with open("./tests/test_data/vector/example_annotation.json", - "r") as f: + with open(os.path.join(DATA_SET_PATH, "vector/example_annotation.json"),"r") as f: data = json.load(f) actual_df = get_vector_dataframe([data], spark) - expected_df = spark.read.parquet( - "./tests/test_data/vector/expected_df.parquet") + expected_df = spark.read.parquet(os.path.join(DATA_SET_PATH, "vector/expected_df.parquet")) self.assertEqual(sorted(actual_df.collect()), sorted(expected_df.collect())) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..2de7546 --- /dev/null +++ b/tox.ini @@ -0,0 +1,45 @@ +# this file is *not* meant to cover or endorse the use of tox or pytest or +# testing in general, +# +# It's meant to show the use of: +# +# - check-manifest +# confirm items checked into vcs are in your sdist +# - readme_renderer (when using a ReStructuredText README) +# confirms your long_description will render correctly on PyPI. +# +# and also to help confirm pull requests to this project. + +[tox] +envlist = py{37,38,39,310} + +# Define the minimal tox version required to run; +# if the host tox is less than this the tool with create an environment and +# provision it with a tox that satisfies it under provision_tox_env. +# At least this version is needed for PEP 517/518 support. +minversion = 3.3.0 + +# Activate isolated build environment. tox will use a virtual environment +# to build a source distribution from the source tree. For build tools and +# arguments use the pyproject.toml file as specified in PEP-517 and PEP-518. +isolated_build = true + +[testenv] +deps = + check-manifest >= 0.42 + # If your project uses README.rst, uncomment the following: + # readme_renderer + flake8 + pytest + build + twine +commands = + check-manifest --ignore 'tox.ini,tests/**' + python -m build + python -m twine check dist/* + flake8 . + py.test tests {posargs} + +[flake8] +exclude = .tox,*.egg,build,data +select = E,W,F \ No newline at end of file