In [None]:
dbutils.widgets.text("json_data", "", "JSON Data")

In [None]:
import json
from pyspark.sql import SparkSession

In [None]:
# Define a schema

from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, FloatType, BooleanType, TimestampType, MapType, ArrayType
)

comment_schema = StructType([
    StructField("correspondence", ArrayType(MapType(StringType(),StringType())),True),
    StructField("x",FloatType(),True),
    StructField("y",FloatType(),True),
    StructField("resolved",BooleanType(),True),
    StructField("createdAt",StringType(),True),
    StructField("createdBy",MapType(StringType(),StringType()),True),
    StructField("creationType",StringType(),True),
    StructField("updatedAt",StringType(),True),
    StructField("updatedBy",MapType(StringType(),StringType()),True)
])

point_schema = StructType([
    StructField("x", FloatType(), True),
    StructField("y", FloatType(), True)
])

cuboid_points_schema = StructType([
    StructField("f1", point_schema, True),
    StructField("f2", point_schema, True),
    StructField("r1", point_schema, True),
    StructField("r2", point_schema, True)
])

instance_schema = StructType([
    StructField("instance_type", StringType(), True),
    StructField("classId", IntegerType(), True),
    StructField("probability", IntegerType(), True),
    StructField("bbox_points", MapType(StringType(), FloatType()), True),
    StructField("polygon_points", ArrayType(FloatType()), True),
    StructField("polygon_exclude", ArrayType(ArrayType(FloatType())),True),
    StructField("cuboid_points", cuboid_points_schema, True),
    StructField("ellipse_points", MapType(StringType(),FloatType()),True),
    StructField("point_points", MapType(StringType(),FloatType()),True),
    StructField("groupId", IntegerType(), True),
    StructField("locked", BooleanType(), True),
    StructField("attributes", ArrayType(MapType(StringType(), StringType())), True),
    StructField("trackingId", StringType(), True),
    StructField("error", StringType(), True),
    StructField("createdAt", StringType(), True),
    StructField("createdBy", MapType(StringType(), StringType()), True),
    StructField("creationType", StringType(), True),
    StructField("updatedAt", StringType(), True),
    StructField("updatedBy", MapType(StringType(), StringType()), True),
    StructField("className", StringType(), True)
])

# Define the schema for metadata and nested instances
schema = StructType([
    StructField("image_height", IntegerType(), True),
    StructField("image_width", IntegerType(), True),
    StructField("image_name", StringType(), True),
    StructField("projectId", IntegerType(), True),
    StructField("isPredicted", BooleanType(), True),
    StructField("status", StringType(), True),
    StructField("pinned", BooleanType(), True),
    StructField("annotatorEmail", StringType(), True),
    StructField("qaEmail", StringType(), True),
    StructField("instances", ArrayType(instance_schema), True),
    StructField("comments", ArrayType(comment_schema),True)
    
])

In [None]:
data_json = dbutils.widgets.get("json_data")
data = json.loads(data_json)

spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(data,schema=schema)

# Replace 'your_delta_table' with the path to your existing Delta table
delta_table = "super_annotate.annotations"

# Write the merge statement based on your table schema and requirements
merge_sql = f'''
MERGE INTO {delta_table} AS target
USING df AS source
ON source.image_name = target.image_name AND source.projectId = target.projectId
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
'''

# Execute the merge statement
spark.sql(merge_sql)
