In [11]:
import com.tccc.dna.synapse.spark._
import com.tccc.dna.synapse._
import scala.io.Source

SynapseSpark.printVersions()
sys.props.get("user.timezone").getOrElse("**Not Available**")
sys.props.get("java.runtime.name").getOrElse("**Not Available**")
sys.props.get("java.runtime.version").getOrElse("**Not Available**")
sys.props.get("os.name").getOrElse("**Not Available**")
SynapseSpark.getSparkConfProp("spark.sql.session.timeZone")

In [4]:
val dataContractYaml = """
dataContractSpecification: 0.9.0
id: urn:landing:adobe:aem:asset:asset_created:uuid:a87d84d4-35fb-425b-ac1a-b621f16e0252
info:
  title: asset_lifecycle_created
  version: 1.0.0
  description: All ASSET_CREATED events for every KO Asset since March 1, 2024. Sourced from Adobe AEM.
  owner: TBD
  dataProduct: Adobe Experience Manager
  contact:
    name: Vamsi Batthula
    url: https://github.com/The-Coca-Cola-Company/tccc-mai-consumer-marketing-analytics
    email: vabatthula@coca-cola.com

### servers
servers:
  dev:
    type: adls2
    location: abfss://dam_adobe@sause2tccctstdev001.dfs.core.windows.net.dfs.core.windows.net/asset_created
    dataset: asset_created_YYYYMMDD HHMMSS.json

### terms
terms:
  usage: >
    This is the raw data dropped by Adobe in KO landing storage account. This is only meant to be consumed by batch pipelines.
  limitations: >
    Not suitable of user consumption as it is not curated.
    Processed only once daily.

### schema
schema:
  type: json-schema
  specification: |-
    {
      "$id": "http://coca-cola.com/data/schemas/adobe/aem/asset_created_1-0-0.json",
      "$comment": "Uses SchemaVer schema as described here https://snowplow.io/blog/introducing-schemaver-for-semantic-versioning-of-schemas/",
      "$schema ": "https://json-schema.org/draft/2020-12/schema",
      "title": "asset_created",
      "description": "JSON schema for 'asset_created' event from AEM.",
      "type": "object",
      "$comment": "This event is non-extensible. additionalProperties is set to false so that properties on defined here fail validation.",
      "additionalProperties": false,
      "properties": {
        "asset_size": {
          "type": "integer",
          "description": "In bytes."
        },
        "original_asset_path": {
          "type": "string"
        },
        "event_type": {
          "type": "string",
          "description": "ALWAYS ASSET_CREATED."
        },
        "asset_name": {
          "type": "string"
        },
        "asset_title": {
          "type": "string"
        },
        "asset_created_by": {
          "type": "string",
          "description": "Email address."
        },
        "mime_type": {
          "type": "string"
        },
        "asset_id": {
          "type": "string"
        },
        "event_creation_timestamp": {
          "type": "string",
          "description": "ISO 8601 DateTime with TimeZone. Format: yyyy-MM-dd'T'HH:mm:ss.SSSZ, Example: 2000-10-31T01:30:00.000-05:00."
        },
        "new_asset_path": {
          "type": "string"
        },
        "event_created_by": {
          "type": "string",
          "description": "Email address."
        },
        "asset_creation_timestamp": {
          "type": "string",
          "description": "ISO 8601 DateTime with TimeZone. Format: yyyy-MM-dd'T'HH:mm:ss.SSSZ, Example: 2000-10-31T01:30:00.000-05:00."
        }
      },
      "required": [
        "event_type",
        "asset_id",
        "asset_name",
        "asset_title",
        "asset_size",
        "original_asset_path",
        "new_asset_path",
        "asset_creation_timestamp",
        "event_creation_timestamp",
        "event_created_by",
        "mime_type"
      ]
    }

examples:
  - type: json
    model: my_table
    data: |-
      [
        {
        "data": {
            "eventDetails": {
                "asset_created_by": "admin",
                "asset_creation_timestamp": "2023-10-13T13:51:25.454+05:30",
                "asset_id": "056c0bd6-b0cd-4e40-9682-9a523e680db2",
                "asset_name": "Testing.pdf",
                "asset_size": 314632,
                "asset_title": "Testing.pdf",
                "event_created_by": "admin",
                "event_creation_timestamp": "2023-10-13T13:51:25.000+05:30",
                "event_type": "ASSET_CREATED",
                "mime_type": "application/pdf",
                "new_asset_path": "NA",
                "original_asset_path": "/content/dam/tccc/marketing/jazib-testing-folder/events/Testing.pdf"
            }
        },
        "datacontenttype": "application/json",
        "id": "0.3125665289950522",
        "source": "urn:uuid:a87d84d4-35fb-425b-ac1a-b621f16e0252",
        "specversion": "1.0",
        "type": "com.adobe.tccc.datalake.analytics.custom"
        }
      ]

### quality

#quality:
#  type: SodaCL
#  specification:
#    checks for my_table:
#      - duplicate_count(order_id) = 0

#quality:
#  type: SodaCL
#  specification:
#    checks for my_table: |-
#      - duplicate_count(id) = 0

#quality:
#  type: SodaCL
#  specification:
#    checks for my_table: "$ref: checks.yaml"

#quality:
#  type: montecarlo
#  specification: |-
#    montecarlo:
#      field_health:
#        - table: my_project:my_dataset.my_table
#          fields:
#            - id
#            - timestamp
#            - amount
#          timestamp_field: timestamp

#quality:
#  type: custom
#  specification: |-
"""

In [5]:
//"https://sause2tccctstdev001.blob.core.windows.net/dam-adobe/aem/asset-created/2023/11/15/asset-created-20231115%20113000.json"

val filePath = "aem/asset-created/2023/11/*/asset-created-*.json"
val storageAcct = "xxx"
val containerOrFileSys = "dam-adobe"
val landingPath = f"abfss://$containerOrFileSys@$storageAcct.dfs.core.windows.net/$filePath"

val dataContractSource = Source.fromString(dataContractYaml)

val landingValidatedDf = DataFrames.loadAndValidateJson(landingPath, dataContractSource)
display(landingValidatedDf)
//landingValidatedDf.show(truncate = false)

In [6]:
display(landingValidatedDf, summary=true)