In [0]:
#Libraries management
from pyspark import pipelines as pl
from pyspark.sql.functions import *
from pyspark.sql.types import *

volume_path="/Volumes/workspace/sd_schema/datastore/customer_*.json" 

In [0]:
#bronze layer table: cust_bronze_sd
pl.create_streaming_table("cust_bronze_sd_addNew")

# Ingest the raw data into the bronze table using append flow
@pl.append_flow(
  target = "cust_bronze_sd_addNew", #object name
  name = "cust_bronze_sd_addNew_ingest_flow" #flow name
)
def cust_bronze_sd_addNew_ingest_flow():
  df = (
    

      spark.readStream
          .format("cloudFiles")
          .option("cloudFiles.format", "json")
          .option("cloudFiles.inferColumnTypes", "true") #auto scan schema 
          #.option("cloudFiles.schemaEvolutionMode", "failOnNewColumns") # schema customer_data_1.json is different than customer_data_2.json so it fails with  [UNKNOWN_FIELD_EXCEPTION.NEW_FIELDS_IN_RECORD_WITH_FILE_PATH] excetion and stops processing
          .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
          .load(f"{volume_path}")
  )
  return df.withColumn("ingestion_datetime", current_timestamp())\
           .withColumn("source_filename", col("_metadata.file_path")) 


In [0]:
pl.create_streaming_table(
    name="cust_silver_sd_addNew",
    expect_all_or_drop={
        "valid_id": "CustomerID IS NOT NULL"
    }
)

@pl.append_flow(
    target="cust_silver_sd_addNew",
    name="cust_silver_sd_addNew_flow"
)
def cust_silver_sd_addNew_flow():
    df = spark.readStream.table("cust_bronze_sd_addNew")
    
    # Apply data type conversions for specific columns
    # These conversions will only apply if the columns exist
    
    # Convert SignupDate from String to Date (from customer_data_1.json)
    if "SignupDate" in df.columns:
        df = df.withColumn("SignupDate", col("SignupDate").cast(DateType()))
    
    # Convert Age to Integer (from customer_data_2.json)
    if "Age" in df.columns:
        df = df.withColumn("Age", col("Age").cast(IntegerType()))
    
    # Convert CreditScore to Integer (from customer_data_4.json)
    if "CreditScore" in df.columns:
        df = df.withColumn("CreditScore", col("CreditScore").cast(IntegerType()))
    
    # Additional transformations can be added here:
    # - Data quality checks
    # - Business logic
    # - Derived columns
    # - Data cleansing
    
    return df
