In [0]:
%sql
CREATE VOLUME IF NOT EXISTS dev_catalog.default.raw_json_data
COMMENT "Volume for raw JSON data storage";


In [0]:
# Create raw data directory
#dbutils.fs.mkdirs("dbfs:/Volumes/dev_catalog/default/raw_json_data/landing/raw_datas")
# Create checkpoint directory for bronze layer
#dbutils.fs.mkdirs("dbfs:/Volumes/dev_catalog/default/raw_json_data/checkpoints/customer/")



In [0]:
#dbutils.fs.mkdirs("dbfs:/Volumes/dev_catalog/default/raw_json_data/checkpoints/schema/")

In [0]:
# ---------------------------------------------
# Bronze Notebook (bronze_autoloader_customer.py)
# ---------------------------------------------
from pyspark.sql.functions import input_file_name, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

# Catalog + Schema + Table setup
catalog = "dev_catalog"
schema = "default"
table = "bronze_customer"

# Use the paths provided by the user
input_path = "/Volumes/dev_catalog/default/raw_json_data/landing/raw/"
checkpoint_path = "/Volumes/dev_catalog/default/raw_json_data/checkpoints/customer/"
schema_path = "/Volumes/dev_catalog/default/raw_json_data/checkpoints/schema/"

# Define static schema for nested JSON parsing
customer_schema = StructType([
    StructField("timestamp", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("operation", StringType(), True),
    StructField("name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("address", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("zip_code", StringType(), True)
])

# Read JSON files using Auto Loader with schema
from pyspark.sql.functions import current_timestamp,col

df = (
    spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "json")
        .option("cloudFiles.schemaLocation", schema_path)
        .schema(customer_schema)
        .load(input_path)
        .withColumn("ingest_time", current_timestamp())
        .withColumn("source_file", col("_metadata.file_path")) # ✅ Correct way
)


# Write to Bronze Delta Table
df.writeStream \
    .format("delta") \
    .option("checkpointLocation", checkpoint_path) \
    .option("mergeSchema", "true") \
    .outputMode("append") \
    .trigger(availableNow=True).toTable(f"{catalog}.{schema}.{table}")


In [0]:
%sql
select * from dev_catalog.default.bronze_customer