In [0]:
%sql
CREATE CATALOG IF NOT EXISTS customer_profiile_ingestion;
CREATE SCHEMA IF NOT EXISTS customer_profiile_ingestion.customer;
CREATE VOLUME IF NOT EXISTS customer_profiile_ingestion.customer.customer_volume;

In [0]:
dbutils.fs.mkdirs("dbfs:/Volumes/customer_profiile_ingestion/customer/customer_volume/raw/")

In [0]:
dbutils.fs.mkdirs("dbfs:/Volumes/customer_profiile_ingestion/customer/customer_volume/raw/customers")

#STEP 1: Bronze – Read Raw Data (Day-1 Schema)

In [0]:
# Data only (no header row)
data = [
    [1, "Sunil", "sunil@gmail.com", "India"],
    [2, "Ravi", "ravi@gmail.com", "India"]
]

# Column names as separate parameter
columns = ["customer_id", "name", "email", "country"]

df1_raw_bronze=spark.createDataFrame(data,columns)
print(df1_raw_bronze.printSchema())
df1_raw_bronze.write.mode("overwrite").csv("dbfs:/Volumes/customer_profiile_ingestion/customer/customer_volume/raw/customers",header=True)
df1_raw_bronze.display()

In [0]:
df1_raw_bronze.createOrReplaceTempView("temp")

In [0]:
%sql
describe temp

#STEP 2: Bronze – Write as Delta (Safe Storage)

In [0]:
dbutils.fs.mkdirs("dbfs:/Volumes/customer_profiile_ingestion/customer/customer_volume/bronze/")

In [0]:
dbutils.fs.mkdirs("dbfs:/Volumes/customer_profiile_ingestion/customer/customer_volume/bronze/customers")


Companies always convert raw files into Delta immediately

In [0]:
df1_raw_bronze.write.mode("overwrite").format("delta").save("dbfs:/Volumes/customer_profiile_ingestion/customer/customer_volume/bronze/customers")

#STEP 3: Schema Change Happens (Day-2)<br>
**New Raw File**<br>
customer_id,name,email,country,phone_number<br>
3,Asha,asha@gmail.com,USA,9876543210<br>

Without Schema Handling<br>
This will FAIL:<br>
df.write.format("delta").mode("append").save(path)

#STEP 4: Handle Schema Evolution (REAL-TIME FIX)

In [0]:
df_bronze_new = spark.read.format("delta").load("/Volumes/customer_profiile_ingestion/customer/customer_volume/bronze/customers/")
df_bronze_new.display()

# Initialize the Delta table if it does not exist
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = df_bronze_new.schema

spark.createDataFrame([], schema).write.format("delta").mode("overwrite").save(
    "/Volumes/customer_profiile_ingestion/customer/customer_volume/bronze/customers/Streaming_data/real_time_streaming"
)

# Now append your data
df_bronze_new.write.format("delta").mode("append").option("mergeSchema", "true").save(
    "/Volumes/customer_profiile_ingestion/customer/customer_volume/bronze/customers/Streaming_data/real_time_streaming"
)

display(df_bronze_new)