In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
%fs
ls /Volumes/workspace/ecommerce/ecommerce_data

In [0]:
df_oct = spark.read.parquet("/Volumes/workspace/ecommerce/ecommerce_data/parquet/oct/")

In [0]:
df_oct.printSchema()

In [0]:
print("Total records:", df_oct.count())

#### In modern Databricks (with Unity Catalog):

#### Public DBFS root ( /delta, /dbfs, /FileStore, etc.) is DISABLED

#### You cannot write directly to /delta/...

#### You must write to Unity Catalog locations, i.e. Volumes or Managed Tables

#### designed for security & governance.

In [0]:
# Convert to Delta
df_oct.write.format("delta").mode("overwrite").save("/delta/events")

In [0]:
df_oct.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct")


In [0]:
# Create managed table
df_oct.write.format("delta").mode("overwrite").saveAsTable("workspace.ecommerce.events_oct")

In [0]:
%sql
DESCRIBE DETAIL workspace.ecommerce.events_oct;

In [0]:
spark.sql("""
CREATE TABLE workspace.ecommerce.events_delta USING DELTA
AS
SELECT *
FROM workspace.ecommerce.events_oct
""")

### SCHEMA ENFORCEMENT
#### I design pipelines with schema enforcement, schema evolution, and data quality checks using Delta Lake.
#### Delta Lake will NOT allow data that does not match the tableâ€™s schema to be written.
#### No silent corruption. No guessing. No bad data.
#### Schema enforcement - Block wrong data.
#### Schema evolution - Allow controlled new columns


#### Schema enforcement == data firewall. We test it to make sure garbage never enters our system.

In [0]:
try:
    wrong_schema = spark.createDataFrame(
        [("a", "b", "c")],
        ["x", "y", "z"]
    )

    wrong_schema.write.format("delta") \
        .mode("append") \
        .saveAsTable("workspace.ecommerce.events_oct")

except Exception as e:
    print("Schema enforcement triggered:")
    print(e)

#### This confirms schema enforcement is working correctly in your Delta + Unity Catalog setup.

In [0]:
from pyspark.sql.functions import lit
df_new_col = df_oct.withColumn("source_system", lit("web"))

In [0]:
df_new_col.printSchema()

In [0]:
df_new_col.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable("workspace.ecommerce.events_oct")

In [0]:
%sql
DESCRIBE workspace.ecommerce.events_oct;