In [0]:
#Step 0: Import required modules
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
# Step 1: Read your CSV into a DataFrame

#Path to your CSV on the volume
path = "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv"

# Read CSV
df = spark.read.csv(path, header=True, inferSchema=True)

# Preview the data
df.show(5)

#check schema
df.printSchema()

+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|2019-10-01 00:00:00|      view|  44600062|2103807459595387724|                NULL|shiseido|  35.79|541312140|72d76fde-8bb3-4e0...|
|2019-10-01 00:00:00|      view|   3900821|2053013552326770905|appliances.enviro...|    aqua|   33.2|554748717|9333dfbd-b87a-470...|
|2019-10-01 00:00:01|      view|  17200506|2053013559792632471|furniture.living_...|    NULL|  543.1|519107250|566511c2-e2e3-422...|
|2019-10-01 00:00:01|      view|   1307067|2053013558920217191|  computers.notebook|  lenovo| 251.74|550050854|7c90fc70-0e80-459...|
|2019-10-01 00:00:04|      view|   1004237|2053013555631882655|electr

In [0]:
#Step 2 – Save DataFrame as Delta Table (managed table)

# Databricks will manage the storage location automatically
df.write.format("delta").mode("overwrite").saveAsTable("ecommerce_oct2019")
print("Delta table 'ecommerce_oct2019' created successfully!")

Delta table 'ecommerce_oct2019' created successfully!


In [0]:
# Drop table if exists
spark.sql("DROP TABLE IF EXISTS ecommerce_oct2019_sql")


DataFrame[]

In [0]:
#Step 3 – Create a Delta table using SQL 
spark.sql("""
    CREATE TABLE ecommerce_oct2019_sql
    USING DELTA
    AS SELECT * FROM ecommerce_oct2019   
    """)
print("Delta table 'ecommerce_oct2019_sql' created via SQL!")


Delta table 'ecommerce_oct2019_sql' created via SQL!


In [0]:
#Step 4: How to verify the table really exists:

# List tables
spark.sql("SHOW TABLES").show()

#Preview the data
spark.sql("SELECT * FROM ecommerce_oct2019_sql LIMIT 5").show()


+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| default|   ecommerce_oct2019|      false|
| default|ecommerce_oct2019...|      false|
+--------+--------------------+-----------+

+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+
|2019-10-23 09:44:06|      view|   2800638|2053013563835941749|appliances.kitche...|   NULL|194.32|523560041|f2da82d4-3aca-4eb...|
|2019-10-23 09:44:06|      view|   1306746|2053013558920217191|  computers.notebook| lenovo|873.61|532561846|d2a62c6a-8529-469...|
|2019-10-23 09:44:06|      view|   4700549|2053013560899928785|auto.accessories.

In [0]:
#Check Delta details
spark.sql("DESCRIBE DETAIL ecommerce_oct2019_sql").show()

+------+--------------------+--------------------+-----------+--------+--------------------+-------------------+----------------+-----------------+--------+-----------+--------------------+----------------+----------------+--------------------+--------------------+-------------+
|format|                  id|                name|description|location|           createdAt|       lastModified|partitionColumns|clusteringColumns|numFiles|sizeInBytes|          properties|minReaderVersion|minWriterVersion|       tableFeatures|          statistics|clusterByAuto|
+------+--------------------+--------------------+-----------+--------+--------------------+-------------------+----------------+-----------------+--------+-----------+--------------------+----------------+----------------+--------------------+--------------------+-------------+
| delta|7a471df9-654b-4df...|workspace.default...|       NULL|        |2026-01-12 04:13:...|2026-01-12 04:13:21|              []|               []|      12| 147

In [0]:
#Schema Enforcement Test

#Step 5: Create a DataFrame with WRONG schema
from pyspark.sql import Row

## Completely different schema
wrong_schema_df = spark.createDataFrame([Row(id=1, name = "Vid", value=100)])


In [0]:
#Step 6: Try appending it to the Delta table
try:
    wrong_schema_df.write.format("delta").mode("append").saveAsTable("ecommerce_oct2019_sql")
except Exception as e:
    print("Schema enforcement triggered!")
    print(e)

Schema enforcement triggered!
[_LEGACY_ERROR_TEMP_DELTA_0007] A schema mismatch detected when writing to the Delta table (Table ID: 7a471df9-654b-4df5-971b-d94b8ed284b7).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- event_time: timestamp (nullable = true)
-- event_type: string (nullable = true)
-- product_id: integer (nullable = true)
-- category_id: long (nullable = true)
-- category_code: string (nullable = true)
-- brand: string (nullable = true)
-- price: double (nullable = true)
-- user_id: integer (nullable = true)
-- user_session: string (nullable = true)


Data schema:
root
-- id: long (nullable = true)
-- name: string (nullable = true)
-- value: long (nullable = true)

         
Table ACLs are enabled in this cluste

## Day 5 TASK continues from Day 4 TASK

In [0]:
#STEP 7: Incremental MERGE (UPSERT)
from delta.tables import DeltaTable
from pyspark.sql import functions as F


#Get Delta table
delta_table = DeltaTable.forName(spark, "ecommerce_oct2019_sql")

#Simulate incremental data (price update + new rows)
updates_df = df.limit(10)\
    .withColumn("price", F.col("price") + 100)

#Merge
delta_table.alias("t").merge(
    updates_df.alias("s"),
    "t.event_time = s.event_time AND t.user_id = s.user_id"
).whenMatchedUpdateAll() \
.whenNotMatchedInsertAll() \
.execute()


DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
# STEP 8: Time Travel (read old version)
#1. Check history
spark.sql("DESCRIBE HISTORY ecommerce_oct2019_sql").show(truncate = False)

# 2. Read first version
old_version = spark.read.format("delta") \
    .option ("versionAsOf",0) \
    .table("ecommerce_oct2019_sql") 
old_version.show(20)

# 3. Read by timestamp (choose valid timestamp from history)
yesterday = spark.read.format("delta") \
    .option("timestampAsOf", "2026-01-12 04:13:21") \
    .table("ecommerce_oct2019_sql") #after the first UPSERT (MERGE) but before OPTIMIZE/VACUUM
    
yesterday.show(20)

+-------+-----------------------+--------------+-------------------+----------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+------------------+------------------------+-----------+-----------------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
#STEP 9: OPTIMIZE & ZORDER
spark.sql("""OPTIMIZE ecommerce_oct2019_sql 
          ZORDER BY (event_type, user_id)
          """).show()


+----+--------------------+
|path|             metrics|
+----+--------------------+
|    |{0, 0, {NULL, NUL...|
+----+--------------------+



In [0]:
#STEP 10: Clean up old files (safe retention)
spark.sql("""
          VACUUM ecommerce_oct2019_sql RETAIN 168 HOURS
          """).show()

+----+
|path|
+----+
|    |
+----+

