In [0]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = (
    SparkSession.builder.appName("DeltaDemo")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [0]:
data = [
    (1, "John", "Electronics", 2, 300),
    (2, "Sara", "Clothing", 1, 50),
    (3, "Mike", "Electronics", 4, 600),
    (4, "Nina", "Clothing", 3, 150),
]
columns = ["order_id", "customer_name", "category", "quantity", "amount"]

df = spark.createDataFrame(data, columns)

# Save as Delta
# Save Delta table to DBFS tmp folder
df.write.format("delta").mode("overwrite").save("/dbfs/tmp/orders_delta")

# Read it back
df2 = spark.read.format("delta").load("/dbfs/tmp/orders_delta")
df2.show()


+--------+-------------+-----------+--------+------+
|order_id|customer_name|   category|quantity|amount|
+--------+-------------+-----------+--------+------+
|       1|         John|Electronics|       2|   300|
|       2|         Sara|   Clothing|       1|    50|
|       3|         Mike|Electronics|       4|   600|
|       4|         Nina|   Clothing|       3|   150|
+--------+-------------+-----------+--------+------+



In [0]:
spark.sql("DROP TABLE IF EXISTS orders_managed")
spark.sql("DROP TABLE IF EXISTS orders_unmanaged")

DataFrame[]

1. Managed vs Unmanaged Tables

In [0]:

spark.sql("""
    CREATE TABLE orders_managed
    USING DELTA
    AS SELECT * FROM delta.`dbfs:/tmp/orders_delta`
""")



DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.sql("""
CREATE TABLE orders_unmanaged
USING DELTA
LOCATION '/tmp/orders_delta'
""")


2. CRUD Operations

In [0]:
from pyspark.sql import functions as F

# Read from the table
df = spark.table("orders_managed")
display(df)



order_id,customer_name,category,quantity,amount
4,Nina,Clothing,3,170
3,Mike,Electronics,4,600
1,Alice,Clothing,3,250
5,Tom,Electronics,1,300
6,Jerry,Furniture,2,500


In [0]:
# UPDATE: Clothing → amount + 20
spark.sql("""
UPDATE orders_managed
SET amount = amount + 20
WHERE category = 'Clothing'
""")
display(df)

order_id,customer_name,category,quantity,amount
3,Mike,Electronics,4,600
6,Jerry,Furniture,2,500
4,Nina,Clothing,3,210
1,Alice,Clothing,3,290


In [0]:
# DELETE: Remove quantity < 2
spark.sql("""
DELETE FROM orders_managed
WHERE quantity < 2
""")
display(df)


order_id,customer_name,category,quantity,amount
4,Nina,Clothing,3,210
1,Alice,Clothing,3,290
3,Mike,Electronics,4,600
6,Jerry,Furniture,2,500


In [0]:
# MERGE: New & Updated Orders
from pyspark.sql import Row

new_data = [
    Row(order_id=1, customer_name="Alice", category="Clothing", quantity=3, amount=250),  # updated
    Row(order_id=5, customer_name="Tom", category="Electronics", quantity=1, amount=300), # new
    Row(order_id=6, customer_name="Jerry", category="Furniture", quantity=2, amount=500)  # new
]
new_df = spark.createDataFrame(new_data)

new_df.createOrReplaceTempView("updates")

spark.sql("""
MERGE INTO orders_managed AS target
USING updates AS source
ON target.order_id = source.order_id
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
""")
display(df)

order_id,customer_name,category,quantity,amount
4,Nina,Clothing,3,210
3,Mike,Electronics,4,600
5,Tom,Electronics,1,300
6,Jerry,Furniture,2,500
1,Alice,Clothing,3,250


3. History & Time Travel

In [0]:
# View history
spark.sql("DESCRIBE HISTORY orders_managed").show(truncate=False)

# Read version 0
df_v0 = spark.read.format("delta").option("versionAsOf", 0).table("orders_managed")

# Latest version
df_latest = spark.table("orders_managed")

print("Version 0:")
display(df_v0)

print("Latest version:")
display(df_latest)


+-------+-----------------------+---------------+----------------------------------+----------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

order_id,customer_name,category,quantity,amount
1,John,Electronics,2,300
2,Sara,Clothing,1,50
3,Mike,Electronics,4,600
4,Nina,Clothing,3,150


Latest version:


order_id,customer_name,category,quantity,amount
4,Nina,Clothing,3,210
3,Mike,Electronics,4,600
1,Alice,Clothing,3,250
5,Tom,Electronics,1,300
6,Jerry,Furniture,2,500


4. Incremental Load Pattern

In [0]:
# New mini-batch orders
batch_new = [
    (5, "Tom", "Electronics", 1, 300),   # already present in merge above (example only)
    (6, "Jerry", "Furniture", 2, 500)    # already present (example only)
]
columns = ["order_id", "customer_name", "category", "quantity", "amount"]
batch_df = spark.createDataFrame(batch_new, columns)

In [0]:
# Append
batch_df.write.format("delta").mode("append").saveAsTable("orders_managed")
# Final result
display(spark.table("orders_managed"))

order_id,customer_name,category,quantity,amount
4,Nina,Clothing,3,210
3,Mike,Electronics,4,600
1,Alice,Clothing,3,250
5,Tom,Electronics,1,300
6,Jerry,Furniture,2,500
5,Tom,Electronics,1,300
6,Jerry,Furniture,2,500
5,Tom,Electronics,1,300
6,Jerry,Furniture,2,500


5. Simple Streaming Read

In [0]:
stream_df = (
    spark.readStream
         .format("delta")
         .load("/tmp/orders_delta")
)

query = (
    stream_df.writeStream
             .format("console")
             .outputMode("append")
             .trigger(processingTime="5 seconds")
             .start()
)



In [0]:
query.stop()


6. Visualization

In [0]:

pdf = spark.table("orders_managed").toPandas()
category_sum = pdf.groupby("category")["amount"].sum().reset_index()
for cat in ["Electronics", "Clothing", "Furniture"]:
    if cat not in category_sum["category"].values:
        category_sum = pd.concat([
            category_sum,
            pd.DataFrame({"category": [cat], "amount": [0]})
        ], ignore_index=True)
category_sum = category_sum.sort_values("category")
max_amount = category_sum["amount"].max()
for _, row in category_sum.iterrows():
    category = row["category"]
    amount = row["amount"]
    bar_length = int((amount / max_amount) * 20) if max_amount > 0 else 0
    bar = "█" * bar_length
    print(f"{category:<12} | {bar:<20} {amount}")


Clothing     | ██████               460
Electronics  | ████████████████████ 1500
Furniture    | ████████████████████ 1500
