# Delta Lake Test Notebook

This notebook demonstrates the usage of Delta Lake with Spark 3.4, along with Pandas and Polars integration.

In [12]:
# Import required libraries
import pyspark
from pyspark.sql import SparkSession
from delta import *
import pandas as pd
import polars as pl
import os

print("PySpark version:", pyspark.__version__)
print("Pandas version:", pd.__version__)
print("Polars version:", pl.__version__)

PySpark version: 3.4.4
Pandas version: 2.1.4
Polars version: 0.20.31


In [13]:
# Initialize Spark with Delta Lake support
builder = SparkSession.builder.appName("DeltaLakeDemo") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("INFO")

print(f"Spark version: {spark.version}")
print(f"Spark UI available at: http://localhost:4040")

Spark version: 3.4.4
Spark UI available at: http://localhost:4040


In [14]:
# Create sample data with Pandas
pandas_df = pd.DataFrame({
    'id': range(1, 101),
    'name': [f'User_{i}' for i in range(1, 101)],
    'age': [20 + (i % 50) for i in range(100)],
    'city': ['New York', 'London', 'Tokyo', 'Sydney', 'Paris'] * 20
})

print("Pandas DataFrame:")
print(pandas_df.head())
print(f"Shape: {pandas_df.shape}")

Pandas DataFrame:
   id    name  age      city
0   1  User_1   20  New York
1   2  User_2   21    London
2   3  User_3   22     Tokyo
3   4  User_4   23    Sydney
4   5  User_5   24     Paris
Shape: (100, 4)


In [15]:
# Convert Pandas to Spark DataFrame
spark_df = spark.createDataFrame(pandas_df)

print("Spark DataFrame Schema:")
spark_df.printSchema()
print("\nFirst 5 rows:")
spark_df.show(5)

Spark DataFrame Schema:
root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)


First 5 rows:


25/08/10 01:33:03 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
25/08/10 01:33:03 INFO SharedState: Warehouse path is 'file:/workspace/notebooks/spark-warehouse'.
25/08/10 01:33:03 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
25/08/10 01:33:03 INFO DAGScheduler: Got job 0 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
25/08/10 01:33:03 INFO DAGScheduler: Final stage: ResultStage 0 (showString at NativeMethodAccessorImpl.java:0)
25/08/10 01:33:03 INFO DAGScheduler: Parents of final stage: List()
25/08/10 01:33:03 INFO DAGScheduler: Missing parents: List()
25/08/10 01:33:03 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[6] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
25/08/10 01:33:03 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 13.1 KiB, free 434.4 MiB)
25/08/10 01:33:03 INFO MemoryStore

+---+------+---+--------+
| id|  name|age|    city|
+---+------+---+--------+
|  1|User_1| 20|New York|
|  2|User_2| 21|  London|
|  3|User_3| 22|   Tokyo|
|  4|User_4| 23|  Sydney|
|  5|User_5| 24|   Paris|
+---+------+---+--------+
only showing top 5 rows



25/08/10 01:33:03 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 1976 bytes result sent to driver
25/08/10 01:33:03 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 275 ms on 3a5b1b71f157 (executor driver) (1/1)
25/08/10 01:33:03 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
25/08/10 01:33:03 INFO PythonAccumulatorV2: Connected to AccumulatorServer at host: 127.0.0.1 port: 36805
25/08/10 01:33:03 INFO DAGScheduler: ResultStage 0 (showString at NativeMethodAccessorImpl.java:0) finished in 0.281 s
25/08/10 01:33:03 INFO DAGScheduler: Job 0 is finished. Cancelling potential speculative or zombie tasks for this job
25/08/10 01:33:03 INFO TaskSchedulerImpl: Killing all running tasks in stage 0: Stage finished
25/08/10 01:33:03 INFO DAGScheduler: Job 0 finished: showString at NativeMethodAccessorImpl.java:0, took 0.283483 s


In [16]:
# Write data to Delta Lake format
delta_path = "/workspace/data/users_delta"

# Write as Delta table
spark_df.write.format("delta").mode("overwrite").save(delta_path)

print(f"Data written to Delta Lake at: {delta_path}")
print("Files in Delta directory:")
os.system(f"ls -la {delta_path}")

25/08/10 01:33:03 INFO MapPartitionsRDD: Removing RDD 26 from persistence list
25/08/10 01:33:03 INFO DelegatingLogStore: LogStore `LogStoreAdapter(io.delta.storage.HDFSLogStore)` is used for scheme `file`
25/08/10 01:33:03 INFO DeltaLog: Loading version 0.
25/08/10 01:33:03 INFO DeltaLogFileIndex: Created DeltaLogFileIndex(JSON, numFilesInSegment: 1, totalFileSize: 5873)
25/08/10 01:33:03 INFO FileSourceStrategy: Pushed Filters: 
25/08/10 01:33:03 INFO FileSourceStrategy: Post-Scan Filters: 
25/08/10 01:33:03 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 202.2 KiB, free 434.2 MiB)
25/08/10 01:33:03 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 35.1 KiB, free 434.1 MiB)
25/08/10 01:33:03 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 3a5b1b71f157:40499 (size: 35.1 KiB, free: 434.4 MiB)
25/08/10 01:33:03 INFO SparkContext: Created broadcast 1 from toString at String.java:2951
25/08/10 01:33:03 INFO Fil

Data written to Delta Lake at: /workspace/data/users_delta
Files in Delta directory:
total 192
drwxr-xr-x 51 root root 1632 Aug 10 01:33 .
drwxr-xr-x  3 root root   96 Aug 10 01:31 ..
-rw-r--r--  1 root root   20 Aug 10 01:33 .part-00000-39173877-7131-47b2-ba08-f2e2732c1599-c000.snappy.parquet.crc
-rw-r--r--  1 root root   20 Aug 10 01:31 .part-00000-8464b882-0ec1-4f4f-8cd5-8f035ae315bc-c000.snappy.parquet.crc
-rw-r--r--  1 root root   20 Aug 10 01:33 .part-00001-9ce9c880-90c6-437a-8f68-b9d4be503488-c000.snappy.parquet.crc
-rw-r--r--  1 root root   20 Aug 10 01:31 .part-00001-d51e14e6-0193-4313-b6c7-2981a24c8508-c000.snappy.parquet.crc
-rw-r--r--  1 root root   20 Aug 10 01:33 .part-00002-5780ac2f-14c6-4b63-86ac-346823e01ebf-c000.snappy.parquet.crc
-rw-r--r--  1 root root   20 Aug 10 01:31 .part-00002-a5cdbfe6-0b39-489a-b764-bd2cd95329a2-c000.snappy.parquet.crc
-rw-r--r--  1 root root   20 Aug 10 01:31 .part-00003-4e67ab50-5859-43d4-925c-47eff18277c8-c000.snappy.parquet.crc
-rw-r--r-- 

25/08/10 01:33:05 INFO Executor: Finished task 0.0 in stage 17.0 (TID 170). 6692 bytes result sent to driver
25/08/10 01:33:05 INFO TaskSetManager: Finished task 0.0 in stage 17.0 (TID 170) in 23 ms on 3a5b1b71f157 (executor driver) (1/1)
25/08/10 01:33:05 INFO TaskSchedulerImpl: Removed TaskSet 17.0, whose tasks have all completed, from pool 
25/08/10 01:33:05 INFO BlockManagerInfo: Removed broadcast_15_piece0 on 3a5b1b71f157:40499 in memory (size: 122.6 KiB, free: 434.1 MiB)
25/08/10 01:33:05 INFO DAGScheduler: ResultStage 17 ($anonfun$recordDeltaOperationInternal$1 at DatabricksLogging.scala:128) finished in 0.027 s
25/08/10 01:33:05 INFO DAGScheduler: Job 10 is finished. Cancelling potential speculative or zombie tasks for this job
25/08/10 01:33:05 INFO TaskSchedulerImpl: Killing all running tasks in stage 17: Stage finished
25/08/10 01:33:05 INFO DAGScheduler: Job 10 finished: $anonfun$recordDeltaOperationInternal$1 at DatabricksLogging.scala:128, took 0.030787 s
25/08/10 01:33:0

0

In [17]:
# Read data from Delta Lake
delta_df = spark.read.format("delta").load(delta_path)

print("Reading from Delta Lake:")
print(f"Total records: {delta_df.count()}")
delta_df.show(5)

25/08/10 01:33:05 INFO DAGScheduler: Registering RDD 57 (count at NativeMethodAccessorImpl.java:0) as input to shuffle 4
25/08/10 01:33:05 INFO DAGScheduler: Got map stage job 11 (count at NativeMethodAccessorImpl.java:0) with 50 output partitions
25/08/10 01:33:05 INFO DAGScheduler: Final stage: ShuffleMapStage 19 (count at NativeMethodAccessorImpl.java:0)
25/08/10 01:33:05 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 18)
25/08/10 01:33:05 INFO DAGScheduler: Missing parents: List()
25/08/10 01:33:05 INFO DAGScheduler: Submitting ShuffleMapStage 19 (MapPartitionsRDD[57] at count at NativeMethodAccessorImpl.java:0), which has no missing parents
25/08/10 01:33:05 INFO MemoryStore: Block broadcast_17 stored as values in memory (estimated size 625.0 KiB, free 432.3 MiB)
25/08/10 01:33:05 INFO MemoryStore: Block broadcast_17_piece0 stored as bytes in memory (estimated size 148.5 KiB, free 432.2 MiB)
25/08/10 01:33:05 INFO BlockManagerInfo: Added broadcast_17_piece0 in mem

Reading from Delta Lake:
Total records: 100


25/08/10 01:33:05 INFO PrepareDeltaScan: DELTA: Filtering files for query
25/08/10 01:33:05 INFO SparkContext: Starting job: $anonfun$recordDeltaOperationInternal$1 at DatabricksLogging.scala:128
25/08/10 01:33:05 INFO DAGScheduler: Got job 13 ($anonfun$recordDeltaOperationInternal$1 at DatabricksLogging.scala:128) with 50 output partitions
25/08/10 01:33:05 INFO DAGScheduler: Final stage: ResultStage 24 ($anonfun$recordDeltaOperationInternal$1 at DatabricksLogging.scala:128)
25/08/10 01:33:05 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 23)
25/08/10 01:33:05 INFO DAGScheduler: Missing parents: List()
25/08/10 01:33:05 INFO DAGScheduler: Submitting ResultStage 24 (MapPartitionsRDD[62] at $anonfun$recordDeltaOperationInternal$1 at DatabricksLogging.scala:128), which has no missing parents
25/08/10 01:33:05 INFO MemoryStore: Block broadcast_19 stored as values in memory (estimated size 632.0 KiB, free 432.1 MiB)
25/08/10 01:33:05 INFO MemoryStore: Block broadcast_19_pi

+---+-------+---+--------+
| id|   name|age|    city|
+---+-------+---+--------+
| 33|User_33| 52|   Tokyo|
| 34|User_34| 53|  Sydney|
| 35|User_35| 54|   Paris|
| 36|User_36| 55|New York|
| 37|User_37| 56|  London|
+---+-------+---+--------+
only showing top 5 rows



25/08/10 01:33:05 INFO TaskSetManager: Starting task 30.0 in stage 24.0 (TID 252) (3a5b1b71f157, executor driver, partition 30, PROCESS_LOCAL, 8747 bytes) 
25/08/10 01:33:05 INFO TaskSetManager: Starting task 31.0 in stage 24.0 (TID 253) (3a5b1b71f157, executor driver, partition 31, PROCESS_LOCAL, 8747 bytes) 
25/08/10 01:33:05 INFO Executor: Running task 30.0 in stage 24.0 (TID 252)
25/08/10 01:33:05 INFO TaskSetManager: Starting task 32.0 in stage 24.0 (TID 254) (3a5b1b71f157, executor driver, partition 32, PROCESS_LOCAL, 8747 bytes) 
25/08/10 01:33:05 INFO TaskSetManager: Starting task 33.0 in stage 24.0 (TID 255) (3a5b1b71f157, executor driver, partition 33, PROCESS_LOCAL, 8747 bytes) 
25/08/10 01:33:05 INFO TaskSetManager: Starting task 34.0 in stage 24.0 (TID 256) (3a5b1b71f157, executor driver, partition 34, PROCESS_LOCAL, 8747 bytes) 
25/08/10 01:33:05 INFO Executor: Running task 33.0 in stage 24.0 (TID 255)
25/08/10 01:33:05 INFO Executor: Running task 34.0 in stage 24.0 (TID 

In [18]:
# Demonstrate Delta Lake time travel
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(spark, delta_path)

print("Delta Lake Table History:")
deltaTable.history().show()

25/08/10 01:33:05 INFO CodeGenerator: Code generated in 4.633792 ms
25/08/10 01:33:05 INFO SparkContext: Starting job: getHistory at DeltaTableOperations.scala:54
25/08/10 01:33:05 INFO DAGScheduler: Got job 15 (getHistory at DeltaTableOperations.scala:54) with 12 output partitions
25/08/10 01:33:05 INFO DAGScheduler: Final stage: ResultStage 26 (getHistory at DeltaTableOperations.scala:54)
25/08/10 01:33:05 INFO DAGScheduler: Parents of final stage: List()
25/08/10 01:33:05 INFO DAGScheduler: Missing parents: List()
25/08/10 01:33:05 INFO DAGScheduler: Submitting ResultStage 26 (MapPartitionsRDD[73] at getHistory at DeltaTableOperations.scala:54), which has no missing parents


Delta Lake Table History:
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|      1|2025-08-10 01:33:...|  null|    null|    WRITE|{mode -> Overwrit...|null|    null|     null|          0|  Serializable|        false|{numFiles -> 12, ...|        null|Apache-Spark/3.4....|
|      0|2025-08-10 01:31:...|  null|    null|    WRITE|{mode -> Overwrit...|null|    null|     null|       null|  Serializable|        false|{numFiles -> 12, ...|        null|Apache

25/08/10 01:33:05 INFO MemoryStore: Block broadcast_22 stored as values in memory (estimated size 173.7 KiB, free 432.7 MiB)
25/08/10 01:33:05 INFO MemoryStore: Block broadcast_22_piece0 stored as bytes in memory (estimated size 55.4 KiB, free 432.7 MiB)
25/08/10 01:33:05 INFO BlockManagerInfo: Added broadcast_22_piece0 in memory on 3a5b1b71f157:40499 (size: 55.4 KiB, free: 434.1 MiB)
25/08/10 01:33:05 INFO SparkContext: Created broadcast 22 from broadcast at DAGScheduler.scala:1540
25/08/10 01:33:05 INFO DAGScheduler: Submitting 12 missing tasks from ResultStage 26 (MapPartitionsRDD[73] at getHistory at DeltaTableOperations.scala:54) (first 15 tasks are for partitions Vector(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11))
25/08/10 01:33:05 INFO TaskSchedulerImpl: Adding task set 26.0 with 12 tasks resource profile 0
25/08/10 01:33:05 INFO TaskSetManager: Starting task 0.0 in stage 26.0 (TID 273) (3a5b1b71f157, executor driver, partition 0, PROCESS_LOCAL, 8872 bytes) 
25/08/10 01:33:05 INFO Tas

In [19]:
# Convert Spark DataFrame back to Pandas
result_pandas = delta_df.toPandas()

print("Converted back to Pandas:")
print(result_pandas.head())
print(f"Data types:\n{result_pandas.dtypes}")

25/08/10 01:33:05 INFO PrepareDeltaScan: DELTA: Filtering files for query
25/08/10 01:33:05 INFO SparkContext: Starting job: toPandas at /tmp/ipykernel_21/3442882317.py:2
25/08/10 01:33:05 INFO DAGScheduler: Got job 16 (toPandas at /tmp/ipykernel_21/3442882317.py:2) with 50 output partitions
25/08/10 01:33:05 INFO DAGScheduler: Final stage: ResultStage 28 (toPandas at /tmp/ipykernel_21/3442882317.py:2)
25/08/10 01:33:05 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 27)
25/08/10 01:33:05 INFO DAGScheduler: Missing parents: List()
25/08/10 01:33:05 INFO DAGScheduler: Submitting ResultStage 28 (MapPartitionsRDD[75] at toPandas at /tmp/ipykernel_21/3442882317.py:2), which has no missing parents
25/08/10 01:33:05 INFO MemoryStore: Block broadcast_23 stored as values in memory (estimated size 590.8 KiB, free 433.1 MiB)
25/08/10 01:33:05 INFO MemoryStore: Block broadcast_23_piece0 stored as bytes in memory (estimated size 138.1 KiB, free 433.0 MiB)
25/08/10 01:33:05 INFO Blo

Converted back to Pandas:
   id     name  age      city
0  89  User_89   58    Sydney
1  90  User_90   59     Paris
2  91  User_91   60  New York
3  92  User_92   61    London
4  93  User_93   62     Tokyo
Data types:
id       int64
name    object
age      int64
city    object
dtype: object


In [20]:
# Demonstrate Polars integration
# Convert Pandas to Polars
polars_df = pl.from_pandas(result_pandas)

print("Polars DataFrame:")
print(polars_df.head())
print(f"\nShape: {polars_df.shape}")
print(f"\nSchema:\n{polars_df.schema}")

Polars DataFrame:
shape: (5, 4)
┌─────┬─────────┬─────┬──────────┐
│ id  ┆ name    ┆ age ┆ city     │
│ --- ┆ ---     ┆ --- ┆ ---      │
│ i64 ┆ str     ┆ i64 ┆ str      │
╞═════╪═════════╪═════╪══════════╡
│ 89  ┆ User_89 ┆ 58  ┆ Sydney   │
│ 90  ┆ User_90 ┆ 59  ┆ Paris    │
│ 91  ┆ User_91 ┆ 60  ┆ New York │
│ 92  ┆ User_92 ┆ 61  ┆ London   │
│ 93  ┆ User_93 ┆ 62  ┆ Tokyo    │
└─────┴─────────┴─────┴──────────┘

Shape: (100, 4)

Schema:
OrderedDict([('id', Int64), ('name', String), ('age', Int64), ('city', String)])


In [21]:
# Polars data analysis example
analysis_result = polars_df.group_by("city").agg([
    pl.count().alias("count"),
    pl.col("age").mean().alias("avg_age"),
    pl.col("age").min().alias("min_age"),
    pl.col("age").max().alias("max_age")
])

print("Analysis with Polars:")
print(analysis_result)

Analysis with Polars:
shape: (5, 5)
┌──────────┬───────┬─────────┬─────────┬─────────┐
│ city     ┆ count ┆ avg_age ┆ min_age ┆ max_age │
│ ---      ┆ ---   ┆ ---     ┆ ---     ┆ ---     │
│ str      ┆ u32   ┆ f64     ┆ i64     ┆ i64     │
╞══════════╪═══════╪═════════╪═════════╪═════════╡
│ New York ┆ 20    ┆ 42.5    ┆ 20      ┆ 65      │
│ Sydney   ┆ 20    ┆ 45.5    ┆ 23      ┆ 68      │
│ London   ┆ 20    ┆ 43.5    ┆ 21      ┆ 66      │
│ Paris    ┆ 20    ┆ 46.5    ┆ 24      ┆ 69      │
│ Tokyo    ┆ 20    ┆ 44.5    ┆ 22      ┆ 67      │
└──────────┴───────┴─────────┴─────────┴─────────┘


  pl.count().alias("count"),


In [22]:
# Clean up
spark.stop()
print("Spark session stopped.")
print("\n🎉 All tests completed successfully!")
print("✅ Spark 3.4.4 working")
print("✅ Delta Lake 2.4.0 working")
print("✅ Pandas integration working")
print("✅ Polars integration working")

25/08/10 01:33:06 INFO SparkContext: SparkContext is stopping with exitCode 0.
25/08/10 01:33:06 INFO BlockManagerInfo: Removed broadcast_22_piece0 on 3a5b1b71f157:40499 in memory (size: 55.4 KiB, free: 434.3 MiB)
25/08/10 01:33:06 INFO SparkUI: Stopped Spark web UI at http://3a5b1b71f157:4040
25/08/10 01:33:06 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
25/08/10 01:33:06 INFO MemoryStore: MemoryStore cleared
25/08/10 01:33:06 INFO BlockManager: BlockManager stopped
25/08/10 01:33:06 INFO BlockManagerMaster: BlockManagerMaster stopped
25/08/10 01:33:06 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
25/08/10 01:33:06 INFO SparkContext: Successfully stopped SparkContext


Spark session stopped.

🎉 All tests completed successfully!
✅ Spark 3.4.4 working
✅ Delta Lake 2.4.0 working
✅ Pandas integration working
✅ Polars integration working
