# Spark with Delta Lake in Jupyter Notebooks

There are several things that need special attention.

- You need the config `"spark.jars.packages", "io.delta:delta-spark_2.13:4.0.0"` in order to download the Delta Lake jvm artifact.
- You need the config `"spark.sql.warehouse.dir", "./spark-warehouse"` to set the location of the Spark SQL warehouse. This will be the location where managed tables are stored.
- You need the config `"javax.jdo.option.ConnectionURL", "jdbc:derby:;databaseName=./metastore/metastore_db;create=true"` and `enableHiveSupport()` in order to use the Hive metastore to manage tables. This will create a local Derby database to store the metadata of the tables and will allow to look up tables by name.
- You need to have the `sparksql-magic` extension installed and `%load_ext sparksql_magic` in order to be able to run SQL queries in a cell magic using `%%sparksql`.
- Alternatively, and maybe it is easier, you can use the DataFrame API and point to the datafiles directly. In that case you don't need the SQL Warehouse, the Hive metastore and the `sparksql-magic` extension.

In [1]:
%load_ext sparksql_magic

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Row
from pyspark.sql.window import Window
from delta.tables import DeltaTable

spark = (
    SparkSession.builder 
    .appName("DeltaExample")
    .master("local[*]")
    .config("spark.ui.enabled", "true")   
    .config("spark.jars.packages", "io.delta:delta-spark_2.13:4.0.0")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.warehouse.dir", "./spark-warehouse")
    .config("javax.jdo.option.ConnectionURL", "jdbc:derby:;databaseName=./metastore/metastore_db;create=true")
    .enableHiveSupport()    # persist metastore across sessions (embedded Hive metastore)
    .getOrCreate()
)

# Show the SparkUI url (useful for monitoring and debuging)
spark.sparkContext.uiWebUrl

:: loading settings :: url = jar:file:/home/yannis/Development/tmp/pyspark-delta/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/yannis/.ivy2.5.2/cache
The jars for the packages stored in: /home/yannis/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-91b95ac2-82f1-4bf8-8011-75e41c55a9eb;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 303ms :: artifacts dl 12ms
	:: modules in use:
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules       

'http://ouranos:4040'

In [3]:
%%sparksql
create database if not exists marvel_db;

25/10/17 17:54:31 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
25/10/17 17:54:31 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore yannis@192.168.10.76
25/10/17 17:54:31 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
25/10/17 17:54:32 WARN ObjectStore: Failed to get database marvel_db, returning NoSuchObjectException
25/10/17 17:54:32 WARN ObjectStore: Failed to get database marvel_db, returning NoSuchObjectException
25/10/17 17:54:32 WARN ObjectStore: Failed to get database marvel_db, returning NoSuchObjectException


In [4]:
%%sparksql
use marvel_db;

In [5]:
%%sparksql
describe database marvel_db;

0,1
info_name,info_value
Catalog Name,spark_catalog
Namespace Name,marvel_db
Comment,
Location,file:/home/yannis/Development/tmp/pyspark-delta/spark-warehouse/marvel_db.db
Owner,yannis


In [12]:
%%sparksql
show tables;

0,1,2
namespace,tableName,isTemporary
marvel_db,superheroes,False
,superheroes_raw,True


In [7]:
%%sparksql
create table if not exists marvel_db.superheroes
(
  id INT,
  hero_name STRING,
  secret_identity STRING,
  power_level INT
)
using delta

25/10/17 17:55:39 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`marvel_db`.`superheroes` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
25/10/17 17:55:39 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
25/10/17 17:55:39 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist


In [9]:
%%sparksql
describe table marvel_db.superheroes

0,1,2
col_name,data_type,comment
id,int,
hero_name,string,
secret_identity,string,
power_level,int,


In [10]:
schema = T.StructType([
    T.StructField("id", T.IntegerType(), False),
    T.StructField("hero_name", T.StringType(), False),
    T.StructField("secret_identity", T.StringType(), False),
    T.StructField("power_level", T.IntegerType(), False)
])

In [11]:
raw_df = (
    spark.read
        .schema(schema)
        .option("header", "true")
        .csv("data/marvel.csv")
)

raw_df.createOrReplaceTempView("superheroes_raw")

In [13]:
%%sparksql
select * from superheroes_raw

0,1,2,3
id,hero_name,secret_identity,power_level
1,Iron Man,Tony Stark,95
2,Captain America,Steve Rogers,88
3,Thor,Thor Odinson,98
4,Hulk,Bruce Banner,97
5,Black Widow,Natasha Romanoff,75
6,Spider-Man,Peter Parker,92
7,Black Panther,T'Challa,89
8,Doctor Strange,Stephen Strange,93
9,Scarlet Witch,Wanda Maximoff,94


In [14]:
%%sparksql
INSERT INTO marvel_db.superheroes (
  id,
  hero_name,
  secret_identity,
  power_level
)
SELECT
  id,
  hero_name,
  secret_identity,
  power_level
FROM superheroes_raw;

25/10/17 17:56:39 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [15]:
%%sparksql
select * from superheroes;

                                                                                

0,1,2,3
id,hero_name,secret_identity,power_level
1,Iron Man,Tony Stark,95
2,Captain America,Steve Rogers,88
3,Thor,Thor Odinson,98
4,Hulk,Bruce Banner,97
5,Black Widow,Natasha Romanoff,75
6,Spider-Man,Peter Parker,92
7,Black Panther,T'Challa,89
8,Doctor Strange,Stephen Strange,93
9,Scarlet Witch,Wanda Maximoff,94


In [16]:
raw_df_mod = (
    spark.read
        .schema(schema)
        .option("header", "true")
        .csv("data/marvel_mod.csv")
)

raw_df_mod.createOrReplaceTempView("superheroes_raw_mod")

In [17]:
%%sparksql
MERGE INTO superheroes AS target
USING superheroes_raw_mod AS source
  ON target.id = source.id
WHEN MATCHED THEN UPDATE SET
  target.id = source.id,
  target.hero_name = source.hero_name,
  target.secret_identity = source.secret_identity,
  target.power_level = source.power_level
WHEN NOT MATCHED THEN INSERT (
  id, hero_name, secret_identity, power_level
) VALUES (
  source.id, source.hero_name, source.secret_identity, source.power_level
)
WHEN NOT MATCHED BY SOURCE THEN DELETE;

25/10/17 17:57:09 WARN MapPartitionsRDD: RDD 40 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


0,1,2,3
num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
11,9,1,1


In [18]:
%%sparksql
select * from superheroes;

                                                                                

0,1,2,3
id,hero_name,secret_identity,power_level
1,Iron Man,Tony Stark,97
2,Captain America,Steve Rogers,88
3,Thor,,98
4,Hulk,Bruce Banner,97
5,Black Widow,Natasha Romanoff,75
6,Spider-Man,Peter Parker,92
7,Black Panther,T'Challa,89
9,Scarlet Witch,Wanda Maximoff,94
10,Hawkeye,Clint Barton,70


In [None]:
%%sparksql
UPDATE superheroes
SET power_level = power_level + 1
WHERE hero_name = 'Iron Man'

25/10/17 18:21:06 WARN UpdateCommand: Could not validate number of records due to missing statistics.
                                                                                

0
num_affected_rows
1


In [52]:
%%sparksql
DELETE FROM superheroes WHERE hero_name = 'Ant-Man'

25/10/17 18:23:03 WARN DeleteCommand: Could not validate number of records due to missing statistics.
                                                                                

0
num_affected_rows
1


In [3]:
%%sparksql
DESCRIBE HISTORY marvel_db.superheroes;

25/10/17 18:25:55 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
25/10/17 18:25:55 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore yannis@192.168.10.76
                                                                                

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
4,2025-10-17 18:23:03.868000,,,DELETE,"{'predicate': '[""(hero_name#11526 = Ant-Man)""]'}",,,,3,Serializable,False,"{'numDeletionVectorsUpdated': '0', 'numAddedFiles': '1', 'executionTimeMs': '2033', 'numDeletionVectorsRemoved': '0', 'numRemovedFiles': '1', 'rewriteTimeMs': '335', 'numRemovedBytes': '1642', 'scanTimeMs': '1697', 'numCopiedRows': '9', 'numDeletionVectorsAdded': '0', 'numAddedChangeFiles': '0', 'numDeletedRows': '1', 'numAddedBytes': '1620'}",,Apache-Spark/4.0.1 Delta-Lake/4.0.0
3,2025-10-17 18:21:06.091000,,,UPDATE,"{'predicate': '[""(hero_name#10912 = Iron Man)""]'}",,,,2,Serializable,False,"{'numDeletionVectorsUpdated': '0', 'numAddedFiles': '1', 'executionTimeMs': '1288', 'numDeletionVectorsRemoved': '0', 'numUpdatedRows': '1', 'numRemovedFiles': '1', 'rewriteTimeMs': '427', 'numRemovedBytes': '1642', 'scanTimeMs': '860', 'numCopiedRows': '9', 'numDeletionVectorsAdded': '0', 'numAddedChangeFiles': '0', 'numAddedBytes': '1642'}",,Apache-Spark/4.0.1 Delta-Lake/4.0.0
2,2025-10-17 17:57:07.292000,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""}]', 'predicate': '[""(id#1097 = id#1093)""]', 'notMatchedBySourcePredicates': '[{""actionType"":""delete""}]', 'notMatchedPredicates': '[{""actionType"":""insert""}]'}",,,,1,Serializable,False,"{'numOutputRows': '10', 'numTargetBytesAdded': '1642', 'numTargetRowsInserted': '1', 'numTargetRowsMatchedDeleted': '0', 'numTargetFilesAdded': '1', 'materializeSourceTimeMs': '123', 'numTargetFilesRemoved': '1', 'numTargetRowsMatchedUpdated': '9', 'executionTimeMs': '2864', 'numTargetDeletionVectorsUpdated': '0', 'numTargetRowsCopied': '0', 'rewriteTimeMs': '852', 'numTargetRowsUpdated': '9', 'numTargetDeletionVectorsRemoved': '0', 'numTargetRowsDeleted': '1', 'scanTimeMs': '1874', 'numSourceRows': '10', 'numTargetDeletionVectorsAdded': '0', 'numTargetChangeFilesAdded': '0', 'numTargetRowsNotMatchedBySourceUpdated': '0', 'numTargetRowsNotMatchedBySourceDeleted': '1', 'numTargetBytesRemoved': '1679'}",,Apache-Spark/4.0.1 Delta-Lake/4.0.0
1,2025-10-17 17:56:39.280000,,,WRITE,"{'mode': 'Append', 'partitionBy': '[]'}",,,,0,Serializable,True,"{'numOutputRows': '10', 'numOutputBytes': '1679', 'numFiles': '1'}",,Apache-Spark/4.0.1 Delta-Lake/4.0.0
0,2025-10-17 17:55:36.558000,,,CREATE TABLE,"{'partitionBy': '[]', 'description': None, 'properties': '{}', 'clusterBy': '[]', 'isManaged': 'true'}",,,,,Serializable,True,{},,Apache-Spark/4.0.1 Delta-Lake/4.0.0


In [20]:
%%sparksql
DESCRIBE DETAIL marvel_db.superheroes;

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures
delta,acd6e044-f060-4b96-88b5-37ab228c8991,spark_catalog.marvel_db.superheroes,,file:/home/yannis/Development/tmp/pyspark-delta/spark-warehouse/marvel_db.db/superheroes,2025-10-17 17:55:36.098000,2025-10-17 17:57:07.292000,[],[],1,1642,{},1,2,"['appendOnly', 'invariants']"


In [21]:
%%sparksql
SELECT * FROM marvel_db.superheroes VERSION AS OF 1;

                                                                                

0,1,2,3
id,hero_name,secret_identity,power_level
1,Iron Man,Tony Stark,95
2,Captain America,Steve Rogers,88
3,Thor,Thor Odinson,98
4,Hulk,Bruce Banner,97
5,Black Widow,Natasha Romanoff,75
6,Spider-Man,Peter Parker,92
7,Black Panther,T'Challa,89
8,Doctor Strange,Stephen Strange,93
9,Scarlet Witch,Wanda Maximoff,94


In [22]:
%%sparksql
SELECT * FROM marvel_db.superheroes VERSION AS OF 2;


0,1,2,3
id,hero_name,secret_identity,power_level
1,Iron Man,Tony Stark,97
2,Captain America,Steve Rogers,88
3,Thor,,98
4,Hulk,Bruce Banner,97
5,Black Widow,Natasha Romanoff,75
6,Spider-Man,Peter Parker,92
7,Black Panther,T'Challa,89
9,Scarlet Witch,Wanda Maximoff,94
10,Hawkeye,Clint Barton,70


In [38]:
%%sparksql
SELECT * FROM marvel_db.superheroes TIMESTAMP AS OF '2025-10-17 17:56:00';

                                                                                

0,1,2,3
id,hero_name,secret_identity,power_level


In [None]:
%%sparksql
SELECT * FROM marvel_db.superheroes TIMESTAMP AS OF '2025-10-17 17:57:00';

                                                                                

0,1,2,3
id,hero_name,secret_identity,power_level
1,Iron Man,Tony Stark,95
2,Captain America,Steve Rogers,88
3,Thor,Thor Odinson,98
4,Hulk,Bruce Banner,97
5,Black Widow,Natasha Romanoff,75
6,Spider-Man,Peter Parker,92
7,Black Panther,T'Challa,89
8,Doctor Strange,Stephen Strange,93
9,Scarlet Witch,Wanda Maximoff,94


In [41]:
%%sparksql
SELECT * FROM marvel_db.superheroes TIMESTAMP AS OF '2025-10-17 17:57:07.292000';

0,1,2,3
id,hero_name,secret_identity,power_level
1,Iron Man,Tony Stark,97
2,Captain America,Steve Rogers,88
3,Thor,,98
4,Hulk,Bruce Banner,97
5,Black Widow,Natasha Romanoff,75
6,Spider-Man,Peter Parker,92
7,Black Panther,T'Challa,89
9,Scarlet Witch,Wanda Maximoff,94
10,Hawkeye,Clint Barton,70
