# Spark with Delta Lake in Jupyter Notebooks with Scala

There are several things that need special attention.

- Install `cs`
```bash
curl -fL "https://github.com/coursier/launchers/raw/master/cs-x86_64-pc-linux.gz" | gzip -d > cs
chmod +x cs
./cs setup
```
- Install `scala` kernel
```bash
cs launch --use-bootstrap almond:0.14.1 --scala 2.13 -- --install 
```
- Config `"spark.jars.packages", "io.delta:delta-spark_2.13:4.0.0"` in order to download the Delta Lake jvm artifact.
- You need the config `"spark.sql.warehouse.dir", "./spark-warehouse"` to set the location of the Spark SQL warehouse. This will be the location where managed tables are stored.
- You need the config `"javax.jdo.option.ConnectionURL", "jdbc:derby:;databaseName=./metastore/metastore_db;create=true"` and `enableHiveSupport()` in order to use the Hive metastore to manage tables. This will create a local Derby database to store the metadata of the tables and will allow to look up tables by name.
- You need to have the `sparksql-magic` extension installed and `%load_ext sparksql_magic` in order to be able to run SQL queries in a cell magic using `%%sparksql`.
- Alternatively, and maybe it is easier, you can use the DataFrame API and point to the datafiles directly. In that case you don't need the SQL Warehouse, the Hive metastore and the `sparksql-magic` extension.

In [None]:
import $ivy.`org.apache.spark::spark-sql:4.0.1`
import $ivy.`org.apache.spark::spark-hive:4.0.1`
//import $ivy.`sh.almond::almond-spark:0.14.1` 
import $ivy.`io.delta::delta-spark:4.0.0`

In [None]:
import org.apache.spark.sql._
import org.apache.spark.sql.types._

val spark = SparkSession.builder()
  .master("local[*]")
    .config("spark.ui.enabled", "true")   
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.warehouse.dir", "../spark-warehouse")
    .config("javax.jdo.option.ConnectionURL", "jdbc:derby:;databaseName=../metastore/metastore_db;create=true")
    .config("spark.log.level", "WARN")
    .enableHiveSupport()
    .getOrCreate()

//Show the SparkUI url (useful for monitoring and debuging)
spark.sparkContext.uiWebUrl

In [None]:
spark.sql("create database if not exists marvel_db")

In [None]:
spark.sql("use marvel_db")

In [None]:
spark.sql("describe database marvel_db").show(false)

In [None]:
spark.sql("show tables").show(false)

In [None]:
spark.sql("""
create table if not exists marvel_db.superheroes
(
  id INT,
  hero_name STRING,
  secret_identity STRING,
  power_level INT
)
using delta
""")

In [None]:
spark.sql("describe table marvel_db.superheroes").show()

In [None]:
val schema = StructType(Array(
    StructField("id", IntegerType, nullable = true),
    StructField("hero_name", StringType, nullable = true),
    StructField("secret_identity", StringType, nullable = true),
    StructField("power_level", IntegerType, nullable = true)
))

In [None]:
val raw_df = spark.read
    .schema(schema)
    .option("header", "true")
    .csv("../data/marvel.csv")

raw_df.createOrReplaceTempView("superheroes_raw")

In [None]:
spark.sql("select * from superheroes_raw").show()

In [None]:
spark.sql("DELETE FROM marvel_db.superheroes").show()

In [None]:
spark.sql("""
INSERT INTO marvel_db.superheroes (
  id,
  hero_name,
  secret_identity,
  power_level
)
SELECT
  id,
  hero_name,
  secret_identity,
  power_level
FROM superheroes_raw
""").show()

In [None]:
spark.sql("select * from superheroes").show(false)

In [None]:
val raw_df_mod = spark.read
    .schema(schema)
    .option("header", "true")
    .csv("../data/marvel_mod.csv")

raw_df_mod.createOrReplaceTempView("superheroes_raw_mod")

In [None]:
spark.sql("""
    MERGE INTO superheroes AS target
    USING superheroes_raw_mod AS source
      ON target.id = source.id
    WHEN MATCHED THEN UPDATE SET
      target.id = source.id,
      target.hero_name = source.hero_name,
      target.secret_identity = source.secret_identity,
      target.power_level = source.power_level
    WHEN NOT MATCHED THEN INSERT (
      id, hero_name, secret_identity, power_level
    ) VALUES (
      source.id, source.hero_name, source.secret_identity, source.power_level
    )
    WHEN NOT MATCHED BY SOURCE THEN DELETE
""").show(false)

In [None]:
spark.sql("select * from superheroes").show(false)

In [None]:
spark.sql("""
    UPDATE superheroes
    SET power_level = power_level + 1
    WHERE hero_name = 'Iron Man'
""").show(false)

In [None]:
spark.sql("DELETE FROM superheroes WHERE hero_name = 'Ant-Man'").show(false)

In [None]:
spark.sql("DESCRIBE HISTORY marvel_db.superheroes").show(false)

In [None]:
spark.sql("DESCRIBE DETAIL marvel_db.superheroes").show(false)

In [None]:
spark.sql("SELECT * FROM marvel_db.superheroes VERSION AS OF 1").show(false)

In [None]:
spark.sql("SELECT * FROM marvel_db.superheroes VERSION AS OF 2").show(false)

In [None]:
spark.sql("RESTORE TABLE marvel_db.superheroes TO VERSION AS OF 1")

In [None]:
// spark.sql("RESTORE TABLE marvel_db.superheroes TO TIMESTAMP AS OF '2025-10-17 22:35:00'")