In [None]:
%load_ext sparksql_magic

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Row
from pyspark.sql.window import Window
from delta.tables import DeltaTable

spark = (
    SparkSession.builder 
    .appName("DeltaExample")
    .master("local[*]")
    .config("spark.ui.enabled", "true")   
    .config("spark.jars.packages", "io.delta:delta-spark_2.13:4.0.0")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.warehouse.dir", "./spark-warehouse")
    .enableHiveSupport()    # persist metastore across sessions (embedded Hive metastore)
    .getOrCreate()
)

# Show the SparkUI url (useful for monitoring and debuging)
spark.sparkContext.uiWebUrl

In [None]:
%%sparksql
create database if not exists marvel_db;

In [None]:
%%sparksql
use marvel_db;

In [None]:
%%sparksql
describe database marvel_db;

In [None]:
%%sparksql
show tables;

In [None]:
%%sparksql
create table if not exists marvel_db.superheroes
(
  id INT,
  hero_name STRING,
  secret_identity STRING,
  power_level INT
)
using delta

In [None]:
%%sparksql
describe table marvel_db.superheroes

In [None]:
schema = T.StructType([
    T.StructField("id", T.IntegerType(), False),
    T.StructField("hero_name", T.StringType(), False),
    T.StructField("secret_identity", T.StringType(), False),
    T.StructField("power_level", T.IntegerType(), False)
])

In [None]:
raw_df = (
    spark.read
        .schema(schema)
        .option("header", "true")
        .csv("data/marvel.csv")
)

raw_df.createOrReplaceTempView("superheroes_raw")

In [None]:
%%sparksql
select * from superheroes_raw

In [None]:
%%sparksql
INSERT INTO marvel_db.superheroes (
  id,
  hero_name,
  secret_identity,
  power_level
)
SELECT
  id,
  hero_name,
  secret_identity,
  power_level
FROM superheroes_raw;

In [None]:
%%sparksql
select * from superheroes;

In [None]:
raw_df_mod = (
    spark.read
        .schema(schema)
        .option("header", "true")
        .csv("data/marvel_mod.csv")
)

raw_df_mod.createOrReplaceTempView("superheroes_raw_mod")

In [None]:
%%sparksql
MERGE INTO superheroes AS target
USING superheroes_raw_mod AS source
  ON target.id = source.id
WHEN MATCHED THEN UPDATE SET
  target.id = source.id,
  target.hero_name = source.hero_name,
  target.secret_identity = source.secret_identity,
  target.power_level = source.power_level
WHEN NOT MATCHED THEN INSERT (
  id, hero_name, secret_identity, power_level
) VALUES (
  source.id, source.hero_name, source.secret_identity, source.power_level
)
WHEN NOT MATCHED BY SOURCE THEN DELETE;

In [None]:
%%sparksql
select * from superheroes;