**Note:** This is a DDL notebook. Run this only once

In [None]:
from pyspark.sql import SparkSession

from seed.unity import conf

spark: SparkSession = SparkSession.builder.config(conf=conf).getOrCreate()
print(f"Spark {spark.version} is up and running!")

In [None]:
spark.sql("SHOW CATALOGS").show()

spark.sql("CREATE SCHEMA IF NOT EXISTS unity.dev")

spark.sql("SHOW SCHEMAS FROM unity").show()

In [None]:
spark.sql("SHOW TABLES IN unity.default").show()

spark.sql("SHOW TABLES IN unity.dev").show()

# spark.sql("SELECT * FROM default.marksheet LIMIT 5;").show()

In [None]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS unity.dev.test (
        name STRING
    )
    USING delta
    LOCATION 's3a://unity/'
""")

spark.sql("INSERT INTO unity.dev.test VALUES ('test 1')")
spark.sql("INSERT INTO unity.dev.test VALUES ('test 2')")

spark.sql("SELECT * FROM unity.dev.test").show()

In [None]:
# Read flights data from parquet file

df = spark.read.parquet("s3a://seed/flights-1m.parquet")
df.show(5)

df.printSchema()

df.createOrReplaceTempView("raw_flights")

spark.sql("""
    SELECT
        MIN(FL_DATE) AS min_date,
        MAX(FL_DATE) AS max_date,
        COUNT(*) AS num_rows
    FROM raw_flights;
""").show()

spark.sql("""
    SELECT
        FL_DATE,
        COUNT(*) AS num_rows
    FROM raw_flights
    GROUP BY FL_DATE
    ORDER BY FL_DATE
    LIMIT 5;
""").show()

In [None]:
# Create flights table from parquet file

spark.sql("""
    CREATE TABLE IF NOT EXISTS dev.flights
    USING delta
    LOCATION 's3a://iceberg/'
    AS
    SELECT
        distance,
        dep_time,
        arr_time
    FROM raw_flights;
""")

spark.sql("SELECT * FROM dev.flights LIMIT 5;").show()

In [None]:
spark.stop()