**Note:** This is a DDL notebook. Run this only once

In [None]:
from pyspark.sql import SparkSession

from seed.nessie import conf

spark: SparkSession = SparkSession.builder.config(conf=conf).getOrCreate()
print(f"Spark {spark.version} is up and running!")

In [None]:
# Create a namespace in nessie catalog

# spark.sql("DROP NAMESPACE IF EXISTS nessie.dev;")

spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.dev;")

spark.sql("SHOW NAMESPACES FROM nessie").show()

In [None]:
# Read flights data from parquet file

df = spark.read.parquet("s3a://seed/flights-1m.parquet")
df.show(5)

df.printSchema()

df.createOrReplaceTempView("raw_flights")

spark.sql("""
    SELECT
        MIN(FL_DATE) AS min_date,
        MAX(FL_DATE) AS max_date,
        COUNT(*) AS num_rows
    FROM raw_flights;
""").show()

spark.sql("""
    SELECT
        FL_DATE,
        COUNT(*) AS num_rows
    FROM raw_flights
    GROUP BY FL_DATE
    ORDER BY FL_DATE
    LIMIT 10;
""").show()

In [None]:
# Create flights table from parquet file

spark.sql("""
    CREATE TABLE IF NOT EXISTS nessie.dev.flights 
    USING iceberg
    PARTITIONED BY (fl_date)
    TBLPROPERTIES ('gc.enabled' = 'true')
    AS
    SELECT
        *
    FROM raw_flights;
""")

spark.sql("SELECT * FROM nessie.dev.flights LIMIT 5;").show()

# spark.sql("DROP TABLE nessie.dev.flights PURGE;")