In [0]:
%run ./_include

In [0]:
# 現在のデータベース（スキーマ）を確認
current_db = spark.catalog.currentDatabase()
print(f"Current Database: {current_db}")

# Unity Catalog環境では currentCatalog() も使用可能（Spark 3.4+）
current_catalog = spark.catalog.currentCatalog()
print(f"Current Catalog: {current_catalog}")


## ボリュームのCSVファイルを取り込む

In [0]:
from pyspark.sql.functions import current_timestamp
volumes_path = "/Volumes/practice03/supply_chain/dataset"

tables = [
    "categories",
    "customers",
    "employees",
    "orders",
    "order_details",
    "products",
    "suppliers",
    "shippers",
    "regions",
    "territories",
    "employee_territories",
]

# 各テーブルをDataFrameとして読み込み
for table in tables:    
    df = spark.read.csv(
        f"{volumes_path}/{table}.csv",
        header=True,
        inferSchema=False
    )
    df = df.withColumn("ingested_at", current_timestamp())

    bronze_table_name = f"bronze_{table}"
    spark.sql(f"DROP TABLE IF EXISTS {bronze_table_name}")
    # Deltaテーブルとして保存
    df.write.format("delta").mode("overwrite").saveAsTable(bronze_table_name)
    
    print(f"Loaded {table}: {df.count()} rows")

## テーブルの存在確認

In [0]:

print(f"Show tables")
df_tables = spark.sql("SHOW TABLES")
display(df_tables)

## テーブルのスキーマチェック

In [0]:
table = "bronze_airports"

for table in tables:

    bronze_table = f"bronze_{table}"
    print(f"{bronze_table} schema")
    
    df_schema = spark.sql(f"DESCRIBE TABLE {bronze_table}")
    display(df_schema)

