In [0]:
# 1) Download the raw parquet files into DBFS:/tmp/raw/covid_nyt
import os
import urllib.request

raw_dbfs = "/tmp/raw/covid_nyt"
bronze_dbfs = "/tmp/bronze/covid_nyt"

# clean up any prior runs
dbutils.fs.rm(f"dbfs:{raw_dbfs}", recurse=True)
dbutils.fs.rm(f"dbfs:{bronze_dbfs}", recurse=True)

# recreate the raw folder
dbutils.fs.mkdirs(f"dbfs:{raw_dbfs}")

base_url = "https://raw.githubusercontent.com/delta-incubator/delta-lake-definitive-guide/main/datasets/COVID-19_NYT/"
files = ["lookup.parquet", "us-counties.parquet", "us-states.parquet"]

for fname in files:
    local_tmp = f"/tmp/{fname}"
    print(f"⬇️  Downloading {base_url+fname} → {local_tmp}")
    urllib.request.urlretrieve(base_url + fname, local_tmp)
    dest = f"{raw_dbfs}/{fname}"
    print(f"📂  Copying {local_tmp} → dbfs:{dest}")
    dbutils.fs.cp(f"file:{local_tmp}", f"dbfs:{dest}")

# 2) Read them all as a single DataFrame and write out as Bronze Delta
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

urls = [f"dbfs:{raw_dbfs}/{f}" for f in files]
df = spark.read.parquet(*urls)

print(f"🚀  Writing Bronze delta to dbfs:{bronze_dbfs}")
(df.write
   .format("delta")
   .mode("overwrite")
   .save(f"dbfs:{bronze_dbfs}"))

print("✅  Bronze layer ready!")

In [0]:
%fs ls dbfs:/tmp/raw/covid_nyt