In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, BooleanType, DoubleType

In [0]:
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
# place where raw csvs land after download
VOLUME_TARGET_DIR = f"{VOLUME_ROOT_PATH}/raw"

CATALOG_NAME = "cscie103_catalog"
SCHEMA_NAME = "final_project"
spark.sql(f"USE {CATALOG_NAME}.{SCHEMA_NAME}")

class DataframeNames:
    HOLIDAYS = "holidays"
    OIL = "oil"
    STORES = "stores"
    TEST = "test"
    TRAIN = "train"
    TRANSACTIONS = "transactions"
    TRAINING = "training"

    ALL = [ HOLIDAYS, OIL, STORES, TEST, TRAIN, TRANSACTIONS, TRAINING ]

class DataTier:
    BRONZE = "bronze"
    SILVER = "silver"
    GOLD = "gold"

    def getBronzeName(tablename):
        return DataTier.BRONZE + "_" + tablename

    def getSilverName(tablename):
        return DataTier.SILVER + "_" + tablename
    
    def getGoldName(tablename):
        return DataTier.GOLD + "_" + tablename

In [0]:
# ensure all volumes exist
for volume in [VOLUME_TARGET_DIR]:
  dbutils.fs.mkdirs(volume)

In [0]:
filenames = {
    'holidays': 'holidays_events.csv',
    'oil': 'oil.csv',
    'stores': 'stores.csv',
    'test': 'test.csv',
    'train': 'train.csv',
    'transactions': 'transactions.csv'
}

In [0]:
def extractLoad(csv_filename, bronze_tablename, schema, tbl_ddl_cb):
    """Extracts and loads a csv file into a bronze table."""
    print(f"Loading {csv_filename}...")
    df = spark.read \
      .schema(schema) \
      .csv(f"{VOLUME_TARGET_DIR}/{csv_filename}", header=True)
    print(f"Read {csv_filename} with {df.count()} rows.")

    print(f"Creating bronze table {bronze_tablename}...")
    spark.sql(f"DROP TABLE IF EXISTS {bronze_tablename}")
    tbl_ddl_cb()
    
    print(f"Writing {csv_filename} to bronze table {bronze_tablename}...")
    df.write.mode("overwrite").format("delta").saveAsTable(bronze_tablename)
    print(f"Finished.")

    return df

### Stores

In [0]:
csv_filename_stores = filenames.get(DataframeNames.STORES)
bronze_tablename_stores = DataTier.getBronzeName(DataframeNames.STORES)
stores_schema = StructType([
  StructField("store_nbr", IntegerType(), False),
  StructField("city", StringType(), True),
  StructField("state", StringType(), True),
  StructField("type", StringType(), True),
  StructField("cluster", IntegerType(), True)
])
def stores_ddl_cb():
  spark.sql(f"DROP TABLE IF EXISTS {bronze_tablename_stores}")
  spark.sql(f"""
    CREATE OR REPLACE TABLE {bronze_tablename_stores} (
      store_nbr INTEGER NOT NULL,  -- Must be NOT NULL for a Primary Key
      city STRING,
      state STRING,
      type STRING,
      cluster INTEGER,
      
      CONSTRAINT pk_store_nbr_v2 -- v2 because Databricks, cannot figure out, without it - fails !!!
      PRIMARY KEY (store_nbr)
    )
    USING DELTA
    COMMENT 'Unity Catalog Managed Delta table storing store location and type information.';
  """)

extractLoad(csv_filename_stores, bronze_tablename_stores, stores_schema, stores_ddl_cb)

### Transactions

In [0]:
csv_filename_transactions = filenames.get(DataframeNames.TRANSACTIONS)
bronze_tablename_transactions = DataTier.getBronzeName(DataframeNames.TRANSACTIONS)
transactions_schema = StructType([
    StructField("date", DateType(), False),
    StructField("store_nbr", IntegerType(), False),
    StructField("transactions", IntegerType(), True)
])
def transactions_ddl_cb():
  spark.sql(f"DROP TABLE IF EXISTS {bronze_tablename_transactions}")
  spark.sql(f"""
    CREATE OR REPLACE TABLE {bronze_tablename_transactions} (
      date DATE NOT NULL,
      store_nbr INTEGER NOT NULL,
      transactions INTEGER,
      
      CONSTRAINT pk_transaction_id 
      PRIMARY KEY (date, store_nbr),
      
      CONSTRAINT fk_txn_store_nbr 
      FOREIGN KEY (store_nbr)
      REFERENCES {CATALOG_NAME}.{SCHEMA_NAME}.{bronze_tablename_stores} (store_nbr)
    )
    USING DELTA
    COMMENT 'Unity Catalog Managed Delta table storing daily transaction counts per store with PK and FK constraints.';
  """)

extractLoad(csv_filename_transactions, bronze_tablename_transactions, transactions_schema, transactions_ddl_cb)

### Oil

In [0]:
csv_filename_oil = filenames.get(DataframeNames.OIL)
bronze_tablename_oil = DataTier.getBronzeName(DataframeNames.OIL)
oil_schema = StructType([
    StructField("date", DateType(), False),
    StructField("dcoilwtico", DoubleType(), True)
])
def oil_ddl_cb():
  spark.sql(f"DROP TABLE IF EXISTS {bronze_tablename_oil}")
  spark.sql(f"""
    CREATE OR REPLACE TABLE {bronze_tablename_oil} (
    date DATE NOT NULL,
    dcoilwtico DOUBLE,
    
    -- Primary Key: Ensures a unique price per date.
    CONSTRAINT pk_oil_date 
      PRIMARY KEY (date)
    )
    USING DELTA
    COMMENT 'Unity Catalog Managed Delta table storing daily WTI Crude Oil prices with PK constraint.';
  """)

extractLoad(csv_filename_oil, bronze_tablename_oil, oil_schema, oil_ddl_cb)

### Holidays

In [0]:
csv_filename_holidays_events = filenames.get(DataframeNames.HOLIDAYS)
bronze_tablename_holidays_events = DataTier.getBronzeName(DataframeNames.HOLIDAYS)
holidays_schema = StructType([
    StructField("date", DateType(), False),      # Date is essential, set to NOT NULL
    StructField("type", StringType(), False),    # Holiday type is essential, set to NOT NULL
    StructField("locale", StringType(), False),  # Defines the scope (National/Regional), NOT NULL
    StructField("locale_name", StringType(), False), # Specific region name, NOT NULL
    StructField("description", StringType(), True),
    StructField("transferred", BooleanType(), True)
])
def holidays_ddl_cb():
  spark.sql(f"DROP TABLE IF EXISTS {bronze_tablename_holidays_events}")
  spark.sql(f"""
    CREATE OR REPLACE TABLE {bronze_tablename_holidays_events} (
      date DATE NOT NULL,
      type STRING NOT NULL,
      locale STRING NOT NULL,
      locale_name STRING NOT NULL,
      description STRING,
      transferred BOOLEAN,
      
      CONSTRAINT pk_holiday_id 
      PRIMARY KEY (date, type, locale, locale_name)
    )
    USING DELTA
    COMMENT 'Unity Catalog Managed Delta table storing holiday data with Composite PK constraint.';
  """)

extractLoad(csv_filename_holidays_events, bronze_tablename_holidays_events, holidays_schema, holidays_ddl_cb)

### Train

In [0]:
csv_filename_train = filenames.get(DataframeNames.TRAIN)
bronze_tablename_train = DataTier.getBronzeName(DataframeNames.TRAIN)
train_schema = StructType([
    StructField("id", IntegerType(), False),        # Unique identifier, set to NOT NULL
    StructField("date", DateType(), False),         # Essential for time series, set to NOT NULL
    StructField("store_nbr", IntegerType(), False),  # Links to stores table, set to NOT NULL
    StructField("family", StringType(), False),      # Product category, set to NOT NULL
    StructField("sales", DoubleType(), True),       # Sales amount (can be 0 or null depending on source)
    StructField("onpromotion", IntegerType(), True)  # Number of items on promotion
])
def train_ddl_cb():
  spark.sql(f"DROP TABLE IF EXISTS {bronze_tablename_train}")
  spark.sql(f"""
    CREATE OR REPLACE TABLE {bronze_tablename_train} (
        id INTEGER NOT NULL,
        date DATE NOT NULL,
        store_nbr INTEGER NOT NULL,
        family STRING NOT NULL,
        sales DOUBLE,
        onpromotion INTEGER,
        
        CONSTRAINT pk_sales_record 
        PRIMARY KEY (date, store_nbr, family), 
        
        CONSTRAINT fk_sales_store_nbr 
        FOREIGN KEY (store_nbr)
        REFERENCES {CATALOG_NAME}.{SCHEMA_NAME}.{bronze_tablename_stores} (store_nbr)
    )
    USING DELTA
    COMMENT 'Unity Catalog Managed Delta table storing daily sales training data with Composite PK and FK constraints.';
  """)

extractLoad(csv_filename_train, bronze_tablename_train, train_schema, train_ddl_cb)

### Test

In [0]:
csv_filename_test = filenames.get(DataframeNames.TEST)
bronze_tablename_test = DataTier.getBronzeName(DataframeNames.TEST)
test_schema = StructType([
    StructField("id", IntegerType(), False),        # Unique identifier, set to NOT NULL
    StructField("date", DateType(), False),         # Essential for time series, set to NOT NULL
    StructField("store_nbr", IntegerType(), False),  # Links to stores table, set to NOT NULL
    StructField("family", StringType(), False),      # Product category, set to NOT NULL
    StructField("onpromotion", IntegerType(), True)  # Number of items on promotion (can be null/missing)
])
def test_ddl_cb():
  spark.sql(f"DROP TABLE IF EXISTS {bronze_tablename_test}")
  spark.sql(f"""
    CREATE OR REPLACE TABLE {bronze_tablename_test} (
        id INTEGER NOT NULL,
        date DATE NOT NULL,
        store_nbr INTEGER NOT NULL,
        family STRING NOT NULL,
        onpromotion INTEGER,
        
        CONSTRAINT pk_test_id 
        PRIMARY KEY (id), 
        
        CONSTRAINT fk_test_store_nbr 
        FOREIGN KEY (store_nbr)
        REFERENCES {CATALOG_NAME}.{SCHEMA_NAME}.{bronze_tablename_stores} (store_nbr)
    )
    USING DELTA
    COMMENT 'Unity Catalog Managed Delta table storing sales test data with PK and FK constraints.';
    """)

extractLoad(csv_filename_test, bronze_tablename_test, test_schema, test_ddl_cb)