In [0]:
# imports
import os
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql import functions as F
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.ml.feature import FeatureHasher

In [0]:
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
# raw data
VOLUME_BRONZE_DIR = f"{VOLUME_ROOT_PATH}/bronze"
# place where prepared data is written
VOLUME_SILVER_DIR = f"{VOLUME_ROOT_PATH}/silver"

# ensure all paths exist
for path in [VOLUME_BRONZE_DIR, VOLUME_SILVER_DIR]:
  if not os.path.exists(path):
    os.makedirs(path, exist_ok=True)

In [0]:
# load the data from local volumes
tablename = 'holidays'

In [0]:
# read from Bronze tier as Delta table
bronze_path = f"{VOLUME_BRONZE_DIR}/{tablename}"
holidays_events_df = spark.read.format("delta").load(bronze_path)

In [0]:
holidays_events_df.printSchema()
display(holidays_events_df)

In [0]:
def rows_to_value(df, name):
    return [ row[name] for row in df ]

display(rows_to_value(holidays_events_df.select('locale_name').distinct().collect(), 'locale_name'))
holidays_events_df.select('locale').distinct().show()
holidays_events_df.select('type').distinct().show()

In [0]:
# Preparation of holidays data (holidays_events_df):
# 1. Drop rows with 'transfered' = true -> these were transferred to another date.
#    Identifiable by 'type' = 'Transfer'
# 2. Explode nationwade holiday to per state, identifiable by 'locale_name' = 'Ecuador'
# 3. Deduplicate dates. This is made under assumption that all the rest of holiday types are actual holidays.
# 4. Construct new dataframe with 2 columns: 'date', 'is_holiday' from the holidays df

# 1. Drop rows with 'transfered' = true -> these were transferred to another date.
holidays_events_df = holidays_events_df.where(F.col('locale_name') != 'Transfer')

# 2. Explode nationwade holiday to per state, identifiable by 'locale_name' = 'Ecuador'
# list of states is provided by the stores_df
stores_bronze_path = f"{VOLUME_BRONZE_DIR}/stores"
stores_df = spark.read.format("delta").load(stores_bronze_path)

ecuador_states = [ row['state'] for row in stores_df.select('state').distinct().collect()]

# add array with all the states to 'Ecuador' rows
holidays_events_df = holidays_events_df.withColumn(
    'locale_name_array',
    F.when(
        F.col('locale_name') == 'Ecuador',
        F.array([ F.lit(s) for s in ecuador_states ])
    ).otherwise(
        F.array(F.col('locale_name'))
    )
)
# Explode & 
# 4. Construct new dataframe with 2 columns: 'date', 'is_holiday' from the holidays df
holidays_events_df = holidays_events_df.select(
    'date',
    F.explode('locale_name_array').alias('state'),
    F.lit(1).alias('is_holiday') 
)

# 3. Deduplicate rows by leaving unique per date-state
holidays_events_df = holidays_events_df.dropDuplicates(['date', 'state'])

In [0]:
holidays_events_df.printSchema()
display(holidays_events_df)

In [0]:
# hash 'state' column
# add one fake row at the end with 'is_holiday' = 0, for proper hashing
holidays_events_df = holidays_events_df.union(
    holidays_events_df.select(
        F.lit('2099-01-01').alias('date'),
        F.lit('Bolivar').alias('state'),
        F.lit(0).alias('is_holiday')
    )
)

holidays_hasher = FeatureHasher(
    inputCols=['state', 'is_holiday'],
    outputCol='hash_state_isHoliday',
    numFeatures=1024
)
holidays_events_df = holidays_hasher.transform(holidays_events_df)

# drop 'state', 'is_holiday' columns
holidays_events_df = holidays_events_df.drop('state', 'is_holiday')

display(holidays_events_df)

In [0]:
# write to Silver tier as Delta table
silver_path = f"{VOLUME_SILVER_DIR}/{tablename}"
dbutils.fs.rm(silver_path, True)
holidays_events_df.write.mode('overwrite').format("delta").save(silver_path)