In [0]:
# imports
import os
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql import functions as F
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.ml.feature import FeatureHasher

In [0]:
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
# raw data
VOLUME_BRONZE_DIR = f"{VOLUME_ROOT_PATH}/bronze"
# place where prepared data is written
VOLUME_SILVER_DIR = f"{VOLUME_ROOT_PATH}/silver"

# ensure all paths exist
for path in [VOLUME_BRONZE_DIR, VOLUME_SILVER_DIR]:
  if not os.path.exists(path):
    os.makedirs(path, exist_ok=True)

In [0]:
# load the data from local volumes
tablename = 'stores'

In [0]:
# read from Bronze tier as Delta table
bstores_df = spark.read.format("delta").load(f"{VOLUME_BRONZE_DIR}/{tablename}")

In [0]:
bstores_df.printSchema()
display(bstores_df)

In [0]:
def smart_na_drop(df):
    """
    Drops all rows with any null values in columns.
    """
    before = df.count()
    df = df.dropna()
    after = df.count()
    print(f"dropped {before - after} rows")
    return df

In [0]:
sstores_df = smart_na_drop(bstores_df)

In [0]:
sstores_df_categorical_cols = ['city', 'state', 'type', 'cluster']

# hash features because databricks
sstores_hasher = FeatureHasher(
    inputCols=sstores_df_categorical_cols,
    outputCol='hash_city_state_type_cluster',
    numFeatures=1024
)

sstores_df = sstores_hasher.transform(sstores_df)
sstores_df = sstores_df.drop(*sstores_df_categorical_cols)

display(sstores_df)

In [0]:
# write to Silver tier as Delta table
silver_path = f"{VOLUME_SILVER_DIR}/{tablename}"
dbutils.fs.rm(silver_path, True)
sstores_df.write.format("delta").mode("overwrite").save(silver_path)