In [0]:
# imports
import os
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql import Window
from pyspark.sql import functions as F
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.ml.feature import FeatureHasher

In [0]:
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
# raw data
VOLUME_BRONZE_DIR = f"{VOLUME_ROOT_PATH}/bronze"
# place where prepared data is written
VOLUME_SILVER_DIR = f"{VOLUME_ROOT_PATH}/silver"

# ensure all paths exist
for path in [VOLUME_BRONZE_DIR, VOLUME_SILVER_DIR]:
  if not os.path.exists(path):
    os.makedirs(path, exist_ok=True)

tablename = 'train'

In [0]:
# read from Bronze tier as Delta table
btrain_df = spark.read.format('delta').load(f'{VOLUME_BRONZE_DIR}/{tablename}')

In [0]:
btrain_df.printSchema()
display(btrain_df)

In [0]:
def smart_na_drop(df):
    """
    Drops all rows with any null values in columns.
    """
    before = df.count()
    df = df.dropna()
    after = df.count()
    print(f"dropped {before - after} rows")
    return df

In [0]:
strain_df = smart_na_drop(btrain_df)

In [0]:
categorical_cols = [ 'store_nbr', 'family' ]

# hash features because databricks
hasher = FeatureHasher(
    inputCols=categorical_cols,
    outputCol='hash_storenbr_family',
    numFeatures=1024
)

strain_df = hasher.transform(strain_df)
display(strain_df)

In [0]:
# save to Silver tier as Delta table
silver_path = f'{VOLUME_SILVER_DIR}/{tablename}'
strain_df.write.mode('overwrite').save(silver_path)