In [0]:
#Setup Paths
from pyspark.sql import functions as F
from pyspark.sql.window import Window

VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
VOLUME_BRONZE_DIR = f"{VOLUME_ROOT_PATH}/bronze"
VOLUME_SILVER_DIR = f"{VOLUME_ROOT_PATH}/silver"


In [0]:
#Loading Bronze Oil

bronze_oil = spark.read.format("delta").load(
    f"{VOLUME_BRONZE_DIR}/oil"
)

bronze_oil.printSchema()
display(bronze_oil)


root
 |-- date: date (nullable = true)
 |-- dcoilwtico: double (nullable = true)



date,dcoilwtico
2013-01-01,
2013-01-02,93.14
2013-01-03,92.97
2013-01-04,93.12
2013-01-07,93.2
2013-01-08,93.21
2013-01-09,93.08
2013-01-10,93.81
2013-01-11,93.6
2013-01-14,94.27


In [0]:
# Basic Cleaning
silver_oil_clean = (
    bronze_oil
    .withColumn("date", F.to_date("date"))
    .withColumn("dcoilwtico", F.col("dcoilwtico").cast("double"))
)


In [0]:
# Filling in missing oil prices with forward fill
window_ffill = Window.orderBy("date").rowsBetween(Window.unboundedPreceding, 0)

silver_oil_filled = silver_oil_clean.withColumn(
    "dcoilwtico",
    F.last("dcoilwtico", ignorenulls=True).over(window_ffill)
)




In [0]:
silver_oil = silver_oil_filled.select(
    "date",
    "dcoilwtico"
)




In [0]:
# Writing Oil to a Silver Table
silver_oil.write.format("delta") \
    .mode("overwrite") \
    .save(f"{VOLUME_SILVER_DIR}/oil")




In [0]:
# Validation Step
test_silver_oil = spark.read.format("delta").load(
    f"{VOLUME_SILVER_DIR}/oil"
)

test_silver_oil.printSchema()
display(test_silver_oil.orderBy("date"))


root
 |-- date: date (nullable = true)
 |-- dcoilwtico: double (nullable = true)



date,dcoilwtico
2013-01-01,
2013-01-02,93.14
2013-01-03,92.97
2013-01-04,93.12
2013-01-07,93.2
2013-01-08,93.21
2013-01-09,93.08
2013-01-10,93.81
2013-01-11,93.6
2013-01-14,94.27
