In [0]:
%sql
-- creates cscie103_catalog.final_project schema and data volume (if not exist)
CREATE SCHEMA IF NOT EXISTS cscie103_catalog.final_project;
CREATE VOLUME IF NOT EXISTS cscie103_catalog.final_project.data;

In [0]:
#Setup Paths
from pyspark.sql import functions as F
from pyspark.sql.window import Window


VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
VOLUME_RAW_DIR = f"{VOLUME_ROOT_PATH}/raw/oil_stream"
VOLUME_BRONZE_DIR = f"{VOLUME_ROOT_PATH}/bronze"
VOLUME_SILVER_DIR = f"{VOLUME_ROOT_PATH}/silver/oil_stream"

bronze_table_name = "cscie103_catalog.final_project.oil_stream_bronze"
silver_table_name = "cscie103_catalog.final_project.oil_stream_silver"
gold_table_name = "cscie103_catalog.final_project.oil_stream_gold"

userhome = '/Volumes/cscie103_catalog/assignment_03/data'
bronzeCheckpoint = f"{VOLUME_ROOT_PATH}/bronze_checkpoint_oil"
silverCheckpoint = f"{VOLUME_ROOT_PATH}/silver_checkpoint_oil"
goldCheckpoint = f"{VOLUME_ROOT_PATH}/gold_checkpoint_oil"

## Request Daily Oil Prices from US Energy Information Administration

Callin


In [0]:
%python
from datetime import datetime, timedelta

enddate = datetime.today().date()
startdate = enddate - timedelta(days=100)
print(f"Start Date: {startdate}, End Date: {enddate}")

In [0]:
from datetime import date
# for testing only 
# load data from previous years 
# startdate = date(2019, 8, 1)
# enddate = date(2019, 11, 29)
# print(f"Start Date: {startdate}, End Date: {enddate}")

In [0]:
import requests
import json

# API_KEY = dbutils.secrets.get("oil-api", "EIA_API_KEY")
# API_KEY = dbutils.secrets.get("oil-api", "EIA_API_KEY")
API_KEY = "DDwoq9k3J4hbd0t8hB0wWt8M9cpgqSfMyKUUhESj"
# url = f"https://api.eia.gov/v2/petroleum/pri/spt/series/?api_key={API_KEY}"
url = (
    "https://api.eia.gov/v2/petroleum/pri/spt/data/"
    "?frequency=daily"
    "&facets[series][]=RWTC"
    f"&start={startdate}&end={enddate}"
    "&data[0]=value"
    f"&api_key={API_KEY}"
)


response = requests.get(url).json()
response =response["response"] 
records = response["data"]
dateFormat = response["dateFormat"]
total = response["total"]

print(f"Length: {len(records)} - Total {total} - Date Format: {dateFormat}")
# for r in records: 
#     print(r["period"], r["value"])

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Create DataFrame from records
df = spark.createDataFrame(records)

# Select and cast columns
df_clean = (
    df.select(
        F.to_date("period", "yyyy-MM-dd").alias("day"),
        F.col("value").cast("double").alias("price")
    )
)

# Save DataFrame to JSON
df_clean.write.mode("overwrite").json(f"{VOLUME_RAW_DIR}/{startdate}.json")


## Streaming RAW Oil data into Bronze Layer 

In [0]:
# VOLUME_BRONZE_DIR
# VOLUME_SILVER_DIR
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType, DateType
from pyspark.sql.functions import from_json, to_json

schema = StructType([
    StructField("day", DateType(), True),
    StructField("price", DoubleType(), True)
])

df_bronze = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "json")
    .schema(schema)
    .load(VOLUME_RAW_DIR)
)

(
    df_bronze.writeStream
    .format("delta")
    .option("checkpointLocation", bronzeCheckpoint)
    .outputMode("append")
    .trigger(availableNow=True)
    .option("mergeSchema", "true")
    .table(bronze_table_name)
)


In [0]:
%sql
Select * from cscie103_catalog.final_project.oil_stream_bronze

## Stream from Bronze to Silver 

In [0]:
bronze_df = spark.readStream.table(bronze_table_name)

silver_df = (bronze_df
    .withColumn("date", F.to_date(F.col("day")))
    .withColumn("dcoilwtico", F.col("price").cast("double"))
    .select("date", "dcoilwtico"))



stream = (silver_df.writeStream
    .format("delta")
    .option("checkpointLocation", silverCheckpoint)
    .outputMode("append")
    .trigger(availableNow=True)
    .table(silver_table_name))

In [0]:
%sql
Select * from cscie103_catalog.final_project.oil_stream_silver

# Merge with Existing silverlayer_oil Table

### Before Merge 

In [0]:
%sql
Select year(date) , count(*) from cscie103_catalog.final_project.silver_oil group by year(date) order by year(date) desc

### Merging Stream of oil prices into existing table

In [0]:
%sql
-- Select * from cscie103_catalog.final_project.oil_stream_silver
-- Select * from cscie103_catalog.final_project.silver_oil

MERGE INTO cscie103_catalog.final_project.silver_oil t
USING cscie103_catalog.final_project.oil_stream_silver s
ON t.date = s.date 
WHEN NOT MATCHED THEN
  INSERT (date, dcoilwtico)
  VALUES (s.date, s.dcoilwtico);


### After Merging Data

In [0]:
%sql
Select year(date) , count(*) from cscie103_catalog.final_project.silver_oil group by year(date) order by year(date) desc