In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [0]:
# --- CONFIG ---
# 1. Inputs
speed_path = "/Volumes/workspace/default/storage/gold/ticker_data_v7" # Streaming Delta
batch_path = "gold_valid_audit" # Static Delta (S3)

# 2. Output
serving_path = "/Volumes/workspace/default/storage/serving/valuation_dashboard_v4"
checkpoint_path = "/Volumes/workspace/default/storage/checkpoints/job_serving_dashboard_v4"

In [0]:
# 3. Credentials (Required for S3 Access)
ACCESS_KEY = dbutils.secrets.get(scope="ticker", key="access_key")
SECRET_KEY = dbutils.secrets.get(scope="ticker", key="secret_key")
SESSION_TOKEN = dbutils.secrets.get(scope="ticker", key="session_key")

In [0]:
# --- 1. READ SPEED LAYER (STREAMING) ---
# This is already in Volumes, so no Keys needed here
df_speed = spark.readStream.format("delta").load(speed_path).withColumnRenamed("s", "symbol")


In [0]:
# --- 2. READ BATCH LAYER (STATIC S3) ---
# We inject keys to read the S3 data
df_batch_raw = (spark.read
    .format("delta")
    .option("fs.s3a.access.key", ACCESS_KEY)
    .option("fs.s3a.secret.key", SECRET_KEY)
    .option("fs.s3a.session.token", SESSION_TOKEN)
    .option("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
    .table(batch_path)
    
)
df_batch_raw.show(10)

In [0]:
# --- 3. PREPARE BATCH (FORWARD FILL) ---
# We only want the LATEST financial report for each symbol
window_spec = Window.partitionBy("symbol").orderBy(F.col("date").desc())

df_batch_latest = df_batch_raw.withColumn("rank", F.row_number().over(window_spec)) \
                              .filter("rank = 1") \
                              .drop("rank") \
                              .withColumnRenamed("date", "report_date")

# --- 4. CALCULATE BATCH RATIOS (PRE-CALC) ---
# Calculate Cost of Debt & Tax Rate once (Static)
# Handle Div/0 safely using nullif
df_batch_prepared = df_batch_latest.withColumn(
    "cost_of_debt", 
    F.col("interest_expense") / F.nullif(F.col("total_debt"), F.lit(0))
).withColumn(
    "effective_tax_rate",
    F.col("tax_expense") / F.nullif((F.col("nopat") + F.col("tax_expense")), F.lit(0)) 
    # Approx Pre-Tax Income = NOPAT + Tax
).select(
    "symbol", "report_date", "shares_outstanding", 
    "net_debt", "total_debt", "cost_of_debt", "effective_tax_rate",
    "calculated_fcf", "nopat" # Keep FCF for dashboard
)

# --- 5. STREAM-STATIC JOIN ---
# Left Join: Speed is the master. If Batch is missing, we still show Price/Beta.
df_serving = df_speed.join(F.broadcast(df_batch_prepared), on="symbol", how="left")

In [0]:
# --- 6. VALUATION LOGIC (THE WACC ENGINE) ---

# A. Market Cap = Price * Shares
market_cap = F.col("close_price") * F.col("shares_outstanding")

# B. Enterprise Value (V) = Equity + Debt
# Note: We use Total Debt for WACC weighting, Net Debt for EV Valuation
total_capital = market_cap + F.col("total_debt")
enterprise_value = market_cap + F.col("net_debt")

# C. Weights
weight_equity = market_cap / F.nullif(total_capital, F.lit(0))
weight_debt = F.col("total_debt") / F.nullif(total_capital, F.lit(0))

# D. WACC Formula
# WACC = (We * Ke) + (Wd * Kd * (1 - T))
wacc_calc = (
    (weight_equity * F.col("cost_of_equity")) + 
    (weight_debt * F.col("cost_of_debt") * (1 - F.col("effective_tax_rate")))
)

# E. PE Ratio
# EPS Proxy = NOPAT / Shares (Simplified)
eps_proxy = F.col("nopat") / F.nullif(F.col("shares_outstanding"), F.lit(0))
pe_ratio = F.col("close_price") / F.nullif(eps_proxy, F.lit(0))

# --- 7. APPLY TRANSFORMATION ---
df_final = df_serving.withColumn("market_cap", market_cap) \
                     .withColumn("enterprise_value", enterprise_value) \
                     .withColumn("wacc", wacc_calc) \
                     .withColumn("pe_ratio_implied", pe_ratio) \
                     .withColumn("valuation_timestamp", F.current_timestamp()) \
                     .withColumn("valuation_date", F.to_date(F.current_timestamp())) 
                     

In [0]:
# --- 8. SELECT FINAL DASHBOARD COLUMNS ---
output_schema = [
    # Identity
    "symbol", "valuation_timestamp", "report_date",  "valuation_date",
    # Speed Metrics
    "close_price", "beta", "volatility", "momentum", "cost_of_equity",
    # Batch Metrics
    "calculated_fcf", "cost_of_debt", "effective_tax_rate",
    # Synthesis
    "market_cap", "enterprise_value", "wacc", "pe_ratio_implied"
]

df_dashboard = df_final.select(*output_schema)

# --- 9. WRITE STREAM (TO S3) ---
print(f"Starting Serving Stream... Writing to {serving_path}")

df_dashboard.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_path) \
    .option("mergeSchema", "true") \
    .option("fs.s3a.access.key", ACCESS_KEY) \
    .option("fs.s3a.secret.key", SECRET_KEY) \
    .option("fs.s3a.session.token", SESSION_TOKEN) \
    .option("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider") \
    .trigger(availableNow=True) \
    .partitionBy("valuation_date") \
    .start(serving_path)