In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType , StringType , StructField , StructType , LongType , DoubleType , IntegerType , BooleanType
from pyspark.sql.window import Window

In [0]:

source_path = "/Volumes/workspace/default/storage/silver/ticker_data_v5"
destination_path = "/Volumes/workspace/default/storage/gold/ticker_data_v7"
# --- 2. THE FIX: PATH MUST INCLUDE THE VOLUME NAME ---
# Path format: /Volumes/<catalog>/<schema>/<VOLUME_NAME>/<folder>
# We added 'storage' because that is the volume we just created in SQL.
checkpoint_path = "/Volumes/workspace/default/storage/checkpoints/job_gold_ticker_checkpoint_v10"

print(f"Streaming Strategy:")
print(f"Checkpoint: {checkpoint_path}")
print(f"Data (Volume): {source_path}")


In [0]:
# Constants for CAPM Model (Can be replaced by dynamic macro data later)
RISK_FREE_RATE = 0.0425  # 4.25% (10Y Treasury)
MARKET_PREMIUM = 0.0500  # 5.00% (Standard assumption)
# providing a starting version
df = spark.readStream.format("delta") \
  .option("startingVersion", "0") \
  .load(source_path)

In [0]:
# --- 1. SEPARATE THE STREAMS ---
# We split the logic. One stream calculates safe stats (Price/Vol), the other tries to calculate Beta.

# A. TICKER STATS (Safe Stream)
# This guarantees you ALWAYS get Close Price and Volatility, even if SPY is missing.
df_ticker_stats = (df_tickers
    .withWatermark("event_time", "1 hour") # Handle late data
    .groupBy(
        F.col("s"),
        F.window("event_time", "2 hours", "5 minutes")
    )
    .agg(
        # Standard Deviation of PRICE (Native Volatility)
        # This fixes your "0 std" issue because it runs on raw data before any join
        (F.stddev_samp("currentPrice") / F.avg("currentPrice")).alias("volatility"),
        
        # Momentum
        F.avg("changePercent").alias("momentum"),
        
        # Close Price
        F.last("currentPrice").alias("close_price")
    )
)

# B. BETA CALCULATION (Risky Stream)
# This tries to join with SPY. If it fails, we handle it later.

# Join Condition
join_condition = (
    F.col("m_join_key") == F.col("join_key")
) & (
    F.col("event_time") >= F.col("m_event_time") - F.expr("INTERVAL 1 HOUR")
) & (
    F.col("event_time") <= F.col("m_event_time") + F.expr("INTERVAL 1 HOUR")
)

# Use LEFT JOIN so we don't lose the Ticker windows even if SPY is missing
df_beta_calc = df_tickers.join(df_market, join_condition, "left_outer")

df_beta_agg = (df_beta_calc
    .withWatermark("event_time", "1 hour")
    .groupBy(
        F.col("s"),
        F.window("event_time", "2 hours", "5 minutes")
    )
    .agg(
        # Safe Beta Logic:
        # If Variance is 0 or Null (due to missing SPY), return NULL
        (F.covar_samp("changePercent", "m_return") / F.nullif(F.var_samp("m_return"), F.lit(0))).alias("calc_beta")
    )
)

# --- 2. MERGE & FALLBACK (The Fix) ---

# Join the Safe Stats with the Calculated Beta
df_final = df_ticker_stats.join(
    df_beta_agg,
    on=["s", "window"],
    how="left_outer" 
)

# --- 3. APPLY DEFAULTS ---
df_final = df_final.withColumn(
    "beta",
    # LOGIC: If Beta was calculated, use it. 
    # If it's NULL (because Join failed or SPY was missing), Default to 1.0 (Market Average).
    F.coalesce(F.col("calc_beta"), F.lit(1.0)) 
).withColumn(
    "cost_of_equity",
    # CAPM Model: Rf + Beta * (Rm - Rf)
    F.lit(RISK_FREE_RATE) + (F.col("beta") * F.lit(MARKET_PREMIUM))
)

# --- 4. DEBUG/OUTPUT ---
# Select final columns for WACC
df_result = df_final.select(
    "window", "s", "close_price", "volatility", "momentum", "beta", "cost_of_equity"
)

In [0]:


# --- 3. WRITE STREAM ---
df_result.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_path) \
    .trigger(availableNow=True) \
    .start(destination_path)