In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType , StringType , StructField , StructType , LongType , DoubleType , IntegerType , BooleanType
from pyspark.sql.window import Window

In [0]:

source_path = "/Volumes/workspace/default/storage/silver/ticker_data_v5"
destination_path = "/Volumes/workspace/default/storage/gold/ticker_data_v10"
# --- 2. THE FIX: PATH MUST INCLUDE THE VOLUME NAME ---
# Path format: /Volumes/<catalog>/<schema>/<VOLUME_NAME>/<folder>
# We added 'storage' because that is the volume we just created in SQL.
checkpoint_path = "/Volumes/workspace/default/storage/checkpoints/job_gold_ticker_checkpoint_v17"

print(f"Streaming Strategy:")
print(f"Checkpoint: {checkpoint_path}")
print(f"Data (Volume): {source_path}")


# --- 1. READ SOURCE ---
df = spark.readStream.format("delta") \
  .option("startingVersion", "0") \
  .load(source_path) \
        .withColumn("dummy_join_key", F.lit(1)) \
  .withWatermark("event_time", "40 seconds") 


# backfill

# df = spark.read.format("delta") \
#   .option("startingVersion", "0") \
#   .load(source_path) \
#         .withColumn("dummy_join_key", F.lit(1)) 




In [0]:
# Constants for CAPM Model (Can be replaced by dynamic macro data later)
RISK_FREE_RATE = 0.0425  # 4.25% (10Y Treasury)
MARKET_PREMIUM = 0.0500  # 5.00% (Standard assumption)


In [0]:
WINDOW_DURATION = "3 hours"
SLIDE_DURATION = "15 minutes"
JOIN_LOOKBACK = "90 minutes" # Half of the window for the recursive join

# --- 2. BRANCH A: TICKER STATS (Volatility/Price) ---
# Calculated on RAW data to ensure Standard Deviation is accurate (not biased by join duplication)
df_ticker_stats = (df
    .groupBy(
        F.col("s"),
        F.window("event_time", WINDOW_DURATION, SLIDE_DURATION)
    )
    .agg(
        (F.stddev_samp("currentPrice") / F.avg("currentPrice")).alias("volatility"),
        F.avg("changePercent").alias("momentum"),
        F.last("currentPrice").alias("close_price")
    )
)

# --- 3. BRANCH B: BETA CALCULATION (Recursive Join) ---

# Prepare Market Stream (Derived from the same source 'df')
df_market = df.filter(F.col("s") == "SPY") \
    .select(
        F.col("event_time").alias("m_event_time"),
        F.col("changePercent").alias("m_return"),
        F.col("s").alias("m_join_key"),
        F.col("dummy_join_key").alias("m_dummy_join_key")
    )

# The "Recursive" Join Condition
# Matches Ticker events with Market events within the valid window range
join_condition = (F.col("dummy_join_key") == F.col("m_dummy_join_key")) & \
(
    F.col("event_time") >= F.col("m_event_time") - F.expr(f"INTERVAL {JOIN_LOOKBACK}")
) & (
    F.col("event_time") <= F.col("m_event_time") + F.expr(f"INTERVAL {JOIN_LOOKBACK}")
)

# Perform the Join
df_beta_calc = df.join(df_market, join_condition, "left_outer")

# Aggregate Beta
df_beta_agg = (df_beta_calc
    .groupBy(
        F.col("s"),
        F.window("event_time", WINDOW_DURATION, SLIDE_DURATION)
    )
    .agg(
        # Covariance(Ticker, Market) / Variance(Market)
        # We use NULLIF to safely handle 0 variance (flat market)
        (F.covar_samp("changePercent", "m_return") / F.nullif(F.var_samp("m_return"), F.lit(0))).alias("calc_beta")
    )
)



# --- 4. MERGE & COMPUTE WACC WITH QUALITY FLAGS ---
# (This logic remains exactly the same as before)
df_final = df_ticker_stats.join(
    df_beta_agg,
    on=["s", "window"],
    how="left_outer" 
)

df_final = df_final.withColumn(
    "beta_source",
    F.when(F.col("calc_beta").isNotNull(), F.lit("VALID_CALC"))
     .otherwise(F.lit("DEFAULT_FALLBACK"))
).withColumn(
    "beta",
    F.coalesce(F.col("calc_beta"), F.lit(1.0)) 
).withColumn(
    "cost_of_equity",
    F.lit(RISK_FREE_RATE) + (F.col("beta") * F.lit(MARKET_PREMIUM))
)
# df_final.filter(col("beta_source") == "VALID_CALC").show()
# FILTER: Only write meaningful data to the Gold layer
# We discard the "Overnight" data where the market was flat.
df_result = df_final.filter(
    F.col("beta_source") == "VALID_CALC"
).select(
    "window", "s", "close_price", "volatility", "momentum", "beta", "cost_of_equity"
)


is there any valid points in teh dataset ( to eliminate the source data dorruption theory ) ?

In [0]:
# from pyspark.sql.functions import col, avg, min, max, count, stddev

# # Aggregate by Ticker to check stability
# df_stats = df_result.groupBy("s") \
#     .agg(
#         avg("beta").alias("avg_beta"),
#         stddev("beta").alias("beta_volatility"),
#         min("beta").alias("min_beta"),
#         max("beta").alias("max_beta"),
#         count("beta").alias("windows_count")
#     ) \
#     .orderBy("s")

# df_stats.show()

In [0]:
# %sql
# SELECT 
#     T.s AS ticker,
    
#     -- 1. BETA CALCULATION WITH SAFE DIVISION
#     -- Logic: If Variance is 0 (or null), NULLIF returns NULL. 
#     --        Then COALESCE converts that NULL into 0.
#     COALESCE(
#         covar_samp(T.changePercent, M.changePercent) / NULLIF(var_samp(M.changePercent), 0), 
#         0
#     ) as calculated_beta,

#     -- 2. Debug Info
#     count(*) as data_points_used

# FROM try_calc_beta AS T 
# LEFT JOIN try_calc_beta AS M
#   ON M.event_time >= T.event_time - INTERVAL 1 HOUR
#   AND M.event_time <= T.event_time + INTERVAL 1 HOUR

# WHERE 
#     M.s = 'SPY' 
#     AND T.s != 'SPY'

# GROUP BY T.s

after we confirmed that the data source is valid we are suspecting the window interval is the reponsible on the null beta we did compare window from 2H to 7H teh result was correct

In [0]:
# %sql
# WITH 
# -- 1. TEST 2-HOUR WINDOW (The "Noise" Zone)
# beta_2h AS (
#     SELECT 
#         T.s as ticker,
#         COALESCE(
#             covar_samp(T.changePercent, M.changePercent) / NULLIF(var_samp(M.changePercent), 0), 
#             0
#         ) as beta_val
#     FROM try_calc_beta T 
#     LEFT JOIN try_calc_beta M 
#       -- +/- 60 Minutes = 2 Hours Total
#       ON M.event_time BETWEEN T.event_time - INTERVAL 60 MINUTES AND T.event_time + INTERVAL 60 MINUTES
#     WHERE M.s = 'SPY' AND T.s != 'SPY'
#     GROUP BY T.s
# ),

# -- 2. TEST 3-HOUR WINDOW (The "Signal" Zone?)
# beta_3h AS (
#     SELECT 
#         T.s as ticker,
#         COALESCE(
#             covar_samp(T.changePercent, M.changePercent) / NULLIF(var_samp(M.changePercent), 0), 
#             0
#         ) as beta_val
#     FROM try_calc_beta T 
#     LEFT JOIN try_calc_beta M 
#       -- +/- 90 Minutes = 3 Hours Total
#       ON M.event_time BETWEEN T.event_time - INTERVAL 90 MINUTES AND T.event_time + INTERVAL 90 MINUTES
#     WHERE M.s = 'SPY' AND T.s != 'SPY'
#     GROUP BY T.s
# ),

# -- 3. TEST 4-HOUR WINDOW
# beta_4h AS (
#     SELECT 
#         T.s as ticker,
#         COALESCE(
#             covar_samp(T.changePercent, M.changePercent) / NULLIF(var_samp(M.changePercent), 0), 
#             0
#         ) as beta_val
#     FROM try_calc_beta T 
#     LEFT JOIN try_calc_beta M 
#       -- +/- 120 Minutes = 4 Hours Total
#       ON M.event_time BETWEEN T.event_time - INTERVAL 120 MINUTES AND T.event_time + INTERVAL 120 MINUTES
#     WHERE M.s = 'SPY' AND T.s != 'SPY'
#     GROUP BY T.s
# ),

# -- 4. TEST 5-HOUR WINDOW
# beta_5h AS (
#     SELECT 
#         T.s as ticker,
#         COALESCE(
#             covar_samp(T.changePercent, M.changePercent) / NULLIF(var_samp(M.changePercent), 0), 
#             0
#         ) as beta_val
#     FROM try_calc_beta T 
#     LEFT JOIN try_calc_beta M 
#       -- +/- 150 Minutes = 5 Hours Total
#       ON M.event_time BETWEEN T.event_time - INTERVAL 150 MINUTES AND T.event_time + INTERVAL 150 MINUTES
#     WHERE M.s = 'SPY' AND T.s != 'SPY'
#     GROUP BY T.s
# )

# -- 5. THE FINAL COMPARISON
# SELECT 
#     A.ticker,
    
#     -- 2 Hours (Expected: 0)
#     ROUND(A.beta_val, 3) as beta_2h,
    
#     -- 3 Hours (Expected: Breaking Point / Jump)
#     ROUND(B.beta_val, 3) as beta_3h,
    
#     -- 4 Hours (Expected: Stable)
#     ROUND(C.beta_val, 3) as beta_4h,
    
#     -- 5 Hours (Expected: Stable)
#     ROUND(D.beta_val, 3) as beta_5h,
    
#     -- Visual Confirmation of the Jump
#     CASE 
#         WHEN A.beta_val = 0 AND B.beta_val != 0 THEN 'BREAKING POINT'
#         ELSE 'STABLE'
#     END as status

# FROM beta_2h A
# JOIN beta_3h B ON A.ticker = B.ticker
# JOIN beta_4h C ON A.ticker = C.ticker
# JOIN beta_5h D ON A.ticker = D.ticker
# ORDER BY A.ticker

In [0]:
# This initializes the table.
# df_result.write \
#     .format("delta") \
#     .mode("overwrite") \
#     .option("overwriteSchema", "true") \
#     .save(destination_path)

# print("Batch Backfill Complete. Table Initialized.")

# --- 3. WRITE STREAM ---
df_result.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_path) \
    .trigger(availableNow=True) \
    .start(destination_path)