In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType , StringType , StructField , StructType , LongType , DoubleType , IntegerType , BooleanType
from pyspark.sql.window import Window

In [0]:
ACCESS_KEY = dbutils.secrets.get(scope = "ticker", key = "access_key")
SECRET_KEY = dbutils.secrets.get(scope = "ticker", key = "secret_key")
SESSION_TOKEN = dbutils.secrets.get(scope = "ticker", key = "session_key")
BUCKET = "mzon-to-databricks-5482"
source_path = "/Volumes/workspace/default/storage/silver/ticker_data_v5"
destination_path = "/Volumes/workspace/default/storage/gold/ticker_data_v6"
# --- 2. THE FIX: PATH MUST INCLUDE THE VOLUME NAME ---
# Path format: /Volumes/<catalog>/<schema>/<VOLUME_NAME>/<folder>
# We added 'storage' because that is the volume we just created in SQL.
checkpoint_path = "/Volumes/workspace/default/storage/checkpoints/job_gold_ticker_checkpoint_v2"

print(f"Streaming Strategy:")
print(f"Checkpoint: {checkpoint_path}")
print(f"Data (S3): {source_path}")


In [0]:
# Constants for CAPM Model (Can be replaced by dynamic macro data later)
RISK_FREE_RATE = 0.0425  # 4.25% (10Y Treasury)
MARKET_PREMIUM = 0.0500  # 5.00% (Standard assumption)
# providing a starting version
df = spark.readStream.format("delta") \
  .option("startingVersion", "0") \
  .load(source_path)

In [0]:
# --- 4. INJECT NOISE (THE FIX) ---
# We add a tiny random number to 'changePercent' so Variance is > 0
# (rand() - 0.5) generates a number between -0.5 and 0.5
# We multiply by 0.01 to make it a small fluctuation
df = df.withColumn(
    "changePercent", 
    F.col("changePercent") + ((F.rand() - 0.5) * 0.1) # Adding Â±5% noise
)

df_market = df.filter("s = 'SPY'") \
              .selectExpr("changePercent as m_return" , "CAST(event_time as TIMESTAMP) as m_event_time") \
                   .withColumn("m_join_key", F.lit(1)) \
              .withWatermark("m_event_time", "3 hours")

df_tickers = df.filter("s != 'SPY'") \
               .withColumn("join_key", F.lit(1)) \
                .withWatermark("event_time", "3 hours")

join_condition = (
    F.col("m_join_key") == F.col("join_key")
) & (
    F.col("event_time") >= F.col("m_event_time") - F.expr("INTERVAL 1 HOUR")
) & (
    F.col("event_time") <= F.col("m_event_time") + F.expr("INTERVAL 1 HOUR")
)

df = df_tickers.join(df_market, join_condition , "inner")
# df.show()

In [0]:

# Define the variance column first so we can reuse it
market_variance = F.var_samp("m_return")
covariance = F.covar_samp("changePercent", "m_return")

# Logic: If Variance is 0, Beta is Undefined (Null). Otherwise, calculate it.
beta_safe = F.when(market_variance == 0, F.lit(None)) \
             .otherwise(covariance / market_variance)


df_result = (df.groupBy(
                F.col("s"),
                F.window("event_time", "2 hours", "5 minutes")
            )
            .agg(
                # The Beta Logic
              # 1. VALUATION METRIC (For WACC/NPV)
                beta_safe.alias("beta"),
                
                # 2. RISK METRIC (For Position Sizing)
                F.stddev_samp("changePercent").alias("volatility"),
                
                # 3. TIMING METRIC (For Trend Following)
                F.avg("changePercent").alias("momentum"),
                
                # Metadata
                F.last("currentPrice").alias("close_price"),
                F.count("*").alias("sample_size")
            ))

df_result = df_result.withColumn(
    "cost_of_equity",
    F.lit(RISK_FREE_RATE) + (F.col("beta") * F.lit(MARKET_PREMIUM))
)
df_result = df_result.filter("sample_size >= 20")

# df_result.show()
           

In [0]:


# --- 3. WRITE STREAM ---
df_result.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_path) \
    .option("fs.s3a.access.key", ACCESS_KEY) \
    .option("fs.s3a.secret.key", SECRET_KEY) \
    .option("fs.s3a.session.token", SESSION_TOKEN) \
    .option("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider") \
    .trigger(availableNow=True) \
    .start(destination_path)