In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType , StringType , StructField , StructType , LongType , DoubleType , IntegerType , BooleanType
from pyspark.sql.window import Window



In [0]:
source_path = "/Volumes/workspace/default/storage/bronze/ticker_data"


In [0]:
# providing a starting version
df = spark.readStream.format("delta") \
  .option("startingVersion", "0") \
  .load(source_path)

In [0]:
df.printSchema()

In [0]:
schema = StructType([
    StructField("s", StringType(), True),
     StructField("currentPrice", DoubleType(), True),
     StructField("change", DoubleType(), True),
     StructField("changePercent", DoubleType(), True),
     StructField("high", DoubleType(), True),
     StructField("low", DoubleType(), True),
     StructField("open", DoubleType(), True),
     StructField("previousClose", DoubleType(), True),
         StructField("timestamp", LongType(), True),
])

In [0]:
df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)", "topic", "partition", "offset")
df = df.withColumn("ticker" , F.from_json(F.col("value"), schema,options={"mode": "PERMISSIVE", "columnNameOfCorruptRecord": "_corrupt_record"}))
df =df.select("ticker.*")



In [0]:
df = df.withColumn("event_time" , (F.col("timestamp")/1000).cast("timestamp"))
df = df.filter("changePercent IS NOT NULL AND changePercent != 0")


df_market = df.filter("s = 'SPY'") \
              .selectExpr("changePercent as m_return" , "CAST(event_time as TIMESTAMP) as m_event_time") \
                   .withColumn("m_join_key", F.lit(1)) \
              .withWatermark("m_event_time", "2 minutes")

df_tickers = df.filter("s != 'SPY'") \
               .withColumn("join_key", F.lit(1)) \
                .withWatermark("event_time", "2 minutes")

join_condition = (
    F.col("m_join_key") == F.col("join_key")
) & (
    F.col("event_time") >= F.col("m_event_time") - F.expr("INTERVAL 2 MINUTES")
) & (
    F.col("event_time") <= F.col("m_event_time") + F.expr("INTERVAL 2 MINUTES")
)

df = df_tickers.join(df_market, join_condition , "inner")



In [0]:
# df_result = (df.groupBy(
#                 F.col("s"),
#                 F.window("event_time", "10 minutes")
#             )
#             .agg(
#                 # The Beta Logic
#                 (F.covar_samp("changePercent", "m_return") / F.var_samp("m_return")).alias("beta"),
                
#                 # The "Missing" Columns - We must aggregate them
#                 F.first("open").alias("open_price"),
#                 F.max("high").alias("high_price"),
#                 F.min("low").alias("low_price"),
#                 F.last("currentPrice").alias("close_price"),
#                 F.count("s").alias("tick_count")
#             ))

df_result = df.groupBy("s", F.window("event_time", "10 minutes")) \
                     .agg(F.count("*").alias("match_count"))
           

In [0]:
ACCESS_KEY = dbutils.secrets.get(scope = "ticker", key = "access_key")
SECRET_KEY = dbutils.secrets.get(scope = "ticker", key = "secret_key")
SESSION_TOKEN = dbutils.secrets.get(scope = "ticker", key = "session_key")

In [0]:


BUCKET = "mzon-to-databricks-5482"
destination_path = "/Volumes/workspace/default/storage/silver/ticker_data_v3"
# --- 2. THE FIX: PATH MUST INCLUDE THE VOLUME NAME ---
# Path format: /Volumes/<catalog>/<schema>/<VOLUME_NAME>/<folder>
# We added 'storage' because that is the volume we just created in SQL.
checkpoint_path = "/Volumes/workspace/default/storage/checkpoints/job_silver_ticker_checkpoint_v3"

print(f"Streaming Strategy:")
print(f"Checkpoint: {checkpoint_path}")
print(f"Data (S3): {destination_path}")

# --- 3. WRITE STREAM ---
df_result.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_path) \
    .option("fs.s3a.access.key", ACCESS_KEY) \
    .option("fs.s3a.secret.key", SECRET_KEY) \
    .option("fs.s3a.session.token", SESSION_TOKEN) \
    .option("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider") \
    .trigger(availableNow=True) \
    .start(destination_path)

In [0]:
# %fs rm -r /Volumes/workspace/default/storage/checkpoints/job_silver_ticker_v2


In [0]:
# %fs rm -r /Volumes/workspace/default/storage/silver/ticker_data_v2

In [0]:
# # 1. DELETE EVERYTHING (The "Double Tap")
# print("Deleting Checkpoint...")
# dbutils.fs.rm("/Volumes/workspace/default/storage/checkpoints/job_silver_ticker_v2", True)

# print("Deleting Silver Table...")
# dbutils.fs.rm("/Volumes/workspace/default/storage/silver/ticker_data_v2", True)

# # 2. VERIFY DELETION
# # This MUST fail with "java.io.FileNotFoundException" or return False
# # If it returns True, the deletion failed.
# print(f"Checkpoint Exists? {dbutils.fs.ls('/Volumes/workspace/default/storage/checkpoints/job_silver_ticker')}")

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType

source_path = "/Volumes/workspace/default/storage/bronze/ticker_data"

print("--- 1. READING BRONZE (Batch Mode) ---")
# Read Bronze as a static table (ignores checkpoints)
df = spark.read.format("delta").load(source_path)

# 1. Define Schema & Parse
schema = StructType([
    StructField("s", StringType(), True),
    StructField("currentPrice", DoubleType(), True),
    StructField("change", DoubleType(), True),
    StructField("changePercent", DoubleType(), True),
    StructField("timestamp", LongType(), True) # Keeping it simple for debug
])

# Parse JSON
df_parsed = df.selectExpr("CAST(value AS STRING) as json_val") \
              .select(F.from_json("json_val", schema).alias("data")) \
              .select("data.*")

# Create Timestamp
df_parsed = df_parsed.withColumn("event_time", F.col("timestamp").cast("timestamp"))
df_parsed = df_parsed.filter("changePercent IS NOT NULL AND changePercent != 0")

print(f"Total Rows after Parsing/Filtering: {df_parsed.count()}")

# --- 2. CHECK SPY vs TICKERS ---
df_market = df_parsed.filter("s = 'SPY'") \
    .select(F.col("event_time").alias("m_event_time")) \
    .withColumn("key", F.lit(1))

df_tickers = df_parsed.filter("s != 'SPY'") \
    .withColumn("key", F.lit(1))

spy_count = df_market.count()
ticker_count = df_tickers.count()

print(f"SPY Rows: {spy_count}")
print(f"Ticker Rows: {ticker_count}")

if spy_count == 0 or ticker_count == 0:
    print("❌ CRITICAL FAILURE: One side of the join is empty!")
else:
    print("✅ Both sides have data. Checking Time Alignment...")

    # --- 3. CHECK TIME ALIGNMENT (The most common failure) ---
    # We join on the dummy key to see the raw time difference between SPY and Tickers
    df_debug_join = df_tickers.join(df_market, "key", "inner")
    
    # Calculate difference in seconds
    df_debug_join = df_debug_join.withColumn(
        "diff_seconds", 
        F.abs(F.col("event_time").cast("long") - F.col("m_event_time").cast("long"))
    )
    
    print("\n--- TIME GAP ANALYSIS ---")
    df_debug_join.select("s", "event_time", "m_event_time", "diff_seconds").show(5, False)
    
    # Check if ANY rows satisfy your 2-minute (120 seconds) condition
    valid_matches = df_debug_join.filter("diff_seconds <= 120").count()
    
    print(f"\n--- CONCLUSION ---")
    print(f"Total Potential Combinations: {df_debug_join.count()}")
    print(f"Matches within 2 Minutes (120s): {valid_matches}")
    
    if valid_matches == 0:
        print("❌ FAILURE: Your timestamps are too far apart! Increase the INTERVAL in your join.")
    else:
        print(f"✅ SUCCESS: {valid_matches} rows should pass the join in Streaming.")