In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType , StringType , StructField , StructType , LongType , DoubleType , IntegerType , BooleanType
from pyspark.sql.window import Window



In [0]:
source_path = "/Volumes/workspace/default/storage/bronze/ticker_data_v2"


In [0]:
# providing a starting version
df = spark.readStream.format("delta") \
  .option("startingVersion", "0") \
  .load(source_path)

In [0]:
df.printSchema()

In [0]:
# # --- 2. PARSE MINIMAL DATA ---
# # We only need Ticker and Timestamp to check frequency
# schema = StructType([
#     StructField("s", StringType(), True),
#     StructField("timestamp", LongType(), True)
# ])

# # Handle the Base64/Binary casting we discussed
# df = df.withColumn("json_str", F.col("value").cast("string")) \
#        .select(F.from_json("json_str", schema).alias("data")) \
#        .select("data.*")

# # --- 3. CALCULATE GAPS ---
# # We verify if your timestamp is Seconds or Milliseconds dynamically
# # If the average gap is ~15, it's seconds. If it's ~15000, it's millis.
# df = df.filter("s = 'SPY'") \
#        .withColumn("prev_ts", F.lag("timestamp").over(Window.orderBy("timestamp"))) \
#        .withColumn("gap", F.col("timestamp") - F.col("prev_ts"))

# df_tickers = df.filter("s != 'SPY'") \
#        .withColumn("prev_ts", F.lag("timestamp").over(Window.orderBy("timestamp"))) \
#        .withColumn("gap", F.col("timestamp") - F.col("prev_ts"))

# # --- 4. SHOW STATISTICS ---
# print("--- FREQUENCY ANALYSIS (SPY) ---")
# print("Units: Raw Timestamp Units (likely Seconds)")
# df.select("gap").summary("count", "min", "25%", "50%", "75%", "max", "mean").show()

# print("shopw statistics for al ltickers")
# df_tickers.select("gap").summary("count", "min", "25%", "50%", "75%", "max", "mean").show()
# # --- 5. VISUALIZE THE PATTERN ---
# print("--- LATEST 10 GAPS ---")
# df.select("timestamp", "prev_ts", "gap").orderBy(F.col("timestamp").desc()).show(10)

# df_tickers.select("timestamp", "prev_ts", "gap").orderBy(F.col("timestamp").desc()).show(10)


In [0]:
schema = StructType([
    StructField("s", StringType(), True),
     StructField("currentPrice", DoubleType(), True),
     StructField("change", DoubleType(), True),
     StructField("changePercent", DoubleType(), True),
     StructField("high", DoubleType(), True),
     StructField("low", DoubleType(), True),
     StructField("open", DoubleType(), True),
     StructField("previousClose", DoubleType(), True),
      
])

In [0]:
df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)", "topic", "partition", "offset", "timestamp")
df = df.withColumn("ticker" , F.from_json(F.col("value"), schema,options={"mode": "PERMISSIVE", "columnNameOfCorruptRecord": "_corrupt_record"}))
df =df.select("ticker.*" , F.col("timestamp").alias("event_time"))



In [0]:

df = df.filter("changePercent IS NOT NULL AND changePercent != 0")


# df_market = df.filter("s = 'SPY'") \
#               .selectExpr("changePercent as m_return" , "CAST(event_time as TIMESTAMP) as m_event_time") \
#                    .withColumn("m_join_key", F.lit(1)) \
#               .withWatermark("m_event_time", "2 minutes")

# df_tickers = df.filter("s != 'SPY'") \
#                .withColumn("join_key", F.lit(1)) \
#                 .withWatermark("event_time", "2 minutes")

# join_condition = (
#     F.col("m_join_key") == F.col("join_key")
# ) & (
#     F.col("event_time") >= F.col("m_event_time") - F.expr("INTERVAL 2 MINUTES")
# ) & (
#     F.col("event_time") <= F.col("m_event_time") + F.expr("INTERVAL 2 MINUTES")
# )

# df = df_tickers.join(df_market, join_condition , "inner")



In [0]:
# df_result = (df.groupBy(
#                 F.col("s"),
#                 F.window("event_time", "10 minutes")
#             )
#             .agg(
#                 # The Beta Logic
#                 (F.covar_samp("changePercent", "m_return") / F.var_samp("m_return")).alias("beta"),
                
#                 # The "Missing" Columns - We must aggregate them
#                 F.first("open").alias("open_price"),
#                 F.max("high").alias("high_price"),
#                 F.min("low").alias("low_price"),
#                 F.last("currentPrice").alias("close_price"),
#                 F.count("s").alias("tick_count")
#             ))

# df_result = df.groupBy("s", F.window("event_time", "10 minutes")) \
#                      .agg(F.count("*").alias("match_count"))
           

In [0]:
ACCESS_KEY = dbutils.secrets.get(scope = "ticker", key = "access_key")
SECRET_KEY = dbutils.secrets.get(scope = "ticker", key = "secret_key")
SESSION_TOKEN = dbutils.secrets.get(scope = "ticker", key = "session_key")

In [0]:


BUCKET = "mzon-to-databricks-5482"
destination_path = "/Volumes/workspace/default/storage/silver/ticker_data_v5"
# --- 2. THE FIX: PATH MUST INCLUDE THE VOLUME NAME ---
# Path format: /Volumes/<catalog>/<schema>/<VOLUME_NAME>/<folder>
# We added 'storage' because that is the volume we just created in SQL.
checkpoint_path = "/Volumes/workspace/default/storage/checkpoints/job_silver_ticker_checkpoint_v5"

print(f"Streaming Strategy:")
print(f"Checkpoint: {checkpoint_path}")
print(f"Data (S3): {destination_path}")

# --- 3. WRITE STREAM ---
df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_path) \
    .option("fs.s3a.access.key", ACCESS_KEY) \
    .option("fs.s3a.secret.key", SECRET_KEY) \
    .option("fs.s3a.session.token", SESSION_TOKEN) \
    .option("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider") \
    .trigger(availableNow=True) \
    .start(destination_path)

In [0]:
# %fs rm -r /Volumes/workspace/default/storage/checkpoints/job_silver_ticker_v2


In [0]:
# %fs rm -r /Volumes/workspace/default/storage/silver/ticker_data_v2

In [0]:
# # 1. DELETE EVERYTHING (The "Double Tap")
# print("Deleting Checkpoint...")
# dbutils.fs.rm("/Volumes/workspace/default/storage/checkpoints/job_silver_ticker_v2", True)

# print("Deleting Silver Table...")
# dbutils.fs.rm("/Volumes/workspace/default/storage/silver/ticker_data_v2", True)

# # 2. VERIFY DELETION
# # This MUST fail with "java.io.FileNotFoundException" or return False
# # If it returns True, the deletion failed.
# print(f"Checkpoint Exists? {dbutils.fs.ls('/Volumes/workspace/default/storage/checkpoints/job_silver_ticker')}")