In [0]:
%run ./_resources/01-setup $reset_all_data=false

## Reading and cleaning the raw events:

In [0]:
# Get the schema from a json row:
row_example = """{"user_id": "5ee7ba5f-77b2-47e4-8061-dd89f19626f3", "platform": "other", "event_id": "03c3d410-f01f-4f51-8ee0-7fab9be96855", "event_date": 1669301257, "action": "view", "uri": "https://databricks.com/home.htm"}"""
json_schema = F.schema_of_json(row_example)

stream = (spark.readStream.table("events_raw")
          .withColumn('json', F.from_json(F.col("value"), json_schema))
          .select("json.*")
          .where("event_id IS NOT NULL AND user_id IS NOT NULL AND event_date IS NOT NULL")
          .withColumn("event_datetime", F.to_timestamp(F.from_unixtime(F.col("event_date"))))
         )

In [0]:
display(stream, get_chkp_folder())

## Writing cleaned events to the silver table:

In [0]:
(stream
 .withWatermark("event_datetime", "1 hours")
 .dropDuplicates(["event_id"])
 .writeStream
    .trigger(processingTime="20 seconds")
    .option("checkpointLocation", volume_folder + "/checkpoints/silver")
    .option("mergeSchema", "true")
    .table("events"))

Utils.wait_for_table("events")

In [0]:
%sql
SELECT * FROM events;

## Displaying the number of events for each platform within 10 seconds windows:

In [0]:
spark.readStream.table("events").createOrReplaceTempView("events_stream")

In [0]:
df = spark.sql('''
               WITH events_cte AS (
                   SELECT WINDOW(event_datetime, "10 seconds") wnd,
                          platform,
                          COUNT(*) AS cnt
                   FROM events_stream
                   WHERE CAST(event_datetime AS INT) > CAST(CURRENT_TIMESTAMP() AS INT) - 120
                   GROUP BY wnd, platform
               )
               SELECT wnd.*, platform, cnt
               FROM events_cte
               ''')
display(df, checkpointLocation = get_chkp_folder())

In [0]:
%sql
SELECT COUNT(*) AS count, uri 
FROM events_stream 
GROUP BY uri 
ORDER BY count DESC LIMIT 10;

In [0]:
Utils.stop_all_streams(sleep_time=120)