# **Assignment 4: PySpark Structured Streaming Using Kafka Source**

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("pyspark-kafka-streaming").\
        master("spark://spark-master:7077").\
        config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0"). \
        config("spark.executor.memory", "512m").\
        getOrCreate()

TypeError: an integer is required (got type bytes)

## ==== Q2 ====

#### **Q2.1:** All your code for 2.1 should be in the following cell

In [2]:
#Answer to 2.1
df_streamed_raw = (spark
  .readStream
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:9093") \
  .option("subscribe", "topic_test1") \
  .load())


In [3]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import col

# convert byte stream to string
df_streamed_kv = (df_streamed_raw
    .withColumn("key", df_streamed_raw["key"].cast(StringType()))
    .withColumn("value", df_streamed_raw["value"].cast(StringType())))

test_query = (df_streamed_kv 
              .writeStream \
              .format("memory") # output to memory \
              .outputMode("update") # only write updated rows to the sink \
              .queryName("test_query_table")  # Name of the in memory table \
              .start())

23/04/12 16:25:59 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-66229bb6-00a2-4e8e-8f04-969f38ef27ec. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.


#### If all goes well, the following cell should display a table populated with values being streamed from you Kafka producer. NOTE: If you recently ran the producer, it may take a while before the table is populated. Keep rerunning the cell to check for updates:

In [None]:
spark.sql("select * from test_query_table").show()

In [None]:
test_query.stop()

#### The following cells contain code that take the streamed dataframe and formats it properly into a table. If any of the given cells fails, there might be a formatting issue with one of your previous solutions. 

In [4]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, BooleanType, LongType, IntegerType

event_schema = StructType([
    StructField("station", StringType()),
    StructField("valid", StringType()),
    StructField("tmpf", StringType()),
    StructField("dwpf", StringType()),
    StructField("relh", StringType()),
    StructField("feel", StringType()),
    StructField("drct", StringType()),
    StructField("sped", StringType()),
    StructField("alti", StringType()),
    StructField("mslp", StringType()),
    StructField("p01m", StringType()),
    StructField("vsby", StringType()),
    StructField("skyc1", StringType()),
    StructField("skyl1", StringType()),
    StructField("wxcodes", StringType()),
    StructField("ice_acceretion_1hr", StringType()),
])

# Parse the events from JSON format
df_parsed = (df_streamed_kv
           # Sets schema for event data
           .withColumn("value", from_json("value", event_schema))
          )

In [5]:
from pyspark.sql.functions import to_timestamp, unix_timestamp

# Here, we need to convert date_time string to date_time object in the "dd/MMM/yyyy:HH:mm:ss Z" format.

df_formatted = (df_parsed.select(
    col("key").alias("event_key")
    ,col("topic").alias("event_topic")
    ,col("timestamp").alias("event_timestamp")
    ,col("value.station").alias("station")
#     ,col("value.valid").alias("valid")
    ,to_timestamp(col("value.valid"), "yyyy-MM-dd HH:mm").alias("valid")
    ,col("value.tmpf").alias("tmpf")
    ,col("value.dwpf").alias("dwpf")
    ,col("value.relh").alias("relh")
    ,col("value.feel").alias("feel")
    ,col("value.drct").alias("drct")
    ,col("value.sped").alias("sped")
    ,col("value.alti").alias("alti")
    ,col("value.mslp").alias("mslp")
    ,col("value.p01m").alias("p01m")
    ,col("value.vsby").alias("vsby")
    ,col("value.skyc1").alias("skyc1")
    ,col("value.skyl1").alias("skyl1")
    ,col("value.wxcodes").alias("wxcodes")
    ,col("value.ice_acceretion_1hr").alias("ice_acceretion_1hr")
#     cast(IntegerType()).
))

#### **Q2.2:** All your code for 2.2 should be in the following cell


In [6]:
# Answer to 2.2
query = (df_formatted
            .writeStream
            .format("console")
            .trigger(processingTime='5 seconds')
            .outputMode("append")
            .option("truncate",'false')
            .start()
        )

23/04/12 16:26:02 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-7f524734-f72b-49e3-aabe-804bc14480e8. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------


[Stage 1:>                                                          (0 + 1) / 1]

+---------+-----------+---------------+-------+-----+----+----+----+----+----+----+----+----+----+----+-----+-----+-------+------------------+
|event_key|event_topic|event_timestamp|station|valid|tmpf|dwpf|relh|feel|drct|sped|alti|mslp|p01m|vsby|skyc1|skyl1|wxcodes|ice_acceretion_1hr|
+---------+-----------+---------------+-------+-----+----+----+----+----+----+----+----+----+----+----+-----+-----+-------+------------------+
+---------+-----------+---------------+-------+-----+----+----+----+----+----+----+----+----+----+----+-----+-----+-------+------------------+



23/04/12 16:26:08 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 6190 milliseconds
                                                                                

In [None]:
# Print the name of active streams (This may be useful during debugging)
for s in spark.streams.active:
    print(f"ID:{s.id} | NAME:{s.name}")

In [7]:
query.stop()

23/04/12 16:26:15 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@552debf2 is aborting.
23/04/12 16:26:15 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@552debf2 aborted.
23/04/12 16:26:18 WARN TaskSetManager: Lost task 0.0 in stage 2.0 (TID 2, 172.18.0.6, executor 1): TaskKilled (Stage cancelled)
                                                                                

# ==== Project - Start your feature extraction queries from here ====

### Day and Night temp calculation

In [None]:
from pyspark.sql.functions import *
df_day = df_formatted.filter((hour(df_formatted.valid) >= 8) & (hour(df_formatted.valid) < 19))

In [None]:
from pyspark.sql.functions import col, date_add, to_date
# Subtract 7 days from the current date to get the date 1 week ago
one_week_ago = date_add(to_date(col("valid"), "yyyy-MM-dd HH:mm"), -7)
# Filter out the rows where the valid timestamp is one week old or older
filtered_df = df_day.where(col("valid").cast("date") >= one_week_ago)

In [None]:
query1 = (filtered_df
            .writeStream
            .format("console")
            .trigger(processingTime='5 seconds')
            .outputMode("append")
            .option("truncate",'false')
            .start()
        )

In [None]:
query1.stop()

In [None]:
df_night=df_formatted.filter(
    ((hour(df_formatted.valid) >= 19) & (hour(df_formatted.valid) <=23)) |
    ((hour(df_formatted.valid) >= 0) & (hour(df_formatted.valid) < 8))
)

In [None]:
from pyspark.sql.functions import col, date_add, to_date
# Subtract 7 days from the current date to get the date 1 week ago
one_week_ago = date_add(to_date(col("valid"), "yyyy-MM-dd HH:mm"), -7)
# Filter out the rows where the valid timestamp is one week old or older
filtered_df1 = df_night.where(col("valid").cast("date") >= one_week_ago)

In [None]:
query2 = (filtered_df1
            .writeStream
            .format("console")
            .trigger(processingTime='5 seconds')
            .outputMode("append")
            .option("truncate",'false')
            .start()
        )

In [None]:
query2.stop()

## Weekly

In [None]:
from pyspark.sql.functions import avg, col, window,stddev
# assuming your input stream is named `input_stream`
df_stream_day = (df_day
   .withWatermark("valid", "7 days")\
    .groupBy("station", window("valid", "7 days"))\
    .agg(avg("tmpf").alias("avg_temp"), stddev("tmpf").alias("stddev_temp")))


In [None]:
query_3 = df_stream_day\
    .writeStream\
    .format("console")\
    .trigger(processingTime="10 seconds")\
    .outputMode("complete")\
    .option("truncate", "false")\
    .start()

In [None]:
query_3.stop()

In [None]:
from pyspark.sql.functions import avg, col, window,stddev
# assuming your input stream is named `input_stream`
df_stream_night = (df_night
   .withWatermark("valid", "15 minutes")\
    .groupBy("station", window("valid", "7 days"))\
    .agg(avg("tmpf").alias("avg_temp"), stddev("tmpf").alias("stddev_temp")))


In [None]:
query_4 = df_stream_night\
    .writeStream\
    .format("console")\
    .trigger(processingTime="10 seconds")\
    .outputMode("complete")\
    .option("truncate", "false")\
    .start()

In [None]:
query_4.stop()

In [None]:
# df_day_night=df_stream_day.union(df_stream_night)

In [None]:
# query_day_night = df_day_night\
#     .writeStream\
#     .format("console")\
#     .trigger(processingTime="10 seconds")\
#     .outputMode("append")\
#     .option("truncate", "false")\
#     .start()

In [None]:
# query_day_night.stop()

In [9]:
#batch

first_aggregation_query = df_stream.writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("first_aggregation") \
    .trigger(processingTime="10 seconds") \
    .start()

23/04/12 16:26:39 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-cece2d47-7ade-43db-b36b-239cbb2d6148. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.

In [19]:
#batch
first_aggregation_query.stop()

23/04/12 16:29:50 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@64744448 is aborting.
23/04/12 16:29:50 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@64744448 aborted.
23/04/12 16:29:50 WARN TaskSetManager: Lost task 39.0 in stage 61.0 (TID 2484, 172.18.0.6, executor 1): TaskKilled (Stage cancelled)
23/04/12 16:29:50 WARN TaskSetManager: Lost task 38.0 in stage 61.0 (TID 2483, 172.18.0.7, executor 0): TaskKilled (Stage cancelled)

In [24]:
#batch
first_aggregation_output = spark.sql("SELECT * FROM first_aggregation")

                                                                                

-------------------------------------------
Batch: 16
-------------------------------------------
+------------------------------------------+-------+-----------+
|Window                                    |station|temperature|
+------------------------------------------+-------+-----------+
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|MVL    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|MVL    |30.2       |
|[2013-01-01 04:00:00, 2013-01-01 06:00:00]|VSF    |33.08      |
|[2013-01-01 04:00:00, 2013-01-01 06:00:00]|VSF    |33.1       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|VSF    |33.8       |
|[2013-01-01 04:00:00, 2013-01-01 06:00:00]|FSO    |30.2       |
|[2013-01-01 04:00:00, 2013-01-01 06:00:00]|FSO    |29.7       |
|[2013-01-01 04:00:00, 2013-01-01 06:00:00]|FSO    |30.4       |
|[2013-01-01 04:00:00, 2013-01-01 06:00:00]|FSO    |31.3       |
|[2013-01-01 03:00:00, 2013-01-01 05:00:00]|VSF    |33.08

                                                                                

In [8]:
from pyspark.sql.functions import avg, col, window,stddev
# assuming your input stream is named `input_stream`
df_stream = (df_formatted
    .groupBy("station", window("valid", "7 days", "1 day"))\
    .agg(avg("tmpf").alias("avg_temp"), stddev("tmpf").alias("stddev_temp")))

In [None]:
query_stream = df_stream\
    .writeStream\
    .format("console")\
    .trigger(processingTime="10 seconds")\
    .outputMode("update")\
    .option("truncate", "false")\
    .start()

In [None]:
query_stream.stop()

## 2-hour Window 

In [14]:
from pyspark.sql.functions import *

from pyspark.sql.functions import window
df_windowed_2hrs = (df_formatted
#     .withWatermark("valid", "30 minutes")
    .groupBy("station", window("valid", "2 hours","1 hour"))\
    .agg(collect_list("tmpf").alias("temp_list")))

23/04/12 16:29:04 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 11213 milliseconds
23/04/12 16:29:15 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 10351 milliseconds

In [15]:
query_5=df_windowed_2hrs\
    .writeStream\
    .format("console")\
    .trigger(processingTime="10 seconds")\
    .outputMode("complete")\
    .option("truncate", "false")\
    .start()

23/04/12 16:29:18 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-71ff4c1b-4471-4dde-9d06-3821fb46d505. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
[Stage 50:(119 + 2) / 200][Stage 51:>   (0 + 0) / 1][Stage 52:>   (0 + 0) / 1]1]

In [16]:
query_5.stop()

23/04/12 16:29:23 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@33ac857b is aborting.
23/04/12 16:29:23 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@33ac857b aborted.

In [17]:
from pyspark.sql.functions import explode

df_exploded = df_windowed_2hrs.select("Window", "station", explode("temp_list").alias("temperature"))

23/04/12 16:29:26 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 11546 milliseconds


In [35]:
df_exploded = df_exploded.withColumn("prev_week_start", date_sub("window.start", 7))
df_exploded = df_exploded.withColumn("prev_week_end", date_sub("window.start", 1))

[Stage 807:(160 + 2) / 200][Stage 809:>(0 + 0) / 200][Stage 810:>  (0 + 0) / 1]

In [36]:
query_6= (df_exploded
            .writeStream
            .outputMode("complete")
            .format("console")
            .option("truncate",'false')
            .trigger(processingTime="10 seconds")
            .start())

23/04/12 16:54:06 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-f132deeb-e011-48c9-a891-c88325108b4a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/04/12 16:54:08 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 17296 milliseconds


-------------------------------------------
Batch: 35
-------------------------------------------
+------+-------+-----------+-------+------+--------+-----------+
|Window|station|temperature|station|window|avg_temp|stddev_temp|
+------+-------+-----------+-------+------+--------+-----------+
+------+-------+-----------+-------+------+--------+-----------+



                                                                                

-------------------------------------------
Batch: 69
-------------------------------------------
+------------------------------------------+-------+-----------+-------+------------------------------------------+--------+-----------+
|Window                                    |station|temperature|station|window                                    |avg_temp|stddev_temp|
+------------------------------------------+-------+-----------+-------+------------------------------------------+--------+-----------+
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2012-12-30 00:00:00, 2013-01-06 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2012-12-28 00:00:00, 2013-01-04 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2012-12-29 00:00:00, 2013-01-05 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2013-01

23/04/12 16:54:15 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 34262 milliseconds
                                                                                

-------------------------------------------
Batch: 96
-------------------------------------------
+------------------------------------------+-------+-----------+
|Window                                    |station|temperature|
+------------------------------------------+-------+-----------+
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |24.98      |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 18:00:00, 2013-01-01 20:00:00]|EFK    |12.02      |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|MVL    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|MVL    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|MVL    |32.0 

23/04/12 16:54:23 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 22779 milliseconds
23/04/12 16:54:29 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 22487 milliseconds


-------------------------------------------
Batch: 0
-------------------------------------------
+------+-------+-----------+---------------+-------------+
|Window|station|temperature|prev_week_start|prev_week_end|
+------+-------+-----------+---------------+-------------+
+------+-------+-----------+---------------+-------------+



                                                                                

-------------------------------------------
Batch: 97
-------------------------------------------
+------------------------------------------+-------+-----------+
|Window                                    |station|temperature|
+------------------------------------------+-------+-----------+
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |24.98      |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 18:00:00, 2013-01-01 20:00:00]|EFK    |12.02      |
|[2013-01-01 18:00:00, 2013-01-01 20:00:00]|EFK    |12.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|MVL    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|MVL    |30.2 

23/04/12 16:54:36 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 13157 milliseconds
[Stage 821:(173 + 2) / 200][Stage 823:>(0 + 0) / 200][Stage 824:>  (0 + 0) / 1]

In [45]:
query_6.stop()

                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+-------+-----------+---------------+-------------+-------+------------------------------------------+--------+-------------------+-------------------+
|Window                                    |station|temperature|prev_week_start|prev_week_end|station|window                                    |avg_temp|stddev_temp        |z_score            |
+------------------------------------------+-------+-----------+---------------+-------------+-------+------------------------------------------+--------+-------------------+-------------------+
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |13.64      |2012-12-25     |2012-12-31   |CDA    |[2012-12-26 00:00:00, 2013-01-02 00:00:00]|23.252  |0.5489262245511686 |-17.510549815431542|
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |13.64      |2012-12-25     |2012-12-31   |CDA    |[2012-12-31 00:00:00,

23/04/12 16:58:18 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 45014 milliseconds
[Stage 918:(79 + 2) / 200][Stage 920:>  (0 + 0) / 1][Stage 921:>  (0 + 0) / 1]

In [38]:
from pyspark.sql import functions as F

comparison_df = df_exploded.join(
    first_aggregation_output,
    (df_exploded["station"] == first_aggregation_output["station"]) 
       & (first_aggregation_output["window.start"] >= df_exploded["prev_week_start"])
    & (first_aggregation_output["window.start"] <= df_exploded["prev_week_end"])
)

23/04/12 16:55:27 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 33517 milliseconds


-------------------------------------------
Batch: 37
-------------------------------------------
+------+-------+-----------+-------+------+--------+-----------+
|Window|station|temperature|station|window|avg_temp|stddev_temp|
+------+-------+-----------+-------+------+--------+-----------+
+------+-------+-----------+-------+------+--------+-----------+



[Stage 840:(83 + 2) / 200][Stage 842:>  (0 + 0) / 1][Stage 843:>  (0 + 0) / 1]

In [39]:
#batch

comparison_query = comparison_df.writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false") \
    .trigger(processingTime="10 seconds") \
    .start()

23/04/12 16:55:29 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-67d776a3-d807-4d9d-bab4-f55030fe1b05. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
[Stage 840:(199 + 1) / 200][Stage 844:>(7 + 1) / 200][Stage 846:>(0 + 0) / 200] 

-------------------------------------------
Batch: 99
-------------------------------------------
+------------------------------------------+-------+-----------+
|Window                                    |station|temperature|
+------------------------------------------+-------+-----------+
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |24.98      |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 18:00:00, 2013-01-01 20:00:00]|EFK    |12.02      |
|[2013-01-01 18:00:00, 2013-01-01 20:00:00]|EFK    |12.2       |
|[2013-01-01 18:00:00, 2013-01-01 20:00:00]|VSF    |24.08      |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|MVL    |30.2 

23/04/12 16:55:34 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 27108 milliseconds
[Stage 844:(98 + 2) / 200][Stage 846:>(0 + 0) / 200][Stage 847:=> (1 + 0) / 2]

In [44]:
comparison_query.stop()

                                                                                

-------------------------------------------
Batch: 105
-------------------------------------------
+------------------------------------------+-------+-----------+
|Window                                    |station|temperature|
+------------------------------------------+-------+-----------+
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |24.98      |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.0       |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.36      |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |13.64      |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |12.9

23/04/12 16:58:10 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 31019 milliseconds
[Stage 915:(44 + 2) / 200][Stage 918:>(7 + 0) / 200][Stage 920:>  (0 + 0) / 1]  

### z_score

In [41]:
df_with_zscore = comparison_df.withColumn('z_score', expr('(temperature - avg_temp) / stddev_temp'))

                                                                                

-------------------------------------------
Batch: 39
-------------------------------------------
+------+-------+-----------+-------+------+--------+-----------+
|Window|station|temperature|station|window|avg_temp|stddev_temp|
+------+-------+-----------+-------+------+--------+-----------+
+------+-------+-----------+-------+------+--------+-----------+



23/04/12 16:56:29 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 27920 milliseconds
[Stage 869:(97 + 2) / 200][Stage 871:>  (0 + 0) / 1][Stage 873:>  (0 + 0) / 1]

In [42]:
df_anomalous_requests = (df_with_zscore.filter(~isnan(col("z_score")))
                      .filter("z_score > 2 or z_score < -2"))

[Stage 869:(195 + 2) / 200][Stage 871:>  (0 + 0) / 1][Stage 873:>  (0 + 0) / 1]

-------------------------------------------
Batch: 102
-------------------------------------------
+------------------------------------------+-------+-----------+
|Window                                    |station|temperature|
+------------------------------------------+-------+-----------+
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |24.98      |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.0       |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.36      |
|[2013-01-01 18:00:00, 2013-01-01 20:00:00]|EFK    |12.02      |
|[2013-01-01 18:00:00, 2013-01-01 20:00:00]|EFK    |12.2

23/04/12 16:56:36 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 21160 milliseconds


In [43]:
final_query= (df_anomalous_requests
            .writeStream
            .outputMode("complete")
            .format("console")
            .option("truncate",'false')
            .trigger(processingTime="10 seconds")
            .start())

23/04/12 16:56:37 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-6b13bbb8-49f4-46fe-8e90-c7108d82b876. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+-------+-----------+---------------+-------------+-------+------------------------------------------+------------------+-------------------+
|Window                                    |station|temperature|prev_week_start|prev_week_end|station|window                                    |avg_temp          |stddev_temp        |
+------------------------------------------+-------+-----------+---------------+-------------+-------+------------------------------------------+------------------+-------------------+
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.0       |2012-12-25     |2012-12-31   |CDA    |[2012-12-26 00:00:00, 2013-01-02 00:00:00]|23.252            |0.5489262245511686 |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.0       |2012-12-25     |2012-12-31   |CDA    |[2012-12-31 00:00:00, 2013-01-07 00:00:00]|23.252            

23/04/12 16:56:44 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 35474 milliseconds
                                                                                

-------------------------------------------
Batch: 74
-------------------------------------------


23/04/12 16:56:53 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 30391 milliseconds
[Stage 879:(14 + 2) / 200][Stage 882:>  (0 + 0) / 2][Stage 883:>  (0 + 0) / 2]

+------------------------------------------+-------+-----------+-------+------------------------------------------+--------+-----------+
|Window                                    |station|temperature|station|window                                    |avg_temp|stddev_temp|
+------------------------------------------+-------+-----------+-------+------------------------------------------+--------+-----------+
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2012-12-30 00:00:00, 2013-01-06 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2012-12-28 00:00:00, 2013-01-04 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2012-12-29 00:00:00, 2013-01-05 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2013-01-01 00:00:00, 2013-01-08 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:0

23/04/12 16:56:58 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 28721 milliseconds


-------------------------------------------
Batch: 40
-------------------------------------------
+------+-------+-----------+-------+------+--------+-----------+
|Window|station|temperature|station|window|avg_temp|stddev_temp|
+------+-------+-----------+-------+------+--------+-----------+
+------+-------+-----------+-------+------+--------+-----------+



                                                                                

-------------------------------------------
Batch: 103
-------------------------------------------
+------------------------------------------+-------+-----------+
|Window                                    |station|temperature|
+------------------------------------------+-------+-----------+
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |24.98      |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.0       |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.36      |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|BTV    |17.06      |
|[2013-01-01 18:00:00, 2013-01-01 20:00:00]|EFK    |12.0

23/04/12 16:57:04 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 27526 milliseconds
                                                                                

-------------------------------------------
Batch: 75
-------------------------------------------
+------------------------------------------+-------+-----------+-------+------------------------------------------+--------+-----------+
|Window                                    |station|temperature|station|window                                    |avg_temp|stddev_temp|
+------------------------------------------+-------+-----------+-------+------------------------------------------+--------+-----------+
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2012-12-30 00:00:00, 2013-01-06 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2012-12-28 00:00:00, 2013-01-04 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2012-12-29 00:00:00, 2013-01-05 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2013-01

23/04/12 16:57:11 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 18400 milliseconds
23/04/12 16:57:18 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 34124 milliseconds


-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+-------+-----------+---------------+-------------+-------+------------------------------------------+--------+-------------------+
|Window                                    |station|temperature|prev_week_start|prev_week_end|station|window                                    |avg_temp|stddev_temp        |
+------------------------------------------+-------+-----------+---------------+-------------+-------+------------------------------------------+--------+-------------------+
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.0       |2012-12-25     |2012-12-31   |CDA    |[2012-12-26 00:00:00, 2013-01-02 00:00:00]|23.252  |0.5489262245511686 |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.0       |2012-12-25     |2012-12-31   |CDA    |[2012-12-31 00:00:00, 2013-01-07 00:00:00]|23.252  |0.5489262245511686 |
|[2013-01-01 19:00:00, 2013-

                                                                                

-------------------------------------------
Batch: 41
-------------------------------------------
+------+-------+-----------+-------+------+--------+-----------+
|Window|station|temperature|station|window|avg_temp|stddev_temp|
+------+-------+-----------+-------+------+--------+-----------+
+------+-------+-----------+-------+------+--------+-----------+



23/04/12 16:57:25 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 26346 milliseconds
[Stage 895:(194 + 2) / 200][Stage 896:>  (0 + 0) / 1][Stage 898:>  (0 + 0) / 2]

-------------------------------------------
Batch: 0
-------------------------------------------


                                                                                

+------+-------+-----------+---------------+-------------+-------+------+--------+-----------+-------+
|Window|station|temperature|prev_week_start|prev_week_end|station|window|avg_temp|stddev_temp|z_score|
+------+-------+-----------+---------------+-------------+-------+------+--------+-----------+-------+
+------+-------+-----------+---------------+-------------+-------+------+--------+-----------+-------+



23/04/12 16:57:33 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 55698 milliseconds
23/04/12 16:57:39 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 35428 milliseconds


-------------------------------------------
Batch: 104
-------------------------------------------
+------------------------------------------+-------+-----------+
|Window                                    |station|temperature|
+------------------------------------------+-------+-----------+
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |24.98      |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 06:00:00, 2013-01-01 08:00:00]|RUT    |30.2       |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.0       |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.36      |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |13.64      |
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |12.9

                                                                                

-------------------------------------------
Batch: 76
-------------------------------------------
+------------------------------------------+-------+-----------+-------+------------------------------------------+--------+-----------+
|Window                                    |station|temperature|station|window                                    |avg_temp|stddev_temp|
+------------------------------------------+-------+-----------+-------+------------------------------------------+--------+-----------+
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2012-12-30 00:00:00, 2013-01-06 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2012-12-28 00:00:00, 2013-01-04 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2012-12-29 00:00:00, 2013-01-05 00:00:00]|32.0    |0.0        |
|[2013-01-01 14:00:00, 2013-01-01 16:00:00]|VSF    |26.06      |VSF    |[2013-01

23/04/12 16:57:49 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 37547 milliseconds
[Stage 905:(199 + 1) / 200][Stage 907:>(0 + 2) / 200][Stage 908:>  (0 + 0) / 1]

-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------------+-------+-----------+---------------+-------------+-------+------------------------------------------+--------+------------------+
|Window                                    |station|temperature|prev_week_start|prev_week_end|station|window                                    |avg_temp|stddev_temp       |
+------------------------------------------+-------+-----------+---------------+-------------+-------+------------------------------------------+--------+------------------+
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.0       |2012-12-25     |2012-12-31   |CDA    |[2012-12-26 00:00:00, 2013-01-02 00:00:00]|23.252  |0.5489262245511686|
|[2013-01-01 19:00:00, 2013-01-01 21:00:00]|CDA    |14.0       |2012-12-25     |2012-12-31   |CDA    |[2012-12-31 00:00:00, 2013-01-07 00:00:00]|23.252  |0.5489262245511686|
|[2013-01-01 19:00:00, 2013-01-01

23/04/12 16:57:56 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 38087 milliseconds
                                                                                

-------------------------------------------
Batch: 42
-------------------------------------------
+------+-------+-----------+-------+------+--------+-----------+
|Window|station|temperature|station|window|avg_temp|stddev_temp|
+------+-------+-----------+-------+------+--------+-----------+
+------+-------+-----------+-------+------+--------+-----------+



23/04/12 16:58:03 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 38957 milliseconds
[Stage 911:(28 + 2) / 200][Stage 913:=> (1 + 0) / 2][Stage 914:>  (0 + 0) / 1]2]

In [None]:
final_query.stop()