# Exercise 02 : Structured Streaming

In [2]:
dbutils.fs.rm("/tmp/structured-streaming/events", recurse=True)
dbutils.fs.put(
  "/tmp/structured-streaming/events/file01.json",
  """{"event_name":"Open","event_time":1540601000}
{"event_name":"Open","event_time":1540601010}
{"event_name":"Fail","event_time":1540601020}
{"event_name":"Open","event_time":1540601030}
{"event_name":"Open","event_time":1540601040}
{"event_name":"Open","event_time":1540601050}
{"event_name":"Open","event_time":1540601060}
{"event_name":"Fail","event_time":1540601070}
{"event_name":"Open","event_time":1540601080}
{"event_name":"Open","event_time":1540601090}
""", True)

In [3]:
%fs head /tmp/structured-streaming/events/file01.json

In [4]:
##### For Test !
# from pyspark.sql.types import *
# from pyspark.sql.functions import *

# test_schema = StructType([
#  StructField("event_name", StringType(), True),
#  StructField("event_time", TimestampType(), True)])
# test_df = (
#  spark
#    .read
#    .schema(test_schema)
#    .json("/tmp/structured-streaming/events/")
# )
# display(test_df)

event_name,event_time
Open,2018-10-27T00:43:20.000+0000
Open,2018-10-27T00:43:30.000+0000
Fail,2018-10-27T00:43:40.000+0000
Open,2018-10-27T00:43:50.000+0000
Open,2018-10-27T00:44:00.000+0000
Open,2018-10-27T00:44:10.000+0000
Open,2018-10-27T00:44:20.000+0000
Fail,2018-10-27T00:44:30.000+0000
Open,2018-10-27T00:44:40.000+0000
Open,2018-10-27T00:44:50.000+0000


In [5]:
# Create streaming dataframe
from pyspark.sql.types import *
from pyspark.sql.functions import *

read_schema = StructType([
  StructField("event_name", StringType(), True),
  StructField("event_time", TimestampType(), True)])
read_df = (
  spark
    .readStream                       
    .schema(read_schema)
    .option("maxFilesPerTrigger", 1)  # one file at a time
    .json("/tmp/structured-streaming/events/")
)

The following will continue to run as background jobs ...

In [7]:
# Write results in memory
query1 = (
  read_df
    .writeStream
    .format("memory")
    .queryName("read_simple")
    .start()
)

In [8]:
%sql select event_name, date_format(event_time, "MMM-dd HH:mm") from read_simple order by event_time

event_name,"date_format(event_time, MMM-dd HH:mm)"
Open,Oct-27 00:43
Open,Oct-27 00:43
Fail,Oct-27 00:43
Open,Oct-27 00:43
Open,Oct-27 00:44
Open,Oct-27 00:44
Open,Oct-27 00:44
Fail,Oct-27 00:44
Open,Oct-27 00:44
Open,Oct-27 00:44


Publish next event into folder and see the previous results again ! (See how streaming is displayed in the previous background job.)

In [10]:
# Add new file in front end and see above result again
dbutils.fs.put(
  "/tmp/structured-streaming/events/file02.json",
  """{"event_name":"Open","event_time":1540601100}
{"event_name":"Open","event_time":1540601110}
{"event_name":"Fail","event_time":1540601120}
{"event_name":"Open","event_time":1540601130}
{"event_name":"Open","event_time":1540601140}
{"event_name":"Open","event_time":1540601150}
{"event_name":"Open","event_time":1540601160}
{"event_name":"Fail","event_time":1540601170}
{"event_name":"Open","event_time":1540601180}
{"event_name":"Open","event_time":1540601190}
""", True)

In [11]:
# Windowing analysis
group_df = (                 
  read_df
    .groupBy(
      read_df.event_name, 
      window(read_df.event_time, "1 minute"))
    .count()
)

The following will also continue to run as background jobs ...

In [13]:
query2 = (
  group_df
    .writeStream
    .format("memory")
    .queryName("read_counts")
    .outputMode("complete")
    .start()
)

In [14]:
%sql select event_name, date_format(window.end, "MMM-dd HH:mm") as event_time, count from read_counts order by event_time, event_name

event_name,event_time,count
Fail,Oct-27 00:44,1
Open,Oct-27 00:44,3
Fail,Oct-27 00:45,1
Open,Oct-27 00:45,5
Fail,Oct-27 00:46,1
Open,Oct-27 00:46,5
Fail,Oct-27 00:47,1
Open,Oct-27 00:47,3


Publish next event into folder and see the previous results again ! (See how streaming is displayed in the previous background job.)

In [16]:
# Add new file in front end and see above result again
dbutils.fs.put(
  "/tmp/structured-streaming/events/file03.json",
  """{"event_name":"Open","event_time":1540601200}
{"event_name":"Open","event_time":1540601210}
{"event_name":"Fail","event_time":1540601220}
{"event_name":"Open","event_time":1540601230}
{"event_name":"Open","event_time":1540601240}
{"event_name":"Open","event_time":1540601250}
{"event_name":"Open","event_time":1540601260}
{"event_name":"Fail","event_time":1540601270}
{"event_name":"Open","event_time":1540601280}
{"event_name":"Open","event_time":1540601290}
""", True)

After completed, cancel (stop) previous jobs.