In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-21-streaming")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

dataset - Heterogeneity Human Activity Recognition

The data consists of smartphone and smartwatch sensor readings from a variety of devices (such as accelerometer, gyroscope), sampled at the highest possible frequency supported by the devices. Readings from these sensors weree recorded while users perfomed activities like biking, sitting, standing, walking, and so on

In [2]:
file_path = SPARK_BOOK_DATA_PATH + "/data/activity-data/"
static = spark.read.json(file_path)
dataSchema = static.schema

In [3]:
static.show(5)

+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
| Arrival_Time|      Creation_Time|  Device|Index| Model|User|   gt|           x|           y|           z|
+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
|1424686735090|1424686733090638193|nexus4_1|   18|nexus4|   g|stand| 3.356934E-4|-5.645752E-4|-0.018814087|
|1424686735292|1424688581345918092|nexus4_2|   66|nexus4|   g|stand|-0.005722046| 0.029083252| 0.005569458|
|1424686735500|1424686733498505625|nexus4_1|   99|nexus4|   g|stand|   0.0078125|-0.017654419| 0.010025024|
|1424686735691|1424688581745026978|nexus4_2|  145|nexus4|   g|stand|-3.814697E-4|   0.0184021|-0.013656616|
|1424686735890|1424688581945252808|nexus4_2|  185|nexus4|   g|stand|-3.814697E-4|-0.031799316| -0.00831604|
+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
only showing top 5 rows



In [4]:
## Extract

streaming = spark.readStream.schema(dataSchema)\
  .option("maxFilesPerTrigger", 1)\
  .json(file_path)

In [6]:
type(static), type(streaming), streaming.isStreaming

(pyspark.sql.dataframe.DataFrame, pyspark.sql.dataframe.DataFrame, True)

In [8]:
# watch stream

from time import sleep

def show_streaming(SQL_stmt, ntimes=5, sleep_sec=1):
    for x in range(ntimes):
        spark.sql(SQL_stmt).show()
        sleep(sleep_sec)

use format = `memory`, other formats are: `console, socket, kafka`

In [9]:
## Load (action)

activityQuery = (
    streaming.groupBy("gt")   # Transform
    .count()
    .writeStream
    .queryName("activity_counts")
    .format("memory")
    .outputMode("complete")
    .start()
)

# activityQuery.awaitTermination()

In [11]:
show_streaming(SQL_stmt="SELECT * FROM activity_counts", ntimes=10, sleep_sec=2)

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|41810|
|       sit|49234|
|     stand|45541|
|      walk|53024|
|      bike|43189|
|stairsdown|37459|
|      null|41791|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|52262|
|       sit|61542|
|     stand|56926|
|      walk|66280|
|      bike|53987|
|stairsdown|46824|
|      null|52239|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|62716|
|       sit|73849|
|     stand|68311|
|      walk|79536|
|      bike|64786|
|stairsdown|56186|
|      null|62688|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|73171|
|       sit|86156|
|     stand|79698|
|      walk|92792|
|      bike|75583|
|stairsdown|65549|
|      null|73135|
+----------+-----+

+----------+------+
|        gt| count|
+----------+------+
|  stairsup| 83625|
|       sit| 98463|
|     stand| 91083|
|      walk|106048|
|      bike| 86382|


In [12]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x7fa730b8d940>]

In [13]:
simpleTransform = (
  streaming
    .withColumn("stairs", F.expr("gt like '%stairs%'"))
    .where("stairs")
    .where("gt is not null")
    .select("gt", "model", "arrival_time", "creation_time")
    .writeStream.queryName("simple_transform2")
    .format("memory")
    .outputMode("append")
    .start()
)

In [14]:
show_streaming(SQL_stmt="select * from simple_transform2")

+---+-----+------------+-------------+
| gt|model|arrival_time|creation_time|
+---+-----+------------+-------------+
+---+-----+------------+-------------+

+---+-----+------------+-------------+
| gt|model|arrival_time|creation_time|
+---+-----+------------+-------------+
+---+-----+------------+-------------+

+--------+------+-------------+-------------------+
|      gt| model| arrival_time|      creation_time|
+--------+------+-------------+-------------------+
|stairsup|nexus4|1424687983730|1424687981736873519|
|stairsup|nexus4|1424687984021|1424687982023708236|
|stairsup|nexus4|1424687984422|1424687982431691365|
|stairsup|nexus4|1424687984826|1424687982835622029|
|stairsup|nexus4|1424687985228|1424687983237459357|
|stairsup|nexus4|1424687985634|1424687983640474493|
|stairsup|nexus4|1424687986036|1424687984043306525|
|stairsup|nexus4|1424687986438|1424687984441042120|
|stairsup|nexus4|1424687986841|1424687984849086587|
|stairsup|nexus4|1424687987244|1424687985251949135|
|stairsup|

In [15]:
deviceModelStats = (
    streaming.cube("gt", "model").avg()
    .drop("avg(Arrival_time)")
    .drop("avg(Creation_Time)")
    .drop("avg(Index)")
    .writeStream.queryName("device_counts")
    .format("memory")
    .outputMode("complete")
    .start()
)

In [16]:
show_streaming(SQL_stmt="select * from device_counts")

+---+-----+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|
+---+-----+------+------+------+
+---+-----+------+------+------+

+---+-----+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|
+---+-----+------+------+------+
+---+-----+------+------+------+

+---+-----+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|
+---+-----+------+------+------+
+---+-----+------+------+------+

+---+-----+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|
+---+-----+------+------+------+
+---+-----+------+------+------+

+----------+------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+
|      null|nexus4|-0.00786799708513591|-0.00148733897879...|0.006517400118233163|
|      null|nexus4|0.002493916706910...|-0.00693672737540...|-0.00999528491813...|
|      null|  null|0.002493916706910...|-0.0069367273

In [None]:
# COMMAND ----------

historicalAgg = static.groupBy("gt", "model").avg()

In [20]:
historicalAgg.show()

+----------+------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|        gt| model|   avg(Arrival_Time)|  avg(Creation_Time)|        avg(Index)|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|      bike|nexus4|1.424751134339985E12|1.424752127369588...| 326459.6867328154|  0.0226887595508668|-0.00877912156368...|-0.08251001663412343|
|      walk|nexus4|1.424746420641789...|1.424747351060674...|149760.09974990616|-0.00390116006094...|0.001052508689953...|-6.95435553042997...|
|stairsdown|nexus4|1.424744591412857...|1.424745503635636...|230452.44623187225|0.021613908669165474|-0.03249018824752616| 0.12035922691504075|
|       sit|nexus4|1.424741207868231E12|1.424742112220355...| 74577.84690275553|-5.49433244039557...|2.791446281700046E-4|-2.33994461689

In [17]:
deviceModelStats = (
    streaming.drop("Arrival_Time", "Creation_Time", "Index")
  .cube("gt", "model").avg()
  .join(historicalAgg, ["gt", "model"])
  .writeStream.queryName("join_hist")
  .format("memory")
  .outputMode("complete")
  .start()
)

In [19]:
show_streaming(SQL_stmt="select * from join_hist", ntimes=10)

+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|   avg(Arrival_Time)|  avg(Creation_Time)|        avg(Index)|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|      bike|nexus4| 0.02756607566542555|-0.01192289439331294|-0.08014216329739748|1.424751134339985E12|1.424752127369588...| 326459.6867328154|  0.0226887595508668|-0.00877912156368...|-0.08251001663412343|
|      walk|nexus4|-5.99343870247432...|0.003705911656842188|-0.00431396913741...|1.424746420641789...|1.424747351060674...|149760.09974990616|-0.00390116006094...|0.001052

+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|   avg(Arrival_Time)|  avg(Creation_Time)|        avg(Index)|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|      bike|nexus4|0.023873200891614647|-0.00804133605683961|-0.08362340956341158|1.424751134339985E12|1.424752127369588...| 326459.6867328154|  0.0226887595508668|-0.00877912156368...|-0.08251001663412343|
|      walk|nexus4|-0.00570308152615...| 0.00311402437097163|7.115927557875535E-4|1.424746420641789...|1.424747351060674...|149760.09974990616|-0.00390116006094...|0.001052

+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|   avg(Arrival_Time)|  avg(Creation_Time)|        avg(Index)|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|      bike|nexus4|0.022835922274970166|-0.00917119389977...|-0.08313589635836904|1.424751134339985E12|1.424752127369588...| 326459.6867328154|  0.0226887595508668|-0.00877912156368...|-0.08251001663412343|
|      walk|nexus4|-0.00464067337126...|0.002647685358633...| 4.51539225862489E-4|1.424746420641789...|1.424747351060674...|149760.09974990616|-0.00390116006094...|0.001052

see `chapter-21-stream-kafka.ipynb` for example using streaming with Kafka

In [None]:
# Subscribe to 1 topic
df1 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("subscribe", "topic1")\
  .load()

In [None]:
# Subscribe to multiple topics
df2 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("subscribe", "topic1,topic2")\
  .load()
# Subscribe to a pattern
df3 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("subscribePattern", "topic.*")\
  .load()


# COMMAND ----------

df1.selectExpr("topic", "CAST(key AS STRING)", "CAST(value AS STRING)")\
  .writeStream\
  .format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("checkpointLocation", "/to/HDFS-compatible/dir")\
  .start()
df1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")\
  .writeStream\
  .format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("checkpointLocation", "/to/HDFS-compatible/dir")\
  .option("topic", "topic1")\
  .start()


# COMMAND ----------

socketDF = spark.readStream.format("socket")\
  .option("host", "localhost").option("port", 9999).load()


# COMMAND ----------

activityCounts.writeStream.trigger(processingTime='5 seconds')\
  .format("console").outputMode("complete").start()


# COMMAND ----------

activityCounts.writeStream.trigger(once=True)\
  .format("console").outputMode("complete").start()


# COMMAND ----------