In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-21-streaming")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

dataset - Heterogeneity Human Activity Recognition

The data consists of smartphone and smartwatch sensor readings from a variety of devices (such as accelerometer, gyroscope), sampled at the highest possible frequency supported by the devices. Readings from these sensors weree recorded while users perfomed activities like biking, sitting, standing, walking, and so on

In [2]:
file_path = SPARK_BOOK_DATA_PATH + "/data/activity-data/"
static = spark.read.json(file_path)
dataSchema = static.schema

In [3]:
static.show(5)

+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
| Arrival_Time|      Creation_Time|  Device|Index| Model|User|   gt|           x|           y|           z|
+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
|1424686735090|1424686733090638193|nexus4_1|   18|nexus4|   g|stand| 3.356934E-4|-5.645752E-4|-0.018814087|
|1424686735292|1424688581345918092|nexus4_2|   66|nexus4|   g|stand|-0.005722046| 0.029083252| 0.005569458|
|1424686735500|1424686733498505625|nexus4_1|   99|nexus4|   g|stand|   0.0078125|-0.017654419| 0.010025024|
|1424686735691|1424688581745026978|nexus4_2|  145|nexus4|   g|stand|-3.814697E-4|   0.0184021|-0.013656616|
|1424686735890|1424688581945252808|nexus4_2|  185|nexus4|   g|stand|-3.814697E-4|-0.031799316| -0.00831604|
+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
only showing top 5 rows



In [4]:
# COMMAND ----------

## Extract

streaming = spark.readStream.schema(dataSchema)\
  .option("maxFilesPerTrigger", 1)\
  .json(file_path)

In [5]:
# COMMAND ----------

## Transform

activityCounts = streaming.groupBy("gt").count()

In [6]:
# COMMAND ----------

## Load (action)

activityQuery = activityCounts.writeStream\
    .queryName("activity_counts")\
    .format("memory")\
    .outputMode("complete")\
    .start()

# activityQuery.awaitTermination()

In [7]:
# COMMAND ----------

from time import sleep
for x in range(5):
    spark.sql("SELECT * FROM activity_counts").show()
    sleep(1)

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|10452|
|       sit|12309|
|     stand|11385|
|      walk|13256|
|      bike|10797|
|stairsdown| 9365|
|      null|10448|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|20905|
|       sit|24619|
|     stand|22770|
|      walk|26512|
|      bike|21594|
|stairsdown|18729|
|      null|20895|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|31357|
|       sit|36929|
|     stand|34155|
|      walk|39768|
|      bike|32391|
|stairsdown|28094|
|      null|31342|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|41809|
|       sit|49238|
|     stand|45539|
|      walk|53024|
|      bike|43187|
|stairsdown|37459|
|      null|41791|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|52262|
|       sit|61545|
|     stand|56924|
|      walk|66280|
|      bike|53985|
|stairsd

In [8]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x7fade9c18898>]

In [13]:
# COMMAND ----------

from pyspark.sql.functions import expr
simpleTransform = streaming.withColumn("stairs", expr("gt like '%stairs%'"))\
  .where("stairs")\
  .where("gt is not null")\
  .select("gt", "model", "arrival_time", "creation_time")\
  .writeStream\
  .queryName("simple_transform2")\
  .format("memory")\
  .outputMode("append")\
  .start()

In [14]:
spark.sql("select * from simple_transform2").show()

+--------+------+-------------+-------------------+
|      gt| model| arrival_time|      creation_time|
+--------+------+-------------+-------------------+
|stairsup|nexus4|1424687983725|1424687981731838118|
|stairsup|nexus4|1424687984021|1424687982029994857|
|stairsup|nexus4|1424687984421|1424687982426655964|
|stairsup|nexus4|1424687984825|1424687982830373005|
|stairsup|nexus4|1424687985223|1424687983232454474|
|stairsup|nexus4|1424687985633|1424687983635439093|
|stairsup|nexus4|1424687986031|1424687984038271124|
|stairsup|nexus4|1424687986438|1424687984447786505|
|stairsup|nexus4|1424687986837|1424687984843868081|
|stairsup|nexus4|1424687987240|1424687985241725747|
|stairsup|nexus4|1424687987640|1424687985649636253|
|stairsup|nexus4|1424687987997|1424687986007149681|
|stairsup|nexus4|1424687988202|1424689834242463027|
|stairsup|nexus4|1424687988400|1424687986409981712|
|stairsup|nexus4|1424687988602|1424687986611397727|
|stairsup|nexus4|1424687988805|1424689834851685440|
|stairsup|ne

In [11]:
# COMMAND ----------

deviceModelStats = streaming.cube("gt", "model").avg()\
  .drop("avg(Arrival_time)")\
  .drop("avg(Creation_Time)")\
  .drop("avg(Index)")\
  .writeStream.queryName("device_counts")\
  .format("memory")\
  .outputMode("complete")\
  .start()

In [12]:
spark.sql("select * from device_counts").show()

+----------+------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+
|      null|nexus4|-0.00709412616238...|-8.58791204326168...|0.006706891709609522|
|      null|nexus4|8.401978016402726E-4|-0.00641096529662...|-0.00813326767938...|
|      null|  null|8.401978016402726E-4|-0.00641096529662...|-0.00813326767938...|
|      bike|nexus4|0.024354459763982594|-0.00985390261844...| -0.0821334040104677|
|     stand|  null|-4.14058840975687...|3.671475178272786E-4|3.247939523294214...|
|       sit|nexus4|-5.35942622812577...|3.534359010935097...|-2.12916954426842...|
|     stand|nexus4|-4.14058840975687...|3.671475178272786E-4|3.247939523294214...|
|stairsdown|  null|0.024112003211586183|-0.03817385825913334| 0.12698510835722174|
|  stairsup|  null| -0.0276360480197965|-0.01028970048418...|-0.09847309651438897|
|   

In [15]:
# COMMAND ----------

historicalAgg = static.groupBy("gt", "model").avg()
deviceModelStats = streaming.drop("Arrival_Time", "Creation_Time", "Index")\
  .cube("gt", "model").avg()\
  .join(historicalAgg, ["gt", "model"])\
  .writeStream.queryName("join_hist")\
  .format("memory")\
  .outputMode("complete")\
  .start()


# COMMAND ----------

In [17]:
spark.sql("select * from join_hist").show()

+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|   avg(Arrival_Time)|  avg(Creation_Time)|        avg(Index)|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|      bike|nexus4|0.023557159293850147|-0.01037107974742...|-0.08150749488259697|1.424751134339985...|1.424752127369589...| 326459.6867328154|0.022688759550866855|-0.00877912156368...|-0.08251001663412343|
|      walk|nexus4|-0.00477321663226...|0.007647625398144233|1.359592299788791...|1.424746420641789...|1.424747351060674...|149760.09974990616|-0.00390116006094...|0.001052

In [18]:
historicalAgg.show()

+----------+------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|        gt| model|   avg(Arrival_Time)|  avg(Creation_Time)|        avg(Index)|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|      bike|nexus4|1.424751134339985...|1.424752127369589...| 326459.6867328154|0.022688759550866855|-0.00877912156368...|-0.08251001663412343|
|      walk|nexus4|1.424746420641789...|1.424747351060674...|149760.09974990616|-0.00390116006094...|0.001052508689953...|-6.95435553042997...|
|stairsdown|nexus4|1.424744591412857E12|1.424745503635636...|230452.44623187225|0.021613908669165436|-0.03249018824752...| 0.12035922691504075|
|       sit|nexus4|1.424741207868231...|1.424742112220356...| 74577.84690275553|-5.49433244039557...|2.791446281700041E-4|-2.33994461689

see `chapter-21-stream-kafka.ipynb` for example using streaming with Kafka

In [None]:
# Subscribe to 1 topic
df1 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("subscribe", "topic1")\
  .load()

In [None]:
# Subscribe to multiple topics
df2 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("subscribe", "topic1,topic2")\
  .load()
# Subscribe to a pattern
df3 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("subscribePattern", "topic.*")\
  .load()


# COMMAND ----------

df1.selectExpr("topic", "CAST(key AS STRING)", "CAST(value AS STRING)")\
  .writeStream\
  .format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("checkpointLocation", "/to/HDFS-compatible/dir")\
  .start()
df1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")\
  .writeStream\
  .format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("checkpointLocation", "/to/HDFS-compatible/dir")\
  .option("topic", "topic1")\
  .start()


# COMMAND ----------

socketDF = spark.readStream.format("socket")\
  .option("host", "localhost").option("port", 9999).load()


# COMMAND ----------

activityCounts.writeStream.trigger(processingTime='5 seconds')\
  .format("console").outputMode("complete").start()


# COMMAND ----------

activityCounts.writeStream.trigger(once=True)\
  .format("console").outputMode("complete").start()


# COMMAND ----------