In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-21-stream")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

dataset - Heterogeneity Human Activity Recognition

The data consists of smartphone and smartwatch sensor readings from a variety of devices (such as accelerometer, gyroscope), sampled at the highest possible frequency supported by the devices. Readings from these sensors weree recorded while users perfomed activities like biking, sitting, standing, walking, and so on

In [3]:
file_path = SPARK_BOOK_DATA_PATH + "/data/activity-data/"
static = spark.read.json(file_path)
dataSchema = static.schema

In [5]:
static.show(5)

+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
| Arrival_Time|      Creation_Time|  Device|Index| Model|User|   gt|           x|           y|           z|
+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
|1424686735090|1424686733090638193|nexus4_1|   18|nexus4|   g|stand| 3.356934E-4|-5.645752E-4|-0.018814087|
|1424686735292|1424688581345918092|nexus4_2|   66|nexus4|   g|stand|-0.005722046| 0.029083252| 0.005569458|
|1424686735500|1424686733498505625|nexus4_1|   99|nexus4|   g|stand|   0.0078125|-0.017654419| 0.010025024|
|1424686735691|1424688581745026978|nexus4_2|  145|nexus4|   g|stand|-3.814697E-4|   0.0184021|-0.013656616|
|1424686735890|1424688581945252808|nexus4_2|  185|nexus4|   g|stand|-3.814697E-4|-0.031799316| -0.00831604|
+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
only showing top 5 rows



In [4]:
# COMMAND ----------

## Extract

streaming = spark.readStream.schema(dataSchema)\
  .option("maxFilesPerTrigger", 1)\
  .json(file_path)

In [6]:
# COMMAND ----------

## Transform

activityCounts = streaming.groupBy("gt").count()

In [7]:
# COMMAND ----------

## Load (action)

activityQuery = activityCounts.writeStream\
    .queryName("activity_counts")\
    .format("memory")\
    .outputMode("complete")\
    .start()

activityQuery.awaitTermination()

In [8]:
# COMMAND ----------

from time import sleep
for x in range(5):
    spark.sql("SELECT * FROM activity_counts").show()
    sleep(2)

+----------+------+
|        gt| count|
+----------+------+
|  stairsup| 83624|
|       sit| 98468|
|     stand| 91080|
|      walk|106047|
|      bike| 86379|
|stairsdown| 74912|
|      null| 83585|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|  stairsup| 94080|
|       sit|110775|
|     stand|102465|
|      walk|119303|
|      bike| 97178|
|stairsdown| 84273|
|      null| 94033|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|  stairsup|104536|
|       sit|123082|
|     stand|113851|
|      walk|132559|
|      bike|107976|
|stairsdown| 93635|
|      null|104480|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|  stairsup|114992|
|       sit|135389|
|     stand|125238|
|      walk|145815|
|      bike|118773|
|stairsdown|102997|
|      null|114927|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|  stairsup|125442|
|       sit|147697|
|     stand|1366

In [10]:
# COMMAND ----------

from time import sleep
for x in range(5):
    spark.sql("SELECT * FROM activity_counts").show()
    sleep(1)

+----------+------+
|        gt| count|
+----------+------+
|  stairsup|198636|
|       sit|233851|
|     stand|216319|
|      walk|251864|
|      bike|205154|
|stairsdown|177899|
|      null|198503|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|  stairsup|209097|
|       sit|246159|
|     stand|227703|
|      walk|265120|
|      bike|215951|
|stairsdown|187259|
|      null|208949|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|  stairsup|219558|
|       sit|258467|
|     stand|239087|
|      walk|278376|
|      bike|226748|
|stairsdown|196618|
|      null|219395|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|  stairsup|230013|
|       sit|270775|
|     stand|250473|
|      walk|291632|
|      bike|237544|
|stairsdown|205981|
|      null|229842|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|  stairsup|240474|
|       sit|283083|
|     stand|2618

In [11]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x7fa054033ac8>]

In [12]:
# COMMAND ----------

from pyspark.sql.functions import expr
simpleTransform = streaming.withColumn("stairs", expr("gt like '%stairs%'"))\
  .where("stairs")\
  .where("gt is not null")\
  .select("gt", "model", "arrival_time", "creation_time")\
  .writeStream\
  .queryName("simple_transform")\
  .format("memory")\
  .outputMode("append")\
  .start()

In [13]:
# COMMAND ----------

deviceModelStats = streaming.cube("gt", "model").avg()\
  .drop("avg(Arrival_time)")\
  .drop("avg(Creation_Time)")\
  .drop("avg(Index)")\
  .writeStream.queryName("device_counts")\
  .format("memory")\
  .outputMode("complete")\
  .start()

In [14]:
# COMMAND ----------

historicalAgg = static.groupBy("gt", "model").avg()
deviceModelStats = streaming.drop("Arrival_Time", "Creation_Time", "Index")\
  .cube("gt", "model").avg()\
  .join(historicalAgg, ["gt", "model"])\
  .writeStream.queryName("device_counts")\
  .format("memory")\
  .outputMode("complete")\
  .start()


# COMMAND ----------

IllegalArgumentException: 'Cannot start query with name device_counts as a query with that name is already active'

In [None]:
# Subscribe to 1 topic
df1 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("subscribe", "topic1")\
  .load()
# Subscribe to multiple topics
df2 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("subscribe", "topic1,topic2")\
  .load()
# Subscribe to a pattern
df3 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("subscribePattern", "topic.*")\
  .load()


# COMMAND ----------

df1.selectExpr("topic", "CAST(key AS STRING)", "CAST(value AS STRING)")\
  .writeStream\
  .format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("checkpointLocation", "/to/HDFS-compatible/dir")\
  .start()
df1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")\
  .writeStream\
  .format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("checkpointLocation", "/to/HDFS-compatible/dir")\
  .option("topic", "topic1")\
  .start()


# COMMAND ----------

socketDF = spark.readStream.format("socket")\
  .option("host", "localhost").option("port", 9999).load()


# COMMAND ----------

activityCounts.writeStream.trigger(processingTime='5 seconds')\
  .format("console").outputMode("complete").start()


# COMMAND ----------

activityCounts.writeStream.trigger(once=True)\
  .format("console").outputMode("complete").start()


# COMMAND ----------