# Spark Streaming

In [1]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
static = spark.read.json("../data/activity-data/")
dataSchema = static.schema

In [3]:
dataSchema

StructType(List(StructField(Arrival_Time,LongType,true),StructField(Creation_Time,LongType,true),StructField(Device,StringType,true),StructField(Index,LongType,true),StructField(Model,StringType,true),StructField(User,StringType,true),StructField(gt,StringType,true),StructField(x,DoubleType,true),StructField(y,DoubleType,true),StructField(z,DoubleType,true)))

### Stream Creation

In [4]:
streaming = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1).json("../data/activity-data")

In [5]:
activityCounts = streaming.groupBy("gt").count()

In [6]:
spark.conf.set("spark.sql.shuffle.partitions", 5)

### Simple Query

In [7]:
activityQuery = activityCounts.writeStream.queryName("activity_counts")\
  .format("memory").outputMode("complete")\
  .start()

#activityQuery.awaitTermination()

In [8]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x202ed6031c8>]

In [9]:
from time import sleep

for i in range(10):
    spark.sql("SELECT * FROM activity_counts").show()
    sleep(1)

+---+-----+
| gt|count|
+---+-----+
+---+-----+

+---+-----+
| gt|count|
+---+-----+
+---+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|       sit|12309|
|     stand|11384|
|stairsdown| 9365|
|      walk|13256|
|  stairsup|10452|
|      null|10449|
|      bike|10796|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|       sit|24619|
|     stand|22769|
|stairsdown|18729|
|      walk|26512|
|  stairsup|20905|
|      null|20896|
|      bike|21593|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|       sit|24619|
|     stand|22769|
|stairsdown|18729|
|      walk|26512|
|  stairsup|20905|
|      null|20896|
|      bike|21593|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|       sit|36929|
|     stand|34154|
|stairsdown|28094|
|      walk|39768|
|  stairsup|31357|
|      null|31343|
|      bike|32390|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|    

### Transformation and Filtering

In [10]:
from pyspark.sql.functions import expr

simpleTransform = streaming.withColumn("stairs", expr("gt like '%stairs%'"))\
                            .where("stairs")\
                            .where("gt is not null")\
                            .select("gt", "model", "arrival_time", "creation_Time")\
                            .writeStream\
                            .queryName("simple_transform")\
                            .format("memory")\
                            .outputMode("append")\
                            .start()

In [11]:
for i in range(10):
    spark.sql("SELECT * FROM simple_transform").show()
    sleep(1)

+---+-----+------------+-------------+
| gt|model|arrival_time|creation_Time|
+---+-----+------------+-------------+
+---+-----+------------+-------------+

+--------+------+-------------+-------------------+
|      gt| model| arrival_time|      creation_Time|
+--------+------+-------------+-------------------+
|stairsup|nexus4|1424687983719|1424687981726802718|
|stairsup|nexus4|1424687984000|1424687982009853255|
|stairsup|nexus4|1424687984404|1424687982411977009|
|stairsup|nexus4|1424687984805|1424687982814351277|
|stairsup|nexus4|1424687985210|1424687983217500861|
|stairsup|nexus4|1424687985620|1424687983620332892|
|stairsup|nexus4|1424687986016|1424687984023164923|
|stairsup|nexus4|1424687986420|1424687984425874884|
|stairsup|nexus4|1424687986820|1424687984828822915|
|stairsup|nexus4|1424687987225|1424687985231654946|
|stairsup|nexus4|1424687987625|1424687985634469017|
|stairsup|nexus4|1424687987992|1424687986002114280|
|stairsup|nexus4|1424687988191|1424689834237427627|
|stairsup|n

+--------+------+-------------+-------------------+
|      gt| model| arrival_time|      creation_Time|
+--------+------+-------------+-------------------+
|stairsup|nexus4|1424687983719|1424687981726802718|
|stairsup|nexus4|1424687984000|1424687982009853255|
|stairsup|nexus4|1424687984404|1424687982411977009|
|stairsup|nexus4|1424687984805|1424687982814351277|
|stairsup|nexus4|1424687985210|1424687983217500861|
|stairsup|nexus4|1424687985620|1424687983620332892|
|stairsup|nexus4|1424687986016|1424687984023164923|
|stairsup|nexus4|1424687986420|1424687984425874884|
|stairsup|nexus4|1424687986820|1424687984828822915|
|stairsup|nexus4|1424687987225|1424687985231654946|
|stairsup|nexus4|1424687987625|1424687985634469017|
|stairsup|nexus4|1424687987992|1424687986002114280|
|stairsup|nexus4|1424687988191|1424689834237427627|
|stairsup|nexus4|1424687988392|1424689834438660537|
|stairsup|nexus4|1424687988592|1424689834640076553|
|stairsup|nexus4|1424687988794|1424689834841675674|
|stairsup|ne

In [12]:
deviceModelStats = streaming.cube("gt", "model").avg()\
                            .drop("avg(Arrival_time)")\
                            .drop("avg(Creation_Time)")\
                            .drop("avg(Index)")\
                            .writeStream.queryName("device_counts").format("memory")\
                            .outputMode("complete")\
                            .start()

In [13]:
for i in range(10):
    spark.sql("SELECT * FROM device_counts").show()
    sleep(1)

+---+-----+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|
+---+-----+------+------+------+
+---+-----+------+------+------+

+---+-----+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|
+---+-----+------+------+------+
+---+-----+------+------+------+

+---+-----+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|
+---+-----+------+------+------+
+---+-----+------+------+------+

+----------+------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+
|       sit|  null|-4.92132825379803...|3.757376157445784...|-4.42863346250729...|
|     stand|  null|-3.00989804137386...|4.133303473120163...|-2.86960196767402...|
|       sit|nexus4|-4.92132825379803...|3.757376157445784...|-4.42863346250729...|
|     stand|nexus4|-3.00989804137386...|4.133303473120163...|-2.86960196767402...|
|      null|  null|-

+----------+------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+
|       sit|  null|-5.02209146083024...|3.799962816160751...|-2.52092116718569...|
|     stand|  null|-3.85281971657785...|4.911896808133742E-4|3.596050619546717E-5|
|       sit|nexus4|-5.02209146083024...|3.799962816160751...|-2.52092116718569...|
|     stand|nexus4|-3.85281971657785...|4.911896808133742E-4|3.596050619546717E-5|
|      null|  null|-0.00551275851715...|-6.49222625233702...|0.005846553661280036|
|      null|  null|0.001263425007459...|-0.00650110655964...|-0.00819209174867...|
|      walk|  null|-0.00317765395921...|0.003289352996140115|0.002783577999142...|
|      null|nexus4|-0.00551275851715...|-6.49222625233702...|0.005846553661280036|
|      null|nexus4|0.001263425007459...|-0.00650110655964...|-0.00819209174867...|
|   

### Joining 

In [15]:
historicalAgg = static.groupBy("gt", "model").avg()
deviceModelStats = streaming.drop("Arrival_Time", "Creation_Time", "Index")\
                            .cube("gt", "model").avg()\
                            .join(historicalAgg, ["gt", "model"])\
                            .writeStream.queryName("device_counts_joins").format("memory")\
                            .outputMode("complete")\
                            .start()

In [16]:
for i in range(10):
    spark.sql("SELECT * FROM device_counts_joins").show()
    sleep(1)

+---+-----+------+------+------+-----------------+------------------+----------+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|avg(Arrival_Time)|avg(Creation_Time)|avg(Index)|avg(x)|avg(y)|avg(z)|
+---+-----+------+------+------+-----------------+------------------+----------+------+------+------+
+---+-----+------+------+------+-----------------+------------------+----------+------+------+------+

+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|   avg(Arrival_Time)|  avg(Creation_Time)|        avg(Index)|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-------------

+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|   avg(Arrival_Time)|  avg(Creation_Time)|        avg(Index)|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|      bike|nexus4|0.020961064903431637|-0.00915222693840...|-0.08538726699652664|1.424751134339985...|1.424752127369589...| 326459.6867328154| 0.02268875955086685|-0.00877912156368...|-0.08251001663412344|
|      null|nexus4|-0.00601179783148...|-7.22352747272202...|0.003847526633724...|1.424749002876339...|1.424749919482127...| 219276.9663669269|-0.00847688860109...|-7.30455

+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|   avg(Arrival_Time)|  avg(Creation_Time)|        avg(Index)|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|      bike|nexus4|0.022510255124868792|-0.00898604823548317|-0.08336229972395487|1.424751134339985...|1.424752127369589...| 326459.6867328154| 0.02268875955086685|-0.00877912156368...|-0.08251001663412344|
|      null|nexus4|-0.00551275851715...|-6.49222625233702...|0.005846553661280036|1.424749002876339...|1.424749919482127...| 219276.9663669269|-0.00847688860109...|-7.30455