In [1]:
from demolib import spark
from demolib.streams import *

In [52]:
# See: https://archive.ics.uci.edu/ml/datasets/Heterogeneity+Activity+Recognition
data_dir = '/Sandbox/notebooks/SparkTheDefinitiveGuide/SparkTheDefinitiveGuide/data/activity-data'

In [3]:
static = spark.read.json(data_dir)
dataSchema = static.schema

In [4]:
static.printSchema()

root
 |-- Arrival_Time: long (nullable = true)
 |-- Creation_Time: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Index: long (nullable = true)
 |-- Model: string (nullable = true)
 |-- User: string (nullable = true)
 |-- gt: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)



In [5]:
static.show(2)

+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
| Arrival_Time|      Creation_Time|  Device|Index| Model|User|   gt|           x|           y|           z|
+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
|1424686735090|1424686733090638193|nexus4_1|   18|nexus4|   g|stand| 3.356934E-4|-5.645752E-4|-0.018814087|
|1424686735292|1424688581345918092|nexus4_2|   66|nexus4|   g|stand|-0.005722046| 0.029083252| 0.005569458|
+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
only showing top 2 rows



In [6]:
streaming = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1)\
        .json(data_dir)
activityCounts = streaming.groupBy("gt").count()
spark.conf.set("spark.sql.shuffle.partitions", 5)

In [7]:
# in Python
activityQuery = activityCounts.writeStream.queryName("activity_counts")\
.format("memory").outputMode("complete")\
.start()

In [8]:
# activityQuery.awaitTermination()

In [12]:
from time import sleep
for x in range(5):
    spark.sql("SELECT * FROM activity_counts").show()
    sleep(1)

+----------+-------+
|        gt|  count|
+----------+-------+
|       sit| 984714|
|     stand| 910783|
|stairsdown| 749059|
|      walk|1060402|
|  stairsup| 836598|
|      null| 835725|
|      bike| 863710|
+----------+-------+

+----------+-------+
|        gt|  count|
+----------+-------+
|       sit| 984714|
|     stand| 910783|
|stairsdown| 749059|
|      walk|1060402|
|  stairsup| 836598|
|      null| 835725|
|      bike| 863710|
+----------+-------+

+----------+-------+
|        gt|  count|
+----------+-------+
|       sit| 984714|
|     stand| 910783|
|stairsdown| 749059|
|      walk|1060402|
|  stairsup| 836598|
|      null| 835725|
|      bike| 863710|
+----------+-------+

+----------+-------+
|        gt|  count|
+----------+-------+
|       sit| 984714|
|     stand| 910783|
|stairsdown| 749059|
|      walk|1060402|
|  stairsup| 836598|
|      null| 835725|
|      bike| 863710|
+----------+-------+

+----------+-------+
|        gt|  count|
+----------+-------+
|       s

In [16]:
sa = spark.streams.active

In [19]:
sa.count(sa)

0

In [20]:
from pyspark.sql.functions import expr
simpleTransform = streaming.withColumn("stairs", expr("gt like '%stairs%'"))\
    .where("stairs")\
    .where("gt is not null")\
    .select("gt", "model", "arrival_time", "creation_time")\
    .writeStream\
    .queryName("simple_transform")\
    .format("memory")\
    .outputMode("append")\
    .start()

In [23]:
spark.streams.active.count(spark.streams.active)

0

In [24]:
spark.sql("SELECT * FROM simple_transform").show(5)

+--------+------+-------------+-------------------+
|      gt| model| arrival_time|      creation_time|
+--------+------+-------------+-------------------+
|stairsup|nexus4|1424687983719|1424687981726802718|
|stairsup|nexus4|1424687984000|1424687982009853255|
|stairsup|nexus4|1424687984404|1424687982411977009|
|stairsup|nexus4|1424687984805|1424687982814351277|
|stairsup|nexus4|1424687985210|1424687983217500861|
+--------+------+-------------+-------------------+
only showing top 5 rows



In [3]:
df2 = spark.readStream.format("kafka")\
    .option("kafka.bootstrap.servers", "localhost:9092")\
    .option("subscribe", "ccfraud")\
    .load()

In [4]:
df2.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [6]:
df2 = spark.read.format("kafka")\
    .option("kafka.bootstrap.servers", "localhost:9092")\
    .option("subscribe", "ccfraud")\
    .option('startingOffsets', -2)
    .option('endingOffsets', -1)
    .load()

In [None]:
df2.show(1)

In [1]:
from demolib import spark

In [3]:
df_kafka = spark.readStream.format("kafka")\
    .option("kafka.bootstrap.servers", "localhost:9092")\
    .option("subscribe", "ccfraud")\
    .option("startingOffsets", "earliest") \
    .load()

In [None]:
def process_row(r):

query = df_kafka.writeStream.foreach(process_row)

In [5]:
s = df_kafka.writeStream

In [7]:
def persist_batch(batchDF, batch_id):
    batchDF.persist()
    batchDF.write.format('csv').save('../data/kafka/{}/out'.format(batch_id))
    batchDF.unpersist()

In [9]:
s.foreachBatch(persist_batch) \
    .outputMode("append")  \
    .start()

<pyspark.sql.streaming.StreamingQuery at 0x2cced8e85c0>

In [8]:
df_kafka.toDF?

query = df_kafka.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.stop()

In [3]:
query = df_kafka.writeStream\
    .queryName("simple_transform")\
    .format("memory")\
    .outputMode("append")\
    .start()

In [16]:
spark.sql("SELECT * FROM simple_transform").count()

0

## Structured Streaming from Kafka

In [11]:
from demolib import spark, Namespace
from demolib.streams import *

kafkacfg = Namespace
kafkacfg.table = "ccfraud_stream"
kafkacfg.topic = 'ccfraud'
kafkacfg.bootstrap = 'localhost:9092'
kafkacfg.groupid = 'sstreaming'

In [12]:
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafkacfg.bootstrap) \
  .option("subscribe", kafkacfg.topic) \
  .option("group.id", kafkacfg.groupid) \
  .load()
#  .option("startingOffsets", "earliest") \

df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

DataFrame[key: string, value: string]

In [13]:
query = df.writeStream\
    .queryName(kafkacfg.table)\
    .format("memory")\
    .outputMode("append")\
    .start()

In [14]:
spark.sql("SHOW TABLES").show()

+--------+--------------+-----------+
|database|     tableName|isTemporary|
+--------+--------------+-----------+
|        |ccfraud_stream|       true|
+--------+--------------+-----------+



In [19]:
spark.sql(f"SELECT COUNT(*) FROM {kafkacfg.table}").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



In [42]:
spark.sql(f"SELECT * FROM {kafkacfg.table}").show()

+----+--------------------+-------+---------+------+--------------------+-------------+
| key|               value|  topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-------+---------+------+--------------------+-------------+
|null|[7B 22 63 63 5F 6...|ccfraud|        0|   119|2019-09-02 21:14:...|            0|
|null|[7B 22 63 63 5F 6...|ccfraud|        0|   120|2019-09-02 21:14:...|            0|
|null|[7B 22 63 63 5F 6...|ccfraud|        0|   121|2019-09-02 21:14:...|            0|
|null|[7B 22 63 63 5F 6...|ccfraud|        0|   122|2019-09-02 21:14:...|            0|
|null|[7B 22 63 63 5F 6...|ccfraud|        0|   123|2019-09-02 21:14:...|            0|
|null|[7B 22 63 63 5F 6...|ccfraud|        0|   124|2019-09-02 21:14:...|            0|
|null|[7B 22 63 63 5F 6...|ccfraud|        0|   125|2019-09-02 21:15:...|            0|
|null|[7B 22 63 63 5F 6...|ccfraud|        0|   126|2019-09-02 21:15:...|            0|
|null|[7B 22 63 63 5F 6...|ccfra

In [44]:
streams_list()

[]

In [43]:
streams_stop_all()

Query ccfraud_stream stopped


In [29]:
list(spark.streams.active)

[<pyspark.sql.streaming.StreamingQuery at 0x25f42c89f28>]

## Sink To File with foreachBatch Sink

In [62]:
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafkacfg.bootstrap) \
  .option("subscribe", kafkacfg.topic) \
  .option("group.id", kafkacfg.groupid) \
  .load()

In [None]:
def persist_batch(batchDF, batch_id):
    batchDF.persist()
    batchDF.write \
        .mode('append') \
        .format('csv').save('../data/kafka/out'.format(batch_id))
    batchDF.unpersist()

query = df.coalesce(1).writeStream \
    .trigger(processingTime = '20 seconds') \
    .foreachBatch(persist_batch) \
    .outputMode("append")  \
    .start()

In [None]:
streams_list()

In [40]:
streams_stop_all()

Query None stopped


## Sink to MongoDB with foreachBatch

In [51]:
from demolib.mongo import *

static = spark.read.json(data_dir)
dataSchema = static.schema

In [53]:
from pyspark.sql.functions import *
file_stream = spark.readStream \
    .schema(dataSchema) \
    .option("maxFilesPerTrigger", 1)\
    .json(data_dir) \
    .withColumn('_id', expr("Device || ':' || Index || ':' || User"))

In [54]:

def mongo_persist_df(batchDF, batch_id):
    """Persist all records from batch_df into MongoDB collection"""
    batchDF.persist()
    mongo_save(batchDF, "sensors", database="test")
    batchDF.unpersist()

# Create write stream query which uses foreachBatch as sink
query = file_stream.coalesce(1).writeStream \
    .queryName('mongo_sink') \
    .trigger(processingTime = '0 seconds') \
    .foreachBatch(mongo_persist_df) \
    .outputMode("append")  \
    .start()

In [57]:
streams_list()

[]

In [56]:
streams_stop_all()

Query mongo_sink stopped


In [1]:
import shutil