In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-21-stream-kafka")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

## Kafka

https://spark.apache.org/docs/2.4.0/structured-streaming-kafka-integration.html#deploying

`./bin/spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0`


### setup

[steps to run kafka and create topic](https://github.com/wgong/py4kids/blob/master/lesson-71-kafka/Calories-Alert-Kafka/kafka.README.md)

In [2]:
# Subscribe to 1 topic
streaming = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "localhost:9092")\
  .option("subscribe", "Hello-Kafka")\
  .load()

In [3]:
streaming.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



### write to memory for test

In [4]:
streaming.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")\
  .writeStream\
  .queryName("test_transform")\
  .format("memory")\
  .outputMode("append")\
  .start()

<pyspark.sql.streaming.StreamingQuery at 0x7fe08da2cac8>

In [8]:
spark.sql("select * from test_transform").show(truncate=False)

+----+---------------------------------------------+
|key |value                                        |
+----+---------------------------------------------+
|null|kafka is a distributed pub/sub message broker|
|null|spark is a distributed big-data platform     |
+----+---------------------------------------------+



In [9]:
spark.sql("select * from test_transform").show(truncate=False)

+----+---------------------------------------------+
|key |value                                        |
+----+---------------------------------------------+
|null|kafka is a distributed pub/sub message broker|
|null|spark is a distributed big-data platform     |
|null|spark also has mllib for machine learning    |
|null|databricks is the company behind spark       |
+----+---------------------------------------------+



In [10]:
spark.sql("select * from test_transform").show(truncate=False)

+----+---------------------------------------------+
|key |value                                        |
+----+---------------------------------------------+
|null|kafka is a distributed pub/sub message broker|
|null|spark is a distributed big-data platform     |
|null|spark also has mllib for machine learning    |
|null|databricks is the company behind spark       |
|null|tensorflow 2.0 was released last week        |
|null|tensorflow.js is very interesting            |
+----+---------------------------------------------+



### write to another topic

In [11]:
streaming.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")\
  .writeStream\
  .format("kafka")\
  .option("kafka.bootstrap.servers", "localhost:9092")\
  .option("checkpointLocation", "/tmp/kafka-checkpoint")\
  .option("topic", "Hello-Echo")\
  .start()

<pyspark.sql.streaming.StreamingQuery at 0x7fe08da2cf60>

Check in Kafkatool to see messages are echoed to the new topic = "Hello-Echo"

<img src=kafka-tool.png>

below codes are not tested

In [None]:
# Subscribe to 1 topic
df1 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("subscribe", "topic1")\
  .load()

In [None]:
# Subscribe to multiple topics
df2 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("subscribe", "topic1,topic2")\
  .load()

In [None]:
# Subscribe to a pattern
df3 = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("subscribePattern", "topic.*")\
  .load()

In [None]:
# COMMAND ----------

df1.selectExpr("topic", "CAST(key AS STRING)", "CAST(value AS STRING)")\
  .writeStream\
  .format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("checkpointLocation", "/to/HDFS-compatible/dir")\
  .start()

df1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")\
  .writeStream\
  .format("kafka")\
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")\
  .option("checkpointLocation", "/to/HDFS-compatible/dir")\
  .option("topic", "topic1")\
  .start()


# COMMAND ----------

socketDF = spark.readStream.format("socket")\
  .option("host", "localhost").option("port", 9999).load()

In [None]:
# COMMAND ----------

activityCounts.writeStream.trigger(processingTime='5 seconds')\
  .format("console").outputMode("complete").start()

In [None]:
# COMMAND ----------

activityCounts.writeStream.trigger(once=True)\
  .format("console").outputMode("complete").start()

In [None]:
# COMMAND ----------