In [1]:
import findspark
findspark.init()

# Load kafka spark driver, to receive kafka stream messages
# https://github.com/nodesense/cts-aws-spark-april-2021/blob/main/spark-setup-for-packages.md

#  $KAFKA_HOME/bin/kafka-topics.sh  --create --bootstrap-server localhost:9092 --replication-factor 1 --partitions 1 --topic test
# $KAFKA_HOME/bin/kafka-console-producer.sh  --broker-list localhost:9092 --topic test


In [2]:
import pyspark


from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]")\
                            .appName("SparkStreamingKafkaBasic").getOrCreate()



In [3]:
# read from kafka, here spark is consumer for kafka topic called test
# spark streaming works as dataframe/sql
kafkaDf = spark.readStream.format("kafka")\
  .option("kafka.bootstrap.servers", "192.168.93.128:9092")\
  .option("subscribe", "test")\
  .load()
 

In [4]:
# .show/print will not work directily due to stream..
# linesDf.show() # worn't work
kafkaDf.printSchema() # works

# key is kafka key, in binary format
# value is kafka value, in binary format
# topic string
# parition, integer
# offer long 
# timestamp - longint in ms
# timestampType - Source Time, Record write time

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
# now convert kafka value which is in bytes to STRING, we ignore the key for now...
# now we pick only value from the stream..
linesDf = kafkaDf.selectExpr("CAST(value AS STRING)")
linesDf.printSchema() # we get only value as string

root
 |-- value: string (nullable = true)



In [6]:
# split the lines into words, then convert the words into individual row using a function called explode
# explode will convert columns/array elements into spark record
import pyspark.sql.functions as F
# linesDf.value is a column
# split convert to list of words [welcome, to, spark]
# convert list of words into individual word/record
# explode, will convert elements into record
#wordsDf = linesDf.select(F.split(linesDf.value," "))
# after explode the output would be, column name is shown as col
#        welcome
#         to
#         spark
# wordsDf = linesDf.select(F.explode(F.split(linesDf.value," ")) )

wordsDf = linesDf.select(F.explode(F.split(linesDf.value," ")).alias("word") )
#        welcome
#         to
#         spark
# now the same result with col name word

# generate running word count from stream
# "word" is a column name
wordCountsDf = wordsDf.groupBy("word").count()

In [7]:
# to print the data on console..
# read the data send by nc command from linux terminal, print it on Jupyter console
echoOnconsole = wordCountsDf\
                .writeStream\
                .outputMode("complete")\
                .format("console")\
                .start() # start the query. spark will subscribe for data

In [None]:
echoOnconsole.awaitTermination()

# later you can terminal the jupyter