In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

scala_version = '2.12'
spark_version = '3.4.1'
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:2.13.0'
]

spark = SparkSession.builder.master("local").appName("kafka-example").config("spark.jars.packages",",".join(packages)).getOrCreate()
spark

In [5]:
topic_name = 'RandomNumber'
kafka_server = 'localhost:9092'

# Construct a streaming DataFrame that reads from test-topic
kafka_Df = spark \
        .read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_server) \
        .option("subscribe", topic_name) \
        .option("startingOffsets", "earliest") \
        .load()

In [7]:
kafka_Df.show()

+----+--------------------+------------+---------+------+--------------------+-------------+
| key|               value|       topic|partition|offset|           timestamp|timestampType|
+----+--------------------+------------+---------+------+--------------------+-------------+
|null|[7B 22 6E 75 6D 6...|RandomNumber|        0|     0|2023-08-24 17:12:...|            0|
|null|[7B 22 6E 75 6D 6...|RandomNumber|        0|     1|2023-08-24 17:12:...|            0|
|null|[7B 22 6E 75 6D 6...|RandomNumber|        0|     2|2023-08-24 17:12:...|            0|
|null|[7B 22 6E 75 6D 6...|RandomNumber|        0|     3|2023-08-24 17:13:...|            0|
|null|[7B 22 6E 75 6D 6...|RandomNumber|        0|     4|2023-08-24 17:13:...|            0|
|null|[7B 22 6E 75 6D 6...|RandomNumber|        0|     5|2023-08-24 17:13:...|            0|
|null|[7B 22 6E 75 6D 6...|RandomNumber|        0|     6|2023-08-24 17:13:...|            0|
|null|[7B 22 6E 75 6D 6...|RandomNumber|        0|     7|2023-08-24 17

In [10]:
from pyspark.sql.functions import col

batchDF = kafka_Df.select(col('topic'),col('offset'),col('value').cast('string').substr(12,1).alias('rand_number'))
from time import sleep
from IPython.display import display, clear_output

for x in range(0, 2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        display(batchDF.show())
        sleep(5)
        clear_output(wait=True)
    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 310
+------------+------+-----------+
|       topic|offset|rand_number|
+------------+------+-----------+
|RandomNumber|     0|          0|
|RandomNumber|     1|          1|
|RandomNumber|     2|          2|
|RandomNumber|     3|          3|
|RandomNumber|     4|          4|
|RandomNumber|     5|          5|
|RandomNumber|     6|          6|
|RandomNumber|     7|          7|
|RandomNumber|     8|          8|
|RandomNumber|     9|          9|
|RandomNumber|    10|          1|
|RandomNumber|    11|          1|
|RandomNumber|    12|          1|
|RandomNumber|    13|          1|
|RandomNumber|    14|          1|
|RandomNumber|    15|          1|
|RandomNumber|    16|          1|
|RandomNumber|    17|          1|
|RandomNumber|    18|          1|
|RandomNumber|    19|          1|
+------------+------+-----------+
only showing top 20 rows



None

break
Live view ended...


In [11]:
batchCountDF = batchDF.groupBy('rand_number').count()
for x in range(0, 2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        display(batchDF.show())
        sleep(5)
        clear_output(wait=True)
    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 10
+------------+------+-----------+
|       topic|offset|rand_number|
+------------+------+-----------+
|RandomNumber|     0|          0|
|RandomNumber|     1|          1|
|RandomNumber|     2|          2|
|RandomNumber|     3|          3|
|RandomNumber|     4|          4|
|RandomNumber|     5|          5|
|RandomNumber|     6|          6|
|RandomNumber|     7|          7|
|RandomNumber|     8|          8|
|RandomNumber|     9|          9|
|RandomNumber|    10|          1|
|RandomNumber|    11|          1|
|RandomNumber|    12|          1|
|RandomNumber|    13|          1|
|RandomNumber|    14|          1|
|RandomNumber|    15|          1|
|RandomNumber|    16|          1|
|RandomNumber|    17|          1|
|RandomNumber|    18|          1|
|RandomNumber|    19|          1|
+------------+------+-----------+
only showing top 20 rows



None

break
Live view ended...


In [3]:
from pyspark.sql.functions import col

topic_name = 'RandomNumber'
kafka_server = 'localhost:9092'

streamRawDf = spark.readStream.format("kafka").option("kafka.bootstrap.servers", kafka_server).option("subscribe", topic_name).load()
streamDF = streamRawDf.select(col('topic'),col('offset'),col('value').cast('string').substr(12,1).alias('rand_number'))
checkEvenDF = streamDF.withColumn('Is_Even',col('rand_number').cast('int') % 2 == 0 )

In [4]:
from random import randint
randNum=str(randint(0,10000))
q1name = "queryNumber"+randNum
q2name = "queryCheckEven"+randNum
stream_writer1 = (streamDF.writeStream.queryName(q1name).trigger(processingTime="5 seconds").outputMode("append").format("memory"))
stream_writer2 = (checkEvenDF.writeStream.queryName(q2name).trigger(processingTime="5 seconds").outputMode("append").format("memory"))
query1 = stream_writer1.start()
query2 = stream_writer2.start()

In [7]:
from time import sleep
from IPython.display import display, clear_output

for x in range(0, 2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        result1 = spark.sql(f"SELECT * from {query1.name}")
        result2 = spark.sql(f"SELECT * from {query2.name}")
        display(result1.show())
        display(result2.show())
        sleep(5)
        clear_output(wait=True)
    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 165
+-----+------+-----------+
|topic|offset|rand_number|
+-----+------+-----------+
+-----+------+-----------+



None

+-----+------+-----------+-------+
|topic|offset|rand_number|Is_Even|
+-----+------+-----------+-------+
+-----+------+-----------+-------+



None

break
Live view ended...
