In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from time import sleep
from IPython.display import display, clear_output
from random import randint


In [3]:
scala_version = '2.12'
spark_version = '3.5.0'
packages = [ f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}' ,
                                                'org.apache.kafka:kafka-clients:3.6.0']
spark = SparkSession.builder.master("local")\
                            .appName("kafka-example")\
                            .config("spark.jars.packages", ",".join(packages))\
                            .getOrCreate()
spark

23/11/23 19:34:03 WARN Utils: Your hostname, dothinh.local resolves to a loopback address: 127.0.0.1; using 192.168.1.3 instead (on interface en0)
23/11/23 19:34:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/usr/local/Cellar/apache-spark/3.5.0/libexec/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/dothinhtpr247gmai.com/.ivy2/cache
The jars for the packages stored in: /Users/dothinhtpr247gmai.com/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4eb84a70-c96b-4210-9847-723879f6098c;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
	found org.apache.kafka#kafka-clients;3.6.0 in central
	found com.github.

# Creating a Kafka Source for Batch Queries

## Create dataframe from Kafka data

In [4]:
TOPIC_NAME = 'RandomNumber'
KAFKA_SERVER = 'localhost:9092'

kafkaDf = spark.read\
    			.format("kafka")\
				.option("kafka.bootstrap.servers", KAFKA_SERVER)\
				.option("subscribe", TOPIC_NAME)\
				.option("startingOffsets", "earliest")\
				.load()

## Show data (converting dataframe to pandas for cleaner view of data)

In [5]:
kafkaDf.toPandas()

                                                                                

Unnamed: 0,key,value,topic,partition,offset,timestamp,timestampType
0,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,0,2023-11-18 09:19:14.917,0
1,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,1,2023-11-18 09:19:19.921,0
2,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,2,2023-11-18 09:19:24.922,0
3,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,3,2023-11-18 09:19:29.928,0
4,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,4,2023-11-18 09:19:34.931,0
...,...,...,...,...,...,...,...
1796,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,1796,2023-11-23 19:33:54.114,0
1797,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,1797,2023-11-23 19:33:59.119,0
1798,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,1798,2023-11-23 19:34:04.124,0
1799,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,1799,2023-11-23 19:34:09.124,0


## Show streaming data using for loop

In [6]:
batchDF = kafkaDf.select(f.col('topic'),
                         f.col('offset'),
                         f.col('value').cast('string').substr(12,1).alias('rand_number'))
for x in range(0, 2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        display(batchDF.toPandas())
        sleep(5)
        clear_output(wait=True)
    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 500


Unnamed: 0,topic,offset,rand_number
0,RandomNumber,0,0
1,RandomNumber,1,1
2,RandomNumber,2,2
3,RandomNumber,3,3
4,RandomNumber,4,4
...,...,...,...
1908,RandomNumber,1908,1
1909,RandomNumber,1909,1
1910,RandomNumber,1910,1
1911,RandomNumber,1911,1


break
Live view ended...


## Perform some data aggregation and show live results

In [7]:
batchCountDF = batchDF.groupBy('rand_number').count()
for x in range(0, 2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        display(batchCountDF.toPandas())
        sleep(5)
        clear_output(wait=True)
    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 10


Unnamed: 0,rand_number,count
0,7,142
1,3,245
2,8,135
3,0,6
4,5,245
5,6,234
6,9,135
7,1,276
8,4,245
9,2,251


break
Live view ended...


# Creating a Kafka Source for Streaming Queries

##  Create Streaming dataframe from Kafka

In [8]:
streamRawDf = spark.readStream\
    				.format("kafka")\
            		.option("kafka.bootstrap.servers", KAFKA_SERVER)\
                  	.option("subscribe", TOPIC_NAME)\
                    .load()
                    
streamDF = streamRawDf.select(
    						f.col('topic'),
							f.col('offset'),
							f.col('value').cast('string').substr(12,1).alias('rand_number')
       					)

checkEvenDF = streamDF.withColumn(
    								'Is_Even',
                                  	f.col('rand_number').cast('int') % 2 == 0 
                                )

## Write stream  

In [9]:
randNum=str(randint(0,10000))
q1name = "queryNumber"+randNum
q2name = "queryCheckEven"+randNum
stream_writer1 = (
    				streamDF.writeStream\
    						.queryName(q1name)\
              				.trigger(processingTime="5 seconds")\
                      		.outputMode("append")\
                            .format("memory")
                )

stream_writer2 = (
					checkEvenDF.writeStream\
								.queryName(q2name)\
								.trigger(processingTime="5 seconds")\
								.outputMode("append")\
								.format("memory")
                )

query1 = stream_writer1.start()
query2 = stream_writer2.start()

23/11/23 19:44:11 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/n4/b5xh84d97qzb6sptmp370phr0000gn/T/temporary-d43838d6-a27f-40fd-b2e3-688747e0851a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/11/23 19:44:11 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/11/23 19:44:11 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/n4/b5xh84d97qzb6sptmp370phr0000gn/T/temporary-7e5f9590-caba-4dcb-84b9-33e6d3dd8235. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folde

                                                                                

## View streaming result

In [12]:
for x in range(2000):
        try:
                print("Showing live view refreshed every 5 seconds")
                print(f"Seconds passed: {x*5}")
                result1 = spark.sql(f'SELECT * from {query1.name}')
                result2 = spark.sql(f'SELECT * from {query2.name}')
                display(result1.toPandas())
                display(result2.toPandas())
                sleep(5)
                clear_output(wait = True)
        except KeyboardInterrupt:
                print("break")
                break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 10


                                                                                

Unnamed: 0,topic,offset,rand_number
0,RandomNumber,1920,1
1,RandomNumber,1921,1
2,RandomNumber,1922,1
3,RandomNumber,1923,1
4,RandomNumber,1924,1
5,RandomNumber,1925,1
6,RandomNumber,1926,1
7,RandomNumber,1927,1
8,RandomNumber,1928,1
9,RandomNumber,1929,1


Unnamed: 0,topic,offset,rand_number,Is_Even
0,RandomNumber,1920,1,False
1,RandomNumber,1921,1,False
2,RandomNumber,1922,1,False
3,RandomNumber,1923,1,False
4,RandomNumber,1924,1,False
5,RandomNumber,1925,1,False
6,RandomNumber,1926,1,False
7,RandomNumber,1927,1,False
8,RandomNumber,1928,1,False
9,RandomNumber,1929,1,False


break
Live view ended...


                                                                                