In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time

kafka_topic_name = "meetuprsvptopic"
kafka_bootstrap_servers = 'localhost:9092'

In [4]:
#1)Start spark session in order to start Dataframe
spark = SparkSession \
        .builder \
        .appName("PySpark Structured Streaming with Kafka and Message Format as JSON") \
        .master("local[*]") \
        .config("spark.jars", "file:///C://spark_dependency_jars//commons-pool2-2.8.1.jar,file:///C://spark_dependency_jars//spark-sql-kafka-0-10_2.12-3.0.1.jar,file:///C://spark_dependency_jars//kafka-clients-2.6.0.jar,file:///C://spark_dependency_jars//spark-streaming-kafka-0-10-assembly_2.12-3.0.1.jar") \
        .config("spark.executor.extraClassPath","file:///C://spark_dependency_jars//commons-pool2-2.8.1.jar:file:///C://spark_dependency_jars//spark-sql-kafka-0-10_2.12-3.0.1.jar:file:///C://spark_dependency_jars//kafka-clients-2.6.0.jar:file:///C://spark_dependency_jars//spark-streaming-kafka-0-10-assembly_2.12-3.0.1.jar") \
        .config("spark.executor.extraLibrary","file:///C://spark_dependency_jars//commons-pool2-2.8.1.jar:file:///C://spark_dependency_jars//spark-sql-kafka-0-10_2.12-3.0.1.jar:file:///C://spark_dependency_jars//kafka-clients-2.6.0.jar:file:///C://spark_dependency_jars//spark-streaming-kafka-0-10-assembly_2.12-3.0.1.jar") \
        .config("spark.driver.extraClassPath", "file:///C://spark_dependency_jars//commons-pool2-2.8.1.jar:file:///C://spark_dependency_jars//spark-sql-kafka-0-10_2.12-3.0.1.jar:file:///C://spark_dependency_jars//kafka-clients-2.6.0.jar:file:///C://spark_dependency_jars//spark-streaming-kafka-0-10-assembly_2.12-3.0.1.jar") \
        .getOrCreate()

In [5]:
spark.sparkContext.setLogLevel("ERROR")

In [6]:
#since its contiouns data use any streaming...ie spark streaming or structured streaming or spark sql   
#  since Structured Streaming is more inclined towards real-time streaming but Spark Streaming focuses more on batch processing
# here we use Structured Streaming. so no batch processing here...think  

# https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#overview
df = spark \
        .readStream \     # reaming data from topic from kafka server
        .format("kafka") \   # kafka sink
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("subscribe", kafka_topic_name) \  
        .option("startingOffsets", "latest") \
        .load()

In [7]:
type(df)  
#Structured Streaming-> dstream +sql

pyspark.sql.dataframe.DataFrame

In [8]:
print("Printing Schema/structure of _df: ")
df.printSchema()   # checking datatype of dataframe  ie  i have 7 columns

Printing Schema/structure of _df: 
root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [34]:
df.columns  # columns

['key', 'value', 'topic', 'partition', 'offset', 'timestamp', 'timestampType']

In [11]:
df1 = df.selectExpr("CAST(value AS string)", "CAST(timestamp AS TIMESTAMP)")
    # selectExpr() t takes SQL expression in a String and returns a new DataFrame.but  in select() it doesn’t have a signature to take Column type and Dataset return type.
    # here we convert  value column from binary to string
   

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [29]:
#method1
df1 = df.select("value")  
df1  \
    .writeStream \
    .format("console") \   # console sink
    .start()
# Note that you have to call start() to actually start the execution of the query

<pyspark.sql.streaming.StreamingQuery at 0x125d5e84be0>

In [31]:
#method2
d=df \
        .writeStream \
        .queryName("aggregates1") \  # query name will be the table name
        .format("memory") \   # memory sink
        .start()
spark.sql("select * from aggregates1").show()

+---+-----+-----+---------+------+---------+-------------+
|key|value|topic|partition|offset|timestamp|timestampType|
+---+-----+-----+---------+------+---------+-------------+
+---+-----+-----+---------+------+---------+-------------+



In [None]:
# method 3
df.createOrReplaceTempView("updates")
spark.sql("select * from updates")

In [72]:
# Write final result into console for debugging purpose
df7 = df8\
        .writeStream \
        .trigger(processingTime='5 seconds') \
        .outputMode("update") \  
        .option("truncate", "false") \
        .format("console") \      # console sink
        .start()



In [None]:
df7.awaitTermination()