In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import from_json, window

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5 pyspark-shell'

In [3]:
bootstrap_servers = 'broker:9094'
topic = 'bitcoin'

In [4]:
spark = SparkSession \
    .builder \
    .appName('StructuredStreamingKafka') \
    .getOrCreate()

In [9]:
kafka_raw_stream = spark \
    .readStream \
    .format('kafka') \
    .option('kafka.bootstrap.servers',bootstrap_servers) \
    .option('subscribe',topic) \
    .load()

In [8]:
kafka_stream = kafka_raw_stream \
    .selectExpr('CAST(key as String)','CAST(value as String)')

In [None]:
vehicle_schema = StructType(
    [
        StructField('type',   StringType(),True),
        StructField('line',   StringType(),True),
        StructField('brigade',StringType(),True),
        StructField('lat',    DoubleType(),True),
        StructField('lon',    DoubleType(),True),
        StructField('bearing',DoubleType(),True),
        StructField('time',   LongType(),  True)
        
    ]
)

In [None]:
vehicles = kafka_stream \
    .withColumn('vehicle', from_json('value',vehicle_schema)) \
    .selectExpr('key','vehicle.*')

In [None]:
vehicles_with_ts = vehicles \
    .selectExpr('key','type','line','brigade','lat','lon','bearing','CAST(time/1000 as Timestamp) as time')

In [None]:
event_count_5min = vehicles_with_ts \
    .withWatermark('time','10 seconds') \
    .groupBy(
        window('time','1 second')
    ).count()

In [10]:
query = kafka_raw_stream  \
    .writeStream \
    .format('console') \
    .option('truncate','false') \
    .start()

In [None]:
query.awaitTermination()