In [None]:
from pyspark import SparkContext                                                                                        
from pyspark.sql import SparkSession                                                                                    
from pyspark.streaming import StreamingContext                                                                          
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql.functions import *
from pyspark.sql import functions as F
import pandas as pd

from pyspark.sql import functions as sf
import json
import time
from pyspark.sql.functions import col
import yaml

In [None]:
ss = SparkSession.Builder() \
     .appName("SparkBatchStreamingKafka") \
     .master("spark://speed-processing-spark-master:7077") \
     .config("spark.jars", "./spark-streaming-kafka-0-10-assembly_2.11-2.4.1.jar,./kafka-clients-0.10.1.0.jar,./spark-sql-kafka-0-10_2.11-2.4.1.jar") \
     .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/") \
     .getOrCreate()

In [None]:
df = ss \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker-1:9093,kafka-broker-2:9093,kafka-broker-3:9093") \
  .option("partition.assignment.strategy", "none") \
  .option("subscribe", "trips") \
  .load()


In [None]:
import random

def transform_window(s):
    """
    s = Row(start=datetime.datetime(2020, 12, 21, 17, 9, 30), end=datetime.datetime(2020, 12, 21, 17, 9, 40))
    """
    return str(int(s.end.timestamp()))

def transform_count(s):
    """
    s = 941
    """
    return str(s + random.randint(0,1000))
udf_transform_window = udf(transform_window)
udf_transform_count = udf(transform_count)

In [None]:
query = df.withWatermark("timestamp", "15 seconds") \
        .groupBy(window("timestamp", "5 seconds", "5 seconds")) \
        .count() \
        .withColumn("count", udf_transform_count("count")) \
        .withColumn("window", udf_transform_window("window")) \
        .withColumn('value', sf.concat(sf.col('window'),sf.lit('_'), sf.col('count'))) \
        .writeStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka-broker-1:9093,kafka-broker-2:9093,kafka-broker-3:9093") \
        .option("topic", "real-time-statistic") \
        .option("checkpointLocation", "/tmp/checkpoint") \
        .outputMode("append") \
        .option("truncate", False) \
        .start()
query.awaitTermination()