In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder\
    .appName('kafka-cleaned-spark')\
    .master('local[2]')\
    .config('spark.sql.shuffle.partitions',2)\
    .getOrCreate()

In [None]:
df = spark.readStream\
    .format('kafka')\
    .option('kafka.bootstrap.servers', 'localhost:9092')\
    .option('subscribe','seedhe_maut_capturePoint')\
    .load()

In [None]:
df.printSchema()

In [None]:
processing = df.selectExpr("CAST(value as STRING) AS value")\
    .select(from_json('value','time TIMESTAMP, hashtag STRING').alias('x'))\
    .selectExpr("x.time as time","x.hashtag as hashtag")\
    .where("length(hashtag) > 1")\
    .where("substring(hashtag,1,1) = '#'")\
    .groupBy("hashtag", window("time", windowDuration='60 minute')).agg(count('hashtag').alias('count'))\
    .selectExpr("(hashtag, count, window.start as start, window.end as end) as result")\
    .select(to_json('result').alias('value'))

In [None]:
processing.writeStream\
    .outputMode('update')\
    .format('console')\
    .option('truncate','false')\
    .start()

In [None]:
spark.streams.awaitAnyTermination()

In [17]:
spark.stop()