# Spark-kafka-Hive-Tableau

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder\
    .appName('spark-kafka-hive-tableau-2')\
    .master('local[2]')\
    .config('spark.sql.shuffle.partitions', 2)\
    .getOrCreate()

In [3]:
df = spark.readStream\
    .format('kafka')\
    .option('kafka.bootstrap.servers', 'localhost:9092')\
    .option('subscribe', 'kafka-hive-spark-intermediate')\
    .load()

In [4]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
processing = df.selectExpr('CAST( value as STRING) as value')\
    .select(from_json('value', 'time TIMESTAMP, hashtags STRING').alias('x'))\
    .selectExpr("x.time as time", "x.hashtags as hashtags")\
    .groupBy('hashtags', window("time", windowDuration = '60 minute')).agg(count('hashtags').alias("hashtags_count"))\
    .selectExpr("(hashtags, hashtags_count, window.start as start, window.end as end) as result")\
    .select(to_json('result').alias('value'))

In [6]:
processing.writeStream\
    .format('console')\
    .outputMode('update')\
    .option('truncate', 'false')\
    .start()

<pyspark.sql.streaming.StreamingQuery at 0x7f8b814e95f8>

In [7]:
spark.streams.awaitAnyTermination()

KeyboardInterrupt: 

In [None]:
spark.stop()