In [None]:
pip install -r requirements.txt

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import length, explode, split, substring, upper, window, format_string, col, lit

In [None]:
INTERVAL = '3 seconds'
SPARK_MASTER = "spark://localhost:5000" if True else "local[2]"
SPARK_APP_NAME = "Final - PSPD"
KAFKA_SERVER = 'localhost:9092'
WORDS_TOPIC = 'wc'
STATS_TOPIC = 'test-elasticsearch-sink'

In [None]:
conf = SparkConf() \
    .setMaster(SPARK_MASTER) \
    .setAppName(SPARK_APP_NAME) \
    .set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0")
    
context = SparkContext(conf=conf)
context.setLogLevel("ERROR")

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
lines = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option("subscribe", WORDS_TOPIC) \
    .option('includeTimestamp', 'true') \
    .load()

In [None]:
# Split the lines into words
words = lines.select(
    explode(
        split(lines.value, "\s+")).alias("word"),
        lines.timestamp
    )
words = words.select(upper(words.word).alias('word'), words.timestamp)

In [None]:
wordCounts = words.withWatermark("timestamp", INTERVAL) \
                .groupBy(
                    window(words.timestamp, INTERVAL, INTERVAL),
                    "word"
                ) \
                .count()

In [None]:
allWords = wordCounts \
        .select(
            lit('1').alias("key"),
            format_string("{\"word\": \"%s\", \"count\": %d}", col("word"), col("count")).alias("value")
        )

# Sinks

In [None]:
qAllWords = allWords \
    .writeStream \
    .outputMode("update") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/wc-stats') \
    .trigger(processingTime=INTERVAL) \
    .start()

In [None]:
spark.stop()
context.stop()

# Others

In [None]:
# Count the words that has length 6, 8 and 11
lengths = words \
    .filter(length(words.word).isin([6, 8, 11])) \
    .withWatermark("timestamp", INTERVAL) \
    .groupBy(
        window(words.timestamp, INTERVAL, INTERVAL),
        length(words.word).alias("stat")
    ) \
    .count() \
    .select(
            lit('1').alias("stat"),
            format_string("{\"stat\": \"%d\", \"count\": %d}", col("stat"), col("count")).alias("value")
        )

In [None]:
qLen = lengths \
    .writeStream \
    .outputMode("update") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/len-stats') \
    .trigger(processingTime=INTERVAL) \
    .start()

In [None]:
# Count the words that startswith S, P and R
letters = words \
    .filter(upper(substring(words.word, 0, 1)).isin(["S", "P", "R"])) \
    .withWatermark("timestamp", INTERVAL) \
    .groupBy(
        window(words.timestamp, INTERVAL, INTERVAL),
        upper(substring(words.word, 0, 1)).alias("stat"),
    ) \
    .count() \
    .select(
            lit('1').alias("stat"),
            format_string("{\"stat\": \"%s\", \"count\": %d}", col("stat"), col("count")).alias("value")
        )

In [None]:
qLet = letters \
    .writeStream \
    .outputMode("update") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/let-stats') \
    .trigger(processingTime=INTERVAL) \
    .start()

In [None]:
# Group words
wordCounts = words.groupBy("word").count()

# Count the total of words readed
total = words \
    .groupBy() \
    .count() \
    .selectExpr("'TOTAL' as key", "CAST(count AS STRING) as value")

In [None]:
qT = total \
    .writeStream \
    .outputMode("complete") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/total-stats') \
    .start()

In [None]:
qAllWords.stop()
qT.stop()
qLen.stop()
qLet.stop()