In [1]:
pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import length, explode, split, substring, upper, window, format_string, col, lit

In [3]:
INTERVAL = '3 seconds'
SPARK_MASTER = "spark://localhost:5000" if True else "local[2]"
SPARK_APP_NAME = "Final - PSPD"
KAFKA_SERVER = 'localhost:9092'
WORDS_TOPIC = 'wc'
STATS_TOPIC = 'test-elasticsearch-sink'

In [4]:
conf = SparkConf() \
    .setMaster(SPARK_MASTER) \
    .setAppName(SPARK_APP_NAME) \
    .set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0")
    
context = SparkContext(conf=conf)
context.setLogLevel("ERROR")



:: loading settings :: url = jar:file:/home/rcleydsonr/spark-3.2.2-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/rcleydsonr/.ivy2/cache
The jars for the packages stored in: /home/rcleydsonr/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7f044886-9c26-45d2-9443-e696fc69b4b3;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in

In [5]:
spark = SparkSession.builder.getOrCreate()

In [6]:
lines = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option("subscribe", WORDS_TOPIC) \
    .option('includeTimestamp', 'true') \
    .load()

In [7]:
# Split the lines into words
words = lines.select(
    explode(
        split(lines.value, "\s+")).alias("word"),
        lines.timestamp
    )
words = words.select(upper(words.word).alias('word'), words.timestamp)

In [11]:
wordCounts = words.withWatermark("timestamp", INTERVAL) \
                .groupBy(
                    window(words.timestamp, INTERVAL, INTERVAL),
                    "word"
                ) \
                .count() \
                .select(
                    lit('1').alias("key"),
                    format_string("{\"word\": \"%s\", \"count\": %d}", col("word"), col("count")).alias("value")
                )

# Sinks

In [12]:
qWc = wordCounts \
    .writeStream \
    .outputMode("update") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/wc-stats') \
    .trigger(processingTime=INTERVAL) \
    .start()

# Others

In [13]:
# Count the words that has length 6, 8 and 11
lengths = words \
    .filter(length(words.word).isin([6, 8, 11])) \
    .withWatermark("timestamp", INTERVAL) \
    .groupBy(
        window(words.timestamp, INTERVAL, INTERVAL),
        length(words.word).alias("length")
    ) \
    .count() \
    .select(
            lit('1').alias("key"),
            format_string("{\"stat\": \"%s\", \"count\": %d}", col("length"), col("count")).alias("value")
        )

In [14]:
qLen = lengths \
    .writeStream \
    .outputMode("update") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/len-stats') \
    .trigger(processingTime=INTERVAL) \
    .start()

In [15]:
# Count the words that startswith S, P and R
letters = words \
    .filter(upper(substring(words.word, 0, 1)).isin(["S", "P", "R"])) \
    .withWatermark("timestamp", INTERVAL) \
    .groupBy(
        window(words.timestamp, INTERVAL, INTERVAL),
        upper(substring(words.word, 0, 1)).alias("stat"),
    ) \
    .count() \
    .select(
            lit('1').alias("key"),
            format_string("{\"stat\": \"%s\", \"count\": %d}", col("stat"), col("count")).alias("value")
        )

In [16]:
qLet = letters \
    .writeStream \
    .outputMode("update") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/let-stats') \
    .trigger(processingTime=INTERVAL) \
    .start()

In [17]:
# Count the total of words readed
total = words \
    .groupBy() \
    .count() \
    .select(
            lit('1').alias("key"),
            format_string("{\"stat\": \"total\", \"count\": %d}", col("count")).alias("value")
        )

In [18]:
qT = total \
    .writeStream \
    .outputMode("complete") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/total-stats') \
    .start()

                                                                                

In [None]:
qWc.stop()
qT.stop()
qLen.stop()
qLet.stop()

In [None]:
spark.stop()
context.stop()