In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.functions import length, explode, split, substring, upper, window, format_string, col, lit, udf
from dotenv import load_dotenv
import os
load_dotenv()

True

In [2]:
INTERVAL = os.getenv("INTERVAL", "10 seconds")
SPARK_MASTER = os.getenv("SPARK_MASTER", "spark://gpu3.esw:7077")
KAFKA_SERVER = os.getenv("KAFKA_SERVER", 'localhost:9092')

SPARK_APP_NAME = "Final - PSPD"

WORDS_TOPIC = os.getenv("WORDS_TOPIC", 'wc')
STATS_TOPIC = os.getenv("STATS_TOPIC", 'test-elasticsearch-sink')

SPARK_CORES_MAX = os.getenv("SPARK_CORES_MAX", "2")

In [3]:
conf = SparkConf() \
    .setMaster(SPARK_MASTER) \
    .setAppName(SPARK_APP_NAME) \
    .set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \
    .set("spark.cores.max", "2")
    
context = SparkContext(conf=conf)
context.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/home/thiago/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/thiago/.ivy2/cache
The jars for the packages stored in: /home/thiago/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3cee7a7c-2b3c-43df-99b4-9223a0d6b2cb;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central

22/09/18 22:44:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/18 22:44:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
spark = SparkSession.builder.getOrCreate()

In [5]:
lines = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option("subscribe", WORDS_TOPIC) \
    .option('includeTimestamp', 'true') \
    .load()

In [6]:
## Cleaner
import re

CLEAN_REGEX = r"[.,/\\\[\]\{\}`~^&!@#\$%*\)\(\'\"<>=+-:;?“]"

def cleaner(sentence):
    return re.sub(CLEAN_REGEX, '', sentence)

cleaner_col = udf(lambda s: cleaner(s), StringType())

In [7]:
# Split the lines into words
words = lines \
        .select(
            explode(split(col("value"), "\s+")).alias("word"),
            lines.timestamp
        ).select(
            upper(cleaner_col(col("word"))).alias('word'), 
            col("timestamp")
        )

In [8]:
wordCounts = words.withWatermark("timestamp", INTERVAL) \
                .groupBy(
                    window(words.timestamp, INTERVAL, INTERVAL),
                    "timestamp",
                    "word"
                ) \
                .count() \
                .select(
                    lit('1').alias("key"),
                    format_string("{\"word\": \"%s\", \"count\": %d, \"timestamp\": %d}", col("word"), col("count"), col("timestamp")).alias("value")
                )

# Sinks

In [9]:
qWc = wordCounts \
    .writeStream \
    .outputMode("update") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/wc-stats') \
    .trigger(processingTime=INTERVAL) \
    .start()

# Stats

In [10]:
# Count the words that has length 6, 8 and 11
lengths = words \
    .filter(length(words.word).isin([6, 8, 11])) \
    .withWatermark("timestamp", INTERVAL) \
    .groupBy(
        window(words.timestamp, INTERVAL, INTERVAL),
        "timestamp",
        length(words.word).alias("length")
    ) \
    .count() \
    .select(
        lit('1').alias("key"),
        format_string("{\"stat\": \"%s\", \"count\": %d, \"timestamp\": %d}", col("length"), col("count"), col("timestamp")).alias("value")
    )

In [11]:
qLen = lengths \
    .writeStream \
    .outputMode("update") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/len-stats') \
    .trigger(processingTime=INTERVAL) \
    .start()

In [12]:
# Count the words that startswith S, P and R
letters = words \
    .filter(upper(substring(words.word, 0, 1)).isin(["S", "P", "R"])) \
    .withWatermark("timestamp", INTERVAL) \
    .groupBy(
        window(words.timestamp, INTERVAL, INTERVAL),
        "timestamp",
        upper(substring(words.word, 0, 1)).alias("stat"),
    ) \
    .count() \
    .select(
            lit('1').alias("key"),
            format_string("{\"stat\": \"%s\", \"count\": %d, \"timestamp\": %d}", col("stat"), col("count"), col("timestamp")).alias("value")
        )

In [13]:
qLet = letters \
    .writeStream \
    .outputMode("update") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/let-stats') \
    .trigger(processingTime=INTERVAL) \
    .start()

In [14]:
# Count the total of words readed
total = words \
    .groupBy() \
    .count() \
    .select(
            lit('1').alias("key"),
            format_string("{\"stat\": \"total\", \"count\": %d}", col("count")).alias("value")
        )

In [15]:
qT = total \
    .writeStream \
    .outputMode("complete") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/total-stats') \
    .start()



In [16]:
qWc.stop()
qT.stop()
qLen.stop()
qLet.stop()

22/09/18 22:48:19 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@6189b2e1 is aborting.
22/09/18 22:48:19 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@6189b2e1 aborted.


In [17]:
spark.stop()
context.stop()