In [13]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import length, explode, split, substring, upper, window

In [39]:
INTERVAL = '3 seconds'
SPARK_MASTER = "spark://localhost:5000" if True else "local"
SPARK_APP_NAME = "Final - PSPD"
KAFKA_SERVER = 'localhost:9093'
WORDS_TOPIC = 'wc'
STATS_TOPIC = 'statistics'

In [40]:
conf = SparkConf() \
    .setMaster(SPARK_MASTER) \
    .setAppName(SPARK_APP_NAME) \
    .set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0")
    
context = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [41]:
lines = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option("subscribe", WORDS_TOPIC) \
    .option('includeTimestamp', 'true') \
    .load()

In [42]:
# Split the lines into words
words = lines.select(
    explode(
        split(lines.value, "\s+")).alias("word"),
        lines.timestamp
    )
words = words.select(upper(words.word).alias('word'), words.timestamp)

In [43]:
# Group words
wordCounts = words.groupBy("word").count()

# Count the total of words readed
total = words \
    .groupBy() \
    .count() \
    .selectExpr("'TOTAL' as key", "CAST(count AS STRING) as value")

In [44]:
# Count the words that startswith S, P and R
letters = words \
    .filter(upper(substring(words.word, 0, 1)).isin(["S", "P", "R"])) \
    .withWatermark("timestamp", INTERVAL) \
    .groupBy(
        window(words.timestamp, INTERVAL, INTERVAL),
        upper(substring(words.word, 0, 1)).alias("key"),
    ) \
    .count() \
    .selectExpr("key", "CAST(count AS STRING) as value")

In [45]:
# Count the words that has length 6, 8 and 11
lengths = words \
    .filter(length(words.word).isin([6, 8, 11])) \
    .withWatermark("timestamp", INTERVAL) \
    .groupBy(
        window(words.timestamp, INTERVAL, INTERVAL),
        length(words.word).alias("key")
    ) \
    .count() \
    .selectExpr("CAST(key AS STRING)", "CAST(count AS STRING) as value")

In [46]:
# Sinks

In [55]:
qW = wordCounts \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

22/09/15 21:48:08 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-9625a6fd-5169-4657-9ac0-4b776a0df835. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/09/15 21:48:08 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


[Stage 3:>                                                        (0 + 0) / 200]

22/09/15 21:48:14 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


[Stage 3:>  (0 + 0) / 200][Stage 4:>    (0 + 0) / 1][Stage 5:>    (0 + 0) / 1]1]

22/09/15 21:48:29 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/09/15 21:48:44 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/09/15 21:48:59 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/09/15 21:49:14 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


[Stage 3:>  (0 + 0) / 200][Stage 4:>    (0 + 0) / 1][Stage 5:>    (0 + 0) / 1]

22/09/15 21:49:29 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/09/15 21:49:44 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/09/15 21:49:59 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/09/15 21:50:14 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


[Stage 3:>  (0 + 0) / 200][Stage 4:>    (0 + 0) / 1][Stage 5:>    (0 + 0) / 1]

22/09/15 21:50:29 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/09/15 21:50:44 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----+
|word|count|
+----+-----+
+----+-----+



                                                                                

22/09/15 21:51:11 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 3000 milliseconds, but spent 173991 milliseconds


                                                                                

22/09/15 21:51:17 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 3000 milliseconds, but spent 179043 milliseconds


                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------+-----+
|  word|count|
+------+-----+
|  PPPP|    1|
|   OLA|    1|
| MUNDO|    1|
| KKKKK|    2|
|RRRRRR|    1|
| SSSSS|    1|
|  AAAA|    1|
+------+-----+



                                                                                

22/09/15 21:51:26 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 3000 milliseconds, but spent 14999 milliseconds




In [56]:
qT = total \
    .writeStream \
    .outputMode("complete") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/total-stats') \
    .start()

22/09/15 21:51:30 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
22/09/15 21:51:30 WARN StreamingQueryManager: Stopping existing streaming query [id=4332b4ba-a805-40e3-b714-25c39cb7b2a5, runId=9db3a083-5b7e-4878-9983-61d6e93897bb], as a new run is being started.


                                                                                

22/09/15 21:51:31 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 3000 milliseconds, but spent 14360 milliseconds




In [57]:
qLen = lengths \
    .writeStream \
    .outputMode("update") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/len-stats') \
    .trigger(processingTime=INTERVAL) \
    .start()

22/09/15 21:51:32 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
22/09/15 21:51:32 WARN StreamingQueryManager: Stopping existing streaming query [id=b7bd206a-2793-401e-a716-7adc5a94f5c5, runId=682b95f4-9987-47cd-b6ac-dc9c94735a89], as a new run is being started.
22/09/15 21:51:32 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@9b34d46 is aborting.
22/09/15 21:51:32 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@9b34d46 aborted.


[Stage 17:=====>         (74 + 4) / 200][Stage 21:>               (0 + 0) / 200]

In [58]:
qLet = letters \
    .writeStream \
    .outputMode("update") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option('topic', STATS_TOPIC) \
    .option('checkpointLocation', '/tmp/spark/let-stats') \
    .trigger(processingTime=INTERVAL) \
    .start()

22/09/15 21:51:34 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
22/09/15 21:51:34 WARN StreamingQueryManager: Stopping existing streaming query [id=4600f568-b579-4317-b07e-9e492273573e, runId=140e9069-473c-4269-8fce-5089e25ce127], as a new run is being started.
22/09/15 21:51:34 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@7ec54ee0 is aborting.
22/09/15 21:51:34 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@7ec54ee0 aborted.




22/09/15 21:51:34 WARN TaskSetManager: Lost task 87.0 in stage 17.0 (TID 1294) (127.0.0.1 executor 156): TaskKilled (Stage cancelled)


[Stage 21:>                                                       (0 + 4) / 200]

22/09/15 21:51:34 WARN TaskSetManager: Lost task 90.0 in stage 17.0 (TID 1297) (127.0.0.1 executor 156): TaskKilled (Stage cancelled)
22/09/15 21:51:34 WARN TaskSetManager: Lost task 89.0 in stage 17.0 (TID 1296) (127.0.0.1 executor 156): TaskKilled (Stage cancelled)
22/09/15 21:51:34 WARN TaskSetManager: Lost task 88.0 in stage 17.0 (TID 1295) (127.0.0.1 executor 156): TaskKilled (Stage cancelled)


                                                                                

22/09/15 21:51:38 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 3000 milliseconds, but spent 6067 milliseconds


                                                                                

22/09/15 21:51:42 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 3000 milliseconds, but spent 8002 milliseconds


                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------+-----+
|  word|count|
+------+-----+
|  PPPP|    2|
|   OLA|    1|
| MUNDO|    1|
| KKKKK|    2|
|RRRRRR|    1|
| SSSSS|    1|
|  AAAA|    2|
+------+-----+



                                                                                

22/09/15 21:51:50 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 3000 milliseconds, but spent 8649 milliseconds


                                                                                

22/09/15 21:51:54 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 3000 milliseconds, but spent 12178 milliseconds


                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------+-----+
|  word|count|
+------+-----+
|  PPPP|    2|
|   OLA|    1|
| MUNDO|    1|
| KKKKK|    3|
|RRRRRR|    1|
| SSSSS|    1|
|  AAAA|    2|
|  RRRR|    1|
+------+-----+



                                                                                

22/09/15 21:52:01 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 3000 milliseconds, but spent 10781 milliseconds


                                                                                

22/09/15 21:52:05 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 3000 milliseconds, but spent 10806 milliseconds


                                                                                

22/09/15 21:52:08 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 3000 milliseconds, but spent 3362 milliseconds


In [59]:
qW.stop()
qT.stop()
qLen.stop()
qLet.stop()

In [60]:
spark.stop()
context.stop()