In [23]:
import os
# https://jupyter-docker-stacks.readthedocs.io/en/latest/using/recipes.html#using-local-spark-jars
# https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#deploying
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0 pyspark-shell'

# Create a SparkSession
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("pyspark-notebook-kafka-structured-streaming")
    .master("spark://spark-master:7077")
    .config("spark.executor.memory", "512m")
    .getOrCreate()
)

In [24]:
# https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html
kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "broker:9092")
    .option("subscribe", "test")
    .option("startingOffsets", "earliest")
    .load()
)
kafka_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

DataFrame[key: string, value: string]

In [25]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

schema = StructType([
    StructField("id", IntegerType()),
    StructField("date", DateType()),
    StructField("title", StringType()),
    StructField("description", StringType())
])
kafka_df = kafka_df.select(from_json(col("value").cast("string"), schema).alias("data")).select("data.*")
kafka_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)



In [26]:
from pyspark.sql.functions import year, month, dayofmonth

kafka_df = kafka_df.withColumn("year", year("date"))
kafka_df = kafka_df.withColumn("month", month("date"))
kafka_df = kafka_df.withColumn("day", dayofmonth("date"))
kafka_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)



In [27]:
from pyspark.sql.functions import col

kafka_df = kafka_df.select(col("id"), col("date"), col("year"), col("month"), col("day"), col("title"), col("description"))
kafka_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)



In [28]:
query = (
    kafka_df
    .writeStream
    .format("csv")
    .option("path", "hdfs://namenode:9000/streaming")
    .option("checkpointLocation", "hdfs://namenode:9000/checkpoint")
    .outputMode("append")
    .start()
    .awaitTermination()
)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [29]:
# Display the number of items grou by year
from pyspark.sql.functions import count

kafka_df_by_year = kafka_df.groupBy("year").agg(count("id").alias("count"))
kafka_df_by_year.printSchema()


root
 |-- year: integer (nullable = true)
 |-- count: long (nullable = false)



In [30]:
query = (
    kafka_df_by_year
    .writeStream
    .format("console")
    .option("truncate", "false")
    .outputMode("complete")
    .start()
    .awaitTermination()
)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
# Compute a rolling count of the number of items per week

from pyspark.sql.functions import window

kafka_df_by_month = kafka_df.groupBy(window("date", "1 week")).agg(count("id").alias("count"))
kafka_df_by_month.printSchema()


In [32]:
query = (
    kafka_df_by_month
    .writeStream
    .format("console")
    .option("truncate", "false")
    .outputMode("update")
    .start()
    .awaitTermination()
)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 