In [1]:
from pyspark.sql import SparkSession as ss
from pyspark.sql.types import (
    StructType as st,
    StructField as sf,
    StringType as srt,
    DoubleType as dt,
    LongType as lt
)

from pyspark.sql.functions import (
    from_json,
    col,
    to_date,
    to_timestamp,
    current_date,
    current_timestamp,
    lit,
    map_from_arrays,
    array,
    date_format
)

Create a SparkSession named "KafkaSubscriber"  
Master is the Spark standalone cluster at 'spark-master:7077'  
Include both the spark-sql-kafka package and delta-spark

In [2]:
sprk = ss.builder \
    .appName("KafkaSubscriber") \
    .master("spark://spark-master:7077") \
    .config(
        "spark.jars.packages", 
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.5,"
        "io.delta:delta-spark_2.12:3.3.0"
    ) \
    .config("spark.jars.repositories", "https://repos.spark-packages.org") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

Define the schema that matches the JSON structure produced by `Kafka_Producer.ipynb`

In [3]:
json_schema = st([
    sf("Date/Time", srt(), True),
    sf("LV ActivePower (kW)", dt(), True),
    sf("Wind Speed (m/s)", dt(), True),
    sf("Theoretical_Power_Curve (KWh)", dt(), True),
    sf("Wind Direction (°)", dt(), True),
    sf("row_id", lt(), True)
])

Create a streaming DataFrame from Kafka:
- `kafka.bootstrap.servers` points to the internal Kafka address
- `subscribe` is set to `xenon-topic`
- `startingOffsets = earliest` means we read all messages from the beginning

In [4]:
kafka_df = sprk.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "xenon-topic") \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "false") \
    .load()

Parse the `value` field (binary) as a string and apply the schema

In [5]:
df = kafka_df.select(
    from_json(
        col("value").cast("string"),
        json_schema
    ).alias("jsonData")
).select("jsonData.*")

Build the final DataFrame with the required columns:  
`signal_date`, `signal_ts`, `create_date`, `create_ts`, `signals`

In [6]:
final_df = df.select(
    # Convert to signal_date
    to_date(
        to_timestamp(col("Date/Time"), "dd MM yyyy HH:mm"),
        "yyyy-MM-dd"
    ).alias("signal_date"),

    # Force correct timestamp format with 'T' and store as string
    date_format(
        to_timestamp(col("Date/Time"), "dd MM yyyy HH:mm"),
        "yyyy-MMdd'T'HH:mm:ss"
    ).alias("signal_ts"),

    # Capture processing time
    current_date().alias("create_date"),
    current_timestamp().alias("create_ts"),

    # Signals as a map
    map_from_arrays(
        array(
            lit("LV ActivePower (kW)"),
            lit("Wind Speed (m/s)"),
            lit("Theoretical_Power_Curve (KWh)"),
            lit("Wind Direction (°)")
        ),
        array(
            col("LV ActivePower (kW)").cast("string"),
            col("Wind Speed (m/s)").cast("string"),
            col("Theoretical_Power_Curve (KWh)").cast("string"),
            col("Wind Direction (°)").cast("string")
        )
    ).alias("signals")
)

In [7]:
final_df.printSchema()

root
 |-- signal_date: date (nullable = true)
 |-- signal_ts: string (nullable = true)
 |-- create_date: date (nullable = false)
 |-- create_ts: timestamp (nullable = false)
 |-- signals: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



Write the stream to a Delta table in "append" mode  
`trigger(once=True)` processes all available messages and stops

In [11]:
final_df.writeStream \
    .format("delta") \
    .option("checkpointLocation", "/tmp/delta_kafka_subscriber_checkpoint") \
    .outputMode("append") \
    .start("/data/delta_output")

<pyspark.sql.streaming.query.StreamingQuery at 0x7600b0d854c0>

Block until the streaming job completes

In [12]:
df = sprk.read.format("delta").load("/data/delta_output")
df.select("signal_ts").show(5, False)

+------------------+
|signal_ts         |
+------------------+
|2018-0101T10:50:00|
|2018-0101T00:00:00|
|2018-0101T03:10:00|
|2018-0101T09:00:00|
|2018-0101T04:20:00|
+------------------+
only showing top 5 rows



Stop Spark

In [3]:
sprk.stop()