In [17]:
from pyspark.sql import SparkSession as ss
from pyspark.sql.types import (
    StructType as st,
    StructField as sf,
    StringType as srt,
    DoubleType as dt,
    LongType as lt
)

from pyspark.sql.functions import (
    from_json,
    col,
    to_date,
    to_timestamp,
    current_date,
    current_timestamp,
    lit,
    map_from_arrays,
    array,
)

In [5]:
sprk = ss.builder \
    .appName("KafkaSubscriber") \
    .master("spark://spark-master:7077") \
    .config(
        "spark.jars.packages", 
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.5,"
        "io.delta:delta-spark_2.12:3.3.0"
    ) \
    .config("spark.jars.repositories", "https://repos.spark-packages.org") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.memory", "10g")\
    .config("spark.cores.max", "4") \
    .getOrCreate()

In [6]:
json_schema = st([
    sf("Date/Time", srt(), True),
    sf("LV ActivePower (kW)", dt(), True),
    sf("Wind Speed (m/s)", dt(), True),
    sf("Theoretical_Power_Curve (KWh)", dt(), True),
    sf("Wind Direction (°)", dt(), True),
    sf("row_id", lt(), True)
])

In [7]:
kafka_df = sprk.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "xenon-topic") \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "false") \
    .load()

In [8]:
df = kafka_df.select(
    from_json(
        col("value").cast("string"),
        json_schema
    ).alias("jsonData")
).select("jsonData.*")

In [9]:
final_df = df.select(
    to_date(
        to_timestamp(col("Date/Time"), "dd MM yyyy HH:mm"),
        "yyyy-MM-dd"
    ).alias("signal_date"),
    to_timestamp(col("Date/Time"), "dd MM yyyy HH:mm").alias("signal_ts"),
    current_date().alias("create_date"),
    current_timestamp().alias("create_ts"),
    map_from_arrays(
        array(
            lit("LV ActivePower (kW)"),
            lit("Wind Speed (m/s)"),
            lit("Theoretical_Power_Curve (KWh)"),
            lit("Wind Direction (°)")
        ),
        array(
            col("LV ActivePower (kW)").cast("string"),
            col("Wind Speed (m/s)").cast("string"),
            col("Theoretical_Power_Curve (KWh)").cast("string"),
            col("Wind Direction (°)").cast("string")
        )
    ).alias("signals")
)

In [10]:
final_df.printSchema()

root
 |-- signal_date: date (nullable = true)
 |-- signal_ts: timestamp (nullable = true)
 |-- create_date: date (nullable = false)
 |-- create_ts: timestamp (nullable = false)
 |-- signals: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [12]:
final_df.writeStream \
    .format("delta") \
    .option("checkpointLocation", "/tmp/delta_kafka_subscriber_checkpoint") \
    .outputMode("append") \
    .start("/data/delta_output")

<pyspark.sql.streaming.query.StreamingQuery at 0x762bb4356f90>

In [16]:
df = sprk.read.format("delta").load("/data/delta_output")
df.show(5)

Py4JJavaError: An error occurred while calling o161.load.
: java.lang.IllegalStateException: No active or default Spark session found
	at org.apache.spark.sql.SparkSession$.$anonfun$active$2(SparkSession.scala:1202)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.SparkSession$.$anonfun$active$1(SparkSession.scala:1202)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.SparkSession$.active(SparkSession.scala:1201)
	at org.apache.spark.sql.delta.sources.DeltaDataSource.getTable(DeltaDataSource.scala:70)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.getTableFromProvider(DataSourceV2Utils.scala:92)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.loadV2Source(DataSourceV2Utils.scala:140)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$1(DataFrameReader.scala:210)
	at scala.Option.flatMap(Option.scala:271)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)


In [15]:
sprk.stop()