In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType, TimestampType

In [2]:
# Initialize a Spark session
spark = SparkSession.builder \
    .appName("WindPowerData") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0,io.delta:delta-core_2.12:2.4.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.debug.maxToStringFields", "2000") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/xs519-hargup/.ivy2/cache
The jars for the packages stored in: /home/xs519-hargup/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a39fda0e-b32e-49cc-9098-cfe144812ccc;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.0 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.9.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
	fou

In [3]:
kafka_bootstrap_servers = "localhost:9092"  # Replace with your Kafka broker
kafka_topic = "wind-data"  # Replace with your Kafka topic name

In [4]:
df_kafka = spark.readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
            .option("subscribe", kafka_topic) \
            .option("startingOffsets", "earliest") \
            .load()

In [5]:
# Extract the message value (assuming it’s in string format)
df_kafka_value = df_kafka.selectExpr("CAST(value AS STRING)")

In [6]:
# Define the schema of the JSON message (example schema)
json_schema = StructType([
    StructField("Date/Time", StringType(), True),
    StructField("LV Active Power (kW)", DoubleType(), True),
    StructField("Wind Speed (m/s)", DoubleType(), True),
    StructField("Theoretical_Power_Curve (kWh)", DoubleType(), True),
    StructField("Wind Direction (°)", DoubleType(), True)
])

# Parse the JSON message
df_parsed = df_kafka_value.withColumn("jsonData", F.from_json(F.col("value"), json_schema))
df_final = df_parsed.select("jsonData.*")

In [7]:
df_final

DataFrame[Date/Time: string, LV Active Power (kW): double, Wind Speed (m/s): double, Theoretical_Power_Curve (kWh): double, Wind Direction (°): double]

In [8]:
df_final.printSchema()

root
 |-- Date/Time: string (nullable = true)
 |-- LV Active Power (kW): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Theoretical_Power_Curve (kWh): double (nullable = true)
 |-- Wind Direction (°): double (nullable = true)



In [9]:
print(df_kafka_value.isStreaming)

True


In [10]:
# # Sample DataFrame creation
# data = [
#     ("2025-03-25 17:00:00", 100.5, 8.2, 50.0, 45.0),
#     ("2025-03-26 18:30:00", 200.0, 10.5, 75.0, 90.0)
# ]
# columns = ["Date/Time", "LV Active Power (kW)", "Wind Speed (m/s)", "Theoretical_Power_Curve (kWh)", "Wind Direction (°)"]

# df = spark.createDataFrame(data, columns)

# Step 1: Parse signal_date and signal_ts
df_final = df_final.withColumn("signal_date", F.to_date(F.expr("substring(`Date/Time`, 1, 10)"), "yyyy-MM-dd")) \
       .withColumn("signal_ts", F.date_format(F.to_timestamp(F.col("Date/Time"), "yyyy-MM-dd HH:mm:ss"), "yyyy-MM-dd'T'HH:mm:ss"))

# Step 2: Add create_date and create_ts
df_final = df_final.withColumn("create_date", F.current_date().cast("date")) \
       .withColumn("create_ts", F.date_format(F.current_timestamp(), "yyyy-MM-dd'T'HH:mm:ss"))

# # Step 3: Construct the signals column
# signal_columns = ["LV Active Power (kW)", "Wind Speed (m/s)", "Theoretical_Power_Curve (kWh)", "Wind Direction (°)"]

# keys = F.array([F.lit(c) for c in signal_columns])  # Array of column names
# values = F.array([F.col(c).cast("string") for c in signal_columns])  # Array of column values

# Use map_from_arrays with keys and values arrays
df_final = df_final.withColumn("signals", F.map_from_arrays(
        array(
            lit("LV ActivePower (kW)"),
            lit("Wind Speed (m/s)"),
            lit("Theoretical_Power_Curve (KWh)"),
            lit("Wind Direction (°)")
        ),
        array(
            col("LV ActivePower (kW)").cast("string"),
            col("Wind Speed (m/s)").cast("string"),
            col("Theoretical_Power_Curve (KWh)").cast("string"),
            col("Wind Direction (°)").cast("string")
        )))

# Select required columns
delta_df = df_final.select("signal_date", "signal_ts", "create_date", "create_ts", "signals")


In [11]:
delta_df.printSchema()

root
 |-- signal_date: date (nullable = true)
 |-- signal_ts: string (nullable = true)
 |-- create_date: date (nullable = false)
 |-- create_ts: string (nullable = false)
 |-- signals: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [12]:
delta_table_path = "./Delta-Table"  

In [13]:
# Writing the streaming data to Delta table
delta_df.writeStream \
 .format("delta") \
 .outputMode("append") \
 .option("checkpointLocation", "./Checkpoint-Delta/") \
 .start(delta_table_path)

25/03/26 10:46:03 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x79ee70b02050>

25/03/26 10:46:03 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

In [14]:
delta_table_df = spark.read.format("delta").load("./Delta-Table")

In [15]:
delta_table_df.show(truncate=False)

+-----------+---------+-----------+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|signal_date|signal_ts|create_date|create_ts          |signals                                                                                                                                            |
+-----------+---------+-----------+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|null       |null     |2025-03-26 |2025-03-26T10:46:04|{LV Active Power (kW) -> null, Wind Speed (m/s) -> 5.31133604049682, Theoretical_Power_Curve (kWh) -> null, Wind Direction (°) -> 259.994903564453}|
|null       |null     |2025-03-26 |2025-03-26T10:46:04|{LV Active Power (kW) -> null, Wind Speed (m/s) -> 5.67216682434082, Theoretical_Power_Curve (kWh) -> null, Wind Direction (°) ->

In [None]:
spark.stop()