In [1]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    countDistinct,
    hour,
    avg,
    when,
    lit,
    col,
    explode,
    create_map,
    broadcast
)

from pyspark.sql.types import (
    StructType as st,
    StructField as sf,
    StringType as srt,
)

Create a SparkSession with Delta Lake support:  
- Master is set to `spark://spark-master:7077`  
- Additional configs enable DeltaLake integration

In [2]:
builder = (
    SparkSession.builder
        .appName("Analysis")
        .master("spark://spark-master:7077") 
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

In [3]:
sprk = configure_spark_with_delta_pip(builder).getOrCreate()
sprk.sparkContext.setLogLevel("WARN")

Path to the Delta table created by the Kafka subscriber

In [4]:
fp = "/data/delta_output"

In [None]:
# Read the Delta table into a DataFrame

In [5]:
df = sprk.read.format("delta").load(fp)

In [6]:
df.printSchema()

root
 |-- signal_date: date (nullable = true)
 |-- signal_ts: timestamp (nullable = true)
 |-- create_date: date (nullable = true)
 |-- create_ts: timestamp (nullable = true)
 |-- signals: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [7]:
df.show(5, truncate=False)

+-----------+-------------------+-----------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|signal_date|signal_ts          |create_date|create_ts              |signals                                                                                                                                                                   |
+-----------+-------------------+-----------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|2018-01-02 |2018-01-02 13:00:00|2025-03-25 |2025-03-25 04:32:19.771|{LV ActivePower (kW) -> 1278.89501953125, Wind Speed (m/s) -> 9.0723066329956, Theoretical_Power_Curve (KWh) -> 2190.70476314143, Wind Direction (°) -> 198.236404418945} |
|2018-01-01 |2018-01-01 04:20:00|202

1) Calculate number of distinct `signal_ts` per day

In [8]:
dts = (
    df.groupBy("signal_date")
      .agg(countDistinct("signal_ts").alias("distinct_ts_count"))
)

In [9]:
dts.show()

+-----------+-----------------+
|signal_date|distinct_ts_count|
+-----------+-----------------+
| 2018-03-17|              144|
| 2018-03-23|              144|
| 2018-01-23|              144|
| 2018-02-26|              144|
| 2018-01-11|              144|
| 2018-02-22|              144|
| 2018-01-18|              144|
| 2018-01-08|              144|
| 2018-03-08|              144|
| 2018-02-11|              144|
| 2018-01-14|              144|
| 2018-03-13|              144|
| 2018-01-13|              144|
| 2018-01-05|              144|
| 2018-02-01|              144|
| 2018-02-04|              144|
| 2018-01-16|              144|
| 2018-02-23|              144|
| 2018-03-29|              144|
| 2018-01-12|              143|
+-----------+-----------------+
only showing top 20 rows



2) Calculate average value of all signals per hour  
Extract columns from the `signals` map

In [10]:
lv_active_power_col = col("signals")["LV ActivePower (kW)"].cast("double")
wind_speed_col      = col("signals")["Wind Speed (m/s)"].cast("double")
theoretical_col     = col("signals")["Theoretical_Power_Curve (KWh)"].cast("double")
wind_dir_col        = col("signals")["Wind Direction (°)"].cast("double")

Group by `signal_date` and hour, then compute averages

In [11]:
grouped = (
    df.groupBy(
        "signal_date",
        hour("signal_ts").alias("hour_of_day")
    )
    .agg(
        avg(lv_active_power_col).alias("avg_active_power"),
        avg(wind_speed_col).alias("avg_wind_speed"),
        avg(theoretical_col).alias("avg_theoretical_power_curve"),
        avg(wind_dir_col).alias("avg_wind_direction")
    )
)

In [12]:
grouped.show(10, truncate=False)

+-----------+-----------+------------------+------------------+---------------------------+------------------+
|signal_date|hour_of_day|avg_active_power  |avg_wind_speed    |avg_theoretical_power_curve|avg_wind_direction|
+-----------+-----------+------------------+------------------+---------------------------+------------------+
|2018-01-04 |23         |789.4284159342433 |6.417605241139726 |783.0409379176934          |26.087587992350205|
|2018-03-15 |14         |0.0               |1.9426068266232763|0.0                        |198.64052168528198|
|2018-02-23 |8          |0.0               |8.636883974075305 |1946.994138122557          |64.31345494588211 |
|2018-03-26 |20         |2629.914794921873 |10.433696746826149|2950.8269442512315         |207.18973286946584|
|2018-04-03 |18         |185.30679575602164|4.584054390589391 |240.37376040014553         |50.887859344482365|
|2018-02-02 |0          |3602.460367838538 |20.155463218688915|3600.0                     |197.15895334879497|
|

3) Add `generation_indicator` column based on avg_active_power  
If <200 => Low, 200-600 => Medium, 600-1000 => High, >=1000 => Exceptional

In [13]:
gen_indicator = (
    grouped.withColumn(
        "generation_indicator",
        when(col("avg_active_power") < 200, "Low")
        .when((col("avg_active_power") >= 200) & (col("avg_active_power") < 600), "Medium")
        .when((col("avg_active_power") >= 600) & (col("avg_active_power") < 1000), "High")
        .otherwise("Exceptional")
    )
)

In [14]:
gen_indicator.show(10, truncate=False)

+-----------+-----------+------------------+------------------+---------------------------+------------------+--------------------+
|signal_date|hour_of_day|avg_active_power  |avg_wind_speed    |avg_theoretical_power_curve|avg_wind_direction|generation_indicator|
+-----------+-----------+------------------+------------------+---------------------------+------------------+--------------------+
|2018-01-04 |23         |789.4284159342433 |6.417605241139726 |783.0409379176934          |26.087587992350205|High                |
|2018-03-15 |14         |0.0               |1.9426068266232763|0.0                        |198.64052168528198|Low                 |
|2018-02-23 |8          |0.0               |8.636883974075305 |1946.994138122557          |64.31345494588211 |Low                 |
|2018-03-26 |20         |2629.914794921873 |10.433696746826149|2950.8269442512315         |207.18973286946584|Exceptional         |
|2018-04-03 |18         |185.30679575602164|4.584054390589391 |240.373760400

4) Create a new DataFrame with JSON-based signal mapping and do broadcast join  
This simulates a small lookup table for signal names

In [15]:
data = [
    ("LV ActivePower (kW)", "active_power_average"),
    ("Wind Speed (m/s)", "wind_speed_average"),
    ("Theoretical_Power_Curve (KWh)", "theo_power_curve_average"),
    ("Wind Direction (°)", "wind_direction_average")
]

In [16]:
schema = st([
    sf("sig_name", srt(), True),
    sf("sig_mapping_name", srt(), True)
])

In [23]:
new_df = sprk.createDataFrame(data, schema)

AttributeError: 'NoneType' object has no attribute 'sc'

Transform columns into a map -> explode it -> join with new_df to rename

In [18]:
formatted_df = (
    gen_indicator
    .select(
       "signal_date",
       "hour_of_day",
       "generation_indicator",
       create_map(
         lit("LV ActivePower (kW)"), col("avg_active_power"),
         lit("Wind Speed (m/s)"), col("avg_wind_speed"),
         lit("Theoretical_Power_Curve (KWh)"), col("avg_theoretical_power_curve"),
         lit("Wind Direction (°)"), col("avg_wind_direction")
       ).alias("metrics")
    )
    .select(
       "signal_date",
       "hour_of_day",
       "generation_indicator",
       explode(col("metrics")).alias("sig_name", "value")
    )
)

Perform a broadcast join to replace `sig_name` with `sig_mapping_name`

In [19]:
joined_df = (
    formatted_df
    .join(broadcast(new_df), on="sig_name", how="left")
    .select(
       "signal_date",
       "hour_of_day",
       "generation_indicator",
       "sig_name",
       "sig_mapping_name",
       "value"
    )
)

In [20]:
joined_df.show(20, truncate=False)

+-----------+-----------+--------------------+-----------------------------+------------------------+------------------+
|signal_date|hour_of_day|generation_indicator|sig_name                     |sig_mapping_name        |value             |
+-----------+-----------+--------------------+-----------------------------+------------------------+------------------+
|2018-01-04 |23         |High                |LV ActivePower (kW)          |active_power_average    |789.4284159342433 |
|2018-01-04 |23         |High                |Wind Speed (m/s)             |wind_speed_average      |6.417605241139726 |
|2018-01-04 |23         |High                |Theoretical_Power_Curve (KWh)|theo_power_curve_average|783.0409379176934 |
|2018-01-04 |23         |High                |Wind Direction (°)           |wind_direction_average  |26.087587992350205|
|2018-03-15 |14         |Low                 |LV ActivePower (kW)          |active_power_average    |0.0               |
|2018-03-15 |14         |Low    

In [21]:
sprk.stop()