In [1]:
from delta import configure_spark_with_delta_pip as cdp
from pyspark.sql import SparkSession as ss
from pyspark.sql.functions import (
    countDistinct,
    hour,
    avg,
    when,
    lit,
    col,
    explode,
    create_map,
    broadcast
)

from pyspark.sql.types import (
    StructType as st,
    StructField as sf,
    StringType as srt,
)

In [2]:
scb = ( # SparkContext Builder scb
    ss.builder
        .appName("Analysis") \
        .master("spark://spark-master:7077") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.executor.cores", "4") \
        .config("spark.executor.memory", "10g")\
        .config("spark.cores.max", "4") \
)

In [3]:
sprk = cdp(scb).getOrCreate()

In [4]:
fp = "/data/delta_output"

In [5]:
df = sprk.read.format("delta").load(fp)

In [6]:
df.printSchema()
df.show(5)

root
 |-- signal_date: date (nullable = true)
 |-- signal_ts: timestamp (nullable = true)
 |-- create_date: date (nullable = true)
 |-- create_ts: timestamp (nullable = true)
 |-- signals: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+-----------+-------------------+-----------+--------------------+--------------------+
|signal_date|          signal_ts|create_date|           create_ts|             signals|
+-----------+-------------------+-----------+--------------------+--------------------+
| 2018-01-01|2018-01-01 00:00:00| 2025-03-27|2025-03-27 03:00:...|{LV ActivePower (...|
| 2018-01-01|2018-01-01 00:00:00| 2025-03-27|2025-03-27 03:00:...|{LV ActivePower (...|
| 2018-01-01|2018-01-01 00:10:00| 2025-03-27|2025-03-27 03:00:...|{LV ActivePower (...|
| 2018-01-01|2018-01-01 00:10:00| 2025-03-27|2025-03-27 03:00:...|{LV ActivePower (...|
| 2018-01-01|2018-01-01 00:20:00| 2025-03-27|2025-03-27 03:00:...|{LV ActivePower (...|
+----------

In [7]:
dts = (
    df.groupBy("signal_date")
      .agg(countDistinct("signal_ts").alias("distinct_ts_count"))
)


In [8]:
dts.show(5)

+-----------+-----------------+
|signal_date|distinct_ts_count|
+-----------+-----------------+
| 2018-08-10|              144|
| 2018-05-28|              144|
| 2018-03-17|              144|
| 2018-06-06|              144|
| 2018-06-26|              141|
+-----------+-----------------+
only showing top 5 rows



In [9]:
lv_active_power_col = col("signals")["LV ActivePower (kW)"].cast("double")
wind_speed_col      = col("signals")["Wind Speed (m/s)"].cast("double")
theoretical_col     = col("signals")["Theoretical_Power_Curve (KWh)"].cast("double")
wind_dir_col        = col("signals")["Wind Direction (°)"].cast("double")

In [10]:
grouped = (
    df.groupBy( "signal_date", hour("signal_ts").alias("hour_of_day"))
    .agg(
        avg(lv_active_power_col).alias("avg_active_power"),
        avg(wind_speed_col).alias("avg_wind_speed"),
        avg(theoretical_col).alias("avg_theoretical_power_curve"),
        avg(wind_dir_col).alias("avg_wind_direction")
    )
)

In [11]:
grouped.show(5)

+-----------+-----------+------------------+------------------+---------------------------+------------------+
|signal_date|hour_of_day|  avg_active_power|    avg_wind_speed|avg_theoretical_power_curve|avg_wind_direction|
+-----------+-----------+------------------+------------------+---------------------------+------------------+
| 2018-01-01|         20|3205.9163411458317|11.479263305664015|          3406.212312613832|  197.523200988769|
| 2018-01-04|         23| 789.4284159342432| 6.417605241139726|          783.0409379176934|26.087587992350205|
| 2018-01-14|         12|               0.0|  8.70398680369059|         1958.5847292657152|33.799146970113064|
| 2018-01-15|          4|               0.0| 6.353489796320594|          754.0327970967259| 73.85050710042314|
| 2018-01-21|         15| 3527.467529296873|13.921771685282335|                     3600.0|177.95908610026004|
+-----------+-----------+------------------+------------------+---------------------------+------------------+
o

In [12]:
gen_indicator = (
    grouped.withColumn(
        "generation_indicator",
        when(col("avg_active_power") < 200, "Low")
        .when((col("avg_active_power") >= 200) & (col("avg_active_power") < 600), "Medium")
        .when((col("avg_active_power") >= 600) & (col("avg_active_power") < 1000), "High")
        .otherwise("Exceptional")
    )
)

In [13]:
gen_indicator.show(5)

+-----------+-----------+------------------+------------------+---------------------------+------------------+--------------------+
|signal_date|hour_of_day|  avg_active_power|    avg_wind_speed|avg_theoretical_power_curve|avg_wind_direction|generation_indicator|
+-----------+-----------+------------------+------------------+---------------------------+------------------+--------------------+
| 2018-01-01|         20|3205.9163411458317|11.479263305664015|          3406.212312613832|  197.523200988769|         Exceptional|
| 2018-01-04|         23| 789.4284159342432| 6.417605241139726|          783.0409379176934|26.087587992350205|                High|
| 2018-01-14|         12|               0.0|  8.70398680369059|         1958.5847292657152|33.799146970113064|                 Low|
| 2018-01-15|          4|               0.0| 6.353489796320594|          754.0327970967259| 73.85050710042314|                 Low|
| 2018-01-21|         15| 3527.467529296873|13.921771685282335|             

In [14]:
data = [
    ("LV ActivePower (kW)", "active_power_average"),
    ("Wind Speed (m/s)", "wind_speed_average"),
    ("Theoretical_Power_Curve (KWh)", "theo_power_curve_average"),
    ("Wind Direction (°)", "wind_direction_average")
]

In [15]:
schema = st([
    sf("sig_name", srt(), True),
    sf("sig_mapping_name", srt(), True)
])

In [16]:
new_df = sprk.createDataFrame(data, schema)

In [17]:
formatted_df = (
    gen_indicator
    .select(
       "signal_date",
       "hour_of_day",
       "generation_indicator",
       create_map(
         lit("LV ActivePower (kW)"), col("avg_active_power"),
         lit("Wind Speed (m/s)"), col("avg_wind_speed"),
         lit("Theoretical_Power_Curve (KWh)"), col("avg_theoretical_power_curve"),
         lit("Wind Direction (°)"), col("avg_wind_direction")
       ).alias("metrics")
    )
    .select(
       "signal_date",
       "hour_of_day",
       "generation_indicator",
       explode(col("metrics")).alias("sig_name", "value")
    )
)

In [18]:
joined_df = (
    formatted_df
    .join(broadcast(new_df), on="sig_name", how="left")
    .select(
       "signal_date",
       "hour_of_day",
       "generation_indicator",
       "sig_name",
       "sig_mapping_name",
       "value"
    )
)

In [19]:
joined_df.show()

+-----------+-----------+--------------------+--------------------+--------------------+------------------+
|signal_date|hour_of_day|generation_indicator|            sig_name|    sig_mapping_name|             value|
+-----------+-----------+--------------------+--------------------+--------------------+------------------+
| 2018-01-01|         20|         Exceptional| LV ActivePower (kW)|active_power_average|3205.9163411458317|
| 2018-01-01|         20|         Exceptional|    Wind Speed (m/s)|  wind_speed_average|11.479263305664015|
| 2018-01-01|         20|         Exceptional|Theoretical_Power...|theo_power_curve_...| 3406.212312613832|
| 2018-01-01|         20|         Exceptional|  Wind Direction (°)|wind_direction_av...|  197.523200988769|
| 2018-01-04|         23|                High| LV ActivePower (kW)|active_power_average| 789.4284159342432|
| 2018-01-04|         23|                High|    Wind Speed (m/s)|  wind_speed_average| 6.417605241139726|
| 2018-01-04|         23|   

In [20]:
sprk.stop()