In [29]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    countDistinct,
    hour,
    avg,
    when,
    lit,
    col,
    create_map,
    explode,
    broadcast
)

from pyspark.sql.types import (
    StructType as st,
    StructField as sf,
    StringType as srt
)

In [2]:
builder = (
    SparkSession.builder
        .appName("Task3-Analysis")
        .master("spark://spark-master:7077") 
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("WARN")

In [4]:
delta_path = "/data/delta_output"  # Adjust if needed

In [5]:
df = spark.read.format("delta").load(delta_path)

In [6]:
df.printSchema()

root
 |-- signal_date: date (nullable = true)
 |-- signal_ts: timestamp (nullable = true)
 |-- create_date: date (nullable = true)
 |-- create_ts: timestamp (nullable = true)
 |-- signals: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [7]:
df.show(5, truncate=False)

+-----------+-------------------+-----------+-----------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|signal_date|signal_ts          |create_date|create_ts              |signals                                                                                                                                                                  |
+-----------+-------------------+-----------+-----------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|2018-01-02 |2018-01-02 16:30:00|2025-03-24 |2025-03-24 09:18:12.661|{LV ActivePower (kW) -> 3589.85888671875, Wind Speed (m/s) -> 13.6978597640991, Theoretical_Power_Curve (KWh) -> 3600.0, Wind Direction (°) -> 204.802307128906}         |
|2018-01-02 |2018-01-02 22:20:00|2025-03

In [15]:
daily_distinct_ts = (
    df.groupBy("signal_date")
      .agg(countDistinct("signal_ts").alias("distinct_ts_count"))
)

In [16]:
daily_distinct_ts.show()

+-----------+-----------------+
|signal_date|distinct_ts_count|
+-----------+-----------------+
| 2018-01-11|              144|
| 2018-01-08|              144|
| 2018-01-14|               90|
| 2018-01-13|              144|
| 2018-01-05|              144|
| 2018-01-12|              143|
| 2018-01-04|              127|
| 2018-01-02|              144|
| 2018-01-03|              144|
| 2018-01-09|              144|
| 2018-01-06|              140|
| 2018-01-01|              144|
| 2018-01-07|              144|
| 2018-01-10|              144|
+-----------+-----------------+



In [17]:
lv_active_power_col = col("signals")["LV ActivePower (kW)"].cast("double")
wind_speed_col      = col("signals")["Wind Speed (m/s)"].cast("double")
theoretical_col     = col("signals")["Theoretical_Power_Curve (KWh)"].cast("double")
wind_dir_col        = col("signals")["Wind Direction (°)"].cast("double")

In [18]:
grouped = (
    df.groupBy(
        "signal_date",
        hour("signal_ts").alias("hour_of_day")
    )
    .agg(
        avg(lv_active_power_col).alias("avg_active_power"),
        avg(wind_speed_col).alias("avg_wind_speed"),
        avg(theoretical_col).alias("avg_theoretical_power_curve"),
        avg(wind_dir_col).alias("avg_wind_direction")
    )
)

In [19]:
grouped.show(10, truncate=False)

+-----------+-----------+------------------+------------------+---------------------------+------------------+
|signal_date|hour_of_day|avg_active_power  |avg_wind_speed    |avg_theoretical_power_curve|avg_wind_direction|
+-----------+-----------+------------------+------------------+---------------------------+------------------+
|2018-01-04 |23         |789.4284159342433 |6.417605241139726 |783.0409379176934          |26.087587992350205|
|2018-01-14 |12         |0.0               |8.70398680369059  |1958.5847292657147         |33.799146970113064|
|2018-01-01 |20         |3205.9163411458317|11.479263305664015|3406.212312613832          |197.523200988769  |
|2018-01-12 |21         |306.1325632731113 |4.951312621434526 |324.8236997973515          |65.52038892110185 |
|2018-01-09 |13         |210.1709823608395 |4.615528742472325 |248.0416888358253          |70.92443339029946 |
|2018-01-09 |3          |185.00654856363883|4.491092522939043 |225.67291365344818         |268.2254028320307 |
|

In [20]:
gen_indicator_df = (
    grouped.withColumn(
        "generation_indicator",
        when(col("avg_active_power") < 200, "Low")
        .when((col("avg_active_power") >= 200) & (col("avg_active_power") < 600), "Medium")
        .when((col("avg_active_power") >= 600) & (col("avg_active_power") < 1000), "High")
        .otherwise("Exceptional")
    )
)

In [21]:
gen_indicator_df.show(10, truncate=False)

+-----------+-----------+------------------+------------------+---------------------------+------------------+--------------------+
|signal_date|hour_of_day|avg_active_power  |avg_wind_speed    |avg_theoretical_power_curve|avg_wind_direction|generation_indicator|
+-----------+-----------+------------------+------------------+---------------------------+------------------+--------------------+
|2018-01-04 |23         |789.4284159342433 |6.417605241139726 |783.0409379176934          |26.087587992350205|High                |
|2018-01-14 |12         |0.0               |8.70398680369059  |1958.5847292657147         |33.799146970113064|Low                 |
|2018-01-01 |20         |3205.9163411458317|11.479263305664015|3406.212312613832          |197.523200988769  |Exceptional         |
|2018-01-12 |21         |306.1325632731113 |4.951312621434526 |324.8236997973515          |65.52038892110185 |Medium              |
|2018-01-09 |13         |210.1709823608395 |4.615528742472325 |248.041688835

In [22]:
mapping_data = [
    ("LV ActivePower (kW)", "active_power_average"),
    ("Wind Speed (m/s)", "wind_speed_average"),
    ("Theoretical_Power_Curve (KWh)", "theo_power_curve_average"),
    ("Wind Direction (°)", "wind_direction_average")
]

In [23]:
mapping_schema = st([
    sf("sig_name", srt(), True),
    sf("sig_mapping_name", srt(), True)
])

In [24]:
mapping_df = spark.createDataFrame(mapping_data, mapping_schema)

In [27]:
long_format_df = (
    gen_indicator_df
    .select(
       "signal_date",
       "hour_of_day",
       "generation_indicator",
       create_map(
         lit("LV ActivePower (kW)"), col("avg_active_power"),
         lit("Wind Speed (m/s)"), col("avg_wind_speed"),
         lit("Theoretical_Power_Curve (KWh)"), col("avg_theoretical_power_curve"),
         lit("Wind Direction (°)"), col("avg_wind_direction")
       ).alias("metrics")
    )
    # 'explode' transforms the map into key-value pairs
    .select(
       "signal_date",
       "hour_of_day",
       "generation_indicator",
       explode(col("metrics")).alias("sig_name", "value")
    )
)

In [30]:
joined_df = (
    long_format_df
    .join(broadcast(mapping_df), on="sig_name", how="left")
    .select(
       "signal_date",
       "hour_of_day",
       "generation_indicator",
       "sig_name",
       "sig_mapping_name",
       "value"
    )
)

In [32]:
joined_df.show(20, truncate=False)

+-----------+-----------+--------------------+-----------------------------+------------------------+------------------+
|signal_date|hour_of_day|generation_indicator|sig_name                     |sig_mapping_name        |value             |
+-----------+-----------+--------------------+-----------------------------+------------------------+------------------+
|2018-01-04 |23         |High                |LV ActivePower (kW)          |active_power_average    |789.4284159342433 |
|2018-01-04 |23         |High                |Wind Speed (m/s)             |wind_speed_average      |6.417605241139726 |
|2018-01-04 |23         |High                |Theoretical_Power_Curve (KWh)|theo_power_curve_average|783.0409379176934 |
|2018-01-04 |23         |High                |Wind Direction (°)           |wind_direction_average  |26.087587992350205|
|2018-01-14 |12         |Low                 |LV ActivePower (kW)          |active_power_average    |0.0               |
|2018-01-14 |12         |Low    

In [33]:
spark.stop()