In [1]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession as ss
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler as va
from pyspark.ml.regression import LinearRegression as lreg
from pyspark.ml.evaluation import RegressionEvaluator as re

```
------------------------------------------------------------------------------
1. Initialize SparkSession (with Delta support)
------------------------------------------------------------------------------
```

Configure Spark to handle Delta Lake

In [2]:
builder = (
    ss.builder
        .appName("WindPowerPrediction")
        .master("spark://spark-master:7077")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

In [3]:
sprk = configure_spark_with_delta_pip(builder).getOrCreate()

```
------------------------------------------------------------------------------
2. Read the Delta table
------------------------------------------------------------------------------
```

Path where Kafka Subscriber wrote Delta data

In [4]:
fp = "/data/delta_output"

In [5]:
df = sprk.read.format("delta").load(fp)

The `signals` column is a map that includes:  
"LV ActivePower (kW)", "Wind Speed (m/s)", etc.

```
------------------------------------------------------------------------------
3. Extract numeric columns from `signals` MapType
------------------------------------------------------------------------------
```

In [6]:
df = df.withColumn("active_power", col("signals")["LV ActivePower (kW)"].cast("double")) \
        .withColumn("wind_speed", col("signals")["Wind Speed (m/s)"].cast("double")) \
        .withColumn("theoretical_curve", col("signals")["Theoretical_Power_Curve (KWh)"].cast("double")) \
        .withColumn("wind_direction", col("signals")["Wind Direction (°)"].cast("double"))

Drop rows with nulls in critical columns we need

In [7]:
df = df.na.drop(subset=["active_power", "wind_speed", "theoretical_curve", "wind_direction"])

```
------------------------------------------------------------------------------
4. Build features & label, then split into train/test
------------------------------------------------------------------------------
```

In [8]:
feature_cols = ["wind_speed", "theoretical_curve", "wind_direction"]
assembler = va(inputCols=feature_cols, outputCol="features")

Our prediction target is `active_power`

In [9]:
assembled = assembler.transform(df).select("signal_date", "signal_ts", "features", "active_power")

Split into train and test sets (80/20)

In [10]:
train_data, test_data = assembled.randomSplit([0.8, 0.2], seed=42)

```
------------------------------------------------------------------------------
5. Train a simple Regression Model (e.g., LinearRegression)
------------------------------------------------------------------------------
```

In [11]:
lr = lreg(featuresCol="features", labelCol="active_power", maxIter=50)
model = lr.fit(train_data)

```
------------------------------------------------------------------------------
6. Evaluate on the test set
------------------------------------------------------------------------------
```

In [12]:
predictions = model.transform(test_data)

Use a RegressionEvaluator to compute RMSE and R2

In [13]:
evaluator = re(
    labelCol="active_power",
    predictionCol="prediction",
    metricName="rmse"
)

In [14]:
rmse = evaluator.evaluate(predictions)

In [15]:
r2 = re(
    labelCol="active_power",
    predictionCol="prediction",
    metricName="r2"
).evaluate(predictions)

In [16]:
print(f"Test RMSE: {rmse}")
print(f"Test R^2: {r2}")

Test RMSE: 452.8843348101753
Test R^2: 0.8838325414715553


```
------------------------------------------------------------------------------
7. Predicting Active Power for a single day (example)
------------------------------------------------------------------------------
```
We'll filter the dataset to a specific day, e.g. "2018-01-01"

In [17]:
day_to_predict = "2018-02-15"
future_df = assembled.filter(col("signal_date") == day_to_predict)

Generate predictions

In [18]:
future_preds = model.transform(future_df).select("signal_date", "signal_ts", "prediction")
future_preds.show(30, truncate=False)

+-----------+-------------------+-------------------+
|signal_date|signal_ts          |prediction         |
+-----------+-------------------+-------------------+
|2018-02-15 |2018-02-15 16:50:00|1842.2359607784429 |
|2018-02-15 |2018-02-15 02:50:00|3046.9058302870276 |
|2018-02-15 |2018-02-15 17:10:00|1962.0427332971472 |
|2018-02-15 |2018-02-15 09:10:00|-47.7184928009998  |
|2018-02-15 |2018-02-15 13:40:00|811.0757053286508  |
|2018-02-15 |2018-02-15 14:10:00|1119.8095100961873 |
|2018-02-15 |2018-02-15 02:10:00|2617.2768085733346 |
|2018-02-15 |2018-02-15 06:20:00|428.51316322533773 |
|2018-02-15 |2018-02-15 10:20:00|-146.97429046975492|
|2018-02-15 |2018-02-15 21:50:00|2508.800139550723  |
|2018-02-15 |2018-02-15 13:20:00|680.8275449625626  |
|2018-02-15 |2018-02-15 19:00:00|2295.629551533294  |
|2018-02-15 |2018-02-15 20:40:00|2664.959560086498  |
|2018-02-15 |2018-02-15 08:10:00|444.07656189057207 |
|2018-02-15 |2018-02-15 19:20:00|2416.3451641979213 |
|2018-02-15 |2018-02-15 05:3

Stop the Spark session

In [19]:
sprk.stop()