# Prepare Europe Wind Datasets

1. The Europe wind dataset is available from https://www.uni-kassel.de/eecs/ies/downloads
  on request.
2. Once obtained please extract the data and place it in your `TORCH_DATA_DIR` directory.

In [None]:
import pandas as pd
import numpy as np
import datetime
from pathlib import Path
from os import environ
import h5py


data_directory = Path(environ.get("TORCH_DATA_DIR")).expanduser().resolve()
eurowind_data = data_directory / "EuropeWindFarm/data"

In [None]:
# https://developer.nvidia.com/blog/three-approaches-to-encoding-time-information-as-features-for-ml-models/
from sklego.preprocessing import RepeatingBasisFunction


def rbf_day_of_year(day_of_year: np.ndarray) -> np.ndarray:
    df_day_of_year = pd.DataFrame({"day_of_year": day_of_year})
    rbf_day_of_year = RepeatingBasisFunction(
        n_periods=12, input_range=(1, 365), remainder="drop", column="day_of_year"
    )
    rbf_day_of_year.fit(df_day_of_year)
    return rbf_day_of_year.transform(df_day_of_year)


def rbf_hour_of_day(hour_of_day: np.ndarray) -> np.ndarray:
    df_hour_of_day = pd.DataFrame({"hour_of_day": hour_of_day})
    rbf_hour_of_day = RepeatingBasisFunction(
        n_periods=6, input_range=(0, 23), remainder="drop", column="hour_of_day"
    )
    rbf_hour_of_day.fit(df_hour_of_day)
    return rbf_hour_of_day.transform(df_hour_of_day)


def rbf_forecasting_hour(forecasting_hour: np.ndarray) -> np.ndarray:
    df_forecasting_hour = pd.DataFrame({"forecasting_hour": forecasting_hour})
    rbf_forecasting_hour = RepeatingBasisFunction(
        n_periods=6, input_range=(0, 48), remainder="drop", column="forecasting_hour"
    )
    rbf_forecasting_hour.fit(df_forecasting_hour)
    return rbf_forecasting_hour.transform(df_forecasting_hour)

In [None]:
def process_wf_df(df: pd.DataFrame, windfarm_id: int, windfarm_count: int):
    dates = np.array(df["Time"].map(lambda x: np.datetime64(x, "h")))
    year = dates.astype("datetime64[Y]").astype(int) + 1970
    month = dates.astype("datetime64[M]").astype(int) % 12 + 1
    day_of_month = (
        dates.astype("datetime64[D]") - dates.astype("datetime64[M]") + 1
    ).astype(int)
    hour_of_day = dates.astype("datetime64[h]").astype(int) % 24
    day_of_year = np.array(
        [
            datetime.datetime(1, month[i], day_of_month[i]).timetuple().tm_yday
            for i in range(len(dates))
        ]
    )
    timestamp = dates.astype("datetime64[h]").astype(int)

    rbf_day_of_year_ = rbf_day_of_year(day_of_year)
    for i in range(12):
        df[f"rbf_day_of_year_{i:02d}"] = rbf_day_of_year_[:, i]

    rbf_hour_of_day_ = rbf_hour_of_day(hour_of_day)
    for i in range(6):
        df[f"rbf_hour_of_day_{i:02d}"] = rbf_hour_of_day_[:, i]

    rbf_forecasting_time = rbf_forecasting_hour(np.array(df["ForecastingTime"]))
    for i in range(6):
        df[f"rbf_forecasting_time_{i:02d}"] = rbf_forecasting_time[:, i]

    # Add dummy variables for windfarm_id
    for i in range(windfarm_count):
        df[f"windfarm_id_{i:02d}"] = 0

    df[f"windfarm_id_{windfarm_id:02d}"] = 1
    df["year"] = year
    df.drop(columns=["Time", "ForecastingTime"], inplace=True)
    df["timestamp"] = timestamp
    return df

In [None]:
windfarm_count = 46
wf_df = []
for windfarm_id in range(1, windfarm_count):
    csv = eurowind_data / f"wf{windfarm_id:d}.csv"
    wf_df.append(process_wf_df(pd.read_csv(csv), windfarm_id - 1, windfarm_count))

In [None]:
# Combine all the datasets into one by chronological order
df_full = pd.concat(wf_df[:3]).sort_values(by="timestamp")
df_full = df_full.drop(columns=["timestamp"])
for i, feature in enumerate(df_full.columns):
    print(i, feature)

In [None]:
from torch import Tensor

x_dataset = Tensor(df_full.drop(columns=["PowerGeneration"]).values)
y_dataset = Tensor(df_full["PowerGeneration"].values)

# Standardize the features
x_dataset = (x_dataset - x_dataset.mean(dim=0)) / x_dataset.std(dim=0)
y_dataset = (y_dataset - y_dataset.mean(dim=0)) / y_dataset.std(dim=0)
x_dataset = x_dataset.numpy().astype(np.float32)
y_dataset = y_dataset.numpy().astype(np.float32)

print("x_dataset.shape:", x_dataset.shape)
print("y_dataset.shape:", y_dataset.shape)
# Count NaNs in the dataset
print("Number of NaNs in x_dataset:", np.isnan(x_dataset).sum())
print("Number of NaNs in y_dataset:", np.isnan(y_dataset).sum())

x_dataset = np.nan_to_num(x_dataset, nan=0.0)

In [None]:
hdf5_filename = data_directory / "eurowind.hdf5"

with h5py.File(hdf5_filename, "w") as f:
    f.create_dataset("x_features", data=x_dataset, dtype="float32", compression="gzip")
    f.create_dataset(
        "y_targets", data=y_dataset.reshape(-1, 1), dtype="float32", compression="gzip"
    )