# Generating Synthetic PV Power Time Series

In [7]:
import os
import yaml
import numpy as np
import pandas as pd
import pvlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

import utils, clean_data, generate_pv

In [8]:
config_path = "config.yaml"
config = utils.load_config(config_path)

dir = os.path.join(config['data']['raw_dir'], 'solar')
features = config['features']
params = config['wind_params']

pv_features, _ = clean_data.relevant_features(features=features)

plot_names = ['Total', 'Direct', 'Diffuse']

In [9]:
files = os.listdir(dir)
file = files[0]

data = pd.read_csv(os.path.join(dir, file))
data['timestamp'] = pd.to_datetime(data['timestamp'])
#df['timestamp'] = df['timestamp'].dt.tz_localize("UTC").dt.tz_convert("Europe/Berlin")
data.set_index('timestamp', inplace=True)

IndexError: list index out of range

In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 79200 entries, 2023-05-26 00:00:00 to 2024-11-25 23:50:00
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   STATIONS_ID    79200 non-null  int64  
 1   PP_10          79200 non-null  float64
 2   TT_10          79200 non-null  float64
 3   TM5_10         79200 non-null  float64
 4   RF_10          79200 non-null  float64
 5   TD_10          79200 non-null  float64
 6   FF_10          79200 non-null  float64
 7   DD_10          79200 non-null  int64  
 8   DS_10          79200 non-null  float64
 9   GS_10          79200 non-null  float64
 10  SD_10          79200 non-null  float64
 11  LS_10          79200 non-null  int64  
 12  Stationshoehe  79200 non-null  float64
 13  geoBreite      79200 non-null  float64
 14  geoLaenge      79200 non-null  float64
dtypes: float64(12), int64(3)
memory usage: 9.7 MB


In [None]:
irrelevant_features = ['STATIONS_ID', 'Stationshoehe', 'geoBreite', 'geoLaenge']

stations_id = data[irrelevant_features[0]].iloc[0]
stations_height = data[irrelevant_features[1]].iloc[0]
latitude = data[irrelevant_features[2]].iloc[0]
longitude = data[irrelevant_features[3]].iloc[0]

data.drop(irrelevant_features, axis=1, inplace=True)

### Analysis of missing values

In [None]:
data.isna().sum()

In [None]:
missing_per_column_per_day = data.groupby(data.index.date).apply(lambda x: x.isna().sum())
missing_per_column_per_day[(missing_per_column_per_day != 0).any(axis=1)].drop(['STATIONS_ID', 'Stationshoehe', 'geoBreite', 'geoLaenge'], axis=1)

We can try to impute with KNNImputer

To help KNNImputer estimating the temporal saisonalities we add encoded temporal features.

In [None]:
data['hour_sin'] = np.sin(2 * np.pi * data.index.hour / 24)
data['hour_cos'] = np.cos(2 * np.pi * data.index.hour / 24)
data['month_sin'] = np.sin(2 * np.pi * data.index.month / 12)
data['month_cos'] = np.cos(2 * np.pi * data.index.month / 12)

We have to determine the optimal number of neighbors in beforehand.

In [None]:
# to be done

In [None]:
imputer = KNNImputer(n_neighbors=5)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(data)
df = pd.DataFrame(scaler.inverse_transform(imputer.fit_transform(df_scaled)), columns=data.columns, index=data.index)
df.drop(['hour_sin', 'hour_cos', 'month_sin', 'month_cos'], axis=1, inplace=True)

In [None]:
df

In [None]:
def plot_power_and_features(day: str,
                            plot_names: list,
                            features: list,
                            power: pd.Series,
                            synchronize_axes=True,
                            save_fig=False
                            ):

    day = pd.Timestamp(day)
    index_0 = power.index.get_loc(day)
    index_1 = power.index.get_loc(day + pd.Timedelta(days=1))
    date = str(features[0].index[index_0:index_1][0].date())

    fig, ax1 = plt.subplots(figsize=(10, 6))
    fontsize = 14
    lines = []
    title_suffix = ''

    # plot power
    line1, = ax1.plot(
    power[index_0:index_1],
    label="Power Output (W)",
    color="black",
    linewidth=2.0
    )
    lines.append(line1)

    # configure secondary y-axis
    ax1.set_xlabel("Time", fontsize=fontsize)
    ax1.set_ylabel("Power Output (W)", fontsize=fontsize)
    ax1.tick_features(axis='y', labelsize=fontsize)
    ax1.tick_features(axis='x', labelsize=fontsize-2)

    ax2 = ax1.twinx()
    # plot irradiance components
    for name, series in zip(plot_names, features):
        line, = ax2.plot(
            series[index_0:index_1],
            label=f"{name} (W/m$^2$)",
            linestyle='--',
            linewidth=2.0
        )
        lines.append(line)

    # configure primary y-axis
    ax2.set_ylabel("Energy flux density (W/m$^2$)", fontsize=fontsize)
    ax2.tick_features(axis='y', labelsize=fontsize)

    # Format x-axis to show only hours (HH)
    ax1.xaxis.set_major_locator(mdates.HourLocator(interval=1))
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H'))
    ticks = ax1.get_xticks()
    ax1.set_xticks(ticks[1:-1])


    # Synchronize y-axes
    if synchronize_axes:
        title_suffix = '(synched axes)'
        all_ghi_min = min([series[index_0:index_1].min() for series in features])
        all_ghi_max = max([series[index_0:index_1].max() for series in features])
        y_min = min(all_ghi_min, power[index_0:index_1].min())
        y_max = max(all_ghi_max, power[index_0:index_1].max())
        ax1.set_ylim(y_min, y_max)
        ax2.set_ylim(y_min, y_max)

    # legend
    lines.append(lines.pop(0))
    labels = [line.get_label() for line in lines]
    ax1.legend(lines, labels, loc="upper left", fontsize=fontsize)

    plt.title(f"Irradiance and Power Output on {date} {title_suffix}", fontsize=fontsize)
    fig.tight_layout()
    #plt.grid(True)
    if save_fig:
        save_path = 'figs/PV'
        os.makedirs(save_path, exist_ok=True)
        save_file = os.path.join(save_path, f'{date}.png')
        plt.savefig(save_file, dpi=300)
        plt.close()
    else:
        plt.show()

In [None]:
total_irradiance, cell_temperature = get_features(data=df,
                                                  features=features,
                                                  params=params)
total = total_irradiance['poa_global']
direct = total_irradiance['poa_direct']
diffuse = total_irradiance['poa_diffuse']
#sky_dhi = total_irradiance['poa_sky_diffuse']
#ground_dhi = total_irradiance['poa_ground_diffuse']
features = [total, direct, diffuse]

power = generate_pv_power(total_irradiance=total,
                          cell_temperature=cell_temperature,
                          params=params)

In [58]:
day = '2023-06-04'

plot_power_and_features(day=day,
                        plot_names=plot_names,
                        features=features,
                        power=power,
                        synchronize_axes=True,
                        save_fig=True)

plot_power_and_features(day=day,
                        plot_names=plot_names,
                        features=features,
                        power=power,
                        synchronize_axes=False,
                        save_fig=True)

In [None]:
for day in np.unique(df.index.date):
    plot_power_and_features(day=str(day),
                            plot_names=plot_names,
                            features=features,
                            power=power,
                            synchronize_axes=False,
                            save_fig=True)