# Generating Synthetic PV Power Time Series

In [1]:
import os
import yaml
import numpy as np
import pandas as pd
import pvlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

import utils, clean_data, generate_pv

In [2]:
config_path = "config.yaml"
config = utils.load_config(config_path)

dir = os.path.join(config['data']['raw_dir'], 'solar')
features = config['features']
params = config['wind_params']

pv_features, _ = clean_data.relevant_features(features=features)

plot_names = ['Total', 'Direct', 'Diffuse']

In [3]:
files = os.listdir(dir)
file = files[0]

data = pd.read_csv(os.path.join(dir, file))
data['timestamp'] = pd.to_datetime(data['timestamp'])
#df['timestamp'] = df['timestamp'].dt.tz_localize("UTC").dt.tz_convert("Europe/Berlin")
data.set_index('timestamp', inplace=True)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 94176 entries, 2023-07-24 00:00:00+00:00 to 2025-05-07 23:50:00+00:00
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   station_id      94176 non-null  int64  
 1   ghi             94119 non-null  float64
 2   dhi             94118 non-null  float64
 3   temperature_2m  94176 non-null  float64
 4   wind_speed      94115 non-null  float64
dtypes: float64(4), int64(1)
memory usage: 4.3 MB


### Analysis of missing values

In [5]:
data.isna().sum()

station_id         0
ghi               57
dhi               58
temperature_2m     0
wind_speed        61
dtype: int64

In [6]:
missing_per_column_per_day = data.groupby(data.index.date).apply(lambda x: x.isna().sum())
missing_per_column_per_day[(missing_per_column_per_day != 0).any(axis=1)]

Unnamed: 0,station_id,ghi,dhi,temperature_2m,wind_speed
2023-09-07,0,0,1,0,0
2023-09-11,0,0,0,0,3
2023-09-12,0,57,57,0,58


We can try to impute with KNNImputer

To help KNNImputer estimating the temporal saisonalities we add encoded temporal features.

In [7]:
data['hour_sin'] = np.sin(2 * np.pi * data.index.hour / 24)
data['hour_cos'] = np.cos(2 * np.pi * data.index.hour / 24)
data['month_sin'] = np.sin(2 * np.pi * data.index.month / 12)
data['month_cos'] = np.cos(2 * np.pi * data.index.month / 12)

We have to determine the optimal number of neighbors in beforehand.

In [8]:
# to be done

In [9]:
imputer = KNNImputer(n_neighbors=5)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(data)
df = pd.DataFrame(scaler.inverse_transform(imputer.fit_transform(df_scaled)), columns=data.columns, index=data.index)
df.drop(['hour_sin', 'hour_cos', 'month_sin', 'month_cos'], axis=1, inplace=True)

In [10]:
df

Unnamed: 0_level_0,station_id,ghi,dhi,temperature_2m,wind_speed
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-07-24 00:00:00+00:00,7374.0,0.0,0.0,18.0,3.6
2023-07-24 00:10:00+00:00,7374.0,0.0,0.0,17.9,3.9
2023-07-24 00:20:00+00:00,7374.0,0.0,0.0,17.9,4.1
2023-07-24 00:30:00+00:00,7374.0,0.0,0.0,17.8,3.4
2023-07-24 00:40:00+00:00,7374.0,0.0,0.0,17.9,3.4
...,...,...,...,...,...
2025-05-07 23:10:00+00:00,7374.0,0.0,0.0,10.7,2.4
2025-05-07 23:20:00+00:00,7374.0,0.0,0.0,10.6,2.1
2025-05-07 23:30:00+00:00,7374.0,0.0,0.0,10.3,2.4
2025-05-07 23:40:00+00:00,7374.0,0.0,0.0,10.4,2.5


In [11]:
def plot_power_and_features(day: str,
                            plot_names: list,
                            features: list,
                            power: pd.Series,
                            synchronize_axes=True,
                            save_fig=False
                            ):

    day = pd.Timestamp(day)
    index_0 = power.index.get_loc(day)
    index_1 = power.index.get_loc(day + pd.Timedelta(days=1))
    date = str(features[0].index[index_0:index_1][0].date())

    fig, ax1 = plt.subplots(figsize=(10, 6))
    fontsize = 14
    lines = []
    title_suffix = ''

    # plot power
    line1, = ax1.plot(
    power[index_0:index_1],
    label="Power Output (W)",
    color="black",
    linewidth=2.0
    )
    lines.append(line1)

    # configure secondary y-axis
    ax1.set_xlabel("Time", fontsize=fontsize)
    ax1.set_ylabel("Power Output (W)", fontsize=fontsize)
    ax1.tick_features(axis='y', labelsize=fontsize)
    ax1.tick_features(axis='x', labelsize=fontsize-2)

    ax2 = ax1.twinx()
    # plot irradiance components
    for name, series in zip(plot_names, features):
        line, = ax2.plot(
            series[index_0:index_1],
            label=f"{name} (W/m$^2$)",
            linestyle='--',
            linewidth=2.0
        )
        lines.append(line)

    # configure primary y-axis
    ax2.set_ylabel("Energy flux density (W/m$^2$)", fontsize=fontsize)
    ax2.tick_features(axis='y', labelsize=fontsize)

    # Format x-axis to show only hours (HH)
    ax1.xaxis.set_major_locator(mdates.HourLocator(interval=1))
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H'))
    ticks = ax1.get_xticks()
    ax1.set_xticks(ticks[1:-1])


    # Synchronize y-axes
    if synchronize_axes:
        title_suffix = '(synched axes)'
        all_ghi_min = min([series[index_0:index_1].min() for series in features])
        all_ghi_max = max([series[index_0:index_1].max() for series in features])
        y_min = min(all_ghi_min, power[index_0:index_1].min())
        y_max = max(all_ghi_max, power[index_0:index_1].max())
        ax1.set_ylim(y_min, y_max)
        ax2.set_ylim(y_min, y_max)

    # legend
    lines.append(lines.pop(0))
    labels = [line.get_label() for line in lines]
    ax1.legend(lines, labels, loc="upper left", fontsize=fontsize)

    plt.title(f"Irradiance and Power Output on {date} {title_suffix}", fontsize=fontsize)
    fig.tight_layout()
    #plt.grid(True)
    if save_fig:
        save_path = 'figs/PV'
        os.makedirs(save_path, exist_ok=True)
        save_file = os.path.join(save_path, f'{date}.png')
        plt.savefig(save_file, dpi=300)
        plt.close()
    else:
        plt.show()

In [12]:
total_irradiance, cell_temperature = generate_pv.get_features(data=df,
                                                  features=features,
                                                  params=params)
total = total_irradiance['poa_global']
direct = total_irradiance['poa_direct']
diffuse = total_irradiance['poa_diffuse']
#sky_dhi = total_irradiance['poa_sky_diffuse']
#ground_dhi = total_irradiance['poa_ground_diffuse']
features = [total, direct, diffuse]

power = generate_pv.generate_pv_power(total_irradiance=total,
                          cell_temperature=cell_temperature,
                          params=params)

KeyError: 'latitude'

In [58]:
day = '2023-06-04'

plot_power_and_features(day=day,
                        plot_names=plot_names,
                        features=features,
                        power=power,
                        synchronize_axes=True,
                        save_fig=True)

plot_power_and_features(day=day,
                        plot_names=plot_names,
                        features=features,
                        power=power,
                        synchronize_axes=False,
                        save_fig=True)

In [None]:
for day in np.unique(df.index.date):
    plot_power_and_features(day=str(day),
                            plot_names=plot_names,
                            features=features,
                            power=power,
                            synchronize_axes=False,
                            save_fig=True)