In [1]:
import pandas as pd
import numpy as np

In [41]:
# external vars:
df_train_variables = pd.read_parquet('df_train_variables.parquet')
df_test_variables = pd.read_parquet('df_test_variables.parquet')
df_train_variables = df_train_variables.drop(columns='delivery_time')
df_test_variables = df_test_variables.drop(columns='delivery_time')
print(df_train_variables.shape)
print(df_test_variables.shape)

(103968, 15)
(22556, 15)


In [42]:
# main table:
df_2023 = pd.read_parquet('train_2023.parquet', engine='pyarrow')
df_2024 = pd.read_parquet('test_2024.parquet', engine='pyarrow')
print(df_2023.shape)
print(df_2024.shape)

(2712765, 8)
(1712420, 8)


In [44]:
df_2023['ExecutionTime'] = df_2023['ExecutionTime'].dt.tz_localize(None)
df_2024['ExecutionTime'] = df_2024['ExecutionTime'].dt.tz_localize(None)
df_2023 = df_2023.groupby(['ExecutionTime','delivery_time'], as_index=False).mean(numeric_only=True)
df_2024 = df_2024.groupby(['ExecutionTime','delivery_time'], as_index=False).mean(numeric_only=True)
print(df_2023.shape)
print(df_2024.shape)

(2712451, 6)
(1712420, 6)


In [45]:

df_2023_long = pd.merge(df_2023, df_train_variables, left_on='delivery_time', right_on='timestamp', how='left')
df_2024_long = pd.merge(df_2024, df_test_variables, left_on='delivery_time', right_on='timestamp', how='left')

print(df_2023_long.shape)
print(df_2024_long.shape)

(2712451, 21)
(1712420, 21)


In [27]:
pd.set_option('display.max_columns', None)
df_2023_long.head()

Unnamed: 0,ExecutionTime,delivery_time,high,low,close,volume,timestamp,day_ahead_price,load_forecast,load_actual,generation_solar_actual,generation_wind_onshore_actual,generation_wind_offshore_actual,generation_wind_onshore_forecast,generation_wind_offshore_forecast,generation_solar_forecast,temperature_rounded,imported,exported,imported_is_zero,exported_is_zero
0,2023-01-01,2023-01-01 00:15:00,2.01,-8.83,0.1,0.0,2023-01-01 00:15:00,-29.95,42577.0,38771.0,2.0,28938.0,2974.0,35870.0,3477.0,0.0,10.0725,0.0,5902.8,0,0
1,2023-01-01,2023-01-01 00:45:00,-3.74,-14.39,-10.0,0.0,2023-01-01 00:45:00,-30.0,41302.0,38457.0,2.0,28786.0,3258.0,35659.0,3486.0,0.0,10.2975,0.0,5902.8,0,0
2,2023-01-01,2023-01-01 01:00:00,4.8,-5.51,-3.96,0.0,2023-01-01 01:00:00,26.81,40841.0,38119.0,2.0,29510.0,3142.0,35545.0,3382.0,0.0,10.41,0.0,7699.4,1,0
3,2023-01-01,2023-01-01 01:30:00,-4.8,-16.09,-5.31,48.4,2023-01-01 01:30:00,-29.94,39746.0,38419.0,2.0,29392.0,3775.0,35365.0,3395.0,0.0,10.58,0.0,7699.4,0,0
4,2023-01-01,2023-01-01 01:45:00,-1.09,-15.97,-2.2,47.7,2023-01-01 01:45:00,-29.95,39249.0,38072.0,2.0,29923.0,3833.0,35319.0,3397.0,0.0,10.665,0.0,7699.4,0,0


In [None]:
# Lasso & Lightgbm
# target: target hour price, target_hour_+1h, target_hour_+2h,target_hour_+3h
# feature: 
# 1. price: lag1, lag2-lag24 price_range, lag5-lag24 price_trend, lag5-lag24 price_std
# 2. volume: lag1, lag2-lag24 price_range, lag5-lag24 price_trend, lag5-lag24 price_std
# 3. wind/load/solar forecast: static feature: delivery_time T forecast, (T-1d actual - T-1d forecast), T-2 ~ T-4 trend


In [None]:
from numba import njit
@njit
def slope_numba(x):
    n = 0
    # 统计非 NaN
    for i in range(x.size):
        if not np.isnan(x[i]): n += 1
    if n < 2: 
        return np.nan
    # 收集有效 y 与等间距 x=0..n-1
    y = np.empty(n, dtype=np.float64)
    k = 0
    for i in range(x.size):
        if not np.isnan(x[i]):
            y[k] = x[i]; k += 1
    t = np.arange(n, dtype=np.float64)
    tm = (n - 1) / 2.0
    ym = y.mean()
    sxx = ((n*(n**2 - 1))/12.0) 
    if sxx == 0:
        return 0.0
    num = np.sum((t - tm) * (y - ym))
    return num / sxx

In [143]:
def make_lasso_wide(
    df_raw,
    price_col='close',
    vol_col='volume',
    exec_col='ExecutionTime',
    dtime_col='delivery_time',
    min_history_hours=6,    # 最少历史小时
    horizon_hours=3         # 预测未来多少小时
):
    lag_points = min_history_hours * 4  # 15min 粒度 → 每小时4个点
    
    df = fill_missing_quarters(df_raw, exec_col=exec_col, dtime_col=dtime_col)
    df[exec_col]  = pd.to_datetime(df[exec_col])
    df[dtime_col] = pd.to_datetime(df[dtime_col])

    out_list = []

    for dtime, g in df.sort_values(exec_col).groupby(dtime_col, sort=False):
        g = g.sort_values(exec_col).reset_index(drop=True)

        p = g[price_col].astype(float)
        v = g[vol_col].astype(float)

        # lag1
        g['price_lag1']  = p.shift(1)
        g['volume_lag1'] = v.shift(1)

        # range: lag2 ~ lag_points
        g['price_range']  = p.shift(2).rolling(lag_points - 1, min_periods=int(0.5 * lag_points))\
                               .agg(lambda x: np.nanmax(x) - np.nanmin(x))
        g['volume_range'] = v.shift(2).rolling(lag_points - 1, min_periods=int(0.5 * lag_points))\
                               .agg(lambda x: np.nanmax(x) - np.nanmin(x))

        # std: lag2 ~ lag_points
        g['price_std']  = p.shift(2).rolling(lag_points - 1, min_periods=int(0.5 * lag_points)).std()
        g['volume_std'] = v.shift(2).rolling(lag_points - 1, min_periods=int(0.5 * lag_points)).std()

        # trend: lag2 ~ lag_points
        win = lag_points - 1
        mp  = max(1, 10)
        g['price_trend']  = p.shift(2).rolling(win, min_periods=mp)\
                              .apply(slope_numba, raw=True, engine='numba')
        g['volume_trend'] = v.shift(2).rolling(win, min_periods=mp)\
                              .apply(slope_numba, raw=True, engine='numba')

        # 样本过滤：≥ min_history_hours 且整点
        g['is_hour'] = g[exec_col].dt.minute.eq(0)
        first_exec  = g[exec_col].min()
        cutoff_time = (first_exec.floor('h') + pd.Timedelta(hours=min_history_hours))
        samples = g[g['is_hour'] & (g[exec_col] >= cutoff_time)].copy()
        if samples.empty:
            continue

        # ===== 新增：同小时“其他三个合同”在 t-15min 的价格（无聚合、无“最近”优先）=====
        samples = samples.reset_index(drop=True)
        samples['row_id'] = np.arange(len(samples))
        samples['merge_exec'] = samples[exec_col] - pd.Timedelta(minutes=15)

        # 找同一小时内的“其它合同”（排除当前 dtime）
        same_hour_others = df[
            (df[dtime_col].dt.floor('h') == dtime.floor('h')) &
            (df[dtime_col] != dtime)
        ][[exec_col, dtime_col, price_col]].rename(
            columns={exec_col: 'lookup_exec', dtime_col: 'nbr_dtime', price_col: 'nbr_price'}
        )

        # 以 t-15 的时刻与邻近合同的执行时刻精确匹配
        tmp = samples[['row_id', dtime_col, 'merge_exec']].merge(
            same_hour_others, left_on='merge_exec', right_on='lookup_exec', how='left'
        )

        if not tmp.empty:
            # 按邻近合同的 delivery_time 升序，给出一个稳定的列位次 0/1/2
            tmp = tmp.sort_values(['row_id', 'nbr_dtime'])
            tmp['k'] = tmp.groupby('row_id').cumcount()  # 0,1,2,...
            # 只取前 3 个（理论上就 3 个）
            tmp = tmp[tmp['k'] <= 2]
            # 展开成三列
            pivot = tmp.pivot_table(index='row_id', columns='k', values='nbr_price', aggfunc='first')
            pivot = pivot.reindex(columns=[0,1,2])  # 若不足3个，补列
            pivot.columns = ['nbr_price_t15_1','nbr_price_t15_2','nbr_price_t15_3']
            samples = samples.join(pivot, on='row_id')
        else:
            samples[['nbr_price_t15_1','nbr_price_t15_2','nbr_price_t15_3']] = np.nan

        samples = samples.drop(columns=['row_id','merge_exec'])

        # 目标列（小时级）
        hourly = g[g['is_hour']][[exec_col, price_col]].sort_values(exec_col).reset_index(drop=True)
        hourly = hourly.rename(columns={price_col: 'target_hour_price'})
        for k in range(1, horizon_hours + 1):
            hourly[f'target_hour_+{k}h'] = hourly['target_hour_price'].shift(-k)

        feat = samples.merge(hourly, on=exec_col, how='left')

        # 最终保留列
        keep_cols = [
            dtime_col, exec_col,
            'price_lag1','volume_lag1',
            'price_range','volume_range',
            'price_trend','volume_trend',
            'price_std','volume_std',
            'nbr_price_t15_1','nbr_price_t15_2','nbr_price_t15_3',  # 新增三列
            'target_hour_price'
        ] + [f'target_hour_+{k}h' for k in range(1, horizon_hours + 1)]

        feat = feat[keep_cols]
        out_list.append(feat)

    wide = pd.concat(out_list, ignore_index=True) if out_list else pd.DataFrame()
    target_cols = ['target_hour_price'] + [f'target_hour_+{k}h' for k in range(1, horizon_hours + 1)]
    wide = wide.dropna(subset=target_cols)
    return wide


In [144]:
df_2023_wide = make_lasso_wide(df_2023_long)

In [145]:
df_2023_wide

Unnamed: 0,delivery_time,ExecutionTime,price_lag1,volume_lag1,price_range,volume_range,price_trend,volume_trend,price_std,volume_std,nbr_price_t15_1,nbr_price_t15_2,nbr_price_t15_3,target_hour_price,target_hour_+1h,target_hour_+2h,target_hour_+3h
0,2023-01-01 11:15:00,2023-01-01 06:00:00,-1.54,0.400,2.76,1.725,0.002692,0.068407,0.982816,0.487356,-4.86,-1.90,2.62,-10.60,-8.41,-0.20,-1.77
1,2023-01-01 11:15:00,2023-01-01 07:00:00,-2.14,5.050,10.51,23.575,-0.426319,0.847940,3.083928,6.604848,8.69,-3.11,-0.12,-8.41,-0.20,-1.77,-10.00
2,2023-01-01 11:15:00,2023-01-01 08:00:00,-8.81,33.425,10.51,56.300,-0.335643,1.527143,3.213064,14.938830,-11.23,3.93,3.93,-0.20,-1.77,-10.00,-10.00
6,2023-01-01 21:00:00,2023-01-01 06:00:00,52.03,0.000,0.00,0.375,0.000000,-0.004076,0.000000,0.078193,50.00,38.09,29.46,52.03,52.03,45.80,45.50
7,2023-01-01 21:00:00,2023-01-01 07:00:00,52.03,0.000,0.00,0.000,0.000000,0.000000,0.000000,0.000000,50.00,38.09,29.46,52.03,45.80,45.50,53.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493659,2023-12-24 07:00:00,2023-12-24 02:00:00,-3.40,3.775,7.99,23.850,0.010068,0.059274,2.420001,5.196885,0.00,4.90,11.27,-4.00,-4.00,-1.01,-3.80
493663,2023-12-24 09:00:00,2023-12-24 02:00:00,-0.99,0.100,10.55,19.575,0.377312,-0.225877,3.598979,4.721563,4.15,4.84,4.23,-1.81,-5.47,-3.07,-0.23
493664,2023-12-24 09:00:00,2023-12-24 03:00:00,-1.88,0.025,10.55,19.575,0.315267,-0.200272,3.459151,4.573748,0.01,6.44,8.29,-5.47,-3.07,-0.23,1.51
493665,2023-12-24 09:00:00,2023-12-24 04:00:00,-1.96,0.000,6.02,15.550,-0.083043,0.081744,1.208526,3.892475,1.81,4.64,3.23,-3.07,-0.23,1.51,-1.42


In [149]:
df_2024_wide = make_lasso_wide(df_2024_long)

In [55]:
df_2023_long.columns

Index(['ExecutionTime', 'delivery_time', 'high', 'low', 'close', 'volume',
       'timestamp', 'day_ahead_price', 'load_forecast', 'load_actual',
       'generation_solar_actual', 'generation_wind_onshore_actual',
       'generation_wind_offshore_actual', 'generation_wind_onshore_forecast',
       'generation_wind_offshore_forecast', 'generation_solar_forecast',
       'temperature_rounded', 'imported', 'exported', 'imported_is_zero',
       'exported_is_zero'],
      dtype='object')

In [59]:
df_2023_long['wind_forecast'] = df_2023_long['generation_wind_offshore_forecast'] + df_2023_long['generation_wind_onshore_forecast']
df_2023_long['wind_actual'] = df_2023_long['generation_wind_offshore_actual'] + df_2023_long['generation_wind_onshore_actual']
df_2023_long = df_2023_long.rename(columns={"generation_solar_forecast": "solar_forecast"})

df_2024_long['wind_forecast'] = df_2024_long['generation_wind_offshore_forecast'] + df_2024_long['generation_wind_onshore_forecast']
df_2024_long['wind_actual'] = df_2024_long['generation_wind_offshore_actual'] + df_2024_long['generation_wind_onshore_actual']
df_2024_long = df_2024_long.rename(columns={"generation_solar_forecast": "solar_forecast"})

In [62]:
df_train_variables['wind_forecast'] = df_train_variables['generation_wind_offshore_forecast'] + df_train_variables['generation_wind_onshore_forecast']
df_train_variables['wind_actual'] = df_train_variables['generation_wind_offshore_actual'] + df_train_variables['generation_wind_onshore_actual']
df_train_variables = df_train_variables.rename(columns={"generation_solar_forecast": "solar_forecast"})

df_test_variables['wind_forecast'] = df_test_variables['generation_wind_offshore_forecast'] + df_test_variables['generation_wind_onshore_forecast']
df_test_variables['wind_actual'] = df_test_variables['generation_wind_offshore_actual'] + df_test_variables['generation_wind_onshore_actual']
df_test_variables = df_test_variables.rename(columns={"generation_solar_forecast": "solar_forecast"})

In [64]:
df_train_variables_2023 = df_train_variables[df_train_variables['timestamp']>='2023-01-01 00:00:00']


In [65]:
df_train_variables_2023

Unnamed: 0,timestamp,day_ahead_price,load_forecast,load_actual,generation_solar_actual,generation_wind_onshore_actual,generation_wind_offshore_actual,generation_wind_onshore_forecast,generation_wind_offshore_forecast,solar_forecast,temperature_rounded,imported,exported,imported_is_zero,exported_is_zero,wind_forecast,wind_actual
69504,2023-01-01 00:00:00,75.01,43046.0,39096.0,2.0,28886.0,2737.0,34784.0,3469.0,0.0,9.9600,0.0,5902.8,1,0,38253.0,31623.0
69505,2023-01-01 00:15:00,-29.95,42577.0,38771.0,2.0,28938.0,2974.0,35870.0,3477.0,0.0,10.0725,0.0,5902.8,0,0,39347.0,31912.0
69506,2023-01-01 00:30:00,-29.98,41937.0,38640.0,1.0,29346.0,3268.0,35785.0,3481.0,0.0,10.1850,0.0,5902.8,0,0,39266.0,32614.0
69507,2023-01-01 00:45:00,-30.00,41302.0,38457.0,2.0,28786.0,3258.0,35659.0,3486.0,0.0,10.2975,0.0,5902.8,0,0,39145.0,32044.0
69508,2023-01-01 01:00:00,26.81,40841.0,38119.0,2.0,29510.0,3142.0,35545.0,3382.0,0.0,10.4100,0.0,7699.4,1,0,38927.0,32652.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103963,2023-12-25 22:45:00,5.04,43088.0,43530.0,3.0,28958.0,6813.0,33823.0,5298.0,0.0,10.0775,0.0,1269.6,0,0,39121.0,35771.0
103964,2023-12-25 23:00:00,32.77,42401.0,43107.0,3.0,29217.0,6896.0,34167.0,5272.0,0.0,10.0800,0.0,2689.4,1,0,39439.0,36113.0
103965,2023-12-25 23:15:00,15.08,41926.0,42284.0,3.0,29826.0,6832.0,34548.0,5260.0,0.0,10.0625,0.0,2689.4,0,0,39808.0,36658.0
103966,2023-12-25 23:30:00,5.07,41316.0,41532.0,3.0,29831.0,6691.0,34919.0,5258.0,0.0,10.0450,0.0,2689.4,0,0,40177.0,36522.0


In [None]:
def make_static_weather_features(df_raw, dtime_col='timestamp'):

    df = df_raw.copy()
    df[dtime_col] = pd.to_datetime(df[dtime_col])
    df = df.sort_values(dtime_col)

    # 用时间索引便于 freq 位移
    dfi = df.set_index(dtime_col)

    # 初始化输出（按原始 index 对齐，不补齐）
    out = pd.DataFrame(index=dfi.index)
    
    base_cols = [
        'day_ahead_price',
        'temperature_rounded',
        'imported', 'exported',
        'imported_is_zero', 'exported_is_zero',
        'load_forecast', 'wind_forecast', 'solar_forecast'
    ]
    for c in base_cols:
        if c in dfi.columns:
            out[c] = dfi[c]

    # 2) T-1d 的 (actual - forecast)，用“时间位移 1 天”的方式，避免依赖等间隔
    #    注意：这里不补齐，如果缺少前一日同刻，就会是 NaN（符合“无填充”的要求）
    for var in ['load', 'wind', 'solar']:
        act_col = f'{var}_actual'
        fc_col  = f'{var}_forecast'
        if act_col in dfi.columns and fc_col in dfi.columns:
            diff = (dfi[act_col].astype('float64') - dfi[fc_col].astype('float64'))
            out[f'{var}_act_minus_fc_prevday'] = diff.shift(1, freq='1D')
        else:
            out[f'{var}_act_minus_fc_prevday'] = np.nan

    # 3) lag2~lag4 趋势：对 *_forecast 做 shift(2) 后 rolling(3) 斜率
    #    不补齐 → 窗口里不足2个有效点就会是 NaN
    for var in ['load', 'wind', 'solar']:
        fc_col = f'{var}_forecast'
        out_col = f'{var}_forecast_trend_lag2_4'
        if fc_col in dfi.columns:
            s2 = dfi[fc_col].astype('float64').shift(2)  # t-15min 开始，拿 t-15, t-30, t-45/60 三个点
            trend = s2.rolling(3, min_periods=2).apply(slope_numba, raw=True, engine='numba')
            out[out_col] = trend
        else:
            out[out_col] = np.nan

    return out.reset_index().rename(columns={dtime_col: dtime_col})


In [None]:
df_2023_variables = make_static_weather_features(df_train_variables_2023,
                                 dtime_col='timestamp')
df_2024_variables = make_static_weather_features(df_test_variables,
                                 dtime_col='timestamp')

In [70]:
df_2023_variables

Unnamed: 0,timestamp,day_ahead_price,temperature_rounded,imported,exported,imported_is_zero,exported_is_zero,load_forecast,wind_forecast,solar_forecast,load_act_minus_fc_prevday,wind_act_minus_fc_prevday,solar_act_minus_fc_prevday,load_forecast_trend_lag2_4,wind_forecast_trend_lag2_4,solar_forecast_trend_lag2_4
0,2023-01-01 00:00:00,75.01,9.9600,0.0,5902.8,1,0,43046.0,38253.0,0.0,,,,,,
1,2023-01-01 00:15:00,-29.95,10.0725,0.0,5902.8,0,0,42577.0,39347.0,0.0,,,,,,
2,2023-01-01 00:30:00,-29.98,10.1850,0.0,5902.8,0,0,41937.0,39266.0,0.0,,,,,,
3,2023-01-01 00:45:00,-30.00,10.2975,0.0,5902.8,0,0,41302.0,39145.0,0.0,,,,-469.0,1094.0,0.0
4,2023-01-01 01:00:00,26.81,10.4100,0.0,7699.4,1,0,40841.0,38927.0,0.0,,,,-554.5,506.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34459,2023-12-25 22:45:00,5.04,10.0775,0.0,1269.6,0,0,43088.0,39121.0,0.0,1578.0,-3602.0,,-35.0,255.5,0.0
34460,2023-12-25 23:00:00,32.77,10.0800,0.0,2689.4,1,0,42401.0,39439.0,0.0,1757.0,-4269.0,,-295.0,437.5,0.0
34461,2023-12-25 23:15:00,15.08,10.0625,0.0,2689.4,0,0,41926.0,39808.0,0.0,1454.0,-5094.0,,-499.5,385.0,0.0
34462,2023-12-25 23:30:00,5.07,10.0450,0.0,2689.4,0,0,41316.0,40177.0,0.0,1233.0,-5790.0,,-703.5,327.5,0.0


In [150]:
df_2023_wide = df_2023_wide.merge(df_2023_variables, left_on='delivery_time', right_on='timestamp', how='left')
df_2024_wide = df_2024_wide.merge(df_2024_variables, left_on='delivery_time', right_on='timestamp', how='left')


In [115]:
def add_temproal_features(df):
    df['day_of_year_sin'] = np.sin(2 * np.pi * df['delivery_time'].dt.dayofyear  / 365)
    df['day_of_year_cos'] = np.cos(2 * np.pi * df['delivery_time'].dt.dayofyear / 365)
    df['delivery_hour'] = df['delivery_time'].dt.hour
    df['delivery_hour_sin'] = np.sin(df['delivery_time'].dt.hour * (2 * np.pi / 24))
    df['delivery_hour_cos'] = np.cos(df['delivery_time'].dt.hour * (2 * np.pi / 24))
    df['delivery_weekday_sin'] = np.sin(df['delivery_time'].dt.weekday * (2 * np.pi / 7))
    df['delivery_weekday_cos'] = np.cos(df['delivery_time'].dt.weekday * (2 * np.pi / 7))
    df['delivery_month_sin'] = np.sin(df['delivery_time'].dt.month * (2 * np.pi / 12))
    df['delivery_month_cos'] = np.cos(df['delivery_time'].dt.month * (2 * np.pi / 12))
    df['time_to_expiry_h'] = (df['delivery_time'] - df['ExecutionTime']).dt.total_seconds() / 3600.0
    return df

In [151]:
df_2023_wide = add_temproal_features(df_2023_wide)
df_2024_wide = add_temproal_features(df_2024_wide)

In [152]:
df_2023_wide = df_2023_wide.drop(columns='solar_act_minus_fc_prevday')
df_2024_wide = df_2024_wide.drop(columns='solar_act_minus_fc_prevday')

In [117]:
df_2023_wide.columns

Index(['delivery_time', 'ExecutionTime', 'price_lag1', 'volume_lag1',
       'price_range', 'volume_range', 'price_trend', 'volume_trend',
       'price_std', 'volume_std', 'target_hour_price', 'target_hour_+1h',
       'target_hour_+2h', 'target_hour_+3h', 'timestamp', 'day_ahead_price',
       'temperature_rounded', 'imported', 'exported', 'imported_is_zero',
       'exported_is_zero', 'load_forecast', 'wind_forecast', 'solar_forecast',
       'load_act_minus_fc_prevday', 'wind_act_minus_fc_prevday',
       'load_forecast_trend_lag2_4', 'wind_forecast_trend_lag2_4',
       'solar_forecast_trend_lag2_4', 'day_of_week', 'delivery_hour',
       'delivery_hour_sin', 'delivery_hour_cos', 'delivery_weekday_sin',
       'delivery_weekday_cos', 'delivery_month_sin', 'delivery_month_cos',
       'time_to_expiry_h', 'day_of_year_sin', 'day_of_year_cos'],
      dtype='object')

In [85]:
df_2023_wide.isnull().sum()

delivery_time                    0
ExecutionTime                    0
price_lag1                     148
volume_lag1                    148
price_range                    127
volume_range                   127
price_trend                     78
volume_trend                    78
price_std                      127
volume_std                     127
target_hour_price                0
target_hour_+1h                  0
target_hour_+2h                  0
target_hour_+3h                  0
timestamp                        0
day_ahead_price                  0
temperature_rounded              0
imported                         0
exported                         0
imported_is_zero                 0
exported_is_zero                 0
load_forecast                    0
wind_forecast                    0
solar_forecast                   0
load_act_minus_fc_prevday      443
wind_act_minus_fc_prevday      443
load_forecast_trend_lag2_4       0
wind_forecast_trend_lag2_4       0
solar_forecast_trend

In [153]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error

# ---------- Robust scaler for price-like features ----------
    
class MedianMadArcsinh75(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        arr = X.to_numpy(dtype=float)
        arr_filled = np.where(np.isnan(arr), 0, arr)
        self.median_ = np.median(arr_filled, axis=0)
        mad_raw = np.median(np.abs(arr_filled - self.median_), axis=0)
        c75 = 0.67448975
        mad_adj = mad_raw / c75
        mad_adj[mad_adj == 0] = 1.0
        self.mad_ = mad_adj
        # 记录输入列名，供 get_feature_names_out 使用
        if hasattr(X, "columns"):
            self.feature_names_in_ = np.asarray(X.columns, dtype=object)
        else:
            self.feature_names_in_ = np.asarray([f"x{i}" for i in range(arr.shape[1])], dtype=object)
        return self
        
    def transform(self, X):
        arr = X.to_numpy(dtype=float)
        arr = np.where(np.isnan(arr), 0, arr)
        Z = (arr - self.median_) / self.mad_
        return np.arcsinh(Z)

    # 关键：让 ColumnTransformer 能拿到列名
    def get_feature_names_out(self, input_features=None):
        if input_features is None:
            input_features = getattr(self, "feature_names_in_", None)
        if input_features is None:
            input_features = [f"x{i}" for i in range(len(self.median_))]
        return np.asarray([f"arcsinh_mad__{c}" for c in input_features], dtype=object)


# ---------- Preprocessor (no model inside) ----------
def make_preprocessor(df, target_cols):
    raw_time_cols = [c for c in ['delivery_time','ExecutionTime','timestamp'] if c in df.columns]

    passthrough_cols = [c for c in [
        'imported_is_zero','exported_is_zero',
        'delivery_hour_sin','delivery_hour_cos',
        'delivery_weekday_sin','delivery_weekday_cos',
        'delivery_month_sin','delivery_month_cos',
        'time_to_expiry_h','day_of_year_sin','day_of_year_cos'
    ] if c in df.columns]

    price_feature_cols = [c for c in ['price_lag1','day_ahead_price', 'nbr_price_t15_1', 'nbr_price_t15_2','nbr_price_t15_3'] if c in df.columns]

    drop_set = set(target_cols) | set(passthrough_cols) | set(price_feature_cols) | set(raw_time_cols)
    numeric_cols = [c for c in df.columns if c not in drop_set and pd.api.types.is_numeric_dtype(df[c])]

    transformers = []
    if price_feature_cols:
        transformers.append(('price_mad', MedianMadArcsinh75(), price_feature_cols))
    if numeric_cols:
        transformers.append(('zscore', StandardScaler(), numeric_cols))
    if passthrough_cols:
        transformers.append(('keep', 'passthrough', passthrough_cols))

    pre = ColumnTransformer(transformers=transformers, remainder='drop', sparse_threshold=0.0)
    try:
        pre.set_output(transform='pandas')   # 保留列名
    except Exception:
        pass
    return pre



In [154]:
df_train_lasso = df_2023_wide.dropna(subset=target_cols)  # 目标列 NaN 直接 drop
df_train_lasso = df_train_lasso.fillna(0)  # 其他 NaN 填 0
df_test_lasso = df_2024_wide.dropna(subset=target_cols)  # 目标列 NaN 直接 drop
df_test_lasso = df_test_lasso.fillna(0)



In [None]:
# ---------- Metrics ----------
def smape(y_true, y_pred):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8))

def mape(y_true, y_pred):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    return 100 * np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8)))

# ---------- Time split (8:2 by delivery date) ----------
def train_val_split_by_date(df, date_col='delivery_time', train_ratio=0.8):
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    dates = np.sort(df[date_col].dt.normalize().unique())
    split_idx = int(len(dates) * train_ratio)
    tr_dates, va_dates = dates[:split_idx], dates[split_idx:]  # 修正：不要 +1
    return (
        df[df[date_col].dt.normalize().isin(tr_dates)],
        df[df[date_col].dt.normalize().isin(va_dates)]
    )

# ---------- Per-target metrics ----------
def eval_multioutput(y_true_df, y_pred_nd, target_cols):
    out = {}
    for i, col in enumerate(target_cols):
        yt = y_true_df.iloc[:, i].to_numpy()
        yp = y_pred_nd[:, i]
        out[col] = {
            'SMAPE': smape(yt, yp),
            'MAE':   mean_absolute_error(yt, yp),
            'MAPE':  mape(yt, yp)
        }
    res = pd.DataFrame(out).T
    avg = res.mean(numeric_only=True).to_frame().T
    avg.index = ['AVERAGE']
    return pd.concat([res, avg], axis=0)

# ---------- Feature importance ----------
def get_feature_importance(transformed_feature_names, estimator, target_cols):
    names = list(transformed_feature_names)
    coef_matrix = estimator.coef_  # [n_targets, n_features]
    if len(names) != coef_matrix.shape[1]:  # 兜底
        names = [f"f{i}" for i in range(coef_matrix.shape[1])]
    imp = pd.DataFrame(coef_matrix.T, index=names, columns=target_cols)
    imp["mean_importance"] = imp.mean(axis=1)
    return imp.sort_values("mean_importance", ascending=False)



# ---------- Train (no-CV), pick alpha on validation, final test ----------
def train_val_test_lasso_manual_alpha(
    df_trainvalid, df_test, target_cols, preprocess_pipeline,
    alphas=None, max_iter=5000
):
    if alphas is None:
        alphas = np.concatenate([np.logspace(-3, 0, 7), np.array([1.5, 2.0, 3.0])])

    train_df, val_df = train_val_split_by_date(df_trainvalid, 'delivery_time', 0.8)

    X_train, y_train = train_df.drop(columns=target_cols), train_df[target_cols]
    X_val,   y_val   = val_df.drop(columns=target_cols),   val_df[target_cols]

    pre = clone(preprocess_pipeline)
    pre.fit(X_train)
    Xtr = pre.transform(X_train)
    Xva = pre.transform(X_val)

    best_alpha, best_score, best_est = None, np.inf, None
    for a in alphas:
        est = Lasso(alpha=a, max_iter=max_iter)
        est.fit(Xtr, y_train)               # multi-output
        score = smape(y_val.values, est.predict(Xva))
        if score < best_score:
            best_alpha, best_score, best_est = a, score, est

    tv_df = pd.concat([train_df, val_df], axis=0)
    X_tv, y_tv = tv_df.drop(columns=target_cols), tv_df[target_cols]
    pre_final = clone(preprocess_pipeline)
    pre_final.fit(X_tv)
    Xtvt = pre_final.transform(X_tv)

    final_est = Lasso(alpha=best_alpha, max_iter=max_iter)
    final_est.fit(Xtvt, y_tv)

    X_test, y_test = df_test.drop(columns=target_cols), df_test[target_cols]
    Xte = pre_final.transform(X_test)
    y_pred_test = final_est.predict(Xte)

    val_results  = eval_multioutput(y_val, best_est.predict(Xva), target_cols)
    test_results = eval_multioutput(y_test, y_pred_test, target_cols)
    
    tf_names = Xtvt.columns if hasattr(Xtvt, "columns") else [f"f{i}" for i in range(Xtvt.shape[1])]
    fi_df = get_feature_importance(tf_names, final_est, target_cols)

    class FittedPipeline:
        def __init__(self, pre, est, target_cols):
            self.pre = pre; self.est = est; self.target_cols = target_cols
        def predict(self, X_df):
            return self.est.predict(self.pre.transform(X_df))

    return {
        'best_alpha': best_alpha,
        'val_results': val_results,
        'test_results': test_results,
        'feature_importance': fi_df,
        'model': FittedPipeline(pre_final, final_est, target_cols)
    }

In [156]:
df_train_lasso.columns

Index(['delivery_time', 'ExecutionTime', 'price_lag1', 'volume_lag1',
       'price_range', 'volume_range', 'price_trend', 'volume_trend',
       'price_std', 'volume_std', 'nbr_price_t15_1', 'nbr_price_t15_2',
       'nbr_price_t15_3', 'target_hour_price', 'target_hour_+1h',
       'target_hour_+2h', 'target_hour_+3h', 'timestamp', 'day_ahead_price',
       'temperature_rounded', 'imported', 'exported', 'imported_is_zero',
       'exported_is_zero', 'load_forecast', 'wind_forecast', 'solar_forecast',
       'load_act_minus_fc_prevday', 'wind_act_minus_fc_prevday',
       'load_forecast_trend_lag2_4', 'wind_forecast_trend_lag2_4',
       'solar_forecast_trend_lag2_4', 'day_of_year_sin', 'day_of_year_cos',
       'delivery_hour', 'delivery_hour_sin', 'delivery_hour_cos',
       'delivery_weekday_sin', 'delivery_weekday_cos', 'delivery_month_sin',
       'delivery_month_cos', 'time_to_expiry_h'],
      dtype='object')

In [158]:
df_test_lasso.columns

Index(['delivery_time', 'ExecutionTime', 'price_lag1', 'volume_lag1',
       'price_range', 'volume_range', 'price_trend', 'volume_trend',
       'price_std', 'volume_std', 'nbr_price_t15_1', 'nbr_price_t15_2',
       'nbr_price_t15_3', 'target_hour_price', 'target_hour_+1h',
       'target_hour_+2h', 'target_hour_+3h', 'timestamp', 'day_ahead_price',
       'temperature_rounded', 'imported', 'exported', 'imported_is_zero',
       'exported_is_zero', 'load_forecast', 'wind_forecast', 'solar_forecast',
       'load_act_minus_fc_prevday', 'wind_act_minus_fc_prevday',
       'load_forecast_trend_lag2_4', 'wind_forecast_trend_lag2_4',
       'solar_forecast_trend_lag2_4', 'day_of_year_sin', 'day_of_year_cos',
       'delivery_hour', 'delivery_hour_sin', 'delivery_hour_cos',
       'delivery_weekday_sin', 'delivery_weekday_cos', 'delivery_month_sin',
       'delivery_month_cos', 'time_to_expiry_h'],
      dtype='object')

In [157]:
df_train_lasso.to_parquet('df_train_2023.parquet', index=False)
df_test_lasso.to_parquet('df_test_2023.parquet', index=False)


In [159]:
target_cols = ['target_hour_price','target_hour_+1h','target_hour_+2h','target_hour_+3h']
preprocess_pipeline = make_preprocessor(df_train_lasso, target_cols)

# 4) 训练+调参（用验证集挑 alpha），并在测试集上做最终评估
out = train_val_test_lasso_manual_alpha(
    df_trainvalid=df_train_lasso,
    df_test=df_test_lasso,
    target_cols=target_cols,
    preprocess_pipeline=preprocess_pipeline,
    alphas=np.r_[np.linspace(0.005, 0.05, 6)]
)


In [161]:
print("Best alpha:", out['best_alpha'])
print(out['val_results'])
print(out['test_results'])


Best alpha: 0.05
                       SMAPE        MAE          MAPE
target_hour_price  14.899224   8.083722  2.173961e+07
target_hour_+1h    15.749533   8.638833  1.843724e+07
target_hour_+2h    16.976187   9.443975  1.562411e+07
target_hour_+3h    18.432069  10.352698  5.007716e+07
AVERAGE            16.514253   9.129807  2.646953e+07
                       SMAPE        MAE          MAPE
target_hour_price  25.891339  10.191759  1.271703e+07
target_hour_+1h    26.844310  11.027016  1.473885e+07
target_hour_+2h    28.139026  12.081313  2.160005e+07
target_hour_+3h    29.422156  13.182703  3.474173e+07
AVERAGE            27.574208  11.620698  2.094941e+07


In [162]:
out['feature_importance']

Unnamed: 0,target_hour_price,target_hour_+1h,target_hour_+2h,target_hour_+3h,mean_importance
price_mad__arcsinh_mad__price_lag1,58.529641,57.315236,56.740439,56.475825,57.265285
zscore__price_std,2.586033,2.390702,3.081184,3.67599,2.933477
keep__delivery_hour_sin,1.804116,1.805932,1.694509,1.773727,1.769571
zscore__wind_forecast,1.761877,1.67335,1.669436,1.524743,1.657351
price_mad__arcsinh_mad__nbr_price_t15_1,0.819638,1.155233,1.394962,1.514629,1.221115
zscore__exported,1.290107,1.155125,1.095011,0.962847,1.125773
keep__imported_is_zero,0.970596,0.865021,0.798501,0.719115,0.838308
keep__delivery_month_sin,1.081922,0.996217,0.744428,0.499883,0.830613
zscore__load_forecast_trend_lag2_4,0.911575,0.841181,0.786304,0.677182,0.804061
zscore__solar_forecast_trend_lag2_4,0.709826,0.718217,0.679537,0.64139,0.687243


In [163]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, make_scorer
from lightgbm import LGBMRegressor


def eval_multioutput(y_true_df, y_pred_nd, target_cols):
    out = {}
    for i, col in enumerate(target_cols):
        yt = y_true_df.iloc[:, i].to_numpy()
        yp = y_pred_nd[:, i]
        out[col] = {
            "SMAPE": smape(yt, yp),
            "MAE":   mean_absolute_error(yt, yp),
            "MAPE":  mape(yt, yp)
        }
    res = pd.DataFrame(out).T
    res.loc["AVERAGE"] = res.mean(numeric_only=True)
    return res

# ====== main ======
def train_lightgbm_multistep(
    df_trainvalid, df_test, target_cols,
    date_col="delivery_time", cv_splits=4,
    param_grid=None, random_state=42
):
    # --- 清洗（按你规则） ---
    df_trainvalid = df_trainvalid.dropna(subset=target_cols).fillna(0).copy()
    df_test       = df_test.dropna(subset=target_cols).fillna(0).copy()

    # --- 按时间排序（TimeSeriesSplit按索引顺序切） ---
    for d in (df_trainvalid, df_test):
        d[date_col] = pd.to_datetime(d[date_col])
    df_trainvalid = df_trainvalid.sort_values(date_col)
    df_test       = df_test.sort_values(date_col)

    # --- 组装特征：去掉原始时间列与目标列 ---
    raw_time_cols = [c for c in ["delivery_time","ExecutionTime","timestamp"] if c in df_trainvalid.columns]
    feat_cols = [c for c in df_trainvalid.columns if c not in set(target_cols) | set(raw_time_cols)]
    X_tv, y_tv = df_trainvalid[feat_cols], df_trainvalid[target_cols]
    X_te, y_te = df_test[feat_cols], df_test[target_cols]

    # --- 基础模型（不标准化） ---
    base = LGBMRegressor(
        n_estimators=400,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=random_state,
        n_jobs=-1
    )
    est = MultiOutputRegressor(base)

    # --- CV: TimeSeriesSplit=4；用 SMAPE(越小越好) 作为评分 ---
    if param_grid is None:
        param_grid = {
            "estimator__n_estimators": [300, 500],
            "estimator__num_leaves": [31, 63],
            "estimator__max_depth": [-1, 8],
        }
    scorer = make_scorer(smape, greater_is_better=False)  # 负的SMAPE，越大越好 => 越小越好
    tscv = TimeSeriesSplit(n_splits=cv_splits)

    gs = GridSearchCV(
        estimator=est,
        param_grid=param_grid,
        scoring=scorer,
        cv=tscv,
        n_jobs=-1,
        verbose=0
    )
    gs.fit(X_tv, y_tv)

    best_model = gs.best_estimator_

    # --- Test集评估 ---
    y_pred_test = best_model.predict(X_te)
    test_results = eval_multioutput(y_te, y_pred_test, target_cols)

    # --- Feature importance（按 gain；每个目标一个模型） ---
    # 对于 MultiOutputRegressor，best_model.estimators_ 是每个目标一个 LGBMRegressor
    def _one_imp(m):
        # 'gain' 比 'split' 更有意义
        imp = m.booster_.feature_importance(importance_type="gain")
        return pd.Series(imp, index=m.booster_.feature_name())

    imps = []
    for i, col in enumerate(target_cols):
        s = _one_imp(best_model.estimators_[i]).rename(col)
        imps.append(s)
    fi = pd.concat(imps, axis=1).fillna(0.0)
    fi["mean_importance"] = fi.mean(axis=1)
    fi = fi.sort_values("mean_importance", ascending=False)

    return {
        "best_params": gs.best_params_,
        "test_results": test_results,
        "feature_importance": fi,
        "model": best_model,
        "feature_cols": feat_cols
    }




In [164]:
target_cols = ['target_hour_price','target_hour_+1h','target_hour_+2h','target_hour_+3h']
out = train_lightgbm_multistep(df_train_lasso, df_test_lasso, target_cols)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6193
[LightGBM] [Info] Total Bins 6193
[LightGBM] [Info] Number of data points in the train set: 78000, number of used features: 35
[LightGBM] [Info] Number of data points in the train set: 78000, number of used features: 35
[LightGBM] [Info] Start training from score 131.507467
[LightGBM] [Info] Start training from score 131.507467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.177188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Tota

In [165]:
print("Best params:", out['best_params'])
print(out['test_results'])

Best params: {'estimator__max_depth': -1, 'estimator__n_estimators': 300, 'estimator__num_leaves': 31}
                       SMAPE       MAE          MAPE
target_hour_price   7.488316  2.980666  8.287948e+06
target_hour_+1h    12.194096  5.261275  1.412353e+07
target_hour_+2h    15.600613  7.145418  2.740987e+07
target_hour_+3h    18.547931  9.076653  4.784033e+07
AVERAGE            13.457739  6.116003  2.441542e+07


In [166]:
out['feature_importance'].head(20)

Unnamed: 0,target_hour_price,target_hour_+1h,target_hour_+2h,target_hour_+3h,mean_importance
price_lag1,8563141000.0,8440549000.0,8378226000.0,8407257000.0,8447293000.0
day_ahead_price,2100953000.0,2082300000.0,2091012000.0,2130956000.0,2101305000.0
nbr_price_t15_2,856944700.0,865997900.0,869186700.0,911418500.0,875887000.0
nbr_price_t15_1,103685100.0,105477400.0,105226900.0,143398900.0,114447100.0
price_range,124520300.0,107987900.0,90633910.0,47339310.0,92620350.0
wind_forecast,65567600.0,64845380.0,75090600.0,113107600.0,79652800.0
time_to_expiry_h,14930670.0,29673930.0,50345200.0,167950100.0,65724980.0
volume_lag1,6382610.0,8244902.0,26685140.0,212016000.0,63332160.0
day_of_year_cos,51241610.0,42441720.0,47061350.0,70077290.0,52705490.0
load_forecast,35730810.0,39262040.0,46706870.0,69038940.0,47684670.0


In [167]:
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer


def permutation_importance_for_horizon(out, df_test, target_cols,
                                       horizon_col='target_hour_+3h',
                                       date_col='delivery_time',
                                       n_repeats=10, random_state=42):
    """
    在测试集上对指定 horizon (默认 target_hour_+3h) 计算置换重要性（SMAPE）。
    依赖 train_lightgbm_multistep 的输出 out：包含 model、feature_cols。
    返回按重要性降序排列的 DataFrame。
    """
    # 取特征与该目标的真实值
    feat_cols = out['feature_cols']
    X_te = df_test.copy()
    X_te[date_col] = pd.to_datetime(X_te[date_col])
    X_te = X_te.sort_values(date_col)[feat_cols]
    y_te = df_test.sort_values(date_col)[horizon_col].to_numpy()

    # 选择对应的单输出 LightGBM 模型
    idx = target_cols.index(horizon_col)
    est_single = out['model'].estimators_[idx]

    # 置换重要性（越大代表置换后 SMAPE 下降越多 → 该特征更重要）
    scorer = make_scorer(smape, greater_is_better=False)  # 负SMAPE，越大越好=误差越小
    result = permutation_importance(
        est_single, X_te, y_te,
        scoring=scorer,
        n_repeats=n_repeats,
        random_state=random_state,
        n_jobs=-1
    )

    pi_df = pd.DataFrame({
        'feature': feat_cols,
        'perm_importance_mean': result.importances_mean,
        'perm_importance_std':  result.importances_std
    }).sort_values('perm_importance_mean', ascending=False).reset_index(drop=True)

    return pi_df


In [168]:
pi_df = permutation_importance_for_horizon(out, df_test_lasso, target_cols,
                                       horizon_col='target_hour_+3h',
                                       date_col='delivery_time',
                                       n_repeats=10, random_state=42)

In [169]:
pi_df

Unnamed: 0,feature,perm_importance_mean,perm_importance_std
0,price_lag1,47.886501,0.043475
1,day_ahead_price,3.696296,0.024338
2,nbr_price_t15_2,2.993313,0.024806
3,temperature_rounded,0.610088,0.013885
4,nbr_price_t15_1,0.269038,0.013105
5,solar_forecast,0.155771,0.007034
6,time_to_expiry_h,0.155134,0.016358
7,volume_std,0.132545,0.007569
8,day_of_year_sin,0.131745,0.011791
9,volume_range,0.122858,0.004191


## Timexer

In [173]:
df_2023_long = df_2023_long.rename(columns={"generation_solar_actual":"solar_actual"})
df_2023_long

Unnamed: 0,ExecutionTime,delivery_time,high,low,close,volume,timestamp,day_ahead_price,load_forecast,load_actual,solar_actual,generation_wind_onshore_actual,generation_wind_offshore_actual,generation_wind_onshore_forecast,generation_wind_offshore_forecast,solar_forecast,temperature_rounded,imported,exported,imported_is_zero,exported_is_zero,wind_forecast,wind_actual,generation_wind_forecast,generation_wind_actual
0,2023-01-01 00:00:00,2023-01-01 00:15:00,2.01,-8.83,0.10,0.000,2023-01-01 00:15:00,-29.95,42577.0,38771.0,2.0,28938.0,2974.0,35870.0,3477.0,0.0,10.0725,0.0,5902.8,0,0,39347.0,31912.0,39347.0,31912.0
1,2023-01-01 00:00:00,2023-01-01 00:45:00,-3.74,-14.39,-10.00,0.000,2023-01-01 00:45:00,-30.00,41302.0,38457.0,2.0,28786.0,3258.0,35659.0,3486.0,0.0,10.2975,0.0,5902.8,0,0,39145.0,32044.0,39145.0,32044.0
2,2023-01-01 00:00:00,2023-01-01 01:00:00,4.80,-5.51,-3.96,0.000,2023-01-01 01:00:00,26.81,40841.0,38119.0,2.0,29510.0,3142.0,35545.0,3382.0,0.0,10.4100,0.0,7699.4,1,0,38927.0,32652.0,38927.0,32652.0
3,2023-01-01 00:00:00,2023-01-01 01:30:00,-4.80,-16.09,-5.31,48.400,2023-01-01 01:30:00,-29.94,39746.0,38419.0,2.0,29392.0,3775.0,35365.0,3395.0,0.0,10.5800,0.0,7699.4,0,0,38760.0,33167.0,38760.0,33167.0
4,2023-01-01 00:00:00,2023-01-01 01:45:00,-1.09,-15.97,-2.20,47.700,2023-01-01 01:45:00,-29.95,39249.0,38072.0,2.0,29923.0,3833.0,35319.0,3397.0,0.0,10.6650,0.0,7699.4,0,0,38716.0,33756.0,38716.0,33756.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2712446,2023-12-24 22:45:00,2023-12-25 22:30:00,2.76,2.76,2.76,0.825,2023-12-25 22:30:00,12.56,43808.0,44287.0,3.0,28924.0,6975.0,33363.0,5421.0,0.0,10.0750,0.0,1269.6,0,0,38784.0,35899.0,38784.0,35899.0
2712447,2023-12-24 22:45:00,2023-12-25 22:45:00,-0.56,-2.99,-2.69,1.050,2023-12-25 22:45:00,5.04,43088.0,43530.0,3.0,28958.0,6813.0,33823.0,5298.0,0.0,10.0775,0.0,1269.6,0,0,39121.0,35771.0,39121.0,35771.0
2712448,2023-12-24 22:45:00,2023-12-25 23:00:00,19.80,18.27,19.66,0.000,2023-12-25 23:00:00,32.77,42401.0,43107.0,3.0,29217.0,6896.0,34167.0,5272.0,0.0,10.0800,0.0,2689.4,1,0,39439.0,36113.0,39439.0,36113.0
2712449,2023-12-24 22:45:00,2023-12-25 23:15:00,7.66,7.66,7.66,0.000,2023-12-25 23:15:00,15.08,41926.0,42284.0,3.0,29826.0,6832.0,34548.0,5260.0,0.0,10.0625,0.0,2689.4,0,0,39808.0,36658.0,39808.0,36658.0


In [189]:
df_timexer_2023 = df_2023_long.merge(df_2023_variables[['timestamp','load_act_minus_fc_prevday','wind_act_minus_fc_prevday','solar_act_minus_fc_prevday','load_forecast_trend_lag2_4','wind_forecast_trend_lag2_4','solar_forecast_trend_lag2_4']], left_on='delivery_time', right_on='timestamp', how='left')
df_timexer_2023

Unnamed: 0,ExecutionTime,delivery_time,high,low,close,volume,timestamp_x,day_ahead_price,load_forecast,load_actual,solar_actual,generation_wind_onshore_actual,generation_wind_offshore_actual,generation_wind_onshore_forecast,generation_wind_offshore_forecast,solar_forecast,temperature_rounded,imported,exported,imported_is_zero,exported_is_zero,wind_forecast,wind_actual,generation_wind_forecast,generation_wind_actual,timestamp_y,load_act_minus_fc_prevday,wind_act_minus_fc_prevday,solar_act_minus_fc_prevday,load_forecast_trend_lag2_4,wind_forecast_trend_lag2_4,solar_forecast_trend_lag2_4
0,2023-01-01 00:00:00,2023-01-01 00:15:00,2.01,-8.83,0.10,0.000,2023-01-01 00:15:00,-29.95,42577.0,38771.0,2.0,28938.0,2974.0,35870.0,3477.0,0.0,10.0725,0.0,5902.8,0,0,39347.0,31912.0,39347.0,31912.0,2023-01-01 00:15:00,,,,,,
1,2023-01-01 00:00:00,2023-01-01 00:45:00,-3.74,-14.39,-10.00,0.000,2023-01-01 00:45:00,-30.00,41302.0,38457.0,2.0,28786.0,3258.0,35659.0,3486.0,0.0,10.2975,0.0,5902.8,0,0,39145.0,32044.0,39145.0,32044.0,2023-01-01 00:45:00,,,,-469.0,1094.0,0.0
2,2023-01-01 00:00:00,2023-01-01 01:00:00,4.80,-5.51,-3.96,0.000,2023-01-01 01:00:00,26.81,40841.0,38119.0,2.0,29510.0,3142.0,35545.0,3382.0,0.0,10.4100,0.0,7699.4,1,0,38927.0,32652.0,38927.0,32652.0,2023-01-01 01:00:00,,,,-554.5,506.5,0.0
3,2023-01-01 00:00:00,2023-01-01 01:30:00,-4.80,-16.09,-5.31,48.400,2023-01-01 01:30:00,-29.94,39746.0,38419.0,2.0,29392.0,3775.0,35365.0,3395.0,0.0,10.5800,0.0,7699.4,0,0,38760.0,33167.0,38760.0,33167.0,2023-01-01 01:30:00,,,,-548.0,-169.5,0.0
4,2023-01-01 00:00:00,2023-01-01 01:45:00,-1.09,-15.97,-2.20,47.700,2023-01-01 01:45:00,-29.95,39249.0,38072.0,2.0,29923.0,3833.0,35319.0,3397.0,0.0,10.6650,0.0,7699.4,0,0,38716.0,33756.0,38716.0,33756.0,2023-01-01 01:45:00,,,,-515.5,-162.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2712446,2023-12-24 22:45:00,2023-12-25 22:30:00,2.76,2.76,2.76,0.825,2023-12-25 22:30:00,12.56,43808.0,44287.0,3.0,28924.0,6975.0,33363.0,5421.0,0.0,10.0750,0.0,1269.6,0,0,38784.0,35899.0,38784.0,35899.0,2023-12-25 22:30:00,1800.0,-3276.0,,15.0,215.5,0.0
2712447,2023-12-24 22:45:00,2023-12-25 22:45:00,-0.56,-2.99,-2.69,1.050,2023-12-25 22:45:00,5.04,43088.0,43530.0,3.0,28958.0,6813.0,33823.0,5298.0,0.0,10.0775,0.0,1269.6,0,0,39121.0,35771.0,39121.0,35771.0,2023-12-25 22:45:00,1578.0,-3602.0,,-35.0,255.5,0.0
2712448,2023-12-24 22:45:00,2023-12-25 23:00:00,19.80,18.27,19.66,0.000,2023-12-25 23:00:00,32.77,42401.0,43107.0,3.0,29217.0,6896.0,34167.0,5272.0,0.0,10.0800,0.0,2689.4,1,0,39439.0,36113.0,39439.0,36113.0,2023-12-25 23:00:00,1757.0,-4269.0,,-295.0,437.5,0.0
2712449,2023-12-24 22:45:00,2023-12-25 23:15:00,7.66,7.66,7.66,0.000,2023-12-25 23:15:00,15.08,41926.0,42284.0,3.0,29826.0,6832.0,34548.0,5260.0,0.0,10.0625,0.0,2689.4,0,0,39808.0,36658.0,39808.0,36658.0,2023-12-25 23:15:00,1454.0,-5094.0,,-499.5,385.0,0.0


In [185]:
df_2023_long

Unnamed: 0,ExecutionTime,delivery_time,high,low,close,volume,timestamp,day_ahead_price,load_forecast,load_actual,solar_actual,generation_wind_onshore_actual,generation_wind_offshore_actual,generation_wind_onshore_forecast,generation_wind_offshore_forecast,solar_forecast,temperature_rounded,imported,exported,imported_is_zero,exported_is_zero,wind_forecast,wind_actual,generation_wind_forecast,generation_wind_actual
0,2023-01-01 00:00:00,2023-01-01 00:15:00,2.01,-8.83,0.10,0.000,2023-01-01 00:15:00,-29.95,42577.0,38771.0,2.0,28938.0,2974.0,35870.0,3477.0,0.0,10.0725,0.0,5902.8,0,0,39347.0,31912.0,39347.0,31912.0
1,2023-01-01 00:00:00,2023-01-01 00:45:00,-3.74,-14.39,-10.00,0.000,2023-01-01 00:45:00,-30.00,41302.0,38457.0,2.0,28786.0,3258.0,35659.0,3486.0,0.0,10.2975,0.0,5902.8,0,0,39145.0,32044.0,39145.0,32044.0
2,2023-01-01 00:00:00,2023-01-01 01:00:00,4.80,-5.51,-3.96,0.000,2023-01-01 01:00:00,26.81,40841.0,38119.0,2.0,29510.0,3142.0,35545.0,3382.0,0.0,10.4100,0.0,7699.4,1,0,38927.0,32652.0,38927.0,32652.0
3,2023-01-01 00:00:00,2023-01-01 01:30:00,-4.80,-16.09,-5.31,48.400,2023-01-01 01:30:00,-29.94,39746.0,38419.0,2.0,29392.0,3775.0,35365.0,3395.0,0.0,10.5800,0.0,7699.4,0,0,38760.0,33167.0,38760.0,33167.0
4,2023-01-01 00:00:00,2023-01-01 01:45:00,-1.09,-15.97,-2.20,47.700,2023-01-01 01:45:00,-29.95,39249.0,38072.0,2.0,29923.0,3833.0,35319.0,3397.0,0.0,10.6650,0.0,7699.4,0,0,38716.0,33756.0,38716.0,33756.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2712446,2023-12-24 22:45:00,2023-12-25 22:30:00,2.76,2.76,2.76,0.825,2023-12-25 22:30:00,12.56,43808.0,44287.0,3.0,28924.0,6975.0,33363.0,5421.0,0.0,10.0750,0.0,1269.6,0,0,38784.0,35899.0,38784.0,35899.0
2712447,2023-12-24 22:45:00,2023-12-25 22:45:00,-0.56,-2.99,-2.69,1.050,2023-12-25 22:45:00,5.04,43088.0,43530.0,3.0,28958.0,6813.0,33823.0,5298.0,0.0,10.0775,0.0,1269.6,0,0,39121.0,35771.0,39121.0,35771.0
2712448,2023-12-24 22:45:00,2023-12-25 23:00:00,19.80,18.27,19.66,0.000,2023-12-25 23:00:00,32.77,42401.0,43107.0,3.0,29217.0,6896.0,34167.0,5272.0,0.0,10.0800,0.0,2689.4,1,0,39439.0,36113.0,39439.0,36113.0
2712449,2023-12-24 22:45:00,2023-12-25 23:15:00,7.66,7.66,7.66,0.000,2023-12-25 23:15:00,15.08,41926.0,42284.0,3.0,29826.0,6832.0,34548.0,5260.0,0.0,10.0625,0.0,2689.4,0,0,39808.0,36658.0,39808.0,36658.0


In [184]:
df_train_variables.columns

Index(['timestamp', 'day_ahead_price', 'load_forecast', 'load_actual',
       'generation_solar_actual', 'generation_wind_onshore_actual',
       'generation_wind_offshore_actual', 'generation_wind_onshore_forecast',
       'generation_wind_offshore_forecast', 'solar_forecast',
       'temperature_rounded', 'imported', 'exported', 'imported_is_zero',
       'exported_is_zero', 'wind_forecast', 'wind_actual'],
      dtype='object')

In [None]:
df_timexer_2023 = df_2023_long[['ExecutionTime', 'delivery_time', 'high', 'low', 'close', 'volume',
                               'day_ahead_price', 'load_forecast', ]]

In [177]:
df_time_2023.columns

Index(['ExecutionTime', 'delivery_time', 'high', 'low', 'close', 'volume',
       'timestamp', 'day_ahead_price', 'load_forecast', 'load_actual',
       'solar_actual', 'generation_wind_onshore_actual',
       'generation_wind_offshore_actual', 'generation_wind_onshore_forecast',
       'generation_wind_offshore_forecast', 'solar_forecast',
       'temperature_rounded', 'imported', 'exported', 'imported_is_zero',
       'exported_is_zero', 'wind_forecast', 'wind_actual',
       'generation_wind_forecast', 'generation_wind_actual',
       'wind_forecast_error', 'solar_forecast_error', 'load_forecast_error',
       'residual_load'],
      dtype='object')

In [None]:
def to_nixtla_tables(df, uid='delivery_time', ds='ExecutionTime', y_col='target_price_15m',
                     hist_cols=None, futr_cols=None, stat_cols=None):

    # 目标
    y_df = df[[uid, ds, y_col]].rename(columns={uid:'unique_id', ds:'ds', y_col:'y'})

    # 历史已知
    hist_df = df[[uid, ds] + hist_cols].rename(columns={uid:'unique_id', ds:'ds'})

    # 未来已知
    futr_df = df[[uid, ds] + futr_cols].rename(columns={uid:'unique_id', ds:'ds'})

    # 静态
    stat_df = (df[[uid] + stat_cols]
               .drop_duplicates(subset=[uid])
               .rename(columns={uid:'unique_id'}))

    return y_df, hist_df, futr_df, stat_df
