In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA

In [15]:
def check_NaN(data):
    # Select only numeric columns for spline interpolation
    numeric_columns = data.select_dtypes(include=[np.number]).columns

    print("Number of NaN values:\n", data[numeric_columns].isna().sum())

In [3]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [6]:
# Load data from the Excel file
excel_file_path = "../../DATAFORMODELtrain200824.xlsx"
actual_data = pd.read_excel(excel_file_path)

In [7]:
# Convert 'Date' to datetime format and round to the nearest hour
actual_data['Date'] = pd.to_datetime(actual_data['Date'], format='%m/%d/%Y %H:%M').dt.round('h')

In [8]:
# Define the date range
start_date = pd.to_datetime('2017-01-01 00:00')
cutoff_date = pd.to_datetime("2024-08-20 23:00")

In [9]:
# Filter data within the date range and drop unnecessary columns
columns_to_drop = [
    'Y', 'M', 'Day', 'H', 'Y2016',	'Y2017',	'Y2018',	'Y2019',	'Y2020',	'Y2021',	'Y2022',	'Y2023',	'Y2024',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'M10', 'M11', 'M12',
    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'h9', 'h10',
    'h11', 'h12', 'h13', 'h14', 'h15', 'h16', 'h17', 'h18', 'h19',
    'h20', 'h21', 'h22', 'h23', 'h24',
    'PriceCZ', 'PriceSK', 'PriceRO', 'WDAY'
]
data = actual_data[(actual_data['Date'] >= start_date) & (actual_data['Date'] <= cutoff_date)].drop(columns=columns_to_drop)

# Time Based Features

In [10]:
# Step 1: Time-based Features
data['hour'] = data['Date'].dt.hour
data['day_of_week'] = data['Date'].dt.dayofweek
data['month'] = data['Date'].dt.month

In [11]:
# Cyclical time-based features
data['sin_hour'] = np.sin(2 * np.pi * data['hour'] / 24)
data['cos_hour'] = np.cos(2 * np.pi * data['hour'] / 24)
data['sin_day_of_week'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
data['cos_day_of_week'] = np.cos(2 * np.pi * data['day_of_week'] / 7)

In [12]:
# Step 2: Lagged Features
lags = [1, 2, 3, 6, 12, 24, 48, 72, 168]  # Example lags (adjust as needed)
for lag in lags:
    data[f'lag_{lag}'] = data['PriceHU'].shift(lag)

In [13]:
# Step 3: Rolling Window Features
windows = [3, 6, 12, 24, 168]  # Example windows (adjust as needed)
for window in windows:
    data[f'rolling_mean_{window}h'] = data['PriceHU'].rolling(window).mean()
    data[f'rolling_std_{window}h'] = data['PriceHU'].rolling(window).std()

In [14]:
# Step 4: Exponential Moving Average
ema_windows = [12, 24, 168]  # Example EMA windows (adjust as needed)
for window in ema_windows:
    data[f'ema_{window}h'] = data['PriceHU'].ewm(span=window).mean()

In [18]:
check_NaN(data)

Number of NaN values:
 PriceHU              0
PMIHU                0
GAS                  0
COAL                 0
CO2                  0
COALTOGAS            0
DOFFHU               0
DOFFSK               0
DOFFRO               0
WND                  0
T2MALL               0
T2MALLMAX            0
T2MALLMIN            0
T2MALLAV             0
T2MALLNONLIN         0
T2MALLP              0
T2MALLPCO2           0
T2MALLPROR           0
T2MHUNORM            0
THUDEVNORM           0
T2MHUPSQ             0
T2MHUP               0
T2MHU                0
PRECHU               0
WS10MHU              0
ALLSKY1HU            0
T2MSKPSQ             0
T2MSKP               0
T2MSK                0
PRECSK               0
WS10MSK              0
ALLSKY1SK            0
T2MROPSQ             0
T2MROP               0
T2MRO                0
PRECRO               0
WS10MRO              0
ALLSKY1RO            0
UNAVNUCHU            0
UNAVGASHU            0
UNAVLIGNHU           0
UNAVTPPHU            0
UNAVNUCSK  

In [17]:
# Step 5: Drop rows with NaN values created by lagging and rolling features
data.dropna(inplace=True)

In [19]:
data.head()

Unnamed: 0,Date,PriceHU,PMIHU,GAS,COAL,CO2,COALTOGAS,DOFFHU,DOFFSK,DOFFRO,WND,T2MALL,T2MALLMAX,T2MALLMIN,T2MALLAV,T2MALLNONLIN,T2MALLP,T2MALLPCO2,T2MALLPROR,T2MHUNORM,THUDEVNORM,T2MHUPSQ,T2MHUP,T2MHU,PRECHU,WS10MHU,ALLSKY1HU,T2MSKPSQ,T2MSKP,T2MSK,PRECSK,WS10MSK,ALLSKY1SK,T2MROPSQ,T2MROP,T2MRO,PRECRO,WS10MRO,ALLSKY1RO,UNAVNUCHU,UNAVGASHU,UNAVLIGNHU,UNAVTPPHU,UNAVNUCSK,UNAVGASSK,UNAVLIGNSK,UNAVTPPSK,UNAVHYDRSK,UNAVNUCRO,UNAVGASRO,UNAVLIGNRO,UNAVTPPRO,UNAVHYDRRO,UNAVNUCCZ,UNAVGASCZ,UNAVLIGNCZ,UNAVTPPCZ,UNAVNUCBG,UNAVGASBG,UNAVLIGNBG,UNAVTPPBG,UNAVHYDRBG,UNAVNUCSL,UNAVGASSL,UNAVLIGNSL,UNAVTPPSL,UNAVHYDRSL,UNAVNUCFR,UNAVHYDRFR,UNAVNUCALL,UNAVGASALL,UNAVLIGNALL,UNAVTPPALL,UNAVHYDRALL,RORRO,RORSE,RORDE,DEWINDGEN,ROSOLGEN,HUSOLGEN,RHSOLGEN,UA_HU,UA_SK,UA_RO,UA_EU,AT_HU,PL_SK,RORRO_HP,RORSE_HP,RORDE_HP,UADEM,HIGHIMIMP,HIGHTEMIMP,SOLMAX,UNAVGASGR,UNAVTPPGR,UNAVHYDRGR,BGSOLGEN,RHBSOLGEN,RORAT_HP,ATWINDGEN,UNAVALLFR,UNAVALL,hour,day_of_week,month,sin_hour,cos_hour,sin_day_of_week,cos_day_of_week,lag_1,lag_2,lag_3,lag_6,lag_12,lag_24,lag_48,lag_72,lag_168,rolling_mean_3h,rolling_std_3h,rolling_mean_6h,rolling_std_6h,rolling_mean_12h,rolling_std_12h,rolling_mean_24h,rolling_std_24h,rolling_mean_168h,rolling_std_168h,ema_12h,ema_24h,ema_168h
8952,2017-01-08 00:00:00,36.13,52.2,18.2,85.15,4.59,4.678571,0,0,0,1,-12.133333,-6.48,-12.946667,-11.221667,-22.695792,2.866667,13.158,885.8,-3.373333,-14.246667,1.9044,1.38,-17.62,0.0,2.21,0.0,49.7025,7.05,-8.95,0.02,1.25,0.0,124.7689,11.17,-9.83,0.1,2.34,0.0,0.0,350.0,200.0,550.0,235.0,0.0,0.0,110.0,0.0,0,408.0,1881.0,2484.0,194,470,0,1765.0,2326.0,0,0,0,0,0,0,0,787,787,0,7990,2901,705.0,758.0,4633.0,6257.0,194.0,309.0,841.0,2368.0,7375.0,0.0,0.0,0.0,650,0,-34,616,600.0,410,595.064267,561.197033,2438.24425,0.0,0.0,-4.748889,49.0,0,1111,0,0.0,0.0,2441.301798,5840,10891,6096.0,0,6,1,0.0,1.0,-0.781831,0.62349,38.16,44.08,60.08,67.83,56.94,43.67,40.68,34.2,57.25,39.456667,4.130573,51.54,13.648735,55.403333,10.92141,49.919583,12.802923,52.129881,15.57768,51.370489,51.997034,52.875787
8953,2017-01-08 01:00:00,33.5,52.2,18.2,85.15,4.59,4.678571,0,0,0,1,-12.23,-6.48,-12.946667,-11.216111,-22.690236,2.77,12.7143,991.66,-3.4525,-14.2175,1.7689,1.33,-17.67,0.0,2.01,0.0,50.5521,7.11,-8.89,0.04,1.24,0.0,118.1569,10.87,-10.13,0.08,2.86,0.0,0.0,350.0,400.0,750.0,235.0,0.0,0.0,110.0,0.0,0,408.0,1881.0,2484.0,194,470,0,1765.0,2326.0,0,0,0,0,0,0,0,787,787,0,7990,2901,705.0,758.0,4833.0,6457.0,194.0,358.0,670.0,2404.0,6543.0,0.0,0.0,0.0,650,0,-34,616,600.0,410,587.183328,562.162682,2424.018585,0.0,0.0,-4.739167,49.0,0,1111,0,0.0,0.0,2442.736807,5288,10891,6296.0,1,6,1,0.258819,0.965926,-0.781831,0.62349,36.13,38.16,44.08,66.5,53.92,35.96,40.1,33.5,50.21,35.93,2.336429,46.04,13.050516,53.701667,12.630615,49.817083,12.928771,52.030417,15.643221,48.621183,50.51727,52.611573
8954,2017-01-08 02:00:00,33.55,52.2,18.2,85.15,4.59,4.678571,0,0,0,1,-12.3,-6.48,-12.946667,-11.199167,-22.673292,2.7,12.393,958.5,-3.505833,-14.124167,1.8769,1.37,-17.63,0.0,1.85,0.0,51.6961,7.19,-8.81,0.06,1.34,0.0,111.0916,10.54,-10.46,0.07,3.29,0.0,0.0,619.0,400.0,1019.0,235.0,0.0,0.0,110.0,0.0,0,408.0,2000.0,2603.0,194,470,0,1947.0,2508.0,0,0,0,0,0,0,0,787,787,0,7990,2901,705.0,1027.0,5134.0,7027.0,194.0,355.0,554.0,2444.0,5318.0,0.0,0.0,0.0,650,0,-24,626,600.0,435,582.266874,563.51285,2412.886431,0.0,0.0,-4.708056,49.0,0,1111,0,0.0,0.0,2444.758782,4453,10891,6866.0,2,6,1,0.5,0.866025,-0.781831,0.62349,33.5,36.13,38.16,64.29,52.02,34.5,38.5,31.78,44.04,34.393333,1.504205,40.916667,10.168836,52.1625,13.914316,49.7775,12.979062,51.967976,15.696152,46.30254,49.159888,52.35211
8955,2017-01-08 03:00:00,32.7,52.2,18.2,85.15,4.59,4.678571,0,0,0,1,-12.333333,-6.48,-12.946667,-11.168056,-22.642181,2.666667,12.24,946.666667,-3.538333,-14.021667,2.0736,1.44,-17.56,0.0,1.69,0.0,52.9984,7.28,-8.72,0.1,1.56,0.0,105.6784,10.28,-10.72,0.06,3.57,0.0,0.0,619.0,400.0,1019.0,235.0,0.0,0.0,110.0,0.0,0,408.0,2042.0,2645.0,194,470,0,1947.0,2508.0,0,0,0,0,0,0,0,787,787,0,7990,2901,705.0,1027.0,5176.0,7069.0,194.0,355.0,379.0,2415.0,4280.0,0.0,0.0,0.0,650,0,-24,626,600.0,450,580.711305,565.441987,2405.019745,0.0,0.0,-4.673889,49.0,0,1111,0,0.0,0.0,2447.417429,3632,10891,6908.0,3,6,1,0.707107,0.707107,-0.781831,0.62349,33.55,33.5,36.13,60.08,56.16,34.0,37.36,31.49,32.81,33.25,0.47697,36.353333,4.297961,50.2075,14.913797,49.723333,13.050288,51.967321,15.696959,44.209841,47.843096,52.085084
8956,2017-01-08 04:00:00,33.0,52.2,18.2,85.15,4.59,4.678571,0,0,0,1,-12.44,-6.48,-12.946667,-11.13125,-22.605375,2.56,11.7504,870.4,-3.629167,-14.000833,1.8769,1.37,-17.63,0.0,1.53,0.0,51.6961,7.19,-8.81,0.13,1.83,0.0,102.4144,10.12,-10.88,0.07,3.67,0.0,0.0,619.0,813.0,1432.0,235.0,0.0,0.0,110.0,0.0,0,408.0,2001.0,2604.0,194,470,0,1947.0,2508.0,0,0,0,0,0,0,0,787,787,0,7990,2901,705.0,1027.0,5548.0,7441.0,194.0,340.0,401.0,2400.0,3628.0,0.0,0.0,0.0,650,0,-24,626,597.0,450,582.660504,568.141264,2400.901617,0.0,0.0,-4.666944,49.0,0,1111,0,0.0,0.0,2450.762172,3181,10891,7280.0,4,6,1,0.866025,0.5,-0.781831,0.62349,32.7,33.55,33.5,44.08,60.39,33.17,37.4,33.98,28.41,33.083333,0.431084,34.506667,2.165398,47.925,15.30458,49.71625,13.059706,51.994643,15.659661,42.48525,46.655648,51.826217


# Co Relations

In [21]:
# Step 1: Correlation Analysis
correlation_matrix = data.corr()
correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:  # Set threshold for dropping highly correlated features
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
data.drop(columns=correlated_features, inplace=True)

In [22]:
# Step 2: Mutual Information for Feature Selection
X = data.drop(['PriceHU', 'Date'], axis=1)
y = data['PriceHU']
mi_scores = mutual_info_regression(X, y)
mi_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

In [23]:
# Retain top features based on mutual information
top_features_mi = mi_scores[mi_scores > 0.01].index.tolist()  # Adjust threshold as needed
X_filtered_mi = X[top_features_mi]

In [24]:
# Step 3: Recursive Feature Elimination (RFE)
rfe_selector = RFE(estimator=RandomForestRegressor(), n_features_to_select=20)  # Adjust the number of features
rfe_selector.fit(X_filtered_mi, y)
rfe_features = X_filtered_mi.columns[rfe_selector.support_].tolist()

In [None]:
# Step 4: PCA for Dimensionality Reduction
pca = PCA(n_components=0.95)  # Keep components that explain 95% variance
X_pca = pca.fit_transform(X_filtered_mi)

In [None]:
# Final list of selected features
selected_features = rfe_features

In [None]:
# Print selected features
print("Selected Features for NBEATSModel:", selected_features)