In [1]:
import time
import numpy as np
import pandas as pd
import plotly.express as px
from xgboost import XGBRegressor

In [14]:
def pinball_score(y, q, alpha):
    return (y-q)*alpha*(y>=q) + (q-y)*(1-alpha)*(y<q)

def remove_upperbound(merged_table_features, percentage=0.04):
    columns = ['SolarDownwardRadiation', 'temp_hornsea', 'temp_solar', 'WindSpeedPCA']
    n = round(len(merged_table_features) * percentage)
    indexes = set()
    for col in columns:
        indexes.update(set(merged_table_features[col].nlargest(n).index))
    return merged_table_features.drop(indexes).reset_index(drop=True)

def remove_long_forecasts(df, length=51.5):
    return df.loc[df.hours_after <= length]

In [15]:
df_train = pd.read_parquet("/content/df_train_pca.parquet")
df_test = pd.read_parquet("/content/df_test_pca.parquet")

In [9]:
# df_train = remove_long_forecasts(df_train)
df_test = remove_long_forecasts(df_test)
# df_train = remove_upperbound(df_train)

In [16]:
label = "Wind_MWh_credit"
# columns = ['RelativeHumidity', 'temp_hornsea', 'WindDirection', 'WindDirection:100', 'WindDirectionPCA',
#            'WindSpeed', 'WindSpeed:100', 'WindSpeedPCA','hours_after', 'CloudCover', 'SolarDownwardRadiation',
#            'temp_solar', 'year', 'month', 'day', 'hour', 'adjusted_solar_radiation', 'temp_x_solar_interaction',
#            'temp_y_solar_interaction', 'wind_interaction', 'wind_interaction_100', 'humidity_wind_interaction',
#            'wind_gradient', 'CloudCover_lag_1h','cloud_cover_change']
columns = ["temp_hornsea", "WindDirectionPCA", "WindSpeedPCA", "hours_after", 'month', 'day', 'hour']
# columns = ["WindSpeedPCA"]

# label = "Solar_MWh_credit"
# columns = ["temp_solar", "CloudCover", "SolarDownwardRadiation", "RelativeHumidity", "hours_after", 'month', 'day', 'hour']
# columns = ["SolarDownwardRadiation"]


index = df_train[df_train[label].isna()].index
x = df_train.drop(index)[columns].to_numpy()
y = df_train.drop(index)[label].to_numpy()

index_test = df_test[df_test[label].isna()].index
x_test = df_test.drop(index_test)[columns].to_numpy()
y_test = df_test.drop(index_test)[label].to_numpy()

In [6]:
x.shape

(1184170, 7)

In [None]:
nestimators = [200, 300]
depths = [2, 3, 4, 5, 6, 7, 8, 10]
etas = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
gammas = [0]
results = []

for ne in nestimators:
  print(f"Nest: {ne}")
  for d in depths:
    print(f"Depth: {d}")
    for e in etas:
      for gamma in gammas:
          model = XGBRegressor(device="cuda", gamma=gamma, eta=e, max_depth=d, n_estimators=ne, objective="reg:quantileerror", quantile_alpha=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
          start = time.time()
          model.fit(x, y)
          end = time.time()

          preds_train = model.predict(x)
          preds = model.predict(x_test)
          pinball_train = np.array([pinball_score(y, pred, (i+1)/10).mean() for i, pred in enumerate(preds_train.T)]).mean()
          pinball_test = np.array([pinball_score(y_test, pred, (i+1)/10).mean() for i, pred in enumerate(preds.T)]).mean()

          results.append((ne, d, e, gamma, end-start, pinball_train, pinball_test))
pd.DataFrame(data=np.array(results).round(2), columns=["nestimators", "depth", "eta", "gamma", "Zeit", "pinball_train", "pinball_test"]).to_excel("scores.xlsx")

Nest: 200
Depth: 2
Depth: 3
Depth: 4
Depth: 5
Depth: 6
Depth: 7
Depth: 8
Depth: 10
Nest: 300
Depth: 2
Depth: 3
Depth: 4
Depth: 5
Depth: 6
Depth: 7
Depth: 8
Depth: 10


Basewerte
6.37
10.8
model = XGBRegressor(device="cuda", seed=123, reg_lambda=1, reg_alpha=0, gamma=0, eta=0.6, max_depth=10, n_estimators=200, objective="reg:quantileerror", quantile_alpha=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [7]:
model = XGBRegressor(device="cuda", seed=123, reg_lambda=1, reg_alpha=0, gamma=0, eta=0.3, max_depth=2, n_estimators=200, objective="reg:quantileerror", quantile_alpha=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
model.fit(x, y)

preds_train = model.predict(x)
preds = model.predict(x_test)
pinball_train = np.array([pinball_score(y, pred, (i+1)/10).mean() for i, pred in enumerate(preds_train.T)]).mean()
pinball_test = np.array([pinball_score(y_test, pred, (i+1)/10).mean() for i, pred in enumerate(preds.T)]).mean()

print(f"Trainscore: {round(pinball_train, 2)}\nTestscore: {round(pinball_test, 2)}")



Trainscore: 35.61
Testscore: 24.83


In [12]:
np.array([pinball_score(y_test, pred, (i+1)/10).mean() for i, pred in enumerate(model.predict(x_test).T)]).mean()

18.66313098499053

In [13]:
model.save_model("model_wind_241024.json")

In [17]:
preds_train = model.predict(x)
preds = model.predict(x_test)
pinball_train = np.array([pinball_score(y, pred, (i+1)/10).mean() for i, pred in enumerate(preds_train.T)]).mean()
pinball_test = np.array([pinball_score(y_test, pred, (i+1)/10).mean() for i, pred in enumerate(preds.T)]).mean()


In [19]:
pinball_train

35.60507123941789

In [26]:
preds = model.predict(x_test)


In [28]:
preds.sort(axis=1)

In [29]:
preds

array([[ 26.828949,  31.607107,  38.549175, ...,  60.46124 ,  81.30625 ,
        100.38426 ],
       [ 26.828949,  31.607107,  32.694786, ...,  60.46124 ,  68.14519 ,
        100.38426 ],
       [ 26.828949,  31.607107,  32.694786, ...,  56.82568 ,  68.14519 ,
        100.38426 ],
       ...,
       [106.80937 , 180.69781 , 246.6882  , ..., 430.6432  , 501.51776 ,
        545.5533  ],
       [106.80937 , 180.69781 , 243.48657 , ..., 430.6432  , 501.51776 ,
        545.5533  ],
       [106.80937 , 180.69781 , 243.48657 , ..., 392.14825 , 501.74716 ,
        547.83435 ]], dtype=float32)

5% = 23.608
model = XGBRegressor(device="cuda", gamma=0, eta=0.1, max_depth=5, n_estimators=200, objective="reg:quantileerror", quantile_alpha=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])


In [None]:
columns

['temp_hornsea',
 'WindDirectionPCA',
 'WindSpeedPCA',
 'hours_after',
 'month',
 'day',
 'hour']

In [None]:
model.feature_importances_

array([0.02186771, 0.02763267, 0.8329607 , 0.03293702, 0.05183614,
       0.02510126, 0.00766461], dtype=float32)