In [None]:
DATA_PATH = 'data/no_exogenous/data.csv'
FREQ = 'h'
YEAR = 2021
OUTPUT_PATH = f'data/predictions_rf_{YEAR}.csv'
MAX_DEPTH = 200
N_ESTIMATORS = 1500

In [None]:
import pandas as pd
import multiprocessing as mp
from sklearn.ensemble import RandomForestRegressor

In [None]:
data = (
    pd.read_csv(DATA_PATH, parse_dates=['timestamp'])
      .set_index('timestamp')
      .asfreq(FREQ)
      .dropna()
)
data['year'] = data.index.year

train = data[(data['year'] < YEAR) & (data['year'] > YEAR - 5)]
test  = data[data['year'] == YEAR]

X_train = train[['hour', 'day_of_week', 'month', 'year', 'is_weekend', 'quarter', 'is_holiday']]
y_train = train['value']
X_test  = test[['hour', 'day_of_week', 'month', 'year', 'is_weekend', 'quarter', 'is_holiday']]

In [None]:
model = RandomForestRegressor(
    max_depth=MAX_DEPTH,
    n_estimators=N_ESTIMATORS,
    bootstrap=True,
    oob_score=True,
    n_jobs=mp.cpu_count() // 2,
    random_state=42
)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)


In [None]:
out = test.copy()
out['value'] = y_pred
out.to_csv(OUTPUT_PATH, index=False)