## Import and Load Data

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
import matplotlib.pyplot as plt

data = pd.read_csv("../../data/ethusd_group_project.csv",parse_dates=['time'], index_col='time')
data = data.dropna()

## Data cleaning
### Features engineering

Now we will also take note of the weekly and daily rv

In [4]:
daily_rv = data['realized_variance']
weekly_rv = daily_rv.resample('W').mean()
monthly_rv = daily_rv.resample('ME').mean()

Make it log RV

In [5]:
log_daily_rv = np.log(daily_rv)
log_weekly_rv = np.log(weekly_rv)
log_monthly_rv = np.log(monthly_rv)

### Train-test split
We are mindful that the data is a time series data, so we will not randomise the data for the train test split. We will use split according the paper done by Dudek et al.

In [None]:
# Define the initial training period
train_start = '2016-03-10' # First data point
train_end = '2018-12-31' # Let the data train for more than a year

# Create the training sample with additional features
X_train = pd.DataFrame({
    'ln_RV_d_t_minus_1': log_daily_rv.shift(1)[train_start:train_end],
    'ln_RV_w_t_minus_1': log_weekly_rv.shift(1)[train_start:train_end],
    'ln_RV_m_t_minus_1': log_monthly_rv.shift(1)[train_start:train_end],
    'open': data['open'][train_start:train_end],
    'close': data['close'][train_start:train_end],
    'high': data['high'][train_start:train_end],
    'low': data['low'][train_start:train_end],
    'volume': data['volume'][train_start:train_end],
    'daily_return': data['daily_return'][train_start:train_end]
})
y_train = log_daily_rv[train_start:train_end]

# Drop rows with NaN values
X_train.dropna(inplace=True)
y_train = y_train[X_train.index]


## Random Forest
We will use a RF model to predict the realised variance

### Training the RF model

In [8]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model
rf_model.fit(X_train, y_train)

Creating a one-day-ahead forecast

In [9]:
ln_rv_forecast = rf_model.predict(X_train.iloc[[-1]])[0]

### Updating the training sample
This is done by removing the oldest one and adding the latest one

### Out of sample performance
Now repeat the above mentioned steps from 2019 to 2023. This part will be the OOS section, and will be used to evaluate the model's performance.

Do note that our OOS begins in 2019.

In [None]:
forecasts = []

forecast_start = '2019-01-01'
forecast_end = '2023-10-08'

# Initialize the first forecast (use the last observation in the training set)
ln_rv_forecast = rf_model.predict(X_train.iloc[[-1]])[0]

# Loop through the forecasting period
for date in pd.date_range(forecast_start, forecast_end):
    # Update the training sample
    new_observation = pd.DataFrame({
        'ln_RV_d_t_minus_1': [ln_rv_forecast],
        'ln_RV_w_t_minus_1': [log_weekly_rv.asof(date)],
        'ln_RV_m_t_minus_1': [log_monthly_rv.asof(date)],
        'open': [data['open'].asof(date)],
        'close': [data['close'].asof(date)],
        'high': [data['high'].asof(date)],
        'low': [data['low'].asof(date)],
        'volume': [data['volume'].asof(date)],
        'daily_return': [data['daily_return'].asof(date)]
    }, index=[date])

    # Concatenate and drop the oldest observation
    X_train = pd.concat([X_train, new_observation])
    X_train.drop(X_train.index[0], inplace=True)  # Remove the oldest observation

    # Align y_train with X_train using reindex
    y_train = log_daily_rv.reindex(X_train.index)

    # Drop rows with NaN values
    y_train.dropna(inplace=True)
    X_train = X_train.loc[y_train.index]  # Ensure X_train matches y_train's index

    # For debugging purposes
    if X_train.shape[0] == y_train.shape[0]:
        rf_model.fit(X_train, y_train)
    else:
        raise ValueError("Error: Inconsistent number of samples. Cannot re-train the model.")

    # Generate the forecast
    ln_rv_forecast = rf_model.predict(new_observation)[0]
    rv_forecast = np.exp(ln_rv_forecast)

    # Store the forecast
    forecasts.append(rv_forecast)

Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)
Updated X_train shape: (3, 9)
y_train shape: (3,)


### Performance of the RF
We will use RMSE to  evaluate the results

In [15]:
forecast_dates = pd.date_range(start=forecast_start, end=forecast_end)
forecasts_series = pd.Series(forecasts, index=forecast_dates)
actual_rv = data['realized_variance'][forecast_start:forecast_end]
forecasts_series = forecasts_series.reindex(actual_rv.index)

# Calculate RMSFE
errors = forecasts_series - actual_rv
rmsfe = np.sqrt(np.mean(errors**2))
print("RMSFE:", rmsfe)


RMSFE: 0.0023384340017124678


### Feature Importance
Notice how the features importance is 0. This is because we are using a rolling window approach, meaning that we are taking around 1-3 observation per window and then training out model. This means that this si too 

In [22]:
importances = rf_model.feature_importances_

# You can create a DataFrame to neatly display the feature names alongside their importances
feature_importances_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print("Feature Importances:")
print(feature_importances_df)


Feature Importances:
             feature  importance
0  ln_RV_d_t_minus_1         0.0
1  ln_RV_w_t_minus_1         0.0
2  ln_RV_m_t_minus_1         0.0
3               open         0.0
4              close         0.0
5               high         0.0
6                low         0.0
7             volume         0.0
8       daily_return         0.0
       ln_RV_d_t_minus_1  ln_RV_w_t_minus_1  ln_RV_m_t_minus_1    open  \
count           1.000000           1.000000           1.000000     1.0   
mean           -9.962319          -7.981488          -8.286138  1634.5   
std                  NaN                NaN                NaN     NaN   
min            -9.962319          -7.981488          -8.286138  1634.5   
25%            -9.962319          -7.981488          -8.286138  1634.5   
50%            -9.962319          -7.981488          -8.286138  1634.5   
75%            -9.962319          -7.981488          -8.286138  1634.5   
max            -9.962319          -7.981488          -8.2

### Plotting out