# Random Forest
The RF performed well in our group project when it came to predicting the RV of ETH. Now we wil use RF to predict other coins

## Import the libraries and data
To obtain the data, please go to notebooks/data_preprocessing, and then run data_import.ipynb and then run data_preprocessing.ipynb. This will give you data/processed_data.csv

In [9]:
# Import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
import numpy as np


data = pd.read_csv('../../data/processed_data.csv')
data['Date'] = pd.to_datetime(data['Date'])

# Remove NaN values
data = data.dropna()

# Print columns
print(data.columns)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker',
       'ln_hourly_return', 'ln_3_hourly_return', 'ln_hourly_rv',
       'ln_3_hourly_rv', 'ln_daily_rv', 'ln_weekly_rv', 'ln_monthly_rv',
       'ln_daily_rv_lag1', 'ln_daily_rv_lag2', 'ln_weekly_rv_lag1',
       'ln_weekly_rv_lag2', 'ln_monthly_rv_lag1', 'ln_monthly_rv_lag2',
       'ln_hourly_rv_lag1', 'ln_3_hourly_rv_lag1', 'ln_hourly_rv_lag2',
       'ln_3_hourly_rv_lag2', 'ln_hourly_return_lag1',
       'ln_3_hourly_return_lag1', 'ln_hourly_return_lag2',
       'ln_3_hourly_return_lag2', 'Risk'],
      dtype='object')


## Train test split
Now we will use a different train-test split from the group project
Group project: 80/20 split
Individual: Use 1 year of training data, then use rolling window 

In [10]:
# Train-test split
# Train data: 0.8 of the data
# Test data: 0.2 of the data
# Sort the data by date
data = data.sort_values('Date')

# Determine when the first year ends, and use it as train data
# The rest of the data is used as test data
min_date = data['Date'].min()
max_date = data['Date'].max()

# Calculate the total time span of the data
total_time_span = max_date - min_date

# Define the first year of data
first_year_end = min_date + pd.DateOffset(years=1)

# Filter data for the first year
first_year_data = data[data['Date'] <= first_year_end]

# Calculate the percentage of data in the first year
percentage_first_year = (len(first_year_data) / len(data))

train_split = percentage_first_year
train_data = data[:int(train_split * len(data))]
test_data = data[int(train_split * len(data)):]

# Print train and test data date
print(train_data['Date'].min(), train_data['Date'].max())
print(test_data['Date'].min(), test_data['Date'].max())

2023-05-03 00:00:00+00:00 2024-05-03 00:00:00+00:00
2024-05-03 01:00:00+00:00 2025-03-10 23:00:00+00:00


### Further split the data based on the risk level
There are low, medium, and high risk models.

In [11]:
# Split train and test data by risk groups
train_data_split = {
    'low': train_data[train_data['Risk'] == 'Low Risk'],
    'medium': train_data[train_data['Risk'] == 'Medium Risk'],
    'high': train_data[train_data['Risk'] == 'High Risk']
}

test_data_split = {
    'low': test_data[test_data['Risk'] == 'Low Risk'],
    'medium': test_data[test_data['Risk'] == 'Medium Risk'],
    'high': test_data[test_data['Risk'] == 'High Risk']
}

# Train the model based on their classifications

### Training data

This wil give us 3 models to work with: model_low, model_medium, and model_high. We will use these subsequent models on the test data to evaluate the models

In [12]:
def train_models_by_frequency_and_risk(train_data, frequencies):
    models = {}
    model_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'MSE (Train)', 'Top Features (Importance)'])  # Updated column name
    
    feature_map = {
        'hourly': [
            'ln_hourly_rv_lag1', 'ln_hourly_rv_lag2',
            'ln_3_hourly_rv_lag1', 'ln_3_hourly_rv_lag2',
            'ln_daily_rv_lag1', 'ln_daily_rv_lag2',
            'ln_hourly_return_lag1', 'ln_hourly_return_lag2',
            'ln_3_hourly_return_lag1', 'ln_3_hourly_return_lag2'
        ],
        '3hourly': [
            'ln_3_hourly_rv_lag1', 'ln_3_hourly_rv_lag2',
            'ln_daily_rv_lag1', 'ln_daily_rv_lag2',
            'ln_weekly_rv_lag1', 'ln_weekly_rv_lag2',
            'ln_3_hourly_return_lag1', 'ln_3_hourly_return_lag2',
            'ln_daily_return_lag1', 'ln_daily_return_lag2'  
        ],
        'daily': [
            'ln_daily_rv_lag1', 'ln_daily_rv_lag2',
            'ln_weekly_rv_lag1', 'ln_weekly_rv_lag2',
            'ln_monthly_rv_lag1', 'ln_monthly_rv_lag2',
            'ln_daily_return_lag1', 'ln_daily_return_lag2',
            'ln_weekly_return_lag1', 'ln_weekly_return_lag2'
        ]
    }

    for freq in frequencies:
        models[freq] = {}
        for risk_group in ['low', 'medium', 'high']:
            group_data = train_data[risk_group].copy()
            
            target = {
                'hourly': 'ln_hourly_rv',
                '3hourly': 'ln_3_hourly_rv',
                'daily': 'ln_daily_rv'
            }[freq]
            
            features = feature_map[freq]
            available_features = [f for f in features if f in group_data.columns]
            
            if not available_features:
                raise ValueError(f"No valid features for {freq}-{risk_group}")
            
            X_train = group_data[available_features].dropna()
            y_train = group_data.loc[X_train.index, target]
            
            model = RandomForestRegressor(
                n_estimators=100,
                random_state=42,
                max_depth=5
            )
            model.fit(X_train, y_train)
            models[freq][risk_group] = model
            
            y_pred = model.predict(X_train)
            mse_train = mean_squared_error(y_train, y_pred)
            
            # Get feature importances with scores
            feature_importance = pd.Series(model.feature_importances_, index=available_features)
            sorted_importance = feature_importance.sort_values(ascending=False).head(3)
            top_features_info = [f"{feature}: {importance:.4f}" for feature, importance in sorted_importance.items()]
            top_features_str = ", ".join(top_features_info)  # Format: "feature1: 0.25, feature2: 0.18, ..."
            
            # Append to summary
            model_summary = pd.concat([
                model_summary,
                pd.DataFrame({
                    'Frequency': [freq],
                    'Risk Group': [risk_group],
                    'MSE (Train)': [mse_train],
                    'Top Features (Importance)': [top_features_str]  # New column with scores
                })
            ], ignore_index=True)
    
    return models, model_summary

models, model_summary = train_models_by_frequency_and_risk(train_data_split, ['hourly', '3hourly', 'daily'])

  model_summary = pd.concat([


### Model summary of training data

In [13]:

model_summary

Unnamed: 0,Frequency,Risk Group,MSE (Train),Top Features (Importance)
0,hourly,low,5.277956,"ln_3_hourly_rv_lag1: 0.4415, ln_hourly_rv_lag1..."
1,hourly,medium,5.764612,"ln_3_hourly_rv_lag1: 0.3081, ln_daily_rv_lag1:..."
2,hourly,high,5.660812,"ln_daily_rv_lag1: 0.6906, ln_3_hourly_rv_lag1:..."
3,3hourly,low,1.69928,"ln_3_hourly_rv_lag1: 0.5821, ln_3_hourly_rv_la..."
4,3hourly,medium,1.765836,"ln_3_hourly_rv_lag1: 0.5011, ln_daily_rv_lag1:..."
5,3hourly,high,1.610396,"ln_daily_rv_lag1: 0.6437, ln_3_hourly_rv_lag1:..."
6,daily,low,0.571216,"ln_daily_rv_lag1: 0.3858, ln_weekly_rv_lag1: 0..."
7,daily,medium,0.660345,"ln_daily_rv_lag1: 0.5175, ln_weekly_rv_lag2: 0..."
8,daily,high,0.506235,"ln_daily_rv_lag1: 0.7162, ln_weekly_rv_lag1: 0..."


### Implement rolling window
Now implement rolling window for the test data

In [None]:
def rolling_window_predictions(X_test, y_test, model, window_size=24, step_ahead=1):
    predictions = []
    actuals = []
    dates = []
    
    max_index = len(X_test) - step_ahead  # Ensure enough data for step_ahead
    
    for i in range(window_size, max_index + 1):
        X_window = X_test.drop(columns=['Date']).iloc[i - window_size:i]
        y_pred = model.predict(X_window.tail(1))[0]
        
        # Capture the target value `step_ahead` steps ahead
        actual_index = i + step_ahead - 1
        actual_value = y_test.iloc[actual_index]
        current_date = X_test['Date'].iloc[actual_index]
        
        predictions.append(y_pred)
        actuals.append(actual_value)
        dates.append(current_date)
    
    return predictions, actuals, dates

In [15]:
def evaluate_models_on_test_data(test_data_split, models, frequencies, window_size=24):
    evaluation_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Ticker', 'MSE (Test)', 'R²'])
    detailed_results = pd.DataFrame(columns=['Date', 'Ticker', 'Risk Group', 'Frequency', 'Predicted', 'Actual'])
    
    # Feature map from your previous code
    feature_map = {
        'hourly': [
            'ln_hourly_rv_lag1', 'ln_hourly_rv_lag2',
            'ln_3_hourly_rv_lag1', 'ln_3_hourly_rv_lag2',
            'ln_daily_rv_lag1', 'ln_daily_rv_lag2',
            'ln_hourly_return_lag1', 'ln_hourly_return_lag2',
            'ln_3_hourly_return_lag1', 'ln_3_hourly_return_lag2'
        ],
        '3hourly': [
            'ln_3_hourly_rv_lag1', 'ln_3_hourly_rv_lag2',
            'ln_daily_rv_lag1', 'ln_daily_rv_lag2',
            'ln_weekly_rv_lag1', 'ln_weekly_rv_lag2',
            'ln_3_hourly_return_lag1', 'ln_3_hourly_return_lag2',
            'ln_daily_return_lag1', 'ln_daily_return_lag2'
        ],
        'daily': [
            'ln_daily_rv_lag1', 'ln_daily_rv_lag2',
            'ln_weekly_rv_lag1', 'ln_weekly_rv_lag2',
            'ln_monthly_rv_lag1', 'ln_monthly_rv_lag2',
            'ln_daily_return_lag1', 'ln_daily_return_lag2',
            'ln_weekly_return_lag1', 'ln_weekly_return_lag2'
        ]
    }
    
    for freq in frequencies:
        for risk_group in ['low', 'medium', 'high']:
            model = models[freq][risk_group]
            group_data = test_data_split[risk_group].copy()
            
            # Define target and step_ahead based on frequency
            target = {
                'hourly': 'ln_hourly_rv',
                '3hourly': 'ln_3_hourly_rv',
                'daily': 'ln_daily_rv'
            }[freq]
            
            step_ahead = {
                'hourly': 1,
                '3hourly': 3,
                'daily': 24
            }[freq]
            
            features = feature_map[freq]
            available_features = [f for f in features if f in group_data.columns]
            
            if not available_features:
                raise ValueError(f"No valid features for {freq}-{risk_group}")
            
            unique_tickers = group_data['Ticker'].unique()
            
            for ticker in unique_tickers:
                ticker_data = group_data[group_data['Ticker'] == ticker].copy()
                
                # Validate features and target
                if not all(f in ticker_data.columns for f in available_features) or target not in ticker_data.columns:
                    print(f"Skipping {ticker}: missing features or target for {freq}-{risk_group}")
                    continue
                
                # Prepare data with Date column
                X_test = ticker_data[['Date'] + available_features].dropna()
                y_test = ticker_data.loc[X_test.index, target]
                
                # Ensure sufficient data for window and step_ahead
                if len(X_test) < window_size + step_ahead:
                    print(f"Skipping {ticker}: insufficient data ({len(X_test)} rows)")
                    continue
                
                # Get predictions using rolling window
                predictions, actuals, dates = rolling_window_predictions(
                    X_test, y_test, model, window_size=window_size, step_ahead=step_ahead
                )
                
                if len(predictions) == 0:
                    continue
                
                # Calculate metrics
                mse = mean_squared_error(actuals, predictions)
                r2 = r2_score(actuals, predictions)
                
                # Append to summary
                evaluation_summary = pd.concat([
                    evaluation_summary,
                    pd.DataFrame({
                        'Frequency': [freq],
                        'Risk Group': [risk_group],
                        'Ticker': [ticker],
                        'MSE (Test)': [mse],
                        'R²': [r2]
                    })
                ], ignore_index=True)
                
                # Append detailed results
                ticker_df = pd.DataFrame({
                    'Date': dates,
                    'Ticker': ticker,
                    'Risk Group': risk_group,
                    'Frequency': freq,
                    'Predicted': predictions,
                    'Actual': actuals
                })
                detailed_results = pd.concat([detailed_results, ticker_df], ignore_index=True)
    
    return evaluation_summary, detailed_results

evaluation_summary, detailed_results = evaluate_models_on_test_data(test_data_split, models, ['hourly', '3hourly', 'daily'])

evaluation_summary

  evaluation_summary = pd.concat([
  detailed_results = pd.concat([detailed_results, ticker_df], ignore_index=True)


Unnamed: 0,Frequency,Risk Group,Ticker,MSE (Test),R²
0,hourly,low,BTC-USD,6.050921,0.054462
1,hourly,medium,ETH-USD,5.818246,0.047285
2,hourly,medium,XRP-USD,5.946979,0.16592
3,hourly,high,DOGE-USD,5.924037,0.063279
4,hourly,high,SOL-USD,5.907248,0.02967
5,3hourly,low,BTC-USD,2.173255,0.092026
6,3hourly,medium,ETH-USD,1.895229,0.125925
7,3hourly,medium,XRP-USD,2.031394,0.341096
8,3hourly,high,DOGE-USD,1.817135,0.172729
9,3hourly,high,SOL-USD,1.6626,0.076874


In [17]:
# Save the detailed results
detailed_results.to_csv('../../results/rf.csv', index=False)