# Random Forest
The RF performed well in our group project when it came to predicting the RV of ETH. Now we wil use RF to predict other coins

## Import the libraries and data
To obtain the data, please go to notebooks/data_preprocessing, and then run data_import.ipynb and then run data_preprocessing.ipynb. This will give you data/processed_data.csv

In [1]:
# Import the necessary libraries
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
from sklearn.ensemble import RandomForestRegressor

hourly_data = pd.read_csv('../..//data/hourly_data.csv')
three_hourly_data = pd.read_csv('../../data/three_hourly_data.csv')
daily_data = pd.read_csv('../../data/daily_data.csv')


## Train test split
Now we will use a different train-test split from the group project
Group project: 80/20 split
Individual: Use 1 year of training data, then use rolling window 

In [2]:

# Train-test split
# Sort the data by date
hourly_data['Date'] = pd.to_datetime(hourly_data['Date'])
three_hourly_data['Date'] = pd.to_datetime(three_hourly_data['Date'])
daily_data['Date'] = pd.to_datetime(daily_data['Date'])

hourly_data = hourly_data.sort_values('Date')

# Determine when the first year ends, and use it as train data
# The rest of the data is used as test data
min_date = hourly_data['Date'].min()
max_date = hourly_data['Date'].max()

# Calculate the total time span of the data
total_time_span = max_date - min_date

# Define the first year of data
first_year_end = min_date + pd.DateOffset(years=1)

# Filter data for the first year
first_year_data = hourly_data[hourly_data['Date'] <= first_year_end]

# Calculate the percentage of data in the first year
percentage_first_year = (len(first_year_data) / len(hourly_data))

train_split = percentage_first_year

# Hourly data train-test split
X_train_hourly = hourly_data[hourly_data['Date'] <= first_year_end]
X_test_hourly = hourly_data[hourly_data['Date'] > first_year_end]

# Three hourly data train-test split
X_train_three_hourly = three_hourly_data[three_hourly_data['Date'] <= first_year_end]
X_test_three_hourly = three_hourly_data[three_hourly_data['Date'] > first_year_end]

# Daily data train-test split
X_train_daily = daily_data[daily_data['Date'] <= first_year_end]
X_test_daily = daily_data[daily_data['Date'] > first_year_end]

### Further split the data based on the risk level
There are low, medium, and high risk models.

In [3]:
train_date_split_hourly = {
    'low': X_train_hourly[X_train_hourly['Risk'] == 'Low Risk'],
    'medium': X_train_hourly[X_train_hourly['Risk'] == 'Medium Risk'],
    'high': X_train_hourly[X_train_hourly['Risk'] == 'High Risk']
}

test_date_split_hourly = {
    'low': X_test_hourly[X_test_hourly['Risk'] == 'Low Risk'],
    'medium': X_test_hourly[X_test_hourly['Risk'] == 'Medium Risk'],
    'high': X_test_hourly[X_test_hourly['Risk'] == 'High Risk']
}

train_date_split_three_hourly = {
    'low': X_train_three_hourly[X_train_three_hourly['Risk'] == 'Low Risk'],
    'medium': X_train_three_hourly[X_train_three_hourly['Risk'] == 'Medium Risk'],
    'high': X_train_three_hourly[X_train_three_hourly['Risk'] == 'High Risk']
}

test_date_split_three_hourly = {
    'low': X_test_three_hourly[X_test_three_hourly['Risk'] == 'Low Risk'],
    'medium': X_test_three_hourly[X_test_three_hourly['Risk'] == 'Medium Risk'],
    'high': X_test_three_hourly[X_test_three_hourly['Risk'] == 'High Risk']
}

train_date_split_daily = {
    'low': X_train_daily[X_train_daily['Risk'] == 'Low Risk'],
    'medium': X_train_daily[X_train_daily['Risk'] == 'Medium Risk'],
    'high': X_train_daily[X_train_daily['Risk'] == 'High Risk']
}

test_date_split_daily = {
    'low': X_test_daily[X_test_daily['Risk'] == 'Low Risk'],
    'medium': X_test_daily[X_test_daily['Risk'] == 'Medium Risk'],
    'high': X_test_daily[X_test_daily['Risk'] == 'High Risk']
}


# Train the model based on their classifications

### Training data

This wil give us 3 models to work with: model_low, model_medium, and model_high. We will use these subsequent models on the test data to evaluate the models

In [4]:
def train_models_by_frequency_and_risk(train_data, frequencies):
    models = {}
    model_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'MSE (Train)', 'Top Features (Importance)'])  # Updated column name
    
    feature_map = {
            'hourly': [
                'ln_hourly_rv_lag1', 'ln_hourly_rv_lag2',
                'ln_hourly_rv_lag3', 'ln_hourly_rv_lag8',
                'ln_hourly_rv_lag24'
            ],
            '3hourly': [
                'ln_3_hourly_rv_lag1', 'ln_3_hourly_rv_lag2',
                'ln_3_hourly_rv_lag3', 'ln_3_hourly_rv_lag4',
                'ln_3_hourly_rv_lag8'
            ],
            'daily': [
                'ln_daily_rv_lag1', 'ln_daily_rv_lag2',
                'ln_daily_rv_lag3', 'ln_weekly_rv', 'ln_weekly_rv_lag1',
                'ln_monthly_rv', 'ln_monthly_rv_lag1',
            ]
        }   
    for freq in frequencies:
        models[freq] = {}
        for risk_group in ['low', 'medium', 'high']:
            group_data = train_data[risk_group].copy()
            
            target = {
                'hourly': 'ln_hourly_rv',
                '3hourly': 'ln_3_hourly_rv',
                'daily': 'ln_daily_rv'
            }[freq]
            
            features = feature_map[freq]
            available_features = [f for f in features if f in group_data.columns]
            
            if not available_features:
                raise ValueError(f"No valid features for {freq}-{risk_group}")
            
            X_train = group_data[available_features].dropna()
            y_train = group_data.loc[X_train.index, target]
            
            model = RandomForestRegressor(
                n_estimators=100,
                random_state=42,
                max_depth=5
            )
            model.fit(X_train, y_train)
            models[freq][risk_group] = model
            
            y_pred = model.predict(X_train)
            mse_train = mean_squared_error(y_train, y_pred)
            
            # Get feature importances with scores
            feature_importance = pd.Series(model.feature_importances_, index=available_features)
            sorted_importance = feature_importance.sort_values(ascending=False).head(3)
            top_features_info = [f"{feature}: {importance:.4f}" for feature, importance in sorted_importance.items()]
            top_features_str = ", ".join(top_features_info)  # Format: "feature1: 0.25, feature2: 0.18, ..."
            
            # Append to summary
            model_summary = pd.concat([
                model_summary,
                pd.DataFrame({
                    'Frequency': [freq],
                    'Risk Group': [risk_group],
                    'MSE (Train)': [mse_train],
                    'Top Features (Importance)': [top_features_str]  # New column with scores
                })
            ], ignore_index=True)
    
    return models, model_summary

model_hourly, summary_hourly = train_models_by_frequency_and_risk(train_date_split_hourly, ['hourly'])
model_three_hourly, summary_three_hourly = train_models_by_frequency_and_risk(train_date_split_three_hourly, ['3hourly'])
model_daily, summary_daily = train_models_by_frequency_and_risk(train_date_split_daily, ['daily'])

  model_summary = pd.concat([
  model_summary = pd.concat([
  model_summary = pd.concat([


### Model summary of training data

In [5]:
# Merge summaries for an overall overview
combined_summary = pd.concat([summary_hourly, summary_three_hourly, summary_daily], ignore_index=True)
print("Combined Model Summary:")
print(combined_summary)

Combined Model Summary:
  Frequency Risk Group  MSE (Train)  \
0    hourly        low     5.745561   
1    hourly     medium     5.760332   
2    hourly       high     5.959230   
3   3hourly        low     1.823590   
4   3hourly     medium     1.634486   
5   3hourly       high     1.685722   
6     daily        low     0.415582   
7     daily     medium     0.347178   
8     daily       high     0.334317   

                           Top Features (Importance)  
0  ln_hourly_rv_lag1: 0.4985, ln_hourly_rv_lag2: ...  
1  ln_hourly_rv_lag1: 0.4237, ln_hourly_rv_lag2: ...  
2  ln_hourly_rv_lag1: 0.4593, ln_hourly_rv_lag2: ...  
3  ln_3_hourly_rv_lag1: 0.5774, ln_3_hourly_rv_la...  
4  ln_3_hourly_rv_lag1: 0.4729, ln_3_hourly_rv_la...  
5  ln_3_hourly_rv_lag1: 0.5508, ln_3_hourly_rv_la...  
6  ln_weekly_rv: 0.6168, ln_weekly_rv_lag1: 0.120...  
7  ln_weekly_rv: 0.5829, ln_weekly_rv_lag1: 0.152...  
8  ln_weekly_rv: 0.8055, ln_weekly_rv_lag1: 0.076...  


### Implement rolling window
Now implement rolling window for the test data

In [6]:
def rolling_window_predictions(X_test, y_test, model, window_size=24, step_ahead=1):
    predictions = []
    actuals = []
    dates = []
    
    max_index = len(X_test) - step_ahead  # Ensure enough data for step_ahead
    
    for i in range(window_size, max_index + 1):
        X_window = X_test.drop(columns=['Date']).iloc[i - window_size:i]
        y_pred = model.predict(X_window.tail(1))[0]
        
        # Capture the target value `step_ahead` steps ahead
        actual_index = i + step_ahead - 1
        actual_value = y_test.iloc[actual_index]
        current_date = X_test['Date'].iloc[actual_index]
        
        predictions.append(y_pred)
        actuals.append(actual_value)
        dates.append(current_date)
    
    return predictions, actuals, dates

In [7]:
def evaluate_models_on_test_data(test_data_split, models, frequencies, window_size=24):
    evaluation_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Ticker', 'MSE (Test)', 'R²'])
    detailed_results = pd.DataFrame(columns=['Date', 'Ticker', 'Risk Group', 'Frequency', 'Predicted', 'Actual'])
    
    # Feature map from your previous code
    feature_map = {
            'hourly': [
                'ln_hourly_rv_lag1', 'ln_hourly_rv_lag2',
                'ln_hourly_rv_lag3', 'ln_hourly_rv_lag8',
                'ln_hourly_rv_lag24'
            ],
            '3hourly': [
                'ln_3_hourly_rv_lag1', 'ln_3_hourly_rv_lag2',
                'ln_3_hourly_rv_lag3', 'ln_3_hourly_rv_lag4',
                'ln_3_hourly_rv_lag8'
            ],
            'daily': [
                'ln_daily_rv_lag1', 'ln_daily_rv_lag2',
                'ln_daily_rv_lag3', 'ln_weekly_rv', 'ln_weekly_rv_lag1',
                'ln_monthly_rv', 'ln_monthly_rv_lag1',
            ]
        }   
    
    for freq in frequencies:
        for risk_group in ['low', 'medium', 'high']:
            model = models[freq][risk_group]
            group_data = test_data_split[risk_group].copy()
            
            # Define target and step_ahead based on frequency
            target = {
                'hourly': 'ln_hourly_rv',
                '3hourly': 'ln_3_hourly_rv',
                'daily': 'ln_daily_rv'
            }[freq]
            
            # Used to be another method which needed 1, 3 and 24. But have since changed to solely make 1 step ahead
            step_ahead = {
                'hourly': 1,
                '3hourly': 1,
                'daily': 1
            }[freq]
            
            features = feature_map[freq]
            available_features = [f for f in features if f in group_data.columns]
            
            if not available_features:
                raise ValueError(f"No valid features for {freq}-{risk_group}")
            
            unique_tickers = group_data['Ticker'].unique()
            
            for ticker in unique_tickers:
                ticker_data = group_data[group_data['Ticker'] == ticker].copy()
                
                # Validate features and target
                if not all(f in ticker_data.columns for f in available_features) or target not in ticker_data.columns:
                    print(f"Skipping {ticker}: missing features or target for {freq}-{risk_group}")
                    continue
                
                # Prepare data with Date column
                X_test = ticker_data[['Date'] + available_features].dropna()
                y_test = ticker_data.loc[X_test.index, target]
                
                # Ensure sufficient data for window and step_ahead
                if len(X_test) < window_size + step_ahead:
                    print(f"Skipping {ticker}: insufficient data ({len(X_test)} rows)")
                    continue
                
                # Get predictions using rolling window
                predictions, actuals, dates = rolling_window_predictions(
                    X_test, y_test, model, window_size=window_size, step_ahead=step_ahead
                )
                
                if len(predictions) == 0:
                    continue
                
                # Calculate metrics
                mse = mean_squared_error(actuals, predictions)
                r2 = r2_score(actuals, predictions)
                
                # Append to summary
                evaluation_summary = pd.concat([
                    evaluation_summary,
                    pd.DataFrame({
                        'Frequency': [freq],
                        'Risk Group': [risk_group],
                        'Ticker': [ticker],
                        'MSE (Test)': [mse],
                        'R²': [r2]
                    })
                ], ignore_index=True)
                
                # Append detailed results
                ticker_df = pd.DataFrame({
                    'Date': dates,
                    'Ticker': ticker,
                    'Risk Group': risk_group,
                    'Frequency': freq,
                    'Predicted': predictions,
                    'Actual': actuals
                })
                detailed_results = pd.concat([detailed_results, ticker_df], ignore_index=True)
    
    return evaluation_summary, detailed_results


In [8]:
evaluation_summary_hourly, detailed_results_hourly = evaluate_models_on_test_data(
    test_date_split_hourly, model_hourly, ['hourly'], window_size=24
)
evaluation_summary_three_hourly, detailed_results_three_hourly = evaluate_models_on_test_data(
    test_date_split_three_hourly, model_three_hourly, ['3hourly'], window_size=24
)
evaluation_summary_daily, detailed_results_daily = evaluate_models_on_test_data(
    test_date_split_daily, model_daily, ['daily'], window_size=24
)

combined_summary = pd.concat([evaluation_summary_hourly, evaluation_summary_three_hourly, evaluation_summary_daily], ignore_index=True)
combined_details = pd.concat([detailed_results_hourly, detailed_results_three_hourly, detailed_results_daily], ignore_index=True)
combined_details.to_csv('../../results/rf.csv', index=False)
print("Evaluation Summary:")
print(combined_summary)

  evaluation_summary = pd.concat([
  detailed_results = pd.concat([detailed_results, ticker_df], ignore_index=True)
  evaluation_summary = pd.concat([
  detailed_results = pd.concat([detailed_results, ticker_df], ignore_index=True)
  evaluation_summary = pd.concat([
  detailed_results = pd.concat([detailed_results, ticker_df], ignore_index=True)


Evaluation Summary:
   Frequency Risk Group    Ticker  MSE (Test)        R²
0     hourly        low   BTC-USD    6.137385  0.032154
1     hourly        low   ETH-USD    6.250909 -0.020190
2     hourly     medium   XRP-USD    6.189495  0.117544
3     hourly       high  DOGE-USD    6.086399  0.025906
4     hourly       high   SOL-USD    5.986613  0.009177
5    3hourly        low   BTC-USD    2.128615  0.099694
6    3hourly        low   ETH-USD    2.030861  0.057515
7    3hourly     medium   XRP-USD    2.085522  0.307492
8    3hourly       high  DOGE-USD    1.852628  0.149222
9    3hourly       high   SOL-USD    1.663300  0.092660
10     daily        low   BTC-USD    0.995994  0.054637
11     daily        low   ETH-USD    0.817043  0.112460
12     daily     medium   XRP-USD    1.189057  0.387768
13     daily       high  DOGE-USD    0.828767  0.140296
14     daily       high   SOL-USD    0.676554  0.088763
