# HAR Model
For each risk classficiation, we will train a model to fit to predict the RV model

## Import the libraries and data
To obtain the data, please go to notebooks/data_preprocessing, and then run data_import.ipynb and then run data_preprocessing.ipynb. This will give you data/processed_data.csv

In [1]:

# Import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from arch import arch_model
import numpy as np

data = pd.read_csv('../../data/processed_data.csv')
data['Date'] = pd.to_datetime(data['Date'])

# Remove NaN values
data = data.dropna()

# Print columns
print(data.columns)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker',
       'ln_hourly_return', 'ln_3_hourly_return', 'hourly_rv', '3_hourly_rv',
       'ln_hourly_rv', 'ln_3_hourly_rv', 'ln_daily_rv', 'weekly_rv',
       'ln_weekly_rv', 'monthly_rv', 'ln_monthly_rv', 'ln_daily_rv_lag1',
       'ln_daily_rv_lag2', 'ln_weekly_rv_lag1', 'ln_weekly_rv_lag2',
       'ln_monthly_rv_lag1', 'ln_monthly_rv_lag2', 'ln_hourly_rv_lag1',
       'ln_3_hourly_rv_lag1', 'ln_hourly_rv_lag2', 'ln_3_hourly_rv_lag2',
       'ln_hourly_return_lag1', 'ln_3_hourly_return_lag1',
       'ln_hourly_return_lag2', 'ln_3_hourly_return_lag2', 'hourly_rv_lag1',
       'hourly_rv_lag2', 'three_hourly_rv_lag1', 'three_hourly_rv_lag2',
       'daily_rv', 'Risk'],
      dtype='object')


## Train test split
Now we will use a different train-test split from the group project
Group project: 80/20 split
Individual: Use 1 year of training data, then use rolling window 

In [2]:
# Train-test split
# Sort the data by date
data = data.sort_values('Date')

# Determine when the first year ends, and use it as train data
# The rest of the data is used as test data
min_date = data['Date'].min()
max_date = data['Date'].max()

# Calculate the total time span of the data
total_time_span = max_date - min_date

# Define the first year of data
first_year_end = min_date + pd.DateOffset(years=1)

# Filter data for the first year
first_year_data = data[data['Date'] <= first_year_end]

# Calculate the percentage of data in the first year
percentage_first_year = (len(first_year_data) / len(data))

train_split = percentage_first_year
train_data = data[:int(train_split * len(data))]
test_data = data[int(train_split * len(data)):]

# Print train and test data date
print(train_data['Date'].min(), train_data['Date'].max())
print(test_data['Date'].min(), test_data['Date'].max())

2023-05-03 00:00:00+00:00 2024-05-03 00:00:00+00:00
2024-05-03 01:00:00+00:00 2025-03-10 23:00:00+00:00


### Further split the data based on the risk level
There are low, medium, and high risk models.

In [3]:

# Split train and test data by risk groups
train_data_split = {
    'low': train_data[train_data['Risk'] == 'Low Risk'],
    'medium': train_data[train_data['Risk'] == 'Medium Risk'],
    'high': train_data[train_data['Risk'] == 'High Risk']
}

test_data_split = {
    'low': test_data[test_data['Risk'] == 'Low Risk'],
    'medium': test_data[test_data['Risk'] == 'Medium Risk'],
    'high': test_data[test_data['Risk'] == 'High Risk']
}

# Train the model based on their classifications

### Training data

This wil give us 3 models to work with: model_low, model_medium, and model_high. We will use these subsequent models on the test data to evaluate the models

In [4]:
def train_models_by_frequency_and_risk(train_data, frequencies):
    models = {}
    model_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Omega', 'Alpha[1]', 'Beta[1]', 'Mean'])
    
    for freq in frequencies:
        models[freq] = {}
        # Select target column based on frequency
        if freq == 'hourly':
            target_col = 'hourly_rv'
        elif freq == '3hourly':
            target_col = '3_hourly_rv'
        elif freq == 'daily':
            target_col = 'daily_rv'
        else:
            raise ValueError(f"Unsupported frequency: {freq}")
            
        for risk_group in ['low', 'medium', 'high']:
            df_train = train_data[risk_group].copy()
            if target_col not in df_train.columns:
                print(f"Missing {target_col} for {freq}-{risk_group}, skipping.")
                continue
            
            # Drop missing values
            series = df_train[target_col].dropna()
            if len(series) < 2:
                print(f"Insufficient data for {freq}-{risk_group}, skipping.")
                continue
            
            # Fit GARCH(1,1) with constant mean
            am = arch_model(series, mean='Constant', vol='GARCH', p=1, q=1, dist='normal', rescale=False)
            res = am.fit(disp='off')
            
            # Store the fitted model and its key parameters.
            # Note: We save the last observed variance in level space (i.e. not in log).
            models[freq][risk_group] = {
                'model': res,
                'params': res.params,  # Contains omega, alpha[1], beta[1], mu
                'last_var': res.conditional_volatility.iloc[-1]**2  
            }
            
            # Save parameters for summary
            pars = res.params
            row = {
                'Frequency': freq,
                'Risk Group': risk_group,
                'Omega': pars.get('omega', np.nan),
                'Alpha[1]': pars.get('alpha[1]', np.nan),
                'Beta[1]': pars.get('beta[1]', np.nan),
                'Mean': pars.get('mu', np.nan)
            }
            model_summary = pd.concat([model_summary, pd.DataFrame([row])], ignore_index=True)
    
    return models, model_summary



models, model_summary = train_models_by_frequency_and_risk(train_data_split, ['hourly', '3hourly', 'daily'])

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

  model_summary = pd.concat([model_summary, pd.DataFrame([row])], ignore_index=True)
Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.



### Model summary

In [5]:
model_summary

Unnamed: 0,Frequency,Risk Group,Omega,Alpha[1],Beta[1],Mean
0,hourly,low,8.93735e-10,0.1,0.8,2.3e-05
1,hourly,medium,8.627475e-08,0.2,0.5,4.6e-05
2,hourly,high,5.343839e-09,0.1,0.88,-0.033973
3,3hourly,low,6.962347e-10,0.2,0.78,6.9e-05
4,3hourly,medium,1.503358e-07,0.2,0.7,0.000137
5,3hourly,high,2.098017e-08,0.2,0.78,0.000296
6,daily,low,1.089036e-08,0.2,0.78,0.000549
7,daily,medium,3.090891e-07,0.2,0.78,0.001093
8,daily,high,3.660865e-07,0.2,0.78,0.002368


### Testing data

#### Implementing rolling window
Rolling window is used for a one step ahead forecast. So we constantly update the lagged data with an update lagged data

In [6]:
def rolling_window_predictions(X_test, y_test, model, window_size=24, step_ahead=1):
    params = model['params']
    mu    = params.get('mu', 0.0)
    omega = params.get('omega', np.nan)
    alpha = params.get('alpha[1]', np.nan)
    beta  = params.get('beta[1]', np.nan)
    
    # Start with the last observed variance from training (in level space)
    sigma_sq_prev = model['last_var']
    
    predictions = []
    actuals = []
    dates = []
    
    # Loop over the test sample, starting from the window_size index.
    max_index = len(X_test) - step_ahead
    for i in range(window_size, max_index + 1):
        sigma_sq = sigma_sq_prev
        
        # Forecast step-by-step in level space.
        for h in range(step_ahead):
            # Convert the test value from log-space to level.
            r_t = np.exp(y_test.iloc[i + h - 1])
            sigma_sq = omega + alpha * (r_t - mu)**2 + beta * sigma_sq
        
        # Forecasted variance (level)
        forecast_value = sigma_sq
        predictions.append(forecast_value)
        
        # Actual value: convert y_test from log-space to level
        actual_val = np.exp(y_test.iloc[i + step_ahead - 1])
        actuals.append(actual_val)
        
        dates.append(X_test['Date'].iloc[i + step_ahead - 1])
        
        # Update sigma_sq_prev using the current observation (converted to level)
        sigma_sq_prev = np.exp(y_test.iloc[i])
    
    return np.array(predictions), np.array(actuals), np.array(dates)


#### Implementing an evaluatation function
This function evaluates the findings and puts it in a df for each ticker

In [None]:
def evaluate_models_on_test_data(test_data_split, models, frequencies, window_size=24):
    evaluation_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Ticker', 'MSE', 'R²'])
    detailed_results = pd.DataFrame(columns=['Date', 'Ticker', 'Risk Group', 'Frequency', 'Predicted', 'Actual'])
    
    for freq in frequencies:
        # Define target column and forecast horizon based on frequency
        if freq == 'hourly':
            target = 'hourly_rv'
            step_ahead = 1
        elif freq == '3hourly':
            target = '3_hourly_rv'
            step_ahead = 3
        elif freq == 'daily':
            target = 'daily_rv'
            step_ahead = 24
        else:
            raise ValueError(f"Unsupported frequency: {freq}")
        
        for risk_group in ['low', 'medium', 'high']:
            if risk_group not in models[freq]:
                continue
            model_info = models[freq][risk_group]
            group_data = test_data_split[risk_group].copy()
            
            unique_tickers = group_data['Ticker'].unique()
            for ticker in unique_tickers:
                ticker_data = group_data[group_data['Ticker'] == ticker].copy()
                if target not in ticker_data.columns:
                    continue
                
                # Prepare test data with 'Date' and target column
                X_test = ticker_data[['Date', target]].dropna()
                y_test = X_test[target]
                if len(X_test) < window_size + step_ahead:
                    print(f"Skipping {ticker}: insufficient data for {freq}-{risk_group}")
                    continue
                
                # Obtain forecasts using our rolling window prediction function
                predictions, actuals, dates = rolling_window_predictions(X_test, y_test, model_info, window_size, step_ahead)
                
                # Align lengths if necessary
                min_len = min(len(predictions), len(actuals))
                predictions = predictions[:min_len]
                actuals = actuals[:min_len]
                dates = dates[:min_len]
                
                # Filter out any NaNs
                valid_idx = ~np.isnan(predictions) & ~np.isnan(actuals)
                predictions = predictions[valid_idx]
                actuals = actuals[valid_idx]
                dates = dates[valid_idx]
                if len(predictions) == 0:
                    continue
                
                # Convert both predictions and actuals to log-space for metric calculation.
                log_predictions = np.log(predictions)
                log_actuals = np.log(actuals)
                
                mse = mean_squared_error(log_actuals, log_predictions)
                r2 = r2_score(log_actuals, log_predictions)
                
                eval_row = {
                    'Frequency': freq,
                    'Risk Group': risk_group.capitalize(),
                    'Ticker': ticker,
                    'MSE': mse,
                    'R²': r2
                }
                evaluation_summary = pd.concat([evaluation_summary, pd.DataFrame([eval_row])], ignore_index=True)
                
                ticker_results = pd.DataFrame({
                    'Date': dates,
                    'Ticker': ticker,
                    'Risk Group': risk_group.capitalize(),
                    'Frequency': freq,
                    'Predicted': predictions,  # in level (variance) space
                    'Actual': actuals         # in level (variance) space
                })
                detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)
    
    return evaluation_summary, detailed_results


#### Evaluate the test data

In [None]:

# Define frequencies
frequencies = ['hourly', '3hourly', 'daily']

# Evaluate models
evaluation_summary, detailed_results = evaluate_models_on_test_data(
    test_data_split, models, frequencies
)

print(detailed_results.head())
# Save results
detailed_results.to_csv('../../results/garch.csv', index=False)
print("Evaluation Summary:")
print(evaluation_summary)