# HAR Model
For each risk classficiation, we will train a model to fit to predict the RV model

## Import the libraries and data
To obtain the data, please go to notebooks/data_preprocessing, and then run data_import.ipynb and then run data_preprocessing.ipynb. This will give you data/processed_data.csv

In [1]:
# Import the necessary libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

hourly_data = pd.read_csv('../..//data/hourly_data.csv')
three_hourly_data = pd.read_csv('../../data/three_hourly_data.csv')
daily_data = pd.read_csv('../../data/daily_data.csv')

print(three_hourly_data.columns)

Index(['Ticker', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume',
       'ln_3_hourly_return', '3_hourly_rv', 'ln_3_hourly_rv',
       'ln_3_hourly_rv_lag1', 'ln_3_hourly_rv_lag2', 'ln_3_hourly_rv_lag3',
       'ln_3_hourly_rv_lag4', 'ln_3_hourly_rv_lag8', 'Risk'],
      dtype='object')


## Train test split
Now we will use a different train-test split from the group project
Group project: 80/20 split
Individual: Use 1 year of training data, then use rolling window 

In [2]:

# Train-test split
# Sort the data by date
hourly_data['Date'] = pd.to_datetime(hourly_data['Date'])
three_hourly_data['Date'] = pd.to_datetime(three_hourly_data['Date'])
daily_data['Date'] = pd.to_datetime(daily_data['Date'])

hourly_data = hourly_data.sort_values('Date')

# Determine when the first year ends, and use it as train data
# The rest of the data is used as test data
min_date = hourly_data['Date'].min()
max_date = hourly_data['Date'].max()

# Calculate the total time span of the data
total_time_span = max_date - min_date

# Define the first year of data
first_year_end = min_date + pd.DateOffset(years=1)

# Filter data for the first year
first_year_data = hourly_data[hourly_data['Date'] <= first_year_end]

# Calculate the percentage of data in the first year
percentage_first_year = (len(first_year_data) / len(hourly_data))

train_split = percentage_first_year

# Hourly data train-test split
X_train_hourly = hourly_data[hourly_data['Date'] <= first_year_end]
X_test_hourly = hourly_data[hourly_data['Date'] > first_year_end]

# Three hourly data train-test split
X_train_three_hourly = three_hourly_data[three_hourly_data['Date'] <= first_year_end]
X_test_three_hourly = three_hourly_data[three_hourly_data['Date'] > first_year_end]

# Daily data train-test split
X_train_daily = daily_data[daily_data['Date'] <= first_year_end]
X_test_daily = daily_data[daily_data['Date'] > first_year_end]

### Further split the data based on the risk level
There are low, medium, and high risk models.

In [3]:
train_date_split_hourly = {
    'low': X_train_hourly[X_train_hourly['Risk'] == 'Low Risk'],
    'medium': X_train_hourly[X_train_hourly['Risk'] == 'Medium Risk'],
    'high': X_train_hourly[X_train_hourly['Risk'] == 'High Risk']
}

test_date_split_hourly = {
    'low': X_test_hourly[X_test_hourly['Risk'] == 'Low Risk'],
    'medium': X_test_hourly[X_test_hourly['Risk'] == 'Medium Risk'],
    'high': X_test_hourly[X_test_hourly['Risk'] == 'High Risk']
}

train_date_split_three_hourly = {
    'low': X_train_three_hourly[X_train_three_hourly['Risk'] == 'Low Risk'],
    'medium': X_train_three_hourly[X_train_three_hourly['Risk'] == 'Medium Risk'],
    'high': X_train_three_hourly[X_train_three_hourly['Risk'] == 'High Risk']
}

test_date_split_three_hourly = {
    'low': X_test_three_hourly[X_test_three_hourly['Risk'] == 'Low Risk'],
    'medium': X_test_three_hourly[X_test_three_hourly['Risk'] == 'Medium Risk'],
    'high': X_test_three_hourly[X_test_three_hourly['Risk'] == 'High Risk']
}

train_date_split_daily = {
    'low': X_train_daily[X_train_daily['Risk'] == 'Low Risk'],
    'medium': X_train_daily[X_train_daily['Risk'] == 'Medium Risk'],
    'high': X_train_daily[X_train_daily['Risk'] == 'High Risk']
}

test_date_split_daily = {
    'low': X_test_daily[X_test_daily['Risk'] == 'Low Risk'],
    'medium': X_test_daily[X_test_daily['Risk'] == 'Medium Risk'],
    'high': X_test_daily[X_test_daily['Risk'] == 'High Risk']
}


# Train the model based on their classifications

### Training data

This wil give us 3 models to work with: model_low, model_medium, and model_high. We will use these subsequent models on the test data to evaluate the models

In [4]:
def train_models_by_frequency_and_risk(train_data, frequencies):
    models = {}
    model_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Intercept'] +
                                 ['ln_hourly_rv_lag1', 'ln_hourly_rv_lag8', 'ln_hourly_rv_lag24',
                                  'ln_3_hourly_rv_lag1', 'ln_3_hourly_rv_lag4', 'ln_3_hourly_rv_lag8',
                                  'ln_daily_rv_lag1', 'ln_weekly_rv_lag1', 'ln_monthly_rv_lag1'])
    
    for freq in frequencies:
        models[freq] = {}
        for risk_group in ['low', 'medium', 'high']:
            group_data = train_data[risk_group]
            
            # Select features and target based on frequency
            if freq == 'hourly':
                # Use hourly, 8-hourly, and 24-hourly lag features for hourly frequency
                features = ['ln_hourly_rv_lag1', 'ln_hourly_rv_lag8', 'ln_hourly_rv_lag24']
                target = 'ln_hourly_rv'
            elif freq == '3hourly':
                # Use 3 hourly, 12 hourly, and 24 hourly lag features for 3 hourly frequency
                features = ['ln_3_hourly_rv_lag1', 'ln_3_hourly_rv_lag4', 'ln_3_hourly_rv_lag8']
                target = 'ln_3_hourly_rv'
            elif freq == 'daily':
                # For daily, use the daily, weekly, and monthly lags.
                features = ['ln_daily_rv_lag1', 'ln_weekly_rv_lag1', 'ln_monthly_rv_lag1']
                target = 'ln_daily_rv'
            else:
                raise ValueError(f"Unsupported frequency: {freq}")
            
            # Ensure that group_data contains all required features
            missing_features = [f for f in features if f not in group_data.columns]
            if missing_features:
                print(f"Missing features {missing_features} for {freq}-{risk_group}, skipping.")
                continue
            
            # Create a mask for rows with no NaN in the selected features and target
            mask = group_data[features + [target]].notnull().all(axis=1)
            X_train = group_data.loc[mask, features]
            y_train = group_data.loc[mask, target]
            
            # Check for sufficient data after dropping NaNs
            if len(X_train) < 2:
                print(f"Insufficient data for {freq}-{risk_group} after dropping NaNs, skipping.")
                continue
            
            model = LinearRegression()
            model.fit(X_train, y_train)
            models[freq][risk_group] = model
            
            # Build a summary of coefficients and intercept
            coef_dict = {
                'Frequency': freq,
                'Risk Group': risk_group.capitalize(),
                'Intercept': model.intercept_
            }
            for feature in features:
                coef_dict[feature] = model.coef_[features.index(feature)]
            
            coef_df = pd.DataFrame([coef_dict])
            model_summary = pd.concat([model_summary, coef_df], ignore_index=True)
    
    return models, model_summary

# Define frequencies
frequencies = ['hourly', '3hourly', 'daily']

# Train models using your risk-split training dictionaries (ensure these dicts contain DataFrames, not boolean masks)
model_hourly, summary_hourly = train_models_by_frequency_and_risk(train_date_split_hourly, ['hourly'])
model_three_hourly, summary_three_hourly = train_models_by_frequency_and_risk(train_date_split_three_hourly, ['3hourly'])
model_daily, summary_daily = train_models_by_frequency_and_risk(train_date_split_daily, ['daily'])




  model_summary = pd.concat([model_summary, coef_df], ignore_index=True)
  model_summary = pd.concat([model_summary, coef_df], ignore_index=True)
  model_summary = pd.concat([model_summary, coef_df], ignore_index=True)


### Model summary

In [5]:
# Merge summaries for an overall overview
combined_summary = pd.concat([summary_hourly, summary_three_hourly, summary_daily], ignore_index=True)
print("Combined Model Summary:")
print(combined_summary)

Combined Model Summary:
  Frequency Risk Group  Intercept  ln_hourly_rv_lag1  ln_hourly_rv_lag8  \
0    hourly        Low  -8.158565           0.189469           0.092771   
1    hourly     Medium  -8.651461           0.145536           0.073005   
2    hourly       High  -6.556667           0.189176           0.116818   
3   3hourly        Low  -3.803306                NaN                NaN   
4   3hourly     Medium  -4.217920                NaN                NaN   
5   3hourly       High  -2.498676                NaN                NaN   
6     daily        Low  -1.217280                NaN                NaN   
7     daily     Medium  -3.398506                NaN                NaN   
8     daily       High  -0.708454                NaN                NaN   

   ln_hourly_rv_lag24  ln_3_hourly_rv_lag1  ln_3_hourly_rv_lag4  \
0            0.084990                  NaN                  NaN   
1            0.076400                  NaN                  NaN   
2            0.131807   

  combined_summary = pd.concat([summary_hourly, summary_three_hourly, summary_daily], ignore_index=True)


### Testing data

#### Implementing rolling window
Rolling window is used for a one step ahead forecast. So we constantly update the lagged data with an update lagged data

In [6]:
def rolling_window_predictions(X_test, y_test, model,features, window_size=24, step_ahead=1):
    predictions = []
    actuals = []
    dates = []
    
    # Ensure we have enough rows to forecast.
    max_index = len(X_test) - step_ahead + 1
    for i in range(window_size, max_index):
        # Use double brackets to return a DataFrame with the correct column names.
        current_features = X_test.iloc[[i - 1]][features]
        forecast_value = model.predict(current_features)[0]
        predictions.append(forecast_value)
        
        # Actual value is taken at i + step_ahead - 1.
        actual_index = i + step_ahead - 1
        actuals.append(y_test.iloc[actual_index])
        dates.append(X_test['Date'].iloc[actual_index])
    
    return predictions, actuals, dates


#### Implementing an evaluatation function
This function evaluates the findings and puts it in a df for each ticker

In [7]:
def evaluate_models_on_test_data(test_data_split, models, frequencies, window_size=24):
    evaluation_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Ticker', 'MSE', 'R²'])
    detailed_results = pd.DataFrame(columns=['Date', 'Ticker', 'Risk Group', 'Frequency', 'Predicted', 'Actual'])
    
    for freq in frequencies:
        for risk_group in ['low', 'medium', 'high']:
            # Check if a model exists for this frequency and risk group
            if risk_group not in models[freq]:
                continue
            har_model = models[freq][risk_group]
            group_data = test_data_split[risk_group].copy()
            
            # Define features, target, and forecast horizon based on frequency
            if freq == 'hourly':
                # Use hourly, 8-hourly, and 24-hourly lag features for hourly frequency
                features = ['ln_hourly_rv_lag1', 'ln_hourly_rv_lag8', 'ln_hourly_rv_lag24']
                target = 'ln_hourly_rv'
            elif freq == '3hourly':
                # Use 3 hourly, 12 hourly, and 24 hourly lag features for 3 hourly frequency
                features = ['ln_3_hourly_rv_lag1', 'ln_3_hourly_rv_lag4', 'ln_3_hourly_rv_lag8']
                target = 'ln_3_hourly_rv'
            elif freq == 'daily':
                # For daily, use the daily, weekly, and monthly lags.
                features = ['ln_daily_rv_lag1', 'ln_weekly_rv_lag1', 'ln_monthly_rv_lag1']
                target = 'ln_daily_rv'
            else:
                raise ValueError(f"Unsupported frequency: {freq}")
            
            unique_tickers = group_data['Ticker'].unique()
            for ticker in unique_tickers:
                ticker_data = group_data[group_data['Ticker'] == ticker].copy()
                
                # Validate that required columns exist
                if not all(f in ticker_data.columns for f in features) or target not in ticker_data.columns:
                    print(f"Skipping {ticker}: missing features or target for {freq} - {risk_group}")
                    continue
                
                # Prepare test data using the required features and Date.
                X_test = ticker_data[['Date'] + features].dropna()
                y_test = ticker_data.loc[X_test.index, target]
                
                if len(X_test) < window_size + 1:
                    print(f"Skipping {ticker}: insufficient data ({len(X_test)} rows)")
                    continue
                
                # Generate forecasts using the rolling-window prediction function.
                predictions, actuals, dates = rolling_window_predictions(
                    X_test, y_test, har_model, features=features,
                    window_size=window_size
                )
                if len(predictions) == 0:
                    continue
                                
                # Calculate evaluation metrics.
                mse = mean_squared_error(actuals, predictions)
                r2 = r2_score(actuals, predictions)
                
                summary_row = pd.DataFrame({
                    'Frequency': [freq],
                    'Risk Group': [risk_group.capitalize()],
                    'Ticker': [ticker],
                    'MSE': [mse],
                    'R²': [r2]
                })
                evaluation_summary = pd.concat([evaluation_summary, summary_row], ignore_index=True)
                
                ticker_results = pd.DataFrame({
                    'Date': dates,
                    'Ticker': ticker,
                    'Risk Group': risk_group.capitalize(),
                    'Frequency': freq,
                    'Predicted': predictions,
                    'Actual': actuals
                })
                detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)
    
    return evaluation_summary, detailed_results


#### Evaluate the test data

In [8]:
# Define frequencies
frequencies = ['hourly', '3hourly', 'daily']

# Evaluate models on test data for each frequency.
evaluation_summary_hourly, detailed_results_hourly = evaluate_models_on_test_data(
    test_date_split_hourly, model_hourly, ['hourly'], window_size=24
)
evaluation_summary_three_hourly, detailed_results_three_hourly = evaluate_models_on_test_data(
    test_date_split_three_hourly, model_three_hourly, ['3hourly'], window_size=24
)
evaluation_summary_daily, detailed_results_daily = evaluate_models_on_test_data(
    test_date_split_daily, model_daily, ['daily'], window_size=24
)

# Combine the evaluation summaries across frequencies.
combined_summary = pd.concat([evaluation_summary_hourly, evaluation_summary_three_hourly, evaluation_summary_daily], ignore_index=True)
combined_details = pd.concat([detailed_results_hourly, detailed_results_three_hourly, detailed_results_daily], ignore_index=True)
combined_details.to_csv('../../results/har.csv', index=False)
print("Evaluation Summary:")
print(combined_summary)

  evaluation_summary = pd.concat([evaluation_summary, summary_row], ignore_index=True)
  detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)
  evaluation_summary = pd.concat([evaluation_summary, summary_row], ignore_index=True)
  detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)
  evaluation_summary = pd.concat([evaluation_summary, summary_row], ignore_index=True)
  detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)


Evaluation Summary:
   Frequency Risk Group    Ticker       MSE        R²
0     hourly        Low   BTC-USD  6.285184  0.008847
1     hourly        Low   ETH-USD  6.420979 -0.047947
2     hourly     Medium   XRP-USD  6.625459  0.055388
3     hourly       High  DOGE-USD  6.256185 -0.001267
4     hourly       High   SOL-USD  6.124855 -0.013703
5    3hourly        Low   BTC-USD  2.188624  0.074313
6    3hourly        Low   ETH-USD  2.081993  0.033786
7    3hourly     Medium   XRP-USD  2.079133  0.309614
8    3hourly       High  DOGE-USD  1.920552  0.118030
9    3hourly       High   SOL-USD  1.709443  0.067489
10     daily        Low   BTC-USD  1.100746 -0.044790
11     daily        Low   ETH-USD  0.897307  0.025271
12     daily     Medium   XRP-USD  1.323332  0.318631
13     daily       High  DOGE-USD  0.885850  0.081082
14     daily       High   SOL-USD  0.726315  0.021742
