# HAR Model
For each risk classficiation, we will train a model to fit to predict the RV model

## Import the libraries and data
To obtain the data, please go to notebooks/data_preprocessing, and then run data_import.ipynb and then run data_preprocessing.ipynb. This will give you data/processed_data.csv

In [25]:

# Import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

data = pd.read_csv('../../data/processed_data.csv')
data['Date'] = pd.to_datetime(data['Date'])

# Remove NaN values
data = data.dropna()

# Print columns
print(data.columns)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker',
       'ln_hourly_return', 'ln_3_hourly_return', 'ln_hourly_rv',
       'ln_3_hourly_rv', 'ln_daily_rv', 'ln_weekly_rv', 'ln_monthly_rv',
       'ln_daily_rv_lag1', 'ln_daily_rv_lag2', 'ln_weekly_rv_lag1',
       'ln_weekly_rv_lag2', 'ln_monthly_rv_lag1', 'ln_monthly_rv_lag2',
       'ln_hourly_rv_lag1', 'ln_3_hourly_rv_lag1', 'ln_hourly_rv_lag2',
       'ln_3_hourly_rv_lag2', 'ln_hourly_return_lag1',
       'ln_3_hourly_return_lag1', 'ln_hourly_return_lag2',
       'ln_3_hourly_return_lag2', 'Risk'],
      dtype='object')


## Train test split
Now we will use a different train-test split from the group project
Group project: 80/20 split
Individual: Use 1 year of training data, then use rolling window 

In [26]:
# Train-test split
# Train data: 0.8 of the data
# Test data: 0.2 of the data
# Sort the data by date
data = data.sort_values('Date')

# Determine when the first year ends, and use it as train data
# The rest of the data is used as test data
min_date = data['Date'].min()
max_date = data['Date'].max()

# Calculate the total time span of the data
total_time_span = max_date - min_date

# Define the first year of data
first_year_end = min_date + pd.DateOffset(years=1)

# Filter data for the first year
first_year_data = data[data['Date'] <= first_year_end]

# Calculate the percentage of data in the first year
percentage_first_year = (len(first_year_data) / len(data))

train_split = percentage_first_year
train_data = data[:int(train_split * len(data))]
test_data = data[int(train_split * len(data)):]

# Print train and test data date
print(train_data['Date'].min(), train_data['Date'].max())
print(test_data['Date'].min(), test_data['Date'].max())

2023-05-03 00:00:00+00:00 2024-05-03 00:00:00+00:00
2024-05-03 01:00:00+00:00 2025-03-10 23:00:00+00:00


### Further split the data based on the risk level
There are low, medium, and high risk models.

In [27]:

# Split train and test data by risk groups
train_data_split = {
    'low': train_data[train_data['Risk'] == 'Low Risk'],
    'medium': train_data[train_data['Risk'] == 'Medium Risk'],
    'high': train_data[train_data['Risk'] == 'High Risk']
}

test_data_split = {
    'low': test_data[test_data['Risk'] == 'Low Risk'],
    'medium': test_data[test_data['Risk'] == 'Medium Risk'],
    'high': test_data[test_data['Risk'] == 'High Risk']
}

# Train the model based on their classifications

### Training data

This wil give us 3 models to work with: model_low, model_medium, and model_high. We will use these subsequent models on the test data to evaluate the models

In [28]:

# Function to train models and summarize them
def train_models_by_frequency_and_risk(train_data, frequencies):
    models = {}
    model_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Intercept'] + 
                                 ['ln_hourly_rv_lag1', 'ln_3_hourly_rv_lag1', 'ln_daily_rv_lag1', 
                                  'ln_weekly_rv_lag1', 'ln_monthly_rv_lag1'])
    
    for freq in frequencies:
        models[freq] = {}
        for risk_group in ['low', 'medium', 'high']:
            group_data = train_data[risk_group]
            
            # Select features and target based on frequency
            if freq == 'hourly':
                features = ['ln_hourly_rv_lag1', 'ln_3_hourly_rv_lag1', 'ln_daily_rv_lag1']
                target = 'ln_hourly_rv'
            elif freq == '3hourly':
                features = ['ln_3_hourly_rv_lag1', 'ln_daily_rv_lag1', 'ln_weekly_rv_lag1']
                target = 'ln_3_hourly_rv'
            elif freq == 'daily':
                features = ['ln_daily_rv_lag1', 'ln_weekly_rv_lag1', 'ln_monthly_rv_lag1']
                target = 'ln_daily_rv'
            
            X_train = group_data[features]
            y_train = group_data[target]
            
            # Train the model
            model = LinearRegression()
            model.fit(X_train, y_train)
            models[freq][risk_group] = model
            
            # Extract coefficients and intercept
            coef_dict = {
                'Frequency': freq,
                'Risk Group': risk_group,
                'Intercept': model.intercept_
            }
            for feature in features:
                coef_dict[feature] = model.coef_[features.index(feature)]
            
            # Convert the dictionary to a DataFrame and append to the summary
            coef_df = pd.DataFrame([coef_dict])
            model_summary = pd.concat([model_summary, coef_df], ignore_index=True)
    
    return models, model_summary


# Define frequencies
frequencies = ['hourly', '3hourly', 'daily']

# Train models for each frequency and risk group
models, model_summary = train_models_by_frequency_and_risk(train_data_split, frequencies)

  model_summary = pd.concat([model_summary, coef_df], ignore_index=True)
  model_summary = pd.concat([model_summary, coef_df], ignore_index=True)
  model_summary = pd.concat([model_summary, coef_df], ignore_index=True)


### Model summary

In [29]:
model_summary

Unnamed: 0,Frequency,Risk Group,Intercept,ln_hourly_rv_lag1,ln_3_hourly_rv_lag1,ln_daily_rv_lag1,ln_weekly_rv_lag1,ln_monthly_rv_lag1
0,hourly,low,-5.607595,0.108701,0.317332,0.301972,,
1,hourly,medium,-5.851581,0.096119,0.255364,0.361124,,
2,hourly,high,-4.668929,0.079614,0.245319,0.533769,,
3,3hourly,low,-2.633203,,0.364147,0.186414,0.356292,
4,3hourly,medium,-3.397419,,0.283795,0.233489,0.307239,
5,3hourly,high,-2.481452,,0.259835,0.325078,0.353614,
6,daily,low,-0.857442,,,0.293044,0.234213,0.402497
7,daily,medium,-2.039618,,,0.326618,0.307719,0.120866
8,daily,high,-0.656858,,,0.436706,0.278315,0.212104


### Testing data

#### Implementing rolling window
Rolling window is used for a one step ahead forecast. So we constantly update the lagged data with an update lagged data

In [30]:
def rolling_window_predictions(X_test, y_test, model, window_size=24, step_ahead=1):
    predictions = []
    actuals = []
    dates = []
    
    # Maximum index to avoid out-of-bounds
    max_index = len(X_test) - step_ahead  # Ensure enough data for prediction
    
    for i in range(window_size, max_index + 1):
        # Extract features from the rolling window (excluding 'Date')
        X_window = X_test.drop(columns=['Date']).iloc[i - window_size:i]
        
        # Predict the next `step_ahead` step using the last row of the window
        y_pred = model.predict(X_window.tail(1))[0]
        
        # Get the actual value `step_ahead` steps ahead of the current window
        actual_index = i + step_ahead - 1  # Actual value's index
        actual_value = y_test.iloc[actual_index]
        
        # Capture the date of the predicted value (actual's date)
        current_date = X_test['Date'].iloc[actual_index]
        
        predictions.append(y_pred)
        actuals.append(actual_value)
        dates.append(current_date)
    
    return predictions, actuals, dates

#### Implementing an evaluatation function
This function evaluates the findings and puts it in a df for each ticker

In [31]:
def evaluate_models_on_test_data(test_data_split, models, frequencies, window_size=24):
    evaluation_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Ticker', 'MSE', 'R²'])
    detailed_results = pd.DataFrame(columns=['Date', 'Ticker', 'Risk Group', 'Frequency', 'Predicted', 'Actual'])
    
    for freq in frequencies:
        for risk_group in ['low', 'medium', 'high']:
            model = models[freq][risk_group]
            group_data = test_data_split[risk_group]
            
            # Define features, target, and step_ahead based on frequency
            if freq == 'hourly':
                features = ['ln_hourly_rv_lag1', 'ln_3_hourly_rv_lag1', 'ln_daily_rv_lag1']
                target = 'ln_hourly_rv'
                step_ahead = 1  # Predict 1 hour ahead
            elif freq == '3hourly':
                features = ['ln_3_hourly_rv_lag1', 'ln_daily_rv_lag1', 'ln_weekly_rv_lag1']
                target = 'ln_3_hourly_rv'
                step_ahead = 3  # Predict 3 hours ahead
            elif freq == 'daily':
                features = ['ln_daily_rv_lag1', 'ln_weekly_rv_lag1', 'ln_monthly_rv_lag1']
                target = 'ln_daily_rv'
                step_ahead = 24  # Predict 24 hours (1 day) ahead
            
            unique_tickers = group_data['Ticker'].unique()
            
            for ticker in unique_tickers:
                ticker_data = group_data[group_data['Ticker'] == ticker].copy()
                
                # Validate features and target
                if not all(f in ticker_data.columns for f in features) or target not in ticker_data.columns:
                    print(f"Skipping {ticker}: missing features or target for {freq}-{risk_group}")
                    continue
                
                # Prepare test data (features + Date)
                X_test = ticker_data[['Date'] + features].dropna()
                y_test = ticker_data.loc[X_test.index, target]
                
                # Check data sufficiency
                if len(X_test) < window_size + step_ahead:
                    print(f"Skipping {ticker}: insufficient data ({len(X_test)} rows)")
                    continue
                
                # Get predictions and metrics
                predictions, actuals, dates = rolling_window_predictions(
                    X_test, y_test, model, window_size=window_size, step_ahead=step_ahead
                )
                
                if len(predictions) == 0:
                    continue  # Skip if no valid predictions
                
                # Calculate metrics
                mse = mean_squared_error(actuals, predictions)
                r2 = r2_score(actuals, predictions)
                
                # Append to summary
                evaluation_summary = pd.concat([
                    evaluation_summary,
                    pd.DataFrame({
                        'Frequency': [freq],
                        'Risk Group': [risk_group.capitalize()],
                        'Ticker': [ticker],
                        'MSE': [mse],
                        'R²': [r2]
                    })
                ], ignore_index=True)
                
                # Append detailed results
                ticker_results = pd.DataFrame({
                    'Date': dates,
                    'Ticker': ticker,
                    'Risk Group': risk_group.capitalize(),
                    'Frequency': freq,
                    'Predicted': predictions,
                    'Actual': actuals
                })
                detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)
    
    return evaluation_summary, detailed_results

#### Evaluate the test data

In [34]:

# Define frequencies
frequencies = ['hourly', '3hourly', 'daily']

# Evaluate models
evaluation_summary, detailed_results = evaluate_models_on_test_data(
    test_data_split, models, frequencies, window_size=24
)

# Save results
detailed_results.to_csv('../../data/har.csv', index=False)
print("Evaluation Summary:")
print(evaluation_summary)

  evaluation_summary = pd.concat([
  detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)


Evaluation Summary:
   Frequency Risk Group    Ticker       MSE        R²
0     hourly        Low   BTC-USD  6.070857  0.051346
1     hourly     Medium   ETH-USD  5.839966  0.043728
2     hourly     Medium   XRP-USD  5.967356  0.163062
3     hourly       High  DOGE-USD  5.946162  0.059781
4     hourly       High   SOL-USD  5.885372  0.033263
5    3hourly        Low   BTC-USD  2.137566  0.106937
6    3hourly     Medium   ETH-USD  1.911466  0.118436
7    3hourly     Medium   XRP-USD  1.927677  0.374738
8    3hourly       High  DOGE-USD  1.825432  0.168951
9    3hourly       High   SOL-USD  1.637846  0.090618
10     daily        Low   BTC-USD  1.125800 -0.061194
11     daily     Medium   ETH-USD  0.921843  0.025831
12     daily     Medium   XRP-USD  1.253558  0.366974
13     daily       High  DOGE-USD  0.895492  0.083126
14     daily       High   SOL-USD  0.735294  0.022917
