# HAR Model
For each risk classficiation, we will train a model to fit to predict the RV model

In [7]:
# Import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

data = pd.read_csv('../../data/processed_data.csv')
data['Date'] = pd.to_datetime(data['Date'])

# Remove NaN values
data = data.dropna()

# Print columns
print(data.columns)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker',
       'ln_hourly_return', 'ln_3_hourly_return', 'ln_daily_return',
       'ln_hourly_rv', 'ln_3_hourly_rv', 'ln_daily_rv', 'ln_weekly_rv',
       'ln_monthly_rv', 'ln_hourly_rv_lag1', 'ln_3_hourly_rv_lag1',
       'ln_daily_rv_lag1', 'ln_weekly_rv_lag1', 'ln_monthly_rv_lag1', 'Risk'],
      dtype='object')


## Train test split
Now we will use a different train-test split from the group project
Group project: 80/20 split
Individual: Use 1 year of training data, then use rolling window 

In [None]:
# Train-test split
# Train data: 1 year of data
# Find the minimum and maximum dates
min_date = data['Date'].min()
max_date = data['Date'].max()

# Calculate the total time span of the data
total_time_span = max_date - min_date

# Define the first year of data
first_year_end = min_date + pd.DateOffset(years=1)

# Filter data for the first year
first_year_data = data[data['Date'] <= first_year_end]

# Calculate the percentage of data in the first year
percentage_first_year = (len(first_year_data) / len(data)) * 100

# Print results
print(f"First year end date: {first_year_end}")
print(f"Percentage of data in the first year: {percentage_first_year:.2f}%")

# Split the data
train_data = data[data['Date'] <= first_year_end]
test_data = data[data['Date'] > first_year_end]

First year end date: 2024-04-08 00:00:00+00:00
Percentage of data in the first year: 52.89%


# Train the model based on their classifications

### Training data

In [9]:
# Train the model based on Risk classification using HAR
low_risk = train_data[train_data['Risk'] == 'Low Risk']
medium_risk = train_data[train_data['Risk'] == 'Medium Risk']
high_risk = train_data[train_data['Risk'] == 'High Risk']

# Train the model for low risk
X_low_train = low_risk[['ln_daily_rv_lag1', 'ln_weekly_rv_lag1', 'ln_monthly_rv_lag1']]
y_low_train = low_risk['ln_daily_rv']

# Train the model for medium risk
X_medium_train = medium_risk[['ln_daily_rv_lag1', 'ln_weekly_rv_lag1', 'ln_monthly_rv_lag1']]
y_medium_train = medium_risk['ln_daily_rv']

# Train the model for high risk
X_high_train = high_risk[['ln_daily_rv_lag1', 'ln_weekly_rv_lag1', 'ln_monthly_rv_lag1']]
y_high_train = high_risk['ln_daily_rv']

model_low = LinearRegression()
model_low.fit(X_low_train, y_low_train)

model_medium = LinearRegression()
model_medium.fit(X_medium_train, y_medium_train)

model_high = LinearRegression()
model_high.fit(X_high_train, y_high_train)

#### Evaluating the training data

In [20]:
# Predict the training data
y_low_train_pred = model_low.predict(X_low_train)
y_medium_train_pred = model_medium.predict(X_medium_train)
y_high_train_pred = model_high.predict(X_high_train)

# Evaluate the training models
# Calculate the mean squared error
mse_low = mean_squared_error(y_low_train, y_low_train_pred)
mse_medium = mean_squared_error(y_medium_train, y_medium_train_pred)
mse_high = mean_squared_error(y_high_train, y_high_train_pred)

# Print the results
print(f"Mean squared error for low risk: {mse_low:.4f}")
print(f"Mean squared error for medium risk: {mse_medium:.4f}")
print(f"Mean squared error for high risk: {mse_high:.4f}")

Mean squared error for low risk: 1.0773
Mean squared error for medium risk: 0.9332
Mean squared error for high risk: 0.7934


### Testing data

In [None]:
# Define prediction functions for each frequency
def rolling_window_predictions(X_test, y_test, model, window_size=24):
    predictions = []
    actuals = []
    
    for i in range(len(X_test)):
        if i < window_size:
            continue
        
        X_window = X_test.iloc[i-window_size:i]
        y_pred = model.predict(X_window.tail(1))
        predictions.append(y_pred[0])
        actuals.append(y_test.iloc[i])
    
    return predictions, actuals

# Modified evaluate function with consistent feature handling
def evaluate_ticker_mse(risk_data, model, frequency, window_size=24):
    ticker_results = {}
    all_predictions = pd.DataFrame()
    unique_tickers = risk_data['Ticker'].unique()
    
    for ticker in unique_tickers:
        ticker_data = risk_data[risk_data['Ticker'] == ticker].copy()
        
        # Select features and target based on frequency
        if frequency == 'hourly':
            features = ['ln_hourly_rv_lag1', 'ln_3_hourly_rv_lag1', 'ln_daily_rv_lag1']
            target = 'ln_hourly_rv'
        elif frequency == '3hourly':
            features = ['ln_3_hourly_rv_lag1', 'ln_daily_rv_lag1', 'ln_weekly_rv_lag1']
            target = 'ln_3_hourly_rv'
        elif frequency == 'daily':
            features = ['ln_daily_rv_lag1', 'ln_weekly_rv_lag1', 'ln_monthly_rv_lag1']
            target = 'ln_daily_rv'
        else:
            raise ValueError("Invalid frequency. Choose 'hourly', '3hourly', or 'daily'.")
        
        # Ensure consistent feature columns
        X_test = ticker_data[features]
        y_test = ticker_data[target]
        
        # Handle cases where there's insufficient data
        if len(X_test) < window_size + 1:
            print(f"Skipping {ticker}: insufficient data ({len(X_test)} rows)")
            continue
        
        # Get predictions
        predictions, actuals = rolling_window_predictions(X_test, y_test, model, window_size)
        
        # Calculate metrics
        mse = mean_squared_error(actuals, predictions)
        r2 = r2_score(actuals, predictions)
        ticker_results[ticker] = {'MSE': mse, 'R²': r2}
        
        # Prepare predictions DataFrame
        ticker_data = ticker_data.iloc[window_size:].copy()
        ticker_data['Predicted'] = predictions
        ticker_data['Frequency'] = frequency
        all_predictions = pd.concat([all_predictions, ticker_data[['Date', 'Ticker', 'Frequency', 'Predicted']]])
    
    return ticker_results, all_predictions

# Ensure models are trained with correct feature sets
def train_models_by_frequency(risk_data, frequencies):
    models = {}
    
    for freq in frequencies:
        features = []
        target = ''
        
        if freq == 'hourly':
            features = ['ln_hourly_rv_lag1', 'ln_3_hourly_rv_lag1', 'ln_daily_rv_lag1']
            target = 'ln_hourly_rv'
        elif freq == '3hourly':
            features = ['ln_3_hourly_rv_lag1', 'ln_daily_rv_lag1', 'ln_weekly_rv_lag1']
            target = 'ln_3_hourly_rv'
        elif freq == 'daily':
            features = ['ln_daily_rv_lag1', 'ln_weekly_rv_lag1', 'ln_monthly_rv_lag1']
            target = 'ln_daily_rv'
        
        # Train separate models for each risk group and frequency
        models[freq] = {}
        for risk_group in ['low', 'medium', 'high']:
            group_data = risk_data[risk_group]  # Assuming risk_data is a dictionary of DataFrames
            
            X_train = group_data[features]
            y_train = group_data[target]
            
            model = LinearRegression()  # Replace with your model
            model.fit(X_train, y_train)
            models[freq][risk_group] = model
    
    return models

risk_data = {
    'low': low_risk,
    'medium': medium_risk,
    'high': high_risk
}

# Train models with correct feature sets
all_models = train_models_by_frequency(risk_data, frequencies=['hourly', '3hourly', 'daily'])

# Evaluation loop with proper model selection
all_predictions = pd.DataFrame()

for frequency in frequencies:
    # Get models for current frequency
    low_model = all_models[frequency]['low']
    medium_model = all_models[frequency]['medium']
    high_model = all_models[frequency]['high']
    
    # Evaluate each risk group
    low_results, low_df = evaluate_ticker_mse(risk_data['low'], low_model, frequency)
    medium_results, medium_df = evaluate_ticker_mse(risk_data['medium'], medium_model, frequency)
    high_results, high_df = evaluate_ticker_mse(risk_data['high'], high_model, frequency)
    
    # Combine predictions
    all_predictions = pd.concat([all_predictions, low_df, medium_df, high_df])

# Save results
all_predictions.to_csv('../../results/har_multi_frequency.csv', index=False)

def print_ticker_results(results, risk_level, frequency):
    print(f"\n{risk_level} Risk Ticker Performance ({frequency}):")
    for ticker, metrics in results.items():
        print(f"  {ticker}:")
        print(f"    MSE: {metrics['MSE']:.4f}")
        print(f"    R²: {metrics['R²']:.4f}")

for frequency in frequencies:
    print_ticker_results(low_results, "Low", frequency)
    print_ticker_results(medium_results, "Medium", frequency)
    print_ticker_results(high_results, "High", frequency)
     



Low Risk Ticker Performance (hourly):
  BTC-USD:
    MSE: 1.3498
    R²: 0.0272

Medium Risk Ticker Performance (hourly):
  ETH-USD:
    MSE: 1.0150
    R²: 0.1554
  XRP-USD:
    MSE: 1.2143
    R²: 0.0516

High Risk Ticker Performance (hourly):
  DOGE-USD:
    MSE: 1.1704
    R²: 0.3763
  SOL-USD:
    MSE: 0.7298
    R²: 0.2841

Low Risk Ticker Performance (3hourly):
  BTC-USD:
    MSE: 1.3498
    R²: 0.0272

Medium Risk Ticker Performance (3hourly):
  ETH-USD:
    MSE: 1.0150
    R²: 0.1554
  XRP-USD:
    MSE: 1.2143
    R²: 0.0516

High Risk Ticker Performance (3hourly):
  DOGE-USD:
    MSE: 1.1704
    R²: 0.3763
  SOL-USD:
    MSE: 0.7298
    R²: 0.2841

Low Risk Ticker Performance (daily):
  BTC-USD:
    MSE: 1.3498
    R²: 0.0272

Medium Risk Ticker Performance (daily):
  ETH-USD:
    MSE: 1.0150
    R²: 0.1554
  XRP-USD:
    MSE: 1.2143
    R²: 0.0516

High Risk Ticker Performance (daily):
  DOGE-USD:
    MSE: 1.1704
    R²: 0.3763
  SOL-USD:
    MSE: 0.7298
    R²: 0.2841
    