# EWMA Model
For each risk classficiation, we will train a model to fit to predict the RV model. EWMA has been selected to be the baseline model, given the ease of implementation and the O(1) computational cost. Traditionally, EWMA has been use for baseline RV prediction for othert ML projects as well

## Import the libraries and data
To obtain the data, please go to notebooks/data_preprocessing, and then run data_import.ipynb and then run data_preprocessing.ipynb. This will give you processed data

In [None]:

import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np

hourly_data = pd.read_csv('../..//data/hourly_data.csv')
three_hourly_data = pd.read_csv('../../data/three_hourly_data.csv')
daily_data = pd.read_csv('../../data/daily_data.csv')

## Train test split
Now we will use a different train-test split from the group project
Group project: 80/20 split
Individual: Use 1 year of training data, then use rolling window 

In [None]:

# Train-test split
# Sort the data by date
hourly_data['Date'] = pd.to_datetime(hourly_data['Date'])
three_hourly_data['Date'] = pd.to_datetime(three_hourly_data['Date'])
daily_data['Date'] = pd.to_datetime(daily_data['Date'])

hourly_data = hourly_data.sort_values('Date')

# Determine when the first year ends, and use it as train data
# The rest of the data is used as test data
min_date = hourly_data['Date'].min()
max_date = hourly_data['Date'].max()

# Calculate the total time span of the data
total_time_span = max_date - min_date

# Define the first year of data
first_year_end = min_date + pd.DateOffset(years=1)

# Filter data for the first year
first_year_data = hourly_data[hourly_data['Date'] <= first_year_end]

# Calculate the percentage of data in the first year
percentage_first_year = (len(first_year_data) / len(hourly_data))

train_split = percentage_first_year

# Hourly data train-test split
X_train_hourly = hourly_data[hourly_data['Date'] <= first_year_end]
X_test_hourly = hourly_data[hourly_data['Date'] > first_year_end]

# Three hourly data train-test split
X_train_three_hourly = three_hourly_data[three_hourly_data['Date'] <= first_year_end]
X_test_three_hourly = three_hourly_data[three_hourly_data['Date'] > first_year_end]

# Daily data train-test split
X_train_daily = daily_data[daily_data['Date'] <= first_year_end]
X_test_daily = daily_data[daily_data['Date'] > first_year_end]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ticker,ln_hourly_return,hourly_rv,ln_hourly_rv,ln_hourly_rv_lag1,ln_hourly_rv_lag2,ln_hourly_rv_lag3,ln_hourly_rv_lag8,ln_hourly_rv_lag24,Risk
0,2023-04-01 00:00:00+00:00,28473.332031,28546.123047,28424.826172,28434.339844,0.0,BTC-USD,,,,,,,,,Low Risk
33855,2023-04-01 00:00:00+00:00,1821.704346,1827.693848,1820.090820,1822.385986,0.0,ETH-USD,,,,,,,,,Low Risk
67715,2023-04-01 00:00:00+00:00,0.538472,0.538472,0.532962,0.533647,0.0,XRP-USD,,,,,,,,,Medium Risk
16931,2023-04-01 00:00:00+00:00,0.077025,0.077359,0.077012,0.077193,3358592.0,DOGE-USD,,,,,,,,,High Risk
50785,2023-04-01 00:00:00+00:00,21.171354,21.294809,21.155914,21.167952,0.0,SOL-USD,,,,,,,,,High Risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25624,2024-04-01 00:00:00+00:00,0.220039,0.220039,0.216880,0.218250,3359488.0,DOGE-USD,-0.008003,0.000064,-9.655870,-11.611518,-11.187848,-8.687052,-10.787633,-8.136184,High Risk
42554,2024-04-01 00:00:00+00:00,3646.280273,3646.280273,3618.257812,3624.143799,225764352.0,ETH-USD,-0.006724,0.000045,-10.004098,-10.536814,-12.841648,-12.034866,-14.892331,-11.691910,Low Risk
8700,2024-04-01 00:00:00+00:00,71312.171875,71312.171875,70925.554688,71158.453125,366051328.0,BTC-USD,-0.002756,0.000008,-11.787795,-10.406212,-11.944188,-13.694175,-13.627078,-11.385404,Low Risk
59484,2024-04-01 00:00:00+00:00,202.808823,202.808823,201.075760,201.776703,27750912.0,SOL-USD,-0.006066,0.000037,-10.209947,-8.748552,-11.192625,-14.340538,-11.140290,-10.709868,High Risk


### Further split the data based on the risk level
There are low, medium, and high risk models.

In [3]:
train_date_split_hourly = {
    'low': X_train_hourly[X_train_hourly['Risk'] == 'Low Risk'],
    'medium': X_train_hourly[X_train_hourly['Risk'] == 'Medium Risk'],
    'high': X_train_hourly[X_train_hourly['Risk'] == 'High Risk']
}

test_date_split_hourly = {
    'low': X_test_hourly[X_test_hourly['Risk'] == 'Low Risk'],
    'medium': X_test_hourly[X_test_hourly['Risk'] == 'Medium Risk'],
    'high': X_test_hourly[X_test_hourly['Risk'] == 'High Risk']
}

train_date_split_three_hourly = {
    'low': X_train_three_hourly[X_train_three_hourly['Risk'] == 'Low Risk'],
    'medium': X_train_three_hourly[X_train_three_hourly['Risk'] == 'Medium Risk'],
    'high': X_train_three_hourly[X_train_three_hourly['Risk'] == 'High Risk']
}

test_date_split_three_hourly = {
    'low': X_test_three_hourly[X_test_three_hourly['Risk'] == 'Low Risk'],
    'medium': X_test_three_hourly[X_test_three_hourly['Risk'] == 'Medium Risk'],
    'high': X_test_three_hourly[X_test_three_hourly['Risk'] == 'High Risk']
}

train_date_split_daily = {
    'low': X_train_daily[X_train_daily['Risk'] == 'Low Risk'],
    'medium': X_train_daily[X_train_daily['Risk'] == 'Medium Risk'],
    'high': X_train_daily[X_train_daily['Risk'] == 'High Risk']
}

test_date_split_daily = {
    'low': X_test_daily[X_test_daily['Risk'] == 'Low Risk'],
    'medium': X_test_daily[X_test_daily['Risk'] == 'Medium Risk'],
    'high': X_test_daily[X_test_daily['Risk'] == 'High Risk']
}


# Train the model based on their classifications

### Training data

#### Forecast function

In [None]:
def compute_ewma_forecast(series, lam):
    forecasts = [series.iloc[0]]
    for i in range(1, len(series)):
        f = lam * forecasts[i-1] + (1 - lam) * series.iloc[i-1]
        forecasts.append(f)
    return np.array(forecasts)


#### Calculating optimal lambda
In theory, optimal lambda is 0.94, but given the abundance of training data, we can use gridsearch to find optimal lambda

In [5]:
def optimize_lambda(series, candidate_lambdas=np.linspace(0.90, 0.99, 100)):
    best_lambda = None
    best_mse = np.inf
    for lam in candidate_lambdas:
        forecast = compute_ewma_forecast(series, lam)
        # Exclude the first forecast because it is just the initialization
        mse = np.mean((series.values[1:] - forecast[1:])**2)
        if mse < best_mse:
            best_mse = mse
            best_lambda = lam
    return best_lambda, best_mse


This wil give us 3 models to work with: model_low, model_medium, and model_high. We will use these subsequent models on the test data to evaluate the models

In [None]:

def train_models_by_frequency_and_risk(train_data, frequencies):
    models = {}
    model_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Optimal Lambda', 'In-Sample MSE'])
    
    for freq in frequencies:
        models[freq] = {}
        for risk_group in ['low', 'medium', 'high']:
            group_data = train_data[risk_group]
            
            # Select target based on frequency
            if freq == 'hourly':
                target = 'ln_hourly_rv'
            elif freq == '3hourly':
                target = 'ln_3_hourly_rv'
            elif freq == 'daily':
                target = 'ln_daily_rv'
            else:
                raise ValueError(f"Unsupported frequency: {freq}")
            
            if target not in group_data.columns:
                print(f"Missing {target} for {freq}-{risk_group}, skipping.")
                continue
            
            # Drop missing values
            series = group_data[target].dropna()
            if len(series) < 2:
                print(f"Insufficient data for {freq}-{risk_group}, skipping.")
                continue
            
            # Optimise lambda for the EWMA model
            optimal_lambda, in_sample_mse = optimize_lambda(series)
            ewma_model = {'lambda': optimal_lambda, 'initial': series.iloc[0]}
            models[freq][risk_group] = ewma_model
            
            # Append results to the summary DataFrame
            summary_row = {
                'Frequency': freq,
                'Risk Group': risk_group,
                'Optimal Lambda': optimal_lambda,
                'In-Sample MSE': in_sample_mse
            }
            model_summary = pd.concat([model_summary, pd.DataFrame([summary_row])], ignore_index=True)
    
    return models, model_summary

model_daily, summary_daily = train_models_by_frequency_and_risk(train_date_split_daily, ['daily'])
model_hourly, summary_hourly = train_models_by_frequency_and_risk(train_date_split_hourly, ['hourly'])
model_three_hourly, summary_three_hourly = train_models_by_frequency_and_risk(train_date_split_three_hourly, ['3hourly'])

# Merge the summaries
summary = pd.concat([summary_daily, summary_hourly, summary_three_hourly], ignore_index=True)

  model_summary = pd.concat([model_summary, pd.DataFrame([summary_row])], ignore_index=True)
  model_summary = pd.concat([model_summary, pd.DataFrame([summary_row])], ignore_index=True)
  model_summary = pd.concat([model_summary, pd.DataFrame([summary_row])], ignore_index=True)


### Model summary

In [None]:
summary
summary.to_csv('../../data/ewma_model_summary.csv', index=False)

### Testing data

#### Implementing rolling window
Rolling window is used for a one step ahead forecast. So we constantly update the lagged data with an update lagged data

In [None]:
def rolling_window_predictions(X_test, y_test, model, window_size=24, step_ahead=1):
    predictions = []
    actuals = []
    dates = []
    lam = model['lambda']  # Get the decay parameter from the trained EWMA model
    
    max_index = len(X_test) - step_ahead  # Ensure we have data for the forecast horizon
    for i in range(window_size, max_index + 1):
        # Use the target series within the current rolling window
        window_data = y_test.iloc[i - window_size : i]
        
        # Compute the EWMA forecast using the provided decay factor
        forecast = compute_ewma_forecast(window_data, lam)
        # For EWMA, the one-step ahead forecast is the last computed value.
        y_pred = forecast[-1]
        
        # Get the actual value at the forecast time point
        actual_index = i + step_ahead - 1
        actual_value = y_test.iloc[actual_index]
        
        # Capture the corresponding date from the X_test 'Date' column
        current_date = X_test['Date'].iloc[actual_index]
        
        predictions.append(y_pred)
        actuals.append(actual_value)
        dates.append(current_date)
    
    return predictions, actuals, dates


#### Implementing an evaluatation function
This function evaluates the findings and puts it in a df for each ticker

In [None]:
def evaluate_models_on_test_data(test_data_split, models, frequencies, window_size=24, step_ahead = 1):
    evaluation_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Ticker', 'MSE', 'R²'])
    detailed_results = pd.DataFrame(columns=['Date', 'Ticker', 'Risk Group', 'Frequency', 'Predicted', 'Actual'])
    
    for freq in frequencies:
        for risk_group in ['low', 'medium', 'high']:
            model = models[freq][risk_group]
            group_data = test_data_split[risk_group].copy()
            
            # Define target and forecast horizon based on frequency
            if freq == 'hourly':
                target = 'ln_hourly_rv'
            elif freq == '3hourly':
                target = 'ln_3_hourly_rv'
            elif freq == 'daily':
                target = 'ln_daily_rv'
            else:
                raise ValueError("Unsupported frequency")
            
            unique_tickers = group_data['Ticker'].unique()
            
            for ticker in unique_tickers:
                ticker_data = group_data[group_data['Ticker'] == ticker].copy()
                
                # Validate that the required columns exist
                if 'Date' not in ticker_data.columns or target not in ticker_data.columns:
                    print(f"Skipping {ticker}: missing Date or {target} for {freq}-{risk_group}")
                    continue
                
                # Prepare test data with Date and target column
                X_test = ticker_data[['Date', target]].dropna()
                y_test = X_test[target]
                
                # Check for sufficient data
                if len(X_test) < window_size + step_ahead:
                    print(f"Skipping {ticker}: insufficient data ({len(X_test)} rows)")
                    continue
                
                # Get EWMA-based predictions via a rolling window
                predictions, actuals, dates = rolling_window_predictions(
                    X_test, y_test, model, window_size=window_size, step_ahead=step_ahead
                )
                
                if len(predictions) == 0:
                    continue
                                
                # Calculate metrics on the log-transformed values
                mse = mean_squared_error(actuals, predictions)
                r2 = r2_score(actuals, predictions)
                
                eval_row = {
                    'Frequency': freq,
                    'Risk Group': risk_group.capitalize(),
                    'Ticker': ticker,
                    'MSE': mse,
                    'R²': r2
                }
                evaluation_summary = pd.concat(
                    [evaluation_summary, pd.DataFrame([eval_row])], ignore_index=True
                )
                
                ticker_results = pd.DataFrame({
                    'Date': dates,
                    'Ticker': ticker,
                    'Risk Group': risk_group.capitalize(),
                    'Frequency': freq,
                    'Predicted': predictions,
                    'Actual': actuals
                })
                detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)
    
    return evaluation_summary, detailed_results


#### Evaluate the test data

In [None]:
evaluation_summary_hourly, detailed_results_hourly = evaluate_models_on_test_data(
    test_date_split_hourly, model_hourly, ['hourly'], window_size=24
)
evaluation_summary_three_hourly, detailed_results_three_hourly = evaluate_models_on_test_data(
    test_date_split_three_hourly, model_three_hourly, ['3hourly'], window_size=24
)
evaluation_summary_daily, detailed_results_daily = evaluate_models_on_test_data(
    test_date_split_daily, model_daily, ['daily'], window_size=24
)

# Combine the evaluation summaries across frequencies, if desired.
combined_summary = pd.concat([evaluation_summary_hourly, evaluation_summary_three_hourly, evaluation_summary_daily], ignore_index=True)
combined_details = pd.concat([detailed_results_hourly, detailed_results_three_hourly, detailed_results_daily], ignore_index=True)

# Save the detailed results to CSV
combined_details.to_csv('../../results/ewma.csv', index=False)

# Print the evaluation summaries
print("Combined Evaluation Summary:")
print(combined_summary)

# Save this for appendix
combined_summary.to_csv('../../results/appendix/ewma_summary.csv', index=False)


  evaluation_summary = pd.concat(
  detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)
  evaluation_summary = pd.concat(
  detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)
  evaluation_summary = pd.concat(
  detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)


Combined Evaluation Summary:
   Frequency Risk Group    Ticker       MSE        R²
0     hourly        Low   BTC-USD  6.100476  0.037974
1     hourly        Low   ETH-USD  5.934583  0.031436
2     hourly     Medium   XRP-USD  6.050096  0.137419
3     hourly       High  DOGE-USD  6.524614 -0.044228
4     hourly       High   SOL-USD  6.511406 -0.077680
5    3hourly        Low   BTC-USD  2.157549  0.087456
6    3hourly        Low   ETH-USD  1.889717  0.123018
7    3hourly     Medium   XRP-USD  1.900472  0.368939
8    3hourly       High  DOGE-USD  1.771056  0.186682
9    3hourly       High   SOL-USD  1.600562  0.126884
10     daily        Low   BTC-USD  1.086908 -0.031655
11     daily        Low   ETH-USD  0.857532  0.068478
12     daily     Medium   XRP-USD  1.310579  0.325197
13     daily       High  DOGE-USD  0.851503  0.116712
14     daily       High   SOL-USD  0.715991  0.035647
