# EWMA Model
For each risk classficiation, we will train a model to fit to predict the RV model

## Import the libraries and data
To obtain the data, please go to notebooks/data_preprocessing, and then run data_import.ipynb and then run data_preprocessing.ipynb. This will give you data/processed_data.csv

In [37]:

# Import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import numpy as np

data = pd.read_csv('../../data/processed_data.csv')
data['Date'] = pd.to_datetime(data['Date'])

# Remove NaN values
data = data.dropna()

# Print columns
print(data.columns)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker',
       'ln_hourly_return', 'ln_3_hourly_return', 'hourly_rv', '3_hourly_rv',
       'ln_hourly_rv', 'ln_3_hourly_rv', 'ln_daily_rv', 'weekly_rv',
       'ln_weekly_rv', 'monthly_rv', 'ln_monthly_rv', 'ln_daily_rv_lag1',
       'ln_daily_rv_lag2', 'ln_weekly_rv_lag1', 'ln_weekly_rv_lag2',
       'ln_monthly_rv_lag1', 'ln_monthly_rv_lag2', 'ln_hourly_rv_lag1',
       'ln_3_hourly_rv_lag1', 'ln_hourly_rv_lag2', 'ln_3_hourly_rv_lag2',
       'ln_hourly_return_lag1', 'ln_3_hourly_return_lag1',
       'ln_hourly_return_lag2', 'ln_3_hourly_return_lag2', 'hourly_rv_lag1',
       'hourly_rv_lag2', 'three_hourly_rv_lag1', 'three_hourly_rv_lag2',
       'daily_rv', 'Risk'],
      dtype='object')


## Train test split
Now we will use a different train-test split from the group project
Group project: 80/20 split
Individual: Use 1 year of training data, then use rolling window 

In [38]:
# Train-test split
# Train data: 0.8 of the data
# Test data: 0.2 of the data
# Sort the data by date
data = data.sort_values('Date')

# Determine when the first year ends, and use it as train data
# The rest of the data is used as test data
min_date = data['Date'].min()
max_date = data['Date'].max()

# Calculate the total time span of the data
total_time_span = max_date - min_date

# Define the first year of data
first_year_end = min_date + pd.DateOffset(years=1)

# Filter data for the first year
first_year_data = data[data['Date'] <= first_year_end]

# Calculate the percentage of data in the first year
percentage_first_year = (len(first_year_data) / len(data))

train_split = percentage_first_year
train_data = data[:int(train_split * len(data))]
test_data = data[int(train_split * len(data)):]

# Print train and test data date
print(train_data['Date'].min(), train_data['Date'].max())
print(test_data['Date'].min(), test_data['Date'].max())

2023-05-03 00:00:00+00:00 2024-05-03 00:00:00+00:00
2024-05-03 01:00:00+00:00 2025-03-10 23:00:00+00:00


### Further split the data based on the risk level
There are low, medium, and high risk models.

In [39]:

# Split train and test data by risk groups
train_data_split = {
    'low': train_data[train_data['Risk'] == 'Low Risk'],
    'medium': train_data[train_data['Risk'] == 'Medium Risk'],
    'high': train_data[train_data['Risk'] == 'High Risk']
}

test_data_split = {
    'low': test_data[test_data['Risk'] == 'Low Risk'],
    'medium': test_data[test_data['Risk'] == 'Medium Risk'],
    'high': test_data[test_data['Risk'] == 'High Risk']
}

# Train the model based on their classifications

### Training data

#### Forecast function

In [40]:
def compute_ewma_forecast(series, lam):
    # Initialize the forecast with the first observed value
    forecasts = [series.iloc[0]]
    for i in range(1, len(series)):
        f = lam * forecasts[i-1] + (1 - lam) * series.iloc[i-1]
        forecasts.append(f)
    return np.array(forecasts)


#### Calculating optimal lambda
In theory, optimal lambda is 0.94, but given the abundance of training data, we can use gridsearch to find optimal lambda

In [41]:
def optimize_lambda(series, candidate_lambdas=np.linspace(0.90, 0.99, 100)):
    best_lambda = None
    best_mse = np.inf
    for lam in candidate_lambdas:
        forecast = compute_ewma_forecast(series, lam)
        # Exclude the first forecast because it is just the initialization
        mse = np.mean((series.values[1:] - forecast[1:])**2)
        if mse < best_mse:
            best_mse = mse
            best_lambda = lam
    return best_lambda, best_mse


This wil give us 3 models to work with: model_low, model_medium, and model_high. We will use these subsequent models on the test data to evaluate the models

In [42]:

# Function to train models and summarize them
def train_models_by_frequency_and_risk(train_data, frequencies):
    models = {}
    model_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Optimal Lambda', 'In-Sample MSE'])
    
    for freq in frequencies:
        models[freq] = {}
        for risk_group in ['low', 'medium', 'high']:
            group_data = train_data[risk_group]
            
            # Select target based on frequency
            if freq == 'hourly':
                target = 'ln_hourly_rv'
            elif freq == '3hourly':
                target = 'ln_3_hourly_rv'
            elif freq == 'daily':
                target = 'ln_daily_rv'
            else:
                raise ValueError(f"Unsupported frequency: {freq}")
            
            if target not in group_data.columns:
                print(f"Missing {target} for {freq}-{risk_group}, skipping.")
                continue
            
            # Drop missing values
            series = group_data[target].dropna()
            if len(series) < 2:
                print(f"Insufficient data for {freq}-{risk_group}, skipping.")
                continue
            
            # Optimize lambda for the EWMA model
            optimal_lambda, in_sample_mse = optimize_lambda(series)
            
            # Store the "trained" model: here just the optimal lambda and initial value
            ewma_model = {'lambda': optimal_lambda, 'initial': series.iloc[0]}
            models[freq][risk_group] = ewma_model
            
            # Append results to the summary DataFrame
            summary_row = {
                'Frequency': freq,
                'Risk Group': risk_group,
                'Optimal Lambda': optimal_lambda,
                'In-Sample MSE': in_sample_mse
            }
            model_summary = pd.concat([model_summary, pd.DataFrame([summary_row])], ignore_index=True)
    
    return models, model_summary


# Define frequencies
frequencies = ['hourly', '3hourly', 'daily']

# Train models for each frequency and risk group
models, model_summary = train_models_by_frequency_and_risk(train_data_split, frequencies)

  model_summary = pd.concat([model_summary, pd.DataFrame([summary_row])], ignore_index=True)


### Model summary

In [43]:
model_summary

Unnamed: 0,Frequency,Risk Group,Optimal Lambda,In-Sample MSE
0,hourly,low,0.915455,5.725802
1,hourly,medium,0.945455,6.099999
2,hourly,high,0.960909,6.151694
3,3hourly,low,0.9,1.649084
4,3hourly,medium,0.9,1.679407
5,3hourly,high,0.9,1.722624
6,daily,low,0.9,0.328084
7,daily,medium,0.9,0.441658
8,daily,high,0.9,0.522572


### Testing data

#### Implementing rolling window
Rolling window is used for a one step ahead forecast. So we constantly update the lagged data with an update lagged data

In [34]:
def rolling_window_predictions(X_test, y_test, model, window_size=24, step_ahead=1):
    predictions = []
    actuals = []
    dates = []
    lam = model['lambda']  # Get the decay parameter from the trained EWMA model
    
    # Determine maximum index to ensure enough data for the forecast
    max_index = len(X_test) - step_ahead  # Ensure we have data for the forecast horizon
    for i in range(window_size, max_index + 1):
        # Use the target series within the current rolling window
        window_data = y_test.iloc[i - window_size : i]
        
        # Compute the EWMA forecast using the provided decay factor
        forecast = compute_ewma_forecast(window_data, lam)
        # For EWMA, the one-step ahead forecast is the last computed value.
        y_pred = forecast[-1]
        # For multi-step ahead, under basic EWMA assumptions, the forecast remains constant.
        
        # Get the actual value at the forecast time point
        actual_index = i + step_ahead - 1
        actual_value = y_test.iloc[actual_index]
        
        # Capture the corresponding date from the X_test 'Date' column
        current_date = X_test['Date'].iloc[actual_index]
        
        predictions.append(y_pred)
        actuals.append(actual_value)
        dates.append(current_date)
    
    return predictions, actuals, dates


#### Implementing an evaluatation function
This function evaluates the findings and puts it in a df for each ticker

In [44]:
def evaluate_models_on_test_data(test_data_split, models, frequencies, window_size=24):
    evaluation_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Ticker', 'MSE', 'R²'])
    detailed_results = pd.DataFrame(columns=['Date', 'Ticker', 'Risk Group', 'Frequency', 'Predicted', 'Actual'])
    
    for freq in frequencies:
        for risk_group in ['low', 'medium', 'high']:
            model = models[freq][risk_group]
            group_data = test_data_split[risk_group].copy()
            
            # Define target and forecast horizon based on frequency
            if freq == 'hourly':
                target = 'ln_hourly_rv'
                step_ahead = 1  # Forecast 1 step ahead
            elif freq == '3hourly':
                target = 'ln_3_hourly_rv'
                step_ahead = 3  # Forecast 3 steps ahead
            elif freq == 'daily':
                target = 'ln_daily_rv'
                step_ahead = 24  # Forecast 24 steps ahead (1 day)
            else:
                raise ValueError("Unsupported frequency")
            
            unique_tickers = group_data['Ticker'].unique()
            
            for ticker in unique_tickers:
                ticker_data = group_data[group_data['Ticker'] == ticker].copy()
                
                # Validate that the required columns exist
                if 'Date' not in ticker_data.columns or target not in ticker_data.columns:
                    print(f"Skipping {ticker}: missing Date or {target} for {freq}-{risk_group}")
                    continue
                
                # Prepare test data with Date and target column
                X_test = ticker_data[['Date', target]].dropna()
                y_test = X_test[target]
                
                # Check for sufficient data
                if len(X_test) < window_size + step_ahead:
                    print(f"Skipping {ticker}: insufficient data ({len(X_test)} rows)")
                    continue
                
                # Get EWMA-based predictions via a rolling window
                predictions, actuals, dates = rolling_window_predictions(
                    X_test, y_test, model, window_size=window_size, step_ahead=step_ahead
                )
                
                if len(predictions) == 0:
                    continue
                                
                # Calculate metrics on the log-transformed values
                mse = mean_squared_error(actuals, predictions)
                r2 = r2_score(actuals, predictions)
                
                eval_row = {
                    'Frequency': freq,
                    'Risk Group': risk_group.capitalize(),
                    'Ticker': ticker,
                    'MSE': mse,
                    'R²': r2
                }
                evaluation_summary = pd.concat(
                    [evaluation_summary, pd.DataFrame([eval_row])], ignore_index=True
                )
                
                ticker_results = pd.DataFrame({
                    'Date': dates,
                    'Ticker': ticker,
                    'Risk Group': risk_group.capitalize(),
                    'Frequency': freq,
                    'Predicted': predictions,
                    'Actual': actuals
                })
                detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)
    
    return evaluation_summary, detailed_results


#### Evaluate the test data

In [45]:

# Define frequencies
frequencies = ['hourly', '3hourly', 'daily']

# Evaluate models
evaluation_summary, detailed_results = evaluate_models_on_test_data(
    test_data_split, models, frequencies, window_size=24
)

# Save results
detailed_results.to_csv('../../results/ewma.csv', index=False)
print("Evaluation Summary:")
print(evaluation_summary)

  evaluation_summary = pd.concat(
  detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)


Evaluation Summary:
   Frequency Risk Group    Ticker       MSE        R²
0     hourly        Low   BTC-USD  6.150042  0.038972
1     hourly     Medium   ETH-USD  6.129422 -0.003669
2     hourly     Medium   XRP-USD  6.160992  0.135904
3     hourly       High  DOGE-USD  6.719541 -0.062507
4     hourly       High   SOL-USD  6.699600 -0.100483
5    3hourly        Low   BTC-USD  2.105525  0.120324
6    3hourly     Medium   ETH-USD  1.883205  0.131470
7    3hourly     Medium   XRP-USD  1.836699  0.404248
8    3hourly       High  DOGE-USD  1.787785  0.186091
9    3hourly       High   SOL-USD  1.617237  0.102061
10     daily        Low   BTC-USD  1.398552 -0.318293
11     daily     Medium   ETH-USD  1.079920 -0.141218
12     daily     Medium   XRP-USD  1.227648  0.380058
13     daily       High  DOGE-USD  0.921870  0.056119
14     daily       High   SOL-USD  0.789207 -0.048725
