# Comparing 51/49 split to 80/20 split
In theory, an 80/20 split might be ideal to get a balanced train-test split. But given the volatile nature of cryptocurrency, it might be ideal to dedicate less data to training and more to test to prevent overfitting.

This paper uses the EWMA as the baseline, and will be using it to test the data splits.

## 51/49 Split
This paper mainly employed a 51/49 split, so it will be calling the results directly.

In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score


# Load the data
main_split_results = pd.read_csv('../../results/appendix/ewma_summary.csv')
main_split_results

Unnamed: 0,Frequency,Risk Group,Ticker,MSE,R²
0,hourly,Low,BTC-USD,6.100476,0.037974
1,hourly,Low,ETH-USD,5.934583,0.031436
2,hourly,Medium,XRP-USD,6.050096,0.137419
3,hourly,High,DOGE-USD,6.524614,-0.044228
4,hourly,High,SOL-USD,6.511406,-0.07768
5,3hourly,Low,BTC-USD,2.157549,0.087456
6,3hourly,Low,ETH-USD,1.889717,0.123018
7,3hourly,Medium,XRP-USD,1.900472,0.368939
8,3hourly,High,DOGE-USD,1.771056,0.186682
9,3hourly,High,SOL-USD,1.600562,0.126884


## 80/20 Split


In [13]:
# Train-test split
hourly_data = pd.read_csv('../../data/hourly_data.csv')
three_hourly_data = pd.read_csv('../../data/three_hourly_data.csv')
daily_data = pd.read_csv('../../data/daily_data.csv')

# Sort the data by date
hourly_data['Date'] = pd.to_datetime(hourly_data['Date'])
three_hourly_data['Date'] = pd.to_datetime(three_hourly_data['Date'])
daily_data['Date'] = pd.to_datetime(daily_data['Date'])

hourly_data = hourly_data.sort_values('Date')

train_split = 0.8
test_split = 0.2

# Split the data
X_train_hourly = hourly_data.iloc[:int(len(hourly_data)*train_split)]
X_test_hourly = hourly_data.iloc[int(len(hourly_data)*train_split):]

X_train_three_hourly = three_hourly_data.iloc[:int(len(three_hourly_data)*train_split)]
X_test_three_hourly = three_hourly_data.iloc[int(len(three_hourly_data)*train_split):]

X_train_daily = daily_data.iloc[:int(len(daily_data)*train_split)]
X_test_daily = daily_data.iloc[int(len(daily_data)*train_split):]

X_train_hourly

print(len(X_train_hourly), len(X_test_hourly))
print(len(X_train_three_hourly), len(X_test_three_hourly))
print(len(X_train_daily), len(X_test_daily))


67716 16930
22582 5646
2832 708


### Further split the data based on the risk level
There are low, medium, and high risk models.

In [14]:
train_date_split_hourly = {
    'low': X_train_hourly[X_train_hourly['Risk'] == 'Low Risk'],
    'medium': X_train_hourly[X_train_hourly['Risk'] == 'Medium Risk'],
    'high': X_train_hourly[X_train_hourly['Risk'] == 'High Risk']
}

test_date_split_hourly = {
    'low': X_test_hourly[X_test_hourly['Risk'] == 'Low Risk'],
    'medium': X_test_hourly[X_test_hourly['Risk'] == 'Medium Risk'],
    'high': X_test_hourly[X_test_hourly['Risk'] == 'High Risk']
}

train_date_split_three_hourly = {
    'low': X_train_three_hourly[X_train_three_hourly['Risk'] == 'Low Risk'],
    'medium': X_train_three_hourly[X_train_three_hourly['Risk'] == 'Medium Risk'],
    'high': X_train_three_hourly[X_train_three_hourly['Risk'] == 'High Risk']
}

test_date_split_three_hourly = {
    'low': X_test_three_hourly[X_test_three_hourly['Risk'] == 'Low Risk'],
    'medium': X_test_three_hourly[X_test_three_hourly['Risk'] == 'Medium Risk'],
    'high': X_test_three_hourly[X_test_three_hourly['Risk'] == 'High Risk']
}

train_date_split_daily = {
    'low': X_train_daily[X_train_daily['Risk'] == 'Low Risk'],
    'medium': X_train_daily[X_train_daily['Risk'] == 'Medium Risk'],
    'high': X_train_daily[X_train_daily['Risk'] == 'High Risk']
}

test_date_split_daily = {
    'low': X_test_daily[X_test_daily['Risk'] == 'Low Risk'],
    'medium': X_test_daily[X_test_daily['Risk'] == 'Medium Risk'],
    'high': X_test_daily[X_test_daily['Risk'] == 'High Risk']
}


# Train the model based on their classifications

### Training data

#### Forecast function

In [15]:
def compute_ewma_forecast(series, lam):
    # Initialize the forecast with the first observed value
    forecasts = [series.iloc[0]]
    for i in range(1, len(series)):
        f = lam * forecasts[i-1] + (1 - lam) * series.iloc[i-1]
        forecasts.append(f)
    return np.array(forecasts)


#### Calculating optimal lambda
In theory, optimal lambda is 0.94, but given the abundance of training data, we can use gridsearch to find optimal lambda

In [16]:
def optimize_lambda(series, candidate_lambdas=np.linspace(0.90, 0.99, 100)):
    best_lambda = None
    best_mse = np.inf
    for lam in candidate_lambdas:
        forecast = compute_ewma_forecast(series, lam)
        # Exclude the first forecast because it is just the initialization
        mse = np.mean((series.values[1:] - forecast[1:])**2)
        if mse < best_mse:
            best_mse = mse
            best_lambda = lam
    return best_lambda, best_mse


This wil give us 3 models to work with: model_low, model_medium, and model_high. We will use these subsequent models on the test data to evaluate the models

In [None]:

# Function to train models and summarize them
def train_models_by_frequency_and_risk(train_data, frequencies):
    models = {}
    model_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Optimal Lambda', 'In-Sample MSE'])
    
    for freq in frequencies:
        models[freq] = {}
        for risk_group in ['low', 'medium', 'high']:
            group_data = train_data[risk_group]
            
            # Select target based on frequency
            if freq == 'hourly':
                target = 'ln_hourly_rv'
            elif freq == '3hourly':
                target = 'ln_3_hourly_rv'
            elif freq == 'daily':
                target = 'ln_daily_rv'
            else:
                raise ValueError(f"Unsupported frequency: {freq}")
            
            if target not in group_data.columns:
                print(f"Missing {target} for {freq}-{risk_group}, skipping.")
                continue
            
            # Drop missing values
            series = group_data[target].dropna()
            if len(series) < 2:
                print(f"Insufficient data for {freq}-{risk_group}, skipping.")
                continue
            
            # Optimize lambda for the EWMA model
            optimal_lambda, in_sample_mse = optimize_lambda(series)
            
            # Store the "trained" model: here just the optimal lambda and initial value
            ewma_model = {'lambda': optimal_lambda, 'initial': series.iloc[0]}
            models[freq][risk_group] = ewma_model
            
            # Append results to the summary DataFrame
            summary_row = {
                'Frequency': freq,
                'Risk Group': risk_group,
                'Optimal Lambda': optimal_lambda,
                'In-Sample MSE': in_sample_mse
            }
            model_summary = pd.concat([model_summary, pd.DataFrame([summary_row])], ignore_index=True)
    
    return models, model_summary

model_daily, summary_daily = train_models_by_frequency_and_risk(train_date_split_daily, ['daily'])
model_hourly, summary_hourly = train_models_by_frequency_and_risk(train_date_split_hourly, ['hourly'])
model_three_hourly, summary_three_hourly = train_models_by_frequency_and_risk(train_date_split_three_hourly, ['3hourly'])

# Merge the summaries
summary = pd.concat([summary_daily, summary_hourly, summary_three_hourly], ignore_index=True)

  model_summary = pd.concat([model_summary, pd.DataFrame([summary_row])], ignore_index=True)


Insufficient data for daily-medium, skipping.


  model_summary = pd.concat([model_summary, pd.DataFrame([summary_row])], ignore_index=True)


### Model summary

In [None]:
summary

Unnamed: 0,Frequency,Risk Group,Optimal Lambda,In-Sample MSE
0,daily,low,0.9,0.975163
1,daily,high,0.9,0.847627
2,hourly,low,0.900909,5.813577
3,hourly,medium,0.938182,5.949516
4,hourly,high,0.957273,6.051113
5,3hourly,low,0.9,2.00824
6,3hourly,high,0.9,1.725562


### Testing data

#### Implementing rolling window
Rolling window is used for a one step ahead forecast. So we constantly update the lagged data with an update lagged data

In [None]:
def rolling_window_predictions(X_test, y_test, model, window_size=24, step_ahead=1):
    predictions = []
    actuals = []
    dates = []
    lam = model['lambda']  # Get the decay parameter from the trained EWMA model
    
    max_index = len(X_test) - step_ahead  # Ensure we have data for the forecast horizon
    for i in range(window_size, max_index + 1):
        # Use the target series within the current rolling window
        window_data = y_test.iloc[i - window_size : i]
        
        # Compute the EWMA forecast using the provided decay factor
        forecast = compute_ewma_forecast(window_data, lam)
        # For EWMA, the one-step ahead forecast is the last computed value.
        y_pred = forecast[-1]
        # For multi-step ahead, under basic EWMA assumptions, the forecast remains constant.
        
        # Get the actual value at the forecast time point
        actual_index = i + step_ahead - 1
        actual_value = y_test.iloc[actual_index]
        
        # Capture the corresponding date from the X_test 'Date' column
        current_date = X_test['Date'].iloc[actual_index]
        
        predictions.append(y_pred)
        actuals.append(actual_value)
        dates.append(current_date)
    
    return predictions, actuals, dates


#### Implementing an evaluatation function
This function evaluates the findings and puts it in a df for each ticker

In [None]:
def evaluate_models_on_test_data(test_data_split, models, frequencies, window_size=24):
    evaluation_summary = pd.DataFrame(columns=['Frequency', 'Risk Group', 'Ticker', 'MSE', 'R²'])
    detailed_results = pd.DataFrame(columns=['Date', 'Ticker', 'Risk Group', 'Frequency', 'Predicted', 'Actual'])
    
    for freq in frequencies:
        for risk_group in ['low', 'medium', 'high']:
            model = models[freq][risk_group]
            group_data = test_data_split[risk_group].copy()
            
            # Define target and forecast horizon based on frequency
            if freq == 'hourly':
                target = 'ln_hourly_rv'
                step_ahead = 1  # Forecast 1 step ahead
            elif freq == '3hourly':
                target = 'ln_3_hourly_rv'
                step_ahead = 1  # Forecast 3 steps ahead
            elif freq == 'daily':
                target = 'ln_daily_rv'
                step_ahead = 1  # Forecast 24 steps ahead (1 day)
            else:
                raise ValueError("Unsupported frequency")
            
            unique_tickers = group_data['Ticker'].unique()
            
            for ticker in unique_tickers:
                ticker_data = group_data[group_data['Ticker'] == ticker].copy()
                
                # Validate that the required columns exist
                if 'Date' not in ticker_data.columns or target not in ticker_data.columns:
                    print(f"Skipping {ticker}: missing Date or {target} for {freq}-{risk_group}")
                    continue
                
                # Prepare test data with Date and target column
                X_test = ticker_data[['Date', target]].dropna()
                y_test = X_test[target]
                
                # Check for sufficient data
                if len(X_test) < window_size + step_ahead:
                    print(f"Skipping {ticker}: insufficient data ({len(X_test)} rows)")
                    continue
                
                # Get EWMA-based predictions via a rolling window
                predictions, actuals, dates = rolling_window_predictions(
                    X_test, y_test, model, window_size=window_size, step_ahead=step_ahead
                )
                
                if len(predictions) == 0:
                    continue
                                
                # Calculate metrics on the log-transformed values
                mse = mean_squared_error(actuals, predictions)
                r2 = r2_score(actuals, predictions)
                
                eval_row = {
                    'Frequency': freq,
                    'Risk Group': risk_group.capitalize(),
                    'Ticker': ticker,
                    'MSE': mse,
                    'R²': r2
                }
                evaluation_summary = pd.concat(
                    [evaluation_summary, pd.DataFrame([eval_row])], ignore_index=True
                )
                
                ticker_results = pd.DataFrame({
                    'Date': dates,
                    'Ticker': ticker,
                    'Risk Group': risk_group.capitalize(),
                    'Frequency': freq,
                    'Predicted': predictions,
                    'Actual': actuals
                })
                detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)
    
    return evaluation_summary, detailed_results


#### Evaluate the test data

In [None]:
# Define frequencies
frequencies = ['hourly', '3hourly', 'daily']

# Evaluate models on test data for each frequency.
evaluation_summary_hourly, detailed_results_hourly = evaluate_models_on_test_data(
    test_date_split_hourly, model_hourly, ['hourly'], window_size=24
)
evaluation_summary_three_hourly, detailed_results_three_hourly = evaluate_models_on_test_data(
    test_date_split_three_hourly, model_three_hourly, ['3hourly'], window_size=24
)
evaluation_summary_daily, detailed_results_daily = evaluate_models_on_test_data(
    test_date_split_daily, model_daily, ['daily'], window_size=24
)

# Combine the evaluation summaries across frequencies, if desired.
combined_summary = pd.concat([evaluation_summary_hourly, evaluation_summary_three_hourly, evaluation_summary_daily], ignore_index=True)
combined_details = pd.concat([detailed_results_hourly, detailed_results_three_hourly, detailed_results_daily], ignore_index=True)

# Save the detailed results to CSV
combined_details.to_csv('../../results/appendix/large_split_ewma.csv', index=False)

# Print the evaluation summaries
print("Combined Evaluation Summary:")
print(combined_summary)

combined_summary.to_csv('../../results/appendix/large_split_ewma_summary.csv', index=False)

  evaluation_summary = pd.concat(
  detailed_results = pd.concat([detailed_results, ticker_results], ignore_index=True)


KeyError: 'medium'