# Autoregressive Model Testing (SARIMA & SARIMAX)

In [116]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import itertools
import warnings
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.seasonal import seasonal_decompose
from IPython.display import display, HTML
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_breusch_godfrey
from statsmodels.api import OLS, add_constant
from scipy.stats import shapiro, anderson
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [117]:
# Load data
df = pd.read_excel('data/Monthly Mastersheet.xlsx')

# Ensure date is datetime and set index
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df.index = pd.date_range(start=df.index[0], periods=len(df), freq='MS')
df.columns = df.columns.str.strip()

## Data Preparation

In [118]:
macro_list = ['LFPR', 'CPI', 'r', 'M1', 'GDP', 'IM', 'EX', 'CC']
asset_list= ['Bitcoin', 'Tether', 'Litecoin', 'XRP', 'Ethereum', 'Dogecoin', 'Cardano', 'USD Coin']
pc_list = ['PC1_macro', 'PC2_macro', 'PC1_crypto', 'PC2_crypto']
train_end = '2024-01-01'

In [119]:
ar_orders = {}
ar_orders['LFPR'] = {'p': 1, 'd': 1, 'q': 0, 'P': 1, 'D': 1, 'Q': 0, 
                     'Bitcoin_lag': 1, 'Tether_lag': 3, 'Litecoin_lag': 0, 'XRP_lag': 0, 'Ethereum_lag': 0, 'Dogecoin_lag': 0, 
                     'Cardano_lag': 2, 'USD Coin_lag': 0, 'PC1_crypto_lag': 0, 'PC2_crypto_lag': 1,'VIX_lag': 0}
ar_orders['CPI'] = {'p': 1, 'd': 2, 'q': 0, 'P': 1, 'D': 1, 'Q': 1, 
                    'Bitcoin_lag': 3, 'Tether_lag': 5, 'Litecoin_lag': 4, 'XRP_lag': 5, 'Ethereum_lag': 0, 'Dogecoin_lag': 6, 
                    'Cardano_lag': 2, 'USD Coin_lag': 2, 'PC1_crypto_lag': 6, 'PC2_crypto_lag': 2, 'VIX_lag': 0}
ar_orders['r'] = {'p': 1, 'd': 2, 'q': 0, 'P': 1, 'D': 0, 'Q': 0, 
                  'Bitcoin_lag': 6, 'Tether_lag': 5, 'Litecoin_lag': 5, 'XRP_lag': 3, 'Ethereum_lag': 0, 'Dogecoin_lag': 1, 
                  'Cardano_lag': 2, 'USD Coin_lag': 2, 'PC1_crypto_lag': 1, 'PC2_crypto_lag': 0, 'VIX_lag': 0}
ar_orders['M1'] = {'p': 1, 'd': 1, 'q': 0, 'P': 1, 'D': 0, 'Q': 1, 
                   'Bitcoin_lag': 1, 'Tether_lag': 2, 'Litecoin_lag': 2, 'XRP_lag': 5, 'Ethereum_lag': 2, 'Dogecoin_lag': 0, 
                   'Cardano_lag': 0, 'USD Coin_lag': 0, 'PC1_crypto_lag': 1, 'PC2_crypto_lag': 1, 'VIX_lag': 0}
ar_orders['GDP'] = {'p': 1, 'd': 1, 'q': 0, 'P': 1, 'D': 1, 'Q': 0, 
                            'Bitcoin_lag': 0, 'Tether_lag': 2, 'Litecoin_lag': 0, 'XRP_lag': 0, 'Ethereum_lag': 4, 'Dogecoin_lag': 1, 
                            'Cardano_lag': 0, 'USD Coin_lag': 2, 'PC1_crypto_lag': 0, 'PC2_crypto_lag': 0, 'VIX_lag': 0}
ar_orders['IM'] = {'p': 1, 'd': 1, 'q': 1, 'P': 2, 'D': 1, 'Q': 0, 
                   'Bitcoin_lag': 0, 'Tether_lag': 1, 'Litecoin_lag': 2, 'XRP_lag': 3, 'Ethereum_lag': 3, 'Dogecoin_lag': 0, 
                   'Cardano_lag': 3, 'USD Coin_lag': 1, 'PC1_crypto_lag': 2, 'PC2_crypto_lag': 0, 'VIX_lag': 0}
ar_orders['EX'] = {'p': 1, 'd': 1, 'q': 0, 'P': 1, 'D': 1, 'Q': 0, 
                   'Bitcoin_lag': 4, 'Tether_lag': 0, 'Litecoin_lag': 0, 'XRP_lag': 0, 'Ethereum_lag': 0, 'Dogecoin_lag': 0, 
                   'Cardano_lag': 0, 'USD Coin_lag': 6, 'PC1_crypto_lag': 1, 'PC2_crypto_lag': 0, 'VIX_lag': 0}
ar_orders['CC'] = {'p': 2, 'd': 1, 'q': 0, 'P': 1, 'D': 1, 'Q': 1, 
                   'Bitcoin_lag': 0, 'Tether_lag': 0, 'Litecoin_lag': 0, 'XRP_lag': 4, 'Ethereum_lag': 0, 'Dogecoin_lag': 3, 
                   'Cardano_lag': 0, 'USD Coin_lag': 4, 'PC1_crypto_lag': 0, 'PC2_crypto_lag': 0, 'VIX_lag': 0}
# ar_orders['PC1_macro'] = {'p': 1, 'd': 1, 'q': 0, 'P': 1, 'D': 1, 'Q': 0, 
#                           'Bitcoin_lag': 2, 'Tether_lag': 0, 'Litecoin_lag': 2, 'XRP_lag': 4, 'Ethereum_lag': 1, 'Dogecoin_lag': 4, 
#                           'Cardano_lag': 1, 'USD Coin_lag': 4, 'PC1_crypto_lag': 1, 'PC2_crypto_lag': 1, 'VIX_lag': 0}
# ar_orders['PC2_macro'] = {'p': 1, 'd': 1, 'q': 1, 'P': 1, 'D': 1, 'Q': 0, 
#                           'Bitcoin_lag': 0, 'Tether_lag': 0, 'Litecoin_lag': 0, 'XRP_lag': 0, 'Ethereum_lag': 0, 'Dogecoin_lag': 1, 
#                           'Cardano_lag': 1, 'USD Coin_lag': 2, 'PC1_crypto_lag': 0, 'PC2_crypto_lag': 0, 'VIX_lag': 0}

# ar_orders['CC Monthly % Change'] = {'p': 2, 'd': 0,'q': 0, 'P': 2, 'D': 1, 'Q': 0}
# ar_orders['VIX'] = {'p': 1, 'd': 0,'q': 0, 'P': 1, 'D': 1, 'Q': 0}
# ar_orders['MOVE'] = {'p': 1, 'd': 1,'q': 0, 'P': 1, 'D': 0, 'Q': 0}
# ar_orders['vol_LFPR'] = {'p': 1, 'd': 1,'q': 0}
# ar_orders['vol_CPI'] = {'p': 1, 'd': 1, 'q': 0}
# ar_orders['vol_r'] = {'p': 1, 'd': 1,'q': 0}
# ar_orders['vol_M1'] = {'p': 1, 'd': 1,'q': 0}
# ar_orders['vol_GDP'] = {'p': 1, 'd': 1, 'q': 0}
# ar_orders['vol_IM'] = {'p': 1, 'd': 2,'q': 0}
# ar_orders['vol_EX'] = {'p': 1, 'd': 1, 'q': 0}
# ar_orders['vol_CC'] = {'p': 1, 'd': 1,'q': 0}

## Checking Variable Stationarity, ACF, PACF

In [120]:
def check_stationarity(series):
    result = adfuller(series.dropna())
    p_value = result[1]
    print(f"ADF test for {series.name}: p-value = {p_value:.4f}")
    return p_value

In [121]:
# ACF and PACF
def acf(series, name = 'variable'):
    fig, ax = plt.subplots(2, 1, figsize=(10, 6))
    plot_acf(series, lags=30, ax=ax[0])
    ax[0].set_title(f'ACF of {name}')
    plot_pacf(series, lags=30, ax=ax[1])
    ax[1].set_title(f'PACF of {name}')
    plt.tight_layout()
    plt.show()

In [122]:
# variable = 'PC2_macro'
# check_stationarity(df[variable])
# series = df[variable].dropna()
# acf(series, variable)
# df[f'{variable}_diff'] = df[variable].diff()
# series = df[f'{variable}_diff'].dropna()
# acf(series, f'{variable}_diff')
# check_stationarity(df[f'{variable}_diff'])

# df[f'{variable}_diff_diff'] = df[f'{variable}_diff'].diff()
# series = df[f'{variable}_diff_diff'].dropna()
# acf(series, f'{variable}_diff_diff')
# check_stationarity(df[f'{variable}_diff_diff'])

## SARIMA(X) Model

In [123]:
def check_model_assumptions_arx(y_train, X_train, model_residuals):
    # Add constant to X
    X_const = add_constant(X_train)
    
    # 1. Breusch–Pagan test for heteroskedasticity
    ols_model = OLS(y_train, X_const).fit()
    bp_stat, bp_pvalue, _, _ = het_breuschpagan(ols_model.resid, X_const)
    
    # # 2. Durbin-Watson test for autocorrelation
    # dw_stat = durbin_watson(model_residuals)

    # 3. Breusch–Godfrey test
    bg_stat, bg_pvalue, _, _ = acorr_breusch_godfrey(ols_model, nlags=1)

    # 4. Shapiro-Wilk test for normality
    shapiro_stat, shapiro_p = shapiro(model_residuals)

    # 5. Anderson-Darling test
    ad_result = anderson(model_residuals)
    ad_stat = ad_result.statistic
    ad_crit = list(zip(ad_result.significance_level, ad_result.critical_values))

    # 6. Mean of residuals
    mean_resid = model_residuals.mean()

    # 7. VIF for predictors
    # vif_df = pd.DataFrame()
    # vif_df["feature"] = X_const.columns
    # vif_df["VIF"] = [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]

    return {
        "Breusch-Pagan p": bp_pvalue,
        # "Durbin-Watson": dw_stat,
        "Breusch-Godfrey p": bg_pvalue,
        "Shapiro p": shapiro_p,
        "Anderson stat": ad_stat,
        "Anderson crit": ad_crit,
        "Mean resid": mean_resid,
    }

In [132]:
def run_model(df, macro, asset, plot=False):
    order_dict = ar_orders.get(macro, {'p': 1, 'd': 1, 'q': 0, 'P': 1, 'D': 1, 'Q': 0})
    
    # Unpack ARIMA and seasonal orders
    p = order_dict['p']
    d = order_dict['d']
    q = order_dict['q']
    P = order_dict['P']
    D = order_dict['D']
    Q = order_dict['Q']

    # Extract asset-specific lag
    asset_lag_key = f"{asset}_lag"
    asset_lag = order_dict.get(asset_lag_key, 0)

    ### ==== AR Data: Use only macro series ==== ###
    df_macro = df[[macro]].dropna().copy()
    target_ar = df_macro[macro]
    train_endog_ar = target_ar[:train_end]
    test_endog_ar = target_ar[train_end:]

    ### ==== ARX Data: Use macro + asset ==== ###
    df_temp = df[[macro, asset]].dropna().copy()

    # Create lagged asset columns
    for lag in range(1, asset_lag + 1):
        df_temp[f'{asset}_lag{lag}'] = df_temp[asset].shift(lag)

    exog_cols = [asset] + [f'{asset}_lag{lag}' for lag in range(1, asset_lag + 1)]
    df_temp = df_temp.dropna()

    exog = df_temp[exog_cols]
    target_arx = df_temp[macro]

    train_endog_arx = target_arx[:train_end]
    train_exog = exog[:train_end]
    test_endog_arx = target_arx[train_end:]
    test_exog = exog[train_end:]

    ### ==== Fit AR and ARX Models ==== ###
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always", ConvergenceWarning)

        ar_model = SARIMAX(train_endog_ar, order=(p, d, q), seasonal_order=(P, D, Q, 12))
        ar_result = ar_model.fit(disp=False)

        arx_model = SARIMAX(train_endog_arx, exog=train_exog, order=(p, d, q), seasonal_order=(P, D, Q, 12))
        arx_result = arx_model.fit(disp=False)

        for warning in w:
            if issubclass(warning.category, ConvergenceWarning):
                print(f"[WARNING] Convergence issue in macro: {macro}, asset: {asset}")

    ### ==== Forecasts ==== ###
    pred_ar = ar_result.get_forecast(steps=len(test_endog_ar)).predicted_mean
    conf_int_ar = ar_result.get_forecast(steps=len(test_endog_ar)).conf_int()

    pred_arx = arx_result.get_forecast(steps=len(test_endog_arx), exog=test_exog).predicted_mean
    conf_int_arx = arx_result.get_forecast(steps=len(test_endog_arx), exog=test_exog).conf_int()

    # Align index for plotting
    pred_ar.index = test_endog_ar.index
    pred_arx.index = test_endog_arx.index
    conf_int_ar.index = test_endog_ar.index
    conf_int_arx.index = test_endog_arx.index

    ### ==== Plotting ==== ###
    if plot:
        plt.figure(figsize=(10, 5))
        plt.plot(target_ar, label='Actual ' + macro, color='black')
        plt.plot(pred_ar, label=f'Forecasted {macro} (AR only)', linestyle='--', color='blue')
        plt.fill_between(pred_ar.index, conf_int_ar.iloc[:, 0], conf_int_ar.iloc[:, 1], color='blue', alpha=0.1)
        plt.plot(pred_arx, label=f'Forecasted {macro} (ARX with {asset})', linestyle='--', color='red')
        plt.fill_between(pred_arx.index, conf_int_arx.iloc[:, 0], conf_int_arx.iloc[:, 1], color='red', alpha=0.1)
        plt.title("Out-of-Sample Forecast")
        plt.legend()
        plt.tight_layout()
        plt.show()

        plt.figure(figsize=(10, 5))
        plt.plot(test_endog_ar, label='Actual ' + macro, marker='o', color='black')
        plt.plot(pred_ar, label=f'AR Forecast', linestyle='--', marker='x', color='blue')
        plt.plot(pred_arx, label=f'ARX Forecast', linestyle='--', marker='s', color='red')
        plt.title("Forecast vs Actual (Test Period)")
        plt.xlabel("Date")
        plt.ylabel(macro)
        plt.legend()
        plt.tight_layout()
        plt.show()

    ### ==== Metrics ==== ###
    metrics = [
        {
            'Model': 'AR',
            'MAE': mean_absolute_error(test_endog_ar, pred_ar),
            'RMSE': np.sqrt(mean_squared_error(test_endog_ar, pred_ar)),
            'R2': r2_score(test_endog_ar, pred_ar),
            'MAPE (%)': mean_absolute_percentage_error(test_endog_ar, pred_ar) * 100,
            'Order': f'({p},{d},{q})'
        },
        {
            'Model': 'ARX',
            'MAE': mean_absolute_error(test_endog_arx, pred_arx),
            'RMSE': np.sqrt(mean_squared_error(test_endog_arx, pred_arx)),
            'R2': r2_score(test_endog_arx, pred_arx),
            'MAPE (%)': mean_absolute_percentage_error(test_endog_arx, pred_arx) * 100,
            'Order': f'({p},{d},{q})'
        }
    ]
    assumption_results = check_model_assumptions_arx(train_endog_ar, train_exog, arx_result.resid)
    # print(f"==== Model Assumption Tests for {macro} + {asset} ====")
    # print(f"Breusch–Pagan p: {assumption_results['Breusch-Pagan p']:.4f}")
    # print(f"Durbin–Watson: {assumption_results['Durbin-Watson']:.3f}")
    # print(f"Breusch–Godfrey p: {assumption_results['Breusch-Godfrey p']:.4f}")
    # print(f"Shapiro-Wilk p: {assumption_results['Shapiro p']:.4f}")
    # print(f"Anderson-Darling stat: {assumption_results['Anderson stat']:.4f}")
    # print(f"Mean residual: {assumption_results['Mean resid']:.2e}")
    # print("VIF:")
    # print(assumption_results['VIF'])

    return pd.DataFrame(metrics).set_index('Model'), assumption_results 

In [133]:
results_assumptions = []
for macro in macro_list:
    for asset in asset_list:
        try:
            metrics_df, assumptions = run_model(df.copy(), macro, asset, plot=False)

            # Add identifiers
            assumptions['Macro'] = macro
            assumptions['Asset'] = asset
            results_assumptions.append(assumptions)

        except Exception as e:
            print(f"Failed for {macro} + {asset}: {e}")
       
# Merge all metrics
# Create a DataFrame for assumption tests
final_assumptions_df = pd.DataFrame(results_assumptions)

# Clean columns
final_assumptions_df = final_assumptions_df[
    ["Macro", "Asset", "Breusch-Pagan p", "Breusch-Godfrey p",
     "Shapiro p", "Anderson stat", "Mean resid"]
]

Failed for LFPR + Bitcoin: The indices for endog and exog are not aligned
Failed for LFPR + Tether: The indices for endog and exog are not aligned
Failed for LFPR + Cardano: The indices for endog and exog are not aligned
Failed for LFPR + USD Coin: The indices for endog and exog are not aligned
Failed for CPI + Bitcoin: The indices for endog and exog are not aligned
Failed for CPI + Tether: The indices for endog and exog are not aligned
Failed for CPI + Litecoin: The indices for endog and exog are not aligned
Failed for CPI + XRP: The indices for endog and exog are not aligned
Failed for CPI + Dogecoin: The indices for endog and exog are not aligned
Failed for CPI + Cardano: The indices for endog and exog are not aligned
Failed for CPI + USD Coin: The indices for endog and exog are not aligned
Failed for r + Bitcoin: The indices for endog and exog are not aligned
Failed for r + Tether: The indices for endog and exog are not aligned
Failed for r + Litecoin: The indices for endog and exo

KeyboardInterrupt: 

In [126]:
pd.set_option('display.max_columns', None)

# Show all rows (if needed)
pd.set_option('display.max_rows', None)

# Prevent column content from being truncated
pd.set_option('display.max_colwidth', None)
final_assumptions_df

Unnamed: 0,Macro,Asset,Breusch-Pagan p,Breusch-Godfrey p,Shapiro p,Anderson stat,Mean resid
0,LFPR,Litecoin,0.002675,0.710209,0.07577693,0.498606,-1.5e-05
1,LFPR,XRP,0.168421,0.938885,0.02092583,0.616034,-0.000124
2,LFPR,Ethereum,4e-06,0.684582,0.03620032,0.464898,-2.4e-05
3,LFPR,Dogecoin,0.913813,0.946632,5.204856e-07,3.419606,-5.9e-05
4,CPI,Ethereum,0.001297,3e-06,0.08447897,0.447834,-1.1e-05
5,r,Ethereum,5e-06,0.000174,2.630657e-08,3.570641,0.001817
6,M1,Dogecoin,0.753973,0.384419,1.24e-17,22.801929,5.5e-05
7,GDP,Bitcoin,0.005616,0.051299,1.169698e-06,2.308755,0.000197
8,GDP,Litecoin,0.009478,0.097212,6.719411e-05,1.300407,0.000112
9,GDP,XRP,0.247827,0.121223,2.069693e-06,2.048091,-0.000135


## All Macro & Crypto Combination

In [127]:
results_list = []
asset = 'Bitcoin'
for macro in macro_list:
        # Run model, plot=False to skip plotting in batch run
        metrics_df, assumptions = run_model(df.copy(), macro, asset, plot=False)
        
        # metrics_df is a DataFrame with index Model (AR, ARX) and columns MAE, RMSE, R2, MAPE, Order
        # Add macro and asset columns for clarity
        metrics_df['Macro'] = macro
        metrics_df['Asset'] = asset
        
        results_list.append(metrics_df.reset_index())
# Combine all results into one DataFrame
final_results = pd.concat(results_list, ignore_index=True)

# Rearrange columns to show Model, Macro, Asset, and errors only
final_results = final_results[['Model', 'Macro', 'Asset', 'MAE', 'RMSE', 'R2', 'MAPE (%)']]

# Format float columns for better readability
float_cols = ['MAE', 'RMSE', 'R2']
final_results[float_cols] = final_results[float_cols]

ValueError: The indices for endog and exog are not aligned

In [None]:
final_results

Unnamed: 0,Model,Macro,Asset,MAE,RMSE,R2,MAPE (%)
0,AR,LFPR,Bitcoin,0.003778,0.004105,-8.622085,964123000000000.0
1,ARX,LFPR,Bitcoin,0.002853,0.003377,-5.51306,867280400000000.0
2,AR,CPI,Bitcoin,0.015724,0.017328,-170.301196,3923.958
3,ARX,CPI,Bitcoin,0.008226,0.009123,-46.484506,2178.907
4,AR,r,Bitcoin,0.012332,0.015119,0.485414,3111206000000000.0
5,ARX,r,Bitcoin,0.180736,0.209227,-97.550132,3.867054e+16
6,AR,M1,Bitcoin,0.002195,0.002678,-1.111311,127.2761
7,ARX,M1,Bitcoin,0.054426,0.061265,-1104.186863,8606.032
8,AR,GDP,Bitcoin,0.0062,0.007104,-2.214134,482.9077
9,ARX,GDP,Bitcoin,0.00775,0.00903,-4.19348,507.0582


### Combinations Where Adding Asset Data Improves the Model

In [None]:
# Reshape for comparison
df_wide = final_results.pivot_table(
    index=['Macro', 'Asset'],
    columns='Model',
    values=['MAE', 'RMSE', 'R2', 'MAPE (%)']
)

df_wide.columns = ['_'.join(col).strip() for col in df_wide.columns.values]
df_wide.reset_index(inplace=True)

# Define better = lower RMSE, lower MAE, higher R²
df_wide['ARX_better_RMSE'] = df_wide['RMSE_ARX'] < df_wide['RMSE_AR']
df_wide['ARX_better_MAE'] = df_wide['MAE_ARX'] < df_wide['MAE_AR']
df_wide['ARX_better_MAPE'] = df_wide['MAPE (%)_ARX'] < df_wide['MAPE (%)_AR']
df_wide['ARX_better_R2']  = df_wide['R2_ARX']  > df_wide['R2_AR']

# Filter: only combinations where ARX is better by **all** metrics
better_all = df_wide[
    (df_wide['ARX_better_RMSE']) &
    (df_wide['ARX_better_MAE']) &
    (df_wide['ARX_better_MAPE']) &
    (df_wide['ARX_better_R2'])
]

# Display results
if not better_all.empty:
    print("Combinations where ARX (with asset) outperforms AR on all metrics (MAE, RMSE, MAPE, R²):")
    display(better_all[['Macro', 'Asset', 'MAE_AR', 'MAE_ARX', 'RMSE_AR', 'RMSE_ARX', 'MAPE (%)_AR', 'MAPE (%)_ARX', 'R2_AR', 'R2_ARX']])
else:
    print("No combination found where ARX beats AR across MAE, RMSE, and R².")


# Partial wins
print("\n Combinations where ARX has lower MAE:")
display(df_wide[df_wide['ARX_better_MAE']][['Macro', 'Asset', 'MAE_AR', 'MAE_ARX']])

print("\n Combinations where ARX has lower RMSE:")
display(df_wide[df_wide['ARX_better_RMSE']][['Macro', 'Asset', 'RMSE_AR', 'RMSE_ARX']])

print("\n Combinations where ARX has lower MAPE:")
display(df_wide[df_wide['ARX_better_MAPE']][['Macro', 'Asset', 'MAPE (%)_AR', 'MAPE (%)_ARX']])

print("\n Combinations where ARX has higher R²:")
display(df_wide[df_wide['ARX_better_R2']][['Macro', 'Asset', 'R2_AR', 'R2_ARX']])

Combinations where ARX (with asset) outperforms AR on all metrics (MAE, RMSE, MAPE, R²):


Unnamed: 0,Macro,Asset,MAE_AR,MAE_ARX,RMSE_AR,RMSE_ARX,MAPE (%)_AR,MAPE (%)_ARX,R2_AR,R2_ARX
1,CPI,PC2_crypto,3.149736,2.795072,3.592791,3.132072,1.000016,0.887623,-1.351612,-0.787168
3,GDP Monthly,PC2_crypto,119.33931,50.715882,126.999847,58.422681,0.511925,0.217067,0.567334,0.908439
6,M1,PC2_crypto,394.592618,176.004829,476.894472,246.869584,2.163891,0.963,-5.205698,-0.662958
7,PC1_macro,PC2_crypto,984.237581,891.729437,1143.344565,1032.183098,4.65608,4.221489,-27.964347,-22.606031
8,PC2_macro,PC2_crypto,98.854498,77.731121,106.911753,87.67715,0.481666,0.378423,0.728595,0.817468



 Combinations where ARX has lower MAE:


Unnamed: 0,Macro,Asset,MAE_AR,MAE_ARX
1,CPI,PC2_crypto,3.149736,2.795072
3,GDP Monthly,PC2_crypto,119.33931,50.715882
6,M1,PC2_crypto,394.592618,176.004829
7,PC1_macro,PC2_crypto,984.237581,891.729437
8,PC2_macro,PC2_crypto,98.854498,77.731121



 Combinations where ARX has lower RMSE:


Unnamed: 0,Macro,Asset,RMSE_AR,RMSE_ARX
1,CPI,PC2_crypto,3.592791,3.132072
3,GDP Monthly,PC2_crypto,126.999847,58.422681
4,IM,PC2_crypto,134.411491,133.493155
6,M1,PC2_crypto,476.894472,246.869584
7,PC1_macro,PC2_crypto,1143.344565,1032.183098
8,PC2_macro,PC2_crypto,106.911753,87.67715



 Combinations where ARX has lower MAPE:


Unnamed: 0,Macro,Asset,MAPE (%)_AR,MAPE (%)_ARX
1,CPI,PC2_crypto,1.000016,0.887623
3,GDP Monthly,PC2_crypto,0.511925,0.217067
6,M1,PC2_crypto,2.163891,0.963
7,PC1_macro,PC2_crypto,4.65608,4.221489
8,PC2_macro,PC2_crypto,0.481666,0.378423



 Combinations where ARX has higher R²:


Unnamed: 0,Macro,Asset,R2_AR,R2_ARX
1,CPI,PC2_crypto,-1.351612,-0.787168
3,GDP Monthly,PC2_crypto,0.567334,0.908439
4,IM,PC2_crypto,0.135274,0.14705
6,M1,PC2_crypto,-5.205698,-0.662958
7,PC1_macro,PC2_crypto,-27.964347,-22.606031
8,PC2_macro,PC2_crypto,0.728595,0.817468


## Testing Individual Combinations

In [None]:
macro = 'PC1_macro'
asset = 'MOVE'
run_model(df.copy(), macro, asset, plot=False)

Unnamed: 0_level_0,MAE,RMSE,R2,MAPE (%),Order
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AR,984.237581,1143.344565,-27.964347,4.65608,"(1,1,0)"
ARX,1051.210238,1201.981755,-31.011442,4.974656,"(1,1,0)"


## Finding Optimal Crypto Lag

In [None]:
# def find_optimal_lag(df, macro, asset, max_lag=6, verbose=False):
#     best_lag = None
#     best_improvement = np.inf
#     best_metrics = None

#     results = []

#     for lag in range(0, max_lag + 1):
#         # Temporarily override lag

#         try:
#             metrics = run_model(df, macro, asset, lag)
#             ar = metrics.loc['AR']
#             arx = metrics.loc['ARX']

#             delta_mape = arx['MAPE (%)'] - ar['MAPE (%)']
#             results.append({
#                 'Lag': lag,
#                 'ΔMAPE': delta_mape,
#             })

#             if delta_mape < best_improvement:
#                 best_lag = lag
#                 best_improvement = delta_mape
#                 best_metrics = metrics

#             if verbose:
#                 print(f"Lag {lag}: ΔMAPE = {delta_mape:.2f}")

#         except Exception as e:
#             print(f"Lag {lag}: Failed with error: {e}")
#             continue

#     results_df = pd.DataFrame(results)
#     return best_lag, best_improvement, results_df, best_metrics


In [None]:
# for macro in ar_orders.keys():
#     best_lag, _, _, _ = find_optimal_lag(df, macro, asset="VIX", max_lag=6)
    
#     if best_lag is not None:
#         ar_orders[macro]["VIX_lag"] = best_lag
#     else:
#         ar_orders[macro]["VIX_lag"] = np.nan 

In [None]:
from statsmodels.stats.diagnostic import het_breuschpagan

bp_results = [] 
for macro in macro_list:
    for asset in asset_list:
        # prepare the two series
        endog = df[macro]
        raw_exog = df[[asset]]
        data = pd.concat([endog, raw_exog], axis=1).dropna()
        if data.empty:
            continue  
        endog_clean = data[macro]
        raw_exog_clean = data[[asset]]
        exog = sm.add_constant(raw_exog_clean) 

        # ARIMA orders
        mod = sm.tsa.SARIMAX(endog_clean,
                             exog=raw_exog_clean,
                             order=(ar_orders[macro]['p'],
                                    ar_orders[macro]['d'],
                                    ar_orders[macro]['q']),
                             seasonal_order=(ar_orders[macro]['P'],
                                             ar_orders[macro]['D'],
                                             ar_orders[macro]['Q'],
                                             12),
                             enforce_stationarity=False,
                             enforce_invertibility=False)
        res = mod.fit(disp=False)

        # get residuals
        resid = res.resid

        # run Breusch–Pagan on resid**2 vs exog (you can also use fittedvalues)
        lm_stat, lm_pvalue, f_stat, f_pvalue = het_breuschpagan(resid, exog)

        # store
        bp_results.append({
            'macro':     macro,
            'asset':     asset,
            'lm_stat':   lm_stat,
            'lm_pvalue': lm_pvalue,
            'f_stat':    f_stat,
            'f_pvalue':  f_pvalue
        })
bp_df = pd.DataFrame(bp_results)
print(bp_df.pivot(index='macro', columns='asset', values='lm_pvalue'))




asset   Bitcoin   Cardano  Dogecoin  Ethereum  Litecoin    Tether  USD Coin  \
macro                                                                         
CC     0.253187  0.748053  0.432099  0.101998  0.214895  0.063781  0.000159   
CPI    0.706128  0.220021  0.971225  0.713241  0.259209  0.381213  0.031681   
EX     0.507102  0.194442  0.960435  0.175183  0.501549  0.308005  0.936657   
GDP    0.547878  0.339961  0.903414  0.702807  0.784189  0.494116  0.077344   
IM     0.181050  0.008399  0.633347  0.004128  0.147918  0.590354  0.488760   
LFPR   0.137685  0.023226  0.651570  0.228250  0.229700  0.738033  0.213649   
M1     0.913349  0.976527  0.608238  0.751588  0.430452  0.522603  0.501112   
r      0.671662  0.586845  0.681071  0.310879  0.921480  0.701285  0.000010   

asset       XRP  
macro            
CC     0.037640  
CPI    0.091271  
EX     0.372667  
GDP    0.551180  
IM     0.792139  
LFPR   0.096843  
M1     0.363004  
r      0.677552  


## Shapiro Wilk test for Normality of errors

In [None]:
from scipy.stats import shapiro

SW_results = []
for macro in macro_list:
    for asset in asset_list:
        # prepare the two series
        endog = df[macro]
        raw_exog = df[[asset]]
        data = pd.concat([endog, raw_exog], axis=1).dropna()
        
        # ARIMA orders
        mod = sm.tsa.SARIMAX(endog_clean,
                             exog=raw_exog_clean,
                             order=(ar_orders[macro]['p'],
                                    ar_orders[macro]['d'],
                                    ar_orders[macro]['q']),
                             seasonal_order=(ar_orders[macro]['P'],
                                             ar_orders[macro]['D'],
                                             ar_orders[macro]['Q'],
                                             12),
                             enforce_stationarity=False,
                             enforce_invertibility=False)
        res = mod.fit(disp=False)

        # get residuals
        resid = res.resid

        # run Shapiro-Wilk test on residuals
        sw_stat, sw_pvalue = shapiro(resid)

        # store
        SW_results.append({
            'macro':     macro,
            'asset':     asset,
            'sw_stat':   sw_stat,
            'sw_pvalue': sw_pvalue
        })
sw_df = pd.DataFrame(SW_results)
print(sw_df.pivot(index='macro', columns='asset', values='sw_pvalue'))



asset             Bitcoin  Bitcoin Price       Cardano      Dogecoin  \
macro                                                                  
CC           9.277509e-18   9.277509e-18  9.277509e-18  9.277509e-18   
CPI          1.007719e-17   1.007719e-17  1.007719e-17  1.007719e-17   
EX           3.052210e-18   3.052210e-18  3.052210e-18  3.052210e-18   
GDP Monthly  3.052210e-18   3.052210e-18  3.052210e-18  3.052210e-18   
IM           2.557650e-17   2.557650e-17  2.557650e-17  2.557650e-17   
LFPR         3.052210e-18   3.052210e-18  3.052210e-18  3.052210e-18   
M1           6.841178e-20   6.841178e-20  6.841178e-20  6.841178e-20   
PC1_macro    3.052210e-18   3.052210e-18  3.052210e-18  3.052210e-18   
PC2_macro    1.240056e-18   1.240056e-18  1.240056e-18  1.240056e-18   
r            3.144052e-19   3.144052e-19  3.144052e-19  3.144052e-19   

asset            Ethereum      Litecoin          MOVE        Tether  \
macro                                                           

In [None]:
import statsmodels.api as sm
from scipy.stats import shapiro

macro = 'LFPR'
asset = 'Bitcoin'

endog    = df[macro]
raw_exog = df[[asset]]       
exog     = sm.add_constant(raw_exog)
data = pd.concat([endog, raw_exog], axis=1).dropna()

# 2. Fit the SARIMAX (using whatever orders you’ve already tuned)
mod = sm.tsa.SARIMAX(
    endog,
    exog=raw_exog,
    order=(ar_orders[macro]['p'],
           ar_orders[macro]['d'],
           ar_orders[macro]['q']),
    seasonal_order=(ar_orders[macro]['P'],
                    ar_orders[macro]['D'],
                    ar_orders[macro]['Q'],
                    12),
    enforce_stationarity=False,
    enforce_invertibility=False
)
res = mod.fit(disp=False)

# 3. Extract residuals
resid = res.resid

# 4. Run Shapiro–Wilk
stat, pval = shapiro(resid)

print(f"Shapiro–Wilk W = {stat:.4f}, p-value = {pval:.4f}")
if pval < 0.05:
    print("→ Reject normality at α=0.05")
else:
    print("→ Cannot reject normality at α=0.05")


Shapiro–Wilk W = 0.2698, p-value = 0.0000
→ Reject normality at α=0.05
