In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import itertools
import matplotlib.pyplot as plt
import sklearn
from statsmodels.tsa.stattools import adfuller
from arch import arch_model
from sklearn.preprocessing import StandardScaler
from arch import arch_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

# Load data
df = pd.read_excel('data/Monthly Mastersheet with Original Data.xlsx')
print(df.head())
print(df.columns.tolist())

# Ensure date is datetime and set index
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df.index = pd.date_range(start=df.index[0], periods=len(df), freq='MS')
df.columns = df.columns.str.strip()

       Month  Bitcoin  Litecoin       XRP  Ethereum  Dogecoin  Cardano  \
0 2017-09-01 -1.39892  -1.02849 -0.898590  -1.32094  -1.40502      NaN   
1 2017-10-01 -1.33954  -0.77151 -1.145490  -0.83798  -0.46490      NaN   
2 2017-11-01 -1.62299  -1.44830 -1.192080  -1.83787  -1.37105      NaN   
3 2017-12-01 -1.26767  -1.11575 -1.254630  -1.31040  -0.70209      NaN   
4 2018-01-01 -0.89251  -0.24164  0.038975  -0.81128  -0.33444      NaN   

    Tether  USD Coin  LFPR  ...     r      M1        IM        EX     CC  \
0 -3.82830       NaN  63.1  ...  1.15  3535.6  2916.022  2535.501   95.1   
1 -3.73861       NaN  62.7  ...  1.15  3615.0  3034.004  2479.107  100.7   
2 -4.01951       NaN  62.7  ...  1.16  3587.6  3034.004  2479.107   98.5   
3 -2.55687       NaN  62.7  ...  1.30  3630.5  3034.004  2479.107   95.9   
4 -2.75193       NaN  62.7  ...  1.41  3673.4  3093.299  2517.268   95.7   

        GDP  PC1_crypto  PC2_crypto    PC1_macro    PC2_macro  
0  19743.84         NaN         Na

In [14]:
crypto_assets = ['Bitcoin', 'Litecoin', 'XRP', 'Ethereum', 'Dogecoin', 'Cardano', 'USD Coin', 'Tether']
lags = range(7)
macro_vars = ['LFPR']
cutoff_date = pd.to_datetime('2024-01-01')

# Step 1: Generate lagged features for all crypto variables
df_crypto_lagged = pd.DataFrame(index=df.index)
for coin in crypto_assets:
    for lag in lags:
        col_name = f'{coin}_lag{lag}'
        df_crypto_lagged[col_name] = df[coin].shift(lag)

# Step 2: Loop through each macro target
for macro in macro_vars:
    df_macro = df[[macro]].copy()
    df_full = pd.concat([df_macro, df_crypto_lagged], axis=1).dropna()

    # Train-test split
    df_train = df_full[df_full.index < cutoff_date]
    df_test = df_full[df_full.index >= cutoff_date]

    if df_train.empty or df_test.empty:
        print(f"⚠️ Not enough data for macro: {macro}")
        continue

    y_train = df_train[[macro]]
    X_train = df_train.drop(columns=[macro])
    y_test = df_test[[macro]]
    X_test = df_test.drop(columns=[macro])

    # Scaling
    scaler_y = StandardScaler()
    scaler_x = StandardScaler()
    y_train_scaled = scaler_y.fit_transform(y_train).flatten()
    y_test_scaled = scaler_y.transform(y_test).flatten()
    x_train_scaled = scaler_x.fit_transform(X_train)
    x_test_scaled = scaler_x.transform(X_test)

    # Step 3: Fit full ARX-GARCH model with all 56 lagged features
    model = arch_model(
        y_train_scaled, mean='ARX', lags=1, vol='GARCH', p=1, q=1,
        x=x_train_scaled, dist='normal'
    )
    res = model.fit(disp='off')

    # Step 4: Extract p-values and identify significant lags
    pvals = res.pvalues
    exog_pvals = {
        X_train.columns[i]: p for i, (k, p) in enumerate(pvals.items()) if k.startswith('x')
    }
    significant_lags = {
        var: p for var, p in exog_pvals.items() if p < 0.05
    }

    # OPTIONAL: Group by coin and print most significant lag
    grouped = {}
    for var, p in significant_lags.items():
        coin = var.rsplit('_lag', 1)[0]
        if coin not in grouped or p < grouped[coin][1]:
            grouped[coin] = (var, p)

    print(f"\n=== Significant Lags for {macro} (p < 0.05) ===")
    for coin in crypto_assets:
        if coin in grouped:
            var, p = grouped[coin]
            print(f"{coin}: Best lag = {var.split('_lag')[-1]} (p = {p:.4f})")
        else:
            print(f"{coin}: No significant lags")


ValueError: Insufficient data, 58 regressors, 53 data points available

# fixed GARCH (1,1)

In [12]:
from itertools import combinations
from arch import arch_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
from IPython.display import display, HTML

def safe_mape(actual, predicted):
    actual, predicted = np.array(actual), np.array(predicted)
    mask = actual != 0
    return np.mean(np.abs((actual[mask] - predicted[mask]) / actual[mask])) * 100 if np.any(mask) else np.nan

def rolling_forecast_ar(y_train_scaled, y_test_scaled, p, q, l):
    history = list(y_train_scaled)
    preds = []
    last_model = None
    for t in range(len(y_test_scaled)):
        model = arch_model(history, mean='AR', lags=l, vol='GARCH', p=p, q=q, dist='normal', rescale=False)
        res = model.fit(disp='off')
        forecast = res.forecast(horizon=1, method='analytic')
        pred = forecast.mean.values[-1, 0]
        preds.append(pred)
        history.append(y_test_scaled[t])
        last_model = res
    return np.array(preds), last_model

def rolling_forecast_arx(y_train_scaled, x_train_scaled, y_test_scaled, x_test_scaled, p, q, l):
    history_y = list(y_train_scaled)
    history_x = [x.reshape(1, -1) for x in x_train_scaled]
    preds = []
    models = []
    for t in range(len(y_test_scaled)):
        x_arr = np.vstack(history_x)
        model = arch_model(history_y, mean='ARX', lags=l, vol='GARCH', p=p, q=q, x=x_arr, dist='normal', rescale=False)
        res = model.fit(disp='off')
        const = res.params.get('Const', 0)
        phi = res.params.get('y[1]', 0)
        betas = np.array([res.params.get(f'x{i}', 0) for i in range(x_test_scaled.shape[1])])
        y_t = history_y[-1]
        x_t1 = np.asarray(x_test_scaled[t])
        pred = const + phi * y_t + np.dot(betas, x_t1)
        preds.append(pred)
        history_y.append(y_test_scaled[t])
        history_x.append(x_t1.reshape(1, -1))
        models.append(res)
    return np.array(preds), models[-1]  # Return last fitted model for summary

# === CONFIG ===
macro_vars = ['LFPR']
crypto_assets = ['Bitcoin', 'Litecoin', 'XRP', 'Ethereum', 'Dogecoin', 'Cardano', 'USD Coin', 'Tether']
cutoff_date = pd.to_datetime('2024-01-01')

all_results = []
best_results = []

for macro in macro_vars:
    for combo_size in range(1, 9):  # up to 8 crypto assets
        for crypto_combo in combinations(crypto_assets, combo_size):
            p, q = 1, 1
            for lag in range(0, 3):
                cols = [macro] + list(crypto_combo)
                df_temp = df[cols].copy()
                df_temp[macro] = df_temp[macro].shift(lag)
                df_temp.dropna(inplace=True)

                if len(df_temp) < 30 or df_temp[macro].var() == 0:
                    continue

                df_train = df_temp[df_temp.index < cutoff_date]
                df_test = df_temp[df_temp.index >= cutoff_date]

                if len(df_test) == 0 or len(df_train) < 10:
                    continue

                y_col = macro
                x_cols = list(crypto_combo)

                scaler_y = StandardScaler()
                scaler_x = StandardScaler()
                y_train_scaled = scaler_y.fit_transform(df_train[[y_col]]).flatten()
                y_test_scaled = scaler_y.transform(df_test[[y_col]]).flatten()
                x_train_scaled = scaler_x.fit_transform(df_train[x_cols])
                x_test_scaled = scaler_x.transform(df_test[x_cols])

                try:
                    pred_ar_scaled, _ = rolling_forecast_ar(y_train_scaled, y_test_scaled, p, q, 1)
                    pred_ar = scaler_y.inverse_transform(pred_ar_scaled.reshape(-1, 1)).flatten()
                    y_actual = scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()
                    mape_ar = safe_mape(y_actual, pred_ar)

                    if mape_ar > 100:
                        continue

                    pred_arx_scaled, model_arx = rolling_forecast_arx(
                        y_train_scaled, x_train_scaled, y_test_scaled, x_test_scaled, p, q, 1
                    )
                    pred_arx = scaler_y.inverse_transform(pred_arx_scaled.reshape(-1, 1)).flatten()

                    mse_ar = mean_squared_error(y_actual, pred_ar)
                    mse_arx = mean_squared_error(y_actual, pred_arx)
                    r2_ar = r2_score(y_actual, pred_ar)
                    r2_arx = r2_score(y_actual, pred_arx)
                    mape_arx = safe_mape(y_actual, pred_arx)
                    mape_change = ((mape_arx - mape_ar) / mape_ar) * 100 if mape_ar != 0 else np.nan

                    result = {
                        'Macro': macro,
                        'Crypto_Combo': ', '.join(crypto_combo),
                        'p': p,
                        'q': q,
                        'Crypto Lag': lag,
                        'MAPE_AR': mape_ar,
                        'MAPE_ARX': mape_arx,
                        'R2_AR': r2_ar,
                        'R2_ARX': r2_arx,
                        'MAPE_Improvement(%)': mape_change,
                        'Model_ARX': model_arx
                    }
                    all_results.append(result)

                    existing = next((r for r in best_results if r['Macro'] == macro), None)
                    if existing is None or mape_change < existing['MAPE_Improvement(%)']:
                        best_results = [r for r in best_results if r['Macro'] != macro]
                        best_results.append(result)

                except Exception as e:
                    print(f"⚠️ Failed for {macro} with {crypto_combo} p={p}, q={q}, lag={lag}: {e}")

# === DISPLAY RESULTS ===
all_results_df = pd.DataFrame(all_results).drop(columns=['Model_ARX'])
best_results_df = pd.DataFrame(best_results).drop(columns=['Model_ARX'])

sorted_best = best_results_df.sort_values('MAPE_Improvement(%)')
sorted_all = all_results_df.sort_values(['Macro', 'MAPE_Improvement(%)'])

display(sorted_best)

html_all = sorted_all.to_html(index=False)
display(HTML(f'''
    <div style="max-height:500px; overflow:auto; border:1px solid #ccc; padding:10px; font-size:90%">
        {html_all}
    </div>
'''))

# === SHOW MODEL SUMMARIES FOR BEST FEW MODELS ===
print("\n\n=== ARX-GARCH Model Summaries for Top 3 Improvements ===\n")
top_models = sorted(best_results, key=lambda x: x['MAPE_Improvement(%)'])[:3]
for i, result in enumerate(top_models, 1):
    print(f"--- Model {i} ---")
    print(f"Macro Target: {result['Macro']}")
    print(f"Crypto Combo: {result['Crypto_Combo']}")
    print(f"MAPE_AR: {result['MAPE_AR']:.2f} | MAPE_ARX: {result['MAPE_ARX']:.2f}")
    print(f"R2_AR: {result['R2_AR']:.3f} | R2_ARX: {result['R2_ARX']:.3f}")
    print(result['Model_ARX'].summary())
    print("\n")


Unnamed: 0,Macro,Crypto_Combo,p,q,Crypto Lag,MAPE_AR,MAPE_ARX,R2_AR,R2_ARX,MAPE_Improvement(%)
0,LFPR,"Litecoin, Ethereum, USD Coin",1,1,2,0.142639,0.106537,-0.384333,-0.221989,-25.310236


Macro,Crypto_Combo,p,q,Crypto Lag,MAPE_AR,MAPE_ARX,R2_AR,R2_ARX,MAPE_Improvement(%)
LFPR,"Litecoin, Ethereum, USD Coin",1,1,2,0.142639,0.106537,-0.384333,-0.221989,-25.310236
LFPR,"Dogecoin, USD Coin",1,1,0,0.114846,0.086417,-0.134335,0.100381,-24.754035
LFPR,"Bitcoin, XRP, Dogecoin, USD Coin",1,1,2,0.142639,0.109738,-0.384333,-0.171662,-23.066223
LFPR,"Litecoin, XRP, USD Coin",1,1,2,0.142639,0.110885,-0.384333,-0.369454,-22.262249
LFPR,"Bitcoin, Dogecoin, USD Coin",1,1,0,0.114846,0.092922,-0.134335,-0.022168,-19.089865
LFPR,"Litecoin, XRP, Ethereum, Dogecoin, USD Coin, Tether",1,1,2,0.142639,0.115605,-0.384333,-0.431933,-18.952709
LFPR,"Litecoin, USD Coin",1,1,2,0.142639,0.116111,-0.384333,-0.427741,-18.59829
LFPR,"XRP, Cardano, USD Coin",1,1,2,0.142639,0.117767,-0.384333,-0.425187,-17.437347
LFPR,"Litecoin, XRP, Ethereum, USD Coin, Tether",1,1,2,0.142639,0.120094,-0.384333,-0.63312,-15.805873
LFPR,"Bitcoin, XRP, Ethereum, Dogecoin, USD Coin, Tether",1,1,2,0.142639,0.120117,-0.384333,-0.597284,-15.790006




=== ARX-GARCH Model Summaries for Top 3 Improvements ===

--- Model 1 ---
Macro Target: LFPR
Crypto Combo: Litecoin, Ethereum, USD Coin
MAPE_AR: 0.14 | MAPE_ARX: 0.11
R2_AR: -0.384 | R2_ARX: -0.222
                          AR-X - GARCH Model Results                          
Dep. Variable:                      y   R-squared:                       0.686
Mean Model:                      AR-X   Adj. R-squared:                  0.667
Vol Model:                      GARCH   Log-Likelihood:              -0.755776
Distribution:                  Normal   AIC:                           17.5116
Method:            Maximum Likelihood   BIC:                           35.6130
                                        No. Observations:                   71
Date:                Thu, Jul 03 2025   Df Residuals:                       66
Time:                        10:07:12   Df Model:                            5
                                  Mean Model                                  
          