In [1]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [2]:
os.makedirs("outputs", exist_ok=True)
plt.rcParams["figure.dpi"] = 150
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
CRYPTO_LIST = ["BTC", "ETH", "LTC", "BCH"]
MARKET_CHOICES = [
    ["FIAT"],
    ["EQUITY"],
    ["GOLD"],
    ["ENERGY"],
    ["FIAT", "EQUITY"],
    ["FIAT", "GOLD"],
    ["FIAT", "EQUITY", "GOLD", "ENERGY"],
]
BASELINE_WINDOW = 12
BASELINE_EPOCHS = 25
BASELINE_HIDDEN = 4
BASELINE_LR = 0.001
BASELINE_ACT = "tanh"

In [4]:
def load_and_prepare_prices():
    base_path = "data"
    files = {
        "BTC": "CBBTCUSD.csv",
        "ETH": "CBETHUSD.csv",
        "LTC": "CBLTCUSD.csv",
        "BCH": "CBBCHUSD.csv",
        "FIAT": "DTWEXBGS.csv",
        "EQUITY": "CRSP.csv",
        "GOLD": "NASDAQQGLDI.csv",
        "ENERGY": "VDE.csv",
    }

    dfs = {}

    for name, filename in files.items():
        path = os.path.join(base_path, filename)
        if filename == "CRSP.csv":
            df = pd.read_csv(path, low_memory=False,
                             usecols=["DATE", "vwretd"])
            df.columns = ["date", "vwretd"]
            df["date"] = pd.to_datetime(
                df["date"], format="%Y%m%d", errors="coerce")
            df = df.dropna(subset=["date"])
            df = df.groupby("date")["vwretd"].first().reset_index()
            df = df.sort_values("date")
            date_col = "date"
            price_col = "vwretd"
        else:
            df = pd.read_csv(path)
            df.columns = [c.lower().strip() for c in df.columns]

            date_col = None
            for col in df.columns:
                if "date" in col:
                    date_col = col
                    break
            if date_col is None:
                raise ValueError(f"No date column found in {filename}")

            df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
            df = df.dropna(subset=[date_col])
            df = df.sort_values(date_col)

        if filename != "CRSP.csv":
            num_cols = [c for c in df.columns if c != date_col]
            if len(num_cols) == 0:
                raise ValueError(f"No price/value column found in {filename}")

            if filename == "VDE.csv":
                close_cols = [c for c in num_cols if "close" in c.lower()]
                if close_cols:
                    price_col = close_cols[0]
                else:
                    price_col = num_cols[0]
                    print(
                        f"Warning: Close column not found in VDE.csv, using {price_col}"
                    )
            else:
                price_col = num_cols[0]

        df[price_col] = pd.to_numeric(df[price_col], errors="coerce")
        df = df.dropna(subset=[price_col])

        if len(df) == 0:
            print(f"Warning: {filename} has no valid data after cleaning")
            continue

        df = df.set_index(date_col)
        df = df.resample("M").last()
        df[price_col] = pd.to_numeric(df[price_col], errors="coerce")
        df = df.dropna(subset=[price_col])

        if len(df) == 0:
            print(f"Warning: {filename} has no data after resampling")
            continue

        if filename == "CRSP.csv":
            df[name] = np.log(1 + df[price_col])
        else:
            df[name] = np.log(df[price_col]).diff()
            df = df.dropna(subset=[name])

        dfs[name] = df[[name]]

    data = pd.concat(dfs.values(), axis=1)
    data = data.dropna(how="all")
    data = data.dropna()
    data.index.name = "Date"

    return data


def split_sets(dates, X, y, ids):
    mask_train = (dates >= "2015-01-01") & (dates <= "2019-12-31")
    mask_val = (dates >= "2020-01-01") & (dates <= "2021-12-31")
    mask_test = (dates >= "2022-01-01") & (dates <= "2025-10-31")

    def subset(mask):
        idx = np.array(mask)
        return X[idx], y[idx], [ids[i] for i, v in enumerate(mask) if v]

    return subset(mask_train), subset(mask_val), subset(mask_test)


def p(msg):
    print(msg, flush=True)


def make_panel(data, market_assets, w):
    required_cols = ["BTC", "ETH", "LTC", "BCH"] + market_assets
    missing_cols = [col for col in required_cols if col not in data.columns]
    if missing_cols:
        raise ValueError(
            f"Missing columns in data: {missing_cols}. Available columns: {list(data.columns)}"
        )

    if len(data) < w + 1:
        raise ValueError(
            f"Data has only {len(data)} rows, but need at least {w + 1} rows for window size {w}"
        )

    X, y, ids = [], [], []
    for crypto in ["BTC", "ETH", "LTC", "BCH"]:
        for t in range(w, len(data) - 1):
            past_crypto = data[crypto].iloc[t - w: t].values
            past_mkts = data[market_assets].iloc[t - w: t].values.flatten()
            X.append(np.concatenate([past_crypto, past_mkts]))
            y.append(data[crypto].iloc[t + 1])
            ids.append((data.index[t + 1], crypto))
    return np.array(X), np.array(y), ids


def build_panel_for_market(data, market_assets, w):
    X, y, ids = make_panel(data, market_assets, w)

    dates = pd.to_datetime([d for d, _ in ids])
    (Xtr, ytr, ids_tr), (Xval, yval, ids_val), (Xte, yte, ids_te) = split_sets(
        dates, X, y, ids
    )
    return {
        "market_assets": market_assets,
        "w": w,
        "Xtr": Xtr,
        "ytr": ytr,
        "ids_tr": ids_tr,
        "Xval": Xval,
        "yval": yval,
        "ids_val": ids_val,
        "Xte": Xte,
        "yte": yte,
        "ids_te": ids_te,
    }


def predict_betas_df(model, X, ids, asset_label):
    if len(X) == 0:
        return pd.DataFrame(columns=["Date", "Crypto", "Asset", "Beta"])
    model.eval()
    with torch.no_grad():
        X_t = torch.tensor(X, dtype=torch.float32)
        betas = model(X_t).cpu().numpy().ravel()
    out = pd.DataFrame(
        {
            "Date": pd.to_datetime([d for d, _ in ids]),
            "Crypto": [c for _, c in ids],
            "Asset": asset_label,
            "Beta": betas,
        }
    )
    return out

In [5]:
class NeuralBetaNet(nn.Module):
    def __init__(self, in_dim, hidden, activation):
        super().__init__()
        act = {
            "linear": nn.Identity(),
            "sigmoid": nn.Sigmoid(),
            "tanh": nn.Tanh(),
            "relu": nn.ReLU(),
        }[activation]
        self.model = nn.Sequential(
            nn.Linear(in_dim, hidden), act, nn.Linear(hidden, 1))

    def forward(self, x):
        return self.model(x)


def rmse_loss(pred, target):
    return torch.sqrt(torch.mean((pred - target) ** 2))

In [6]:
def train_nn(Xtr, ytr, Xval, yval, hidden, lr, act, epochs=10):
    if len(Xtr) == 0:
        raise ValueError(
            "Training set is empty. Check data loading and panel construction."
        )
    if len(Xtr.shape) < 2:
        raise ValueError(
            f"Training set has wrong shape: {Xtr.shape}. Expected 2D array."
        )
    in_dim = Xtr.shape[1]
    model = NeuralBetaNet(in_dim, hidden, act).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    loss_hist, val_hist = [], []

    Xtr_t = torch.tensor(Xtr, dtype=torch.float32).to(device)
    ytr_t = torch.tensor(ytr, dtype=torch.float32).unsqueeze(1).to(device)
    Xval_t = torch.tensor(Xval, dtype=torch.float32).to(device)
    yval_t = torch.tensor(yval, dtype=torch.float32).unsqueeze(1).to(device)

    for ep in range(epochs):
        model.train()
        opt.zero_grad()
        out = model(Xtr_t)
        loss = rmse_loss(out, ytr_t)
        loss.backward()
        opt.step()

        model.eval()
        with torch.no_grad():
            val_out = model(Xval_t)
            val_loss = rmse_loss(val_out, yval_t)
        loss_hist.append(loss.item())
        val_hist.append(val_loss.item())
        print(
            f"Epoch {ep + 1:03d} | train RMSE {loss.item():.6f} | val RMSE {val_loss.item():.6f}"
        )
    return model, loss_hist, val_hist


def grid_search(Xtr, ytr, Xval, yval, market_assets):
    params = {
        "hidden": [4, 8, 16],
        "lr": [0.001, 0.01, 0.1],
        "act": ["linear", "sigmoid", "tanh", "relu"],
        "w": [12, 24, 36],
    }
    results = []
    for h in params["hidden"]:
        for lr in params["lr"]:
            for act in params["act"]:
                for w in params["w"]:
                    try:
                        model, loss_hist, val_hist = train_nn(
                            Xtr, ytr, Xval, yval, hidden=h, lr=lr, act=act, epochs=10
                        )
                        val_rmse = val_hist[-1]
                        results.append(
                            [market_assets, w, h, lr, act, val_rmse])
                    except Exception as e:
                        print(f"Error {market_assets}, w={w}, act={act}: {e}")
    df = pd.DataFrame(
        results,
        columns=["MarketAssets", "Window", "Hidden",
                 "LR", "Activation", "Val_RMSE"],
    )
    return df

In [7]:
def plot_learning_curve(loss_hist, val_hist, label, out_path):
    plt.figure(figsize=(6, 4))
    plt.plot(loss_hist, label="Train RMSE")
    plt.plot(val_hist, label="Validation RMSE")
    plt.xlabel("Epoch")
    plt.ylabel("RMSE")
    plt.title(f"Learning Curve – {label}")
    plt.legend()
    plt.grid(True)
    plt.savefig(out_path, dpi=300, bbox_inches="tight")
    plt.close()

In [8]:
def compute_descriptive_stats(betas):
    desc = {
        "N": len(betas),
        "Mean": np.mean(betas),
        "Median": np.median(betas),
        "Std": np.std(betas, ddof=1),
        "Variance": np.var(betas, ddof=1),
        "Skew": skew(betas),
        "Kurtosis": kurtosis(betas, fisher=False),
        "Min": np.min(betas),
        "P1": np.percentile(betas, 1),
        "P5": np.percentile(betas, 5),
        "P25": np.percentile(betas, 25),
        "P50": np.percentile(betas, 50),
        "P75": np.percentile(betas, 75),
        "P95": np.percentile(betas, 95),
        "P99": np.percentile(betas, 99),
        "Max": np.max(betas),
    }
    return desc

In [9]:
def plot_annual_beta_dynamics(df_betas, out_path):
    df_betas["Date"] = pd.to_datetime(df_betas["Date"], errors="coerce")
    df_betas["year"] = df_betas["Date"].dt.year
    df_betas["month"] = df_betas["Date"].dt.month

    unique_years = df_betas["year"].dropna().unique()
    if len(unique_years) == 1:
        agg = (
            df_betas.groupby(["month", "Crypto"])["Beta"]
            .agg(["mean", "std"])
            .reset_index()
        )

        plt.figure(figsize=(7, 4))
        for c in agg["Crypto"].unique():
            subset = agg[agg["Crypto"] == c]
            plt.plot(subset["month"], subset["mean"], label=c, marker="o")
        plt.xlabel("Month")
        plt.ylabel("Mean β")
        plt.title(f"Monthly Mean Neural β Dynamics ({unique_years[0]})")
        plt.xticks(range(1, 13))
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        mean_path = out_path.replace(".png", "_mean.png")
        plt.savefig(mean_path, dpi=300, bbox_inches="tight")
        plt.close()

        plt.figure(figsize=(7, 4))
        for c in agg["Crypto"].unique():
            subset = agg[agg["Crypto"] == c]
            plt.plot(subset["month"], subset["std"], label=c, marker="s")
        plt.xlabel("Month")
        plt.ylabel("Std β")
        plt.title(f"Monthly Std Neural β Dynamics ({unique_years[0]})")
        plt.xticks(range(1, 13))
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        std_path = out_path.replace(".png", "_std.png")
        plt.savefig(std_path, dpi=300, bbox_inches="tight")
        plt.close()

        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(7, 8))
        for c in agg["Crypto"].unique():
            subset = agg[agg["Crypto"] == c]
            ax1.plot(subset["month"], subset["mean"], label=c, marker="o")
            ax2.plot(subset["month"], subset["std"], label=c, marker="s")
        ax1.set_xlabel("Month")
        ax1.set_ylabel("Mean β")
        ax1.set_title(f"Monthly Mean Neural β Dynamics ({unique_years[0]})")
        ax1.set_xticks(range(1, 13))
        ax1.legend()
        ax1.grid(True)
        ax2.set_xlabel("Month")
        ax2.set_ylabel("Std β")
        ax2.set_title(f"Monthly Std Neural β Dynamics ({unique_years[0]})")
        ax2.set_xticks(range(1, 13))
        ax2.legend()
        ax2.grid(True)
        plt.tight_layout()
        plt.savefig(out_path, dpi=300, bbox_inches="tight")
        plt.close()
    else:
        agg = (
            df_betas.groupby(["year", "Crypto"])["Beta"]
            .agg(["mean", "std"])
            .reset_index()
        )

        plt.figure(figsize=(7, 4))
        for c in agg["Crypto"].unique():
            subset = agg[agg["Crypto"] == c]
            plt.plot(subset["year"], subset["mean"], label=c, marker="o")
        plt.xlabel("Year")
        plt.ylabel("Mean β")
        plt.title("Annual Mean Neural β Dynamics")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        mean_path = out_path.replace(".png", "_mean.png")
        plt.savefig(mean_path, dpi=300, bbox_inches="tight")
        plt.close()

        plt.figure(figsize=(7, 4))
        for c in agg["Crypto"].unique():
            subset = agg[agg["Crypto"] == c]
            plt.plot(subset["year"], subset["std"], label=c, marker="s")
        plt.xlabel("Year")
        plt.ylabel("Std β")
        plt.title("Annual Std Neural β Dynamics")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        std_path = out_path.replace(".png", "_std.png")
        plt.savefig(std_path, dpi=300, bbox_inches="tight")
        plt.close()

        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(7, 8))
        for c in agg["Crypto"].unique():
            subset = agg[agg["Crypto"] == c]
            ax1.plot(subset["year"], subset["mean"], label=c, marker="o")
            ax2.plot(subset["year"], subset["std"], label=c, marker="s")
        ax1.set_xlabel("Year")
        ax1.set_ylabel("Mean β")
        ax1.set_title("Annual Mean Neural β Dynamics")
        ax1.legend()
        ax1.grid(True)
        ax2.set_xlabel("Year")
        ax2.set_ylabel("Std β")
        ax2.set_title("Annual Std Neural β Dynamics")
        ax2.legend()
        ax2.grid(True)
        plt.tight_layout()
        plt.savefig(out_path, dpi=300, bbox_inches="tight")
        plt.close()

In [10]:
def add_ff_factors(data, ff_df):
    ff_df.columns = [c.strip() for c in ff_df.columns]

    if "date" not in [c.lower() for c in ff_df.columns]:
        ff_df.rename(columns={ff_df.columns[0]: "date"}, inplace=True)

    ff_df = ff_df[ff_df["date"].astype(str).str.match(r"^\d{6}$", na=False)]

    ff_df["date"] = pd.to_datetime(
        ff_df["date"].astype(str) + "01", format="%Y%m%d"
    ) + pd.offsets.MonthEnd(0)

    for col in ff_df.columns:
        if col != "date":
            ff_df[col] = pd.to_numeric(ff_df[col], errors="coerce") / 100.0

    ff_df = ff_df.set_index("date").sort_index()

    merged = data.join(ff_df, how="left").dropna()
    return merged

In [11]:
def sort_and_portfolio(df_betas, returns_df, value_weighted=False):
    if value_weighted:
        crypto_mean_abs_returns = {}
        for crypto in ["BTC", "ETH", "LTC", "BCH"]:
            if crypto in returns_df.columns:
                mean_abs_ret = abs(returns_df[crypto]).mean()
                crypto_mean_abs_returns[crypto] = mean_abs_ret
        total_weight = sum(crypto_mean_abs_returns.values())
        if total_weight > 0:
            crypto_weights_global = {
                k: v / total_weight for k, v in crypto_mean_abs_returns.items()
            }
        else:
            crypto_weights_global = {
                c: 1.0 / 4 for c in ["BTC", "ETH", "LTC", "BCH"]}
    else:
        crypto_weights_global = {
            c: 1.0 / 4 for c in ["BTC", "ETH", "LTC", "BCH"]}

    result = []
    for asset in df_betas["Asset"].unique():
        temp = df_betas[df_betas["Asset"] == asset]
        for date, grp in temp.groupby("Date"):
            ranked = grp.sort_values("Beta")
            ranked["Quartile"] = range(1, len(ranked) + 1)

            quartile_data = {}
            for q, sub in ranked.groupby("Quartile"):
                mean_b = sub["Beta"].mean()

                rets = []
                weights = []
                for c in sub["Crypto"]:
                    if (
                        c in returns_df.columns
                        and asset in returns_df.columns
                        and date in returns_df.index
                    ):
                        excess_ret = (
                            returns_df[c].loc[date] -
                            returns_df[asset].loc[date]
                        )
                        rets.append(excess_ret)
                        weights.append(
                            crypto_weights_global.get(c, 1.0 / len(sub)))

                if rets:
                    if value_weighted:
                        mean_ex = np.average(
                            rets, weights=weights[: len(rets)])
                    else:
                        mean_ex = np.nanmean(rets)
                    result.append([date, asset, q, mean_b, mean_ex])
                    quartile_data[q] = {"beta": mean_b, "ret": mean_ex}

            if 1 in quartile_data and 4 in quartile_data:
                q1_beta = quartile_data[1]["beta"]
                q4_beta = quartile_data[4]["beta"]
                q1_ret = quartile_data[1]["ret"]
                q4_ret = quartile_data[4]["ret"]
                beta_diff = q4_beta - q1_beta
                ret_diff = q4_ret - q1_ret
                result.append([date, asset, "Q4-Q1", beta_diff, ret_diff])

    return pd.DataFrame(
        result, columns=["Date", "Asset",
                         "Quartile", "MeanBeta", "ExcessReturn"]
    )

### Step 0 – Sampling

In [12]:
data = load_and_prepare_prices()
print(f"Data shape: {data.shape}")
print(f"Data columns: {data.columns.tolist()}")
print(f"Data date range: {data.index.min()} to {data.index.max()}")
print(data.head())
if len(data) == 0:
    print("WARNING: Data is empty! Check data files and date alignment.")
else:
    data.to_csv("outputs/data_monthly_log_returns.csv")
    print("Saved data successfully")

Data shape: (51, 8)
Data columns: ['BTC', 'ETH', 'LTC', 'BCH', 'FIAT', 'EQUITY', 'GOLD', 'ENERGY']
Data date range: 2018-01-31 00:00:00 to 2023-11-30 00:00:00
                 BTC       ETH       LTC       BCH      FIAT    EQUITY  \
Date                                                                     
2018-01-31 -0.316698  0.399245 -0.338720 -0.474456 -0.030520  0.049340   
2018-02-28  0.020315 -0.262385  0.221883 -0.208192  0.015064 -0.040254   
2018-04-30  0.288312  0.529274  0.243438  0.681713  0.017133  0.004668   
2018-05-31 -0.211054 -0.148979 -0.225709 -0.305480  0.023970  0.025810   
2018-07-31  0.191088 -0.047297 -0.026645  0.036910 -0.004678  0.031085   

                GOLD    ENERGY  
Date                            
2018-01-31  0.023697  0.031108  
2018-02-28 -0.018993 -0.113646  
2018-04-30 -0.010383  0.092493  
2018-05-31 -0.016046  0.035246  
2018-07-31 -0.025448  0.010855  
Saved data successfully


### Step 1 – Neural‑network architecture

In [13]:
panel_fiat = build_panel_for_market(data, ["FIAT"], BASELINE_WINDOW)
model1, loss_hist1, val_hist1 = train_nn(
    panel_fiat["Xtr"],
    panel_fiat["ytr"],
    panel_fiat["Xval"],
    panel_fiat["yval"],
    hidden=BASELINE_HIDDEN,
    lr=BASELINE_LR,
    act=BASELINE_ACT,
    epochs=BASELINE_EPOCHS,
)
torch.save(model1.state_dict(), "outputs/model_step1_fiat.pt")
plot_learning_curve(
    loss_hist1,
    val_hist1,
    "FIAT w=12 baseline",
    "outputs/step1_learning_curve_fiat_w12.png",
)
p("Saved baseline model and learning curve")

Epoch 001 | train RMSE 0.138542 | val RMSE 0.332085
Epoch 002 | train RMSE 0.136611 | val RMSE 0.332702
Epoch 003 | train RMSE 0.134753 | val RMSE 0.333349
Epoch 004 | train RMSE 0.132968 | val RMSE 0.334026
Epoch 005 | train RMSE 0.131254 | val RMSE 0.334725
Epoch 006 | train RMSE 0.129605 | val RMSE 0.335435
Epoch 007 | train RMSE 0.128016 | val RMSE 0.336147
Epoch 008 | train RMSE 0.126485 | val RMSE 0.336853
Epoch 009 | train RMSE 0.125009 | val RMSE 0.337547
Epoch 010 | train RMSE 0.123588 | val RMSE 0.338223
Epoch 011 | train RMSE 0.122219 | val RMSE 0.338873
Epoch 012 | train RMSE 0.120901 | val RMSE 0.339490
Epoch 013 | train RMSE 0.119631 | val RMSE 0.340070
Epoch 014 | train RMSE 0.118406 | val RMSE 0.340606
Epoch 015 | train RMSE 0.117224 | val RMSE 0.341096
Epoch 016 | train RMSE 0.116081 | val RMSE 0.341536
Epoch 017 | train RMSE 0.114975 | val RMSE 0.341927
Epoch 018 | train RMSE 0.113905 | val RMSE 0.342271
Epoch 019 | train RMSE 0.112868 | val RMSE 0.342570
Epoch 020 | 

### Step 2 – Hyper‑parameter tuning

In [14]:
grid_out_all = []
best_by_market = []


hidden_list = [4, 8, 16]
lr_list = [0.001, 0.01]
act_list = ["linear", "tanh", "relu"]
w_list = [12]

for market_assets in MARKET_CHOICES:
    p(f"Tuning market={'+'.join(market_assets)}")
    pnl = build_panel_for_market(data, market_assets, BASELINE_WINDOW)
    if len(pnl["Xtr"]) == 0 or len(pnl["Xval"]) == 0:
        p("Skipping due to empty train or val")
        continue
    results = []
    for h in hidden_list:
        for lr in lr_list:
            for act in act_list:
                try:
                    model_tmp, tr_h, va_h = train_nn(
                        pnl["Xtr"],
                        pnl["ytr"],
                        pnl["Xval"],
                        pnl["yval"],
                        hidden=h,
                        lr=lr,
                        act=act,
                        epochs=10,
                    )
                    val_rmse = va_h[-1]
                    results.append(
                        [
                            "+".join(market_assets),
                            BASELINE_WINDOW,
                            h,
                            lr,
                            act,
                            float(val_rmse),
                        ]
                    )
                    p(f"  h={h} lr={lr} act={act} -> val RMSE={val_rmse:.5f}")
                except Exception as e:
                    p(f"  error h={h} lr={lr} act={act}: {e}")
    if results:
        df_res = pd.DataFrame(
            results,
            columns=[
                "MarketAssets",
                "Window",
                "Hidden",
                "LR",
                "Activation",
                "Val_RMSE",
            ],
        )
        grid_out_all.append(df_res)

        best_row = df_res.sort_values(
            "Val_RMSE", ascending=True).iloc[0].to_dict()
        best_by_market.append(best_row)

if grid_out_all:
    df_all = pd.concat(grid_out_all, ignore_index=True)
    df_all.to_csv("outputs/step2_grid_results.csv", index=False)
    pd.DataFrame(best_by_market).to_csv(
        "outputs/step2_best_by_market.csv", index=False)
    p("Saved grid results and best per market CSVs")
else:
    p("No grid results created. Check data lengths or grid size.")


best_map = {}
if os.path.exists("outputs/step2_best_by_market.csv"):
    best_df = pd.read_csv("outputs/step2_best_by_market.csv")
    for _, r in best_df.iterrows():
        best_map[r["MarketAssets"]] = dict(
            w=int(r["Window"]),
            hidden=int(r["Hidden"]),
            lr=float(r["LR"]),
            act=str(r["Activation"]),
        )
else:
    for m in [
        "FIAT",
        "EQUITY",
        "GOLD",
        "ENERGY",
        "FIAT+EQUITY",
        "FIAT+GOLD",
        "FIAT+EQUITY+GOLD+ENERGY",
    ]:
        best_map[m] = dict(
            w=BASELINE_WINDOW, hidden=BASELINE_HIDDEN, lr=BASELINE_LR, act=BASELINE_ACT
        )

Tuning market=FIAT
Epoch 001 | train RMSE 0.194089 | val RMSE 0.423832
Epoch 002 | train RMSE 0.191897 | val RMSE 0.422628
Epoch 003 | train RMSE 0.189721 | val RMSE 0.421440
Epoch 004 | train RMSE 0.187562 | val RMSE 0.420260
Epoch 005 | train RMSE 0.185420 | val RMSE 0.419087
Epoch 006 | train RMSE 0.183296 | val RMSE 0.417921
Epoch 007 | train RMSE 0.181191 | val RMSE 0.416761
Epoch 008 | train RMSE 0.179105 | val RMSE 0.415608
Epoch 009 | train RMSE 0.177040 | val RMSE 0.414462
Epoch 010 | train RMSE 0.174997 | val RMSE 0.413323
  h=4 lr=0.001 act=linear -> val RMSE=0.41332
Epoch 001 | train RMSE 0.341638 | val RMSE 0.552121
Epoch 002 | train RMSE 0.339617 | val RMSE 0.550866
Epoch 003 | train RMSE 0.337588 | val RMSE 0.549608
Epoch 004 | train RMSE 0.335549 | val RMSE 0.548347
Epoch 005 | train RMSE 0.333502 | val RMSE 0.547083
Epoch 006 | train RMSE 0.331446 | val RMSE 0.545817
Epoch 007 | train RMSE 0.329381 | val RMSE 0.544548
Epoch 008 | train RMSE 0.327307 | val RMSE 0.543277

Epoch 001 | train RMSE 0.129347 | val RMSE 0.353665
Epoch 002 | train RMSE 0.128004 | val RMSE 0.352367
Epoch 003 | train RMSE 0.126690 | val RMSE 0.351086
Epoch 004 | train RMSE 0.125554 | val RMSE 0.349851
Epoch 005 | train RMSE 0.124461 | val RMSE 0.348649
Epoch 006 | train RMSE 0.123400 | val RMSE 0.347476
Epoch 007 | train RMSE 0.122371 | val RMSE 0.346325
Epoch 008 | train RMSE 0.121376 | val RMSE 0.345198
Epoch 009 | train RMSE 0.120416 | val RMSE 0.344094
Epoch 010 | train RMSE 0.119488 | val RMSE 0.343014
  h=4 lr=0.001 act=relu -> val RMSE=0.34301
Epoch 001 | train RMSE 0.143145 | val RMSE 0.314736
Epoch 002 | train RMSE 0.133481 | val RMSE 0.315607
Epoch 003 | train RMSE 0.125507 | val RMSE 0.313180
Epoch 004 | train RMSE 0.118353 | val RMSE 0.310952
Epoch 005 | train RMSE 0.112499 | val RMSE 0.310913
Epoch 006 | train RMSE 0.107665 | val RMSE 0.312882
Epoch 007 | train RMSE 0.103796 | val RMSE 0.315699
Epoch 008 | train RMSE 0.101187 | val RMSE 0.317895
Epoch 009 | train RM

### Step 3 – Learning curves

In [15]:
curves_index = []
for market_assets in MARKET_CHOICES:
    label = "+".join(market_assets)
    cfg = best_map.get(
        label,
        dict(
            w=BASELINE_WINDOW, hidden=BASELINE_HIDDEN, lr=BASELINE_LR, act=BASELINE_ACT
        ),
    )
    w = int(cfg["w"])
    pnl = build_panel_for_market(data, market_assets, w)
    if len(pnl["Xtr"]) == 0 or len(pnl["Xval"]) == 0:
        p(f"Skipping curves for {label} due to empty split")
        continue
    model_best, tr_hist, va_hist = train_nn(
        pnl["Xtr"],
        pnl["ytr"],
        pnl["Xval"],
        pnl["yval"],
        hidden=int(cfg["hidden"]),
        lr=float(cfg["lr"]),
        act=str(cfg["act"]),
        epochs=BASELINE_EPOCHS,
    )
    model_path = f"outputs/model_step3_{label.replace('+', '_')}.pt"
    torch.save(model_best.state_dict(), model_path)
    plot_path = f"outputs/step3_learning_curve_{label.replace('+', '_')}.png"
    plot_learning_curve(
        tr_hist, va_hist, f"{label} w={w} act={cfg['act']}", plot_path)
    curves_index.append(
        dict(market=label, model_path=model_path, curve_path=plot_path, cfg=cfg)
    )

with open("outputs/step3_curves_index.json", "w") as f:
    json.dump(curves_index, f, indent=2)

p("Saved learning curves")

Epoch 001 | train RMSE 0.383330 | val RMSE 0.328050
Epoch 002 | train RMSE 0.377260 | val RMSE 0.326259
Epoch 003 | train RMSE 0.371208 | val RMSE 0.324499
Epoch 004 | train RMSE 0.365174 | val RMSE 0.322771
Epoch 005 | train RMSE 0.359160 | val RMSE 0.321075
Epoch 006 | train RMSE 0.353167 | val RMSE 0.319411
Epoch 007 | train RMSE 0.347196 | val RMSE 0.317780
Epoch 008 | train RMSE 0.341251 | val RMSE 0.316184
Epoch 009 | train RMSE 0.335331 | val RMSE 0.314621
Epoch 010 | train RMSE 0.329439 | val RMSE 0.313094
Epoch 011 | train RMSE 0.323576 | val RMSE 0.311602
Epoch 012 | train RMSE 0.317744 | val RMSE 0.310147
Epoch 013 | train RMSE 0.311946 | val RMSE 0.308728
Epoch 014 | train RMSE 0.306184 | val RMSE 0.307346
Epoch 015 | train RMSE 0.300458 | val RMSE 0.306001
Epoch 016 | train RMSE 0.294773 | val RMSE 0.304695
Epoch 017 | train RMSE 0.289129 | val RMSE 0.303428
Epoch 018 | train RMSE 0.283530 | val RMSE 0.302199
Epoch 019 | train RMSE 0.277979 | val RMSE 0.301010
Epoch 020 | 

### Step 4 – Descriptive statistics of neural 𝛽

In [16]:
all_betas = []
for market_assets in MARKET_CHOICES:
    label = "+".join(market_assets)
    cfg = best_map.get(
        label,
        dict(
            w=BASELINE_WINDOW, hidden=BASELINE_HIDDEN, lr=BASELINE_LR, act=BASELINE_ACT
        ),
    )
    w = int(cfg["w"])
    pnl = build_panel_for_market(data, market_assets, w)
    if len(pnl["Xtr"]) == 0 or len(pnl["Xval"]) == 0 or len(pnl["Xte"]) == 0:
        p(f"Skipping {label} stats due to empty split")
        continue

    X_tv = np.vstack([pnl["Xtr"], pnl["Xval"]])
    y_tv = np.hstack([pnl["ytr"], pnl["yval"]])
    model_tv, _, _ = train_nn(
        X_tv,
        y_tv,
        pnl["Xval"],
        pnl["yval"],
        hidden=int(cfg["hidden"]),
        lr=float(cfg["lr"]),
        act=str(cfg["act"]),
        epochs=BASELINE_EPOCHS,
    )
    df_b = predict_betas_df(model_tv, pnl["Xte"], pnl["ids_te"], label)
    all_betas.append(df_b)

if all_betas:
    df_betas = pd.concat(all_betas, ignore_index=True)

    df_betas = df_betas[
        (df_betas["Date"] >= "2023-01-01") & (df_betas["Date"] <= "2025-10-31")
    ]
    df_betas.to_csv("outputs/step4_test_betas_panel.csv", index=False)

    rows = []
    for (c, a), g in df_betas.groupby(["Crypto", "Asset"]):
        b = g["Beta"].values
        d = compute_descriptive_stats(b)
        d.update({"Crypto": c, "Asset": a})
        rows.append(d)
    stats_df = pd.DataFrame(rows)[
        [
            "Crypto",
            "Asset",
            "N",
            "Mean",
            "Median",
            "Std",
            "Variance",
            "Skew",
            "Kurtosis",
            "Min",
            "P1",
            "P5",
            "P25",
            "P50",
            "P75",
            "P95",
            "P99",
            "Max",
        ]
    ].sort_values(["Asset", "Crypto"])
    stats_df.to_csv("outputs/step4_neural_beta_descriptives.csv", index=False)
    p("Saved test betas and descriptive stats")
else:
    p("No betas computed. Check earlier steps.")

Epoch 001 | train RMSE 0.342523 | val RMSE 0.371415
Epoch 002 | train RMSE 0.339274 | val RMSE 0.367903
Epoch 003 | train RMSE 0.336060 | val RMSE 0.364419
Epoch 004 | train RMSE 0.332880 | val RMSE 0.360963
Epoch 005 | train RMSE 0.329736 | val RMSE 0.357534
Epoch 006 | train RMSE 0.326628 | val RMSE 0.354135
Epoch 007 | train RMSE 0.323555 | val RMSE 0.350764
Epoch 008 | train RMSE 0.320519 | val RMSE 0.347425
Epoch 009 | train RMSE 0.317520 | val RMSE 0.344116
Epoch 010 | train RMSE 0.314559 | val RMSE 0.340841
Epoch 011 | train RMSE 0.311637 | val RMSE 0.337600
Epoch 012 | train RMSE 0.308754 | val RMSE 0.334394
Epoch 013 | train RMSE 0.305912 | val RMSE 0.331225
Epoch 014 | train RMSE 0.303113 | val RMSE 0.328094
Epoch 015 | train RMSE 0.300355 | val RMSE 0.325002
Epoch 016 | train RMSE 0.297642 | val RMSE 0.321949
Epoch 017 | train RMSE 0.294973 | val RMSE 0.318938
Epoch 018 | train RMSE 0.292349 | val RMSE 0.315969
Epoch 019 | train RMSE 0.289771 | val RMSE 0.313043
Epoch 020 | 

### Step 5 – Dynamics of neural 𝛽

In [17]:
if os.path.exists("outputs/step4_test_betas_panel.csv"):
    df_betas = pd.read_csv(
        "outputs/step4_test_betas_panel.csv", parse_dates=["Date"])
    plot_annual_beta_dynamics(df_betas, "outputs/step5_beta_dynamics.png")
    p("Saved beta dynamics plot")
else:
    p("No betas panel found. Skipping..")

Saved beta dynamics plot


### Step 6 – Findings

Across the sample period, the neural betas (β) estimated for BTC, ETH, LTC, and BCH reveal a consistent pattern of weak or negative relationships with traditional market factors. On average, β values range between –0.05 and –0.07, indicating that cryptocurrencies exhibit limited sensitivity to conventional macroeconomic assets such as fiat currencies, equities, gold, and energy commodities. This overall independence supports the idea that crypto markets continue to operate largely as a separate asset class, driven by their own liquidity and sentiment cycles rather than traditional financial fundamentals.

Among the asset classes, equity factors exerted the strongest and most variable influence on crypto returns. During months of heightened market activity, betas against equity markets tended to rise, suggesting that cryptos partially align with equity risk-on/risk-off dynamics during turbulent periods. By contrast, fiat betas remained close to zero and largely stable, implying minimal connection to broad currency fluctuations. Gold betas were small but occasionally positive, consistent with gold’s weakly diversifying relationship to risk assets. Energy betas showed higher volatility in the middle of the year, hinting at short-lived correlations between crypto and commodity price cycles.

Across cryptocurrencies, BTC and ETH displayed the most stable and consistent β trajectories, reacting similarly to macroeconomic changes. LTC and BCH, however, exhibited more erratic movements and wider month-to-month variation, likely reflecting thinner market depth and greater exposure to idiosyncratic shocks. Temporally, neural betas were less negative in early 2023, dipped during mid-year, and recovered slightly toward year-end, broadly mirroring the rebound in overall crypto valuations. 

Overall, these findings suggest that while cryptos occasionally move in tandem with equities or commodities during high-volatility periods, their market dynamics remain distinct and largely insulated from traditional macroeconomic factors. The relationships identified are transient rather than structural, highlighting the evolving and independent nature of digital asset risk exposures.

### Step 7 – Including additional factors

In [18]:
FF_PATH = "data/FamaFrench_factors.csv"
if os.path.exists(FF_PATH):
    ff_df = pd.read_csv(FF_PATH)
    data_ff = add_ff_factors(data.copy(), ff_df)
    data_ff.to_csv("outputs/step7_data_with_ff.csv", index=False)
    p("Saved merged table with FF factors")

    ff_cols = [
        c
        for c in data_ff.columns
        if c.upper() in ["MKT-RF", "SMB", "HML", "RMW", "CMA", "RF"]
        or "mkt" in c.lower()
        or "smb" in c.lower()
        or "hml" in c.lower()
        or "rmw" in c.lower()
        or "cma" in c.lower()
    ]
    p(f"Found FF factor columns: {ff_cols}")

    ff_results = []
    for market_assets in MARKET_CHOICES:
        label = "+".join(market_assets)
        cfg = best_map.get(
            label,
            dict(
                w=BASELINE_WINDOW,
                hidden=BASELINE_HIDDEN,
                lr=BASELINE_LR,
                act=BASELINE_ACT,
            ),
        )
        w = int(cfg["w"])

        all_assets = market_assets + ff_cols
        pnl_ff = build_panel_for_market(data_ff, all_assets, w)

        if (
            len(pnl_ff["Xtr"]) == 0
            or len(pnl_ff["Xval"]) == 0
            or len(pnl_ff["Xte"]) == 0
        ):
            p(f"Skipping {label} with FF due to empty split")
            continue
        X_tv_ff = np.vstack([pnl_ff["Xtr"], pnl_ff["Xval"]])
        y_tv_ff = np.hstack([pnl_ff["ytr"], pnl_ff["yval"]])
        model_ff, tr_hist_ff, va_hist_ff = train_nn(
            X_tv_ff,
            y_tv_ff,
            pnl_ff["Xval"],
            pnl_ff["yval"],
            hidden=int(cfg["hidden"]),
            lr=float(cfg["lr"]),
            act=str(cfg["act"]),
            epochs=BASELINE_EPOCHS,
        )

        model_ff.eval()
        with torch.no_grad():
            Xte_t = torch.tensor(pnl_ff["Xte"], dtype=torch.float32).to(device)
            yte_t = (
                torch.tensor(pnl_ff["yte"], dtype=torch.float32).unsqueeze(
                    1).to(device)
            )
            pred_ff = model_ff(Xte_t)
            test_rmse_ff = rmse_loss(pred_ff, yte_t).item()
        pnl_base = build_panel_for_market(data, market_assets, w)
        if len(pnl_base["Xte"]) > 0:
            X_tv_base = np.vstack([pnl_base["Xtr"], pnl_base["Xval"]])
            y_tv_base = np.hstack([pnl_base["ytr"], pnl_base["yval"]])
            model_base, _, _ = train_nn(
                X_tv_base,
                y_tv_base,
                pnl_base["Xval"],
                pnl_base["yval"],
                hidden=int(cfg["hidden"]),
                lr=float(cfg["lr"]),
                act=str(cfg["act"]),
                epochs=BASELINE_EPOCHS,
            )
            model_base.eval()
            with torch.no_grad():
                Xte_base_t = torch.tensor(pnl_base["Xte"], dtype=torch.float32).to(
                    device
                )
                yte_base_t = (
                    torch.tensor(pnl_base["yte"], dtype=torch.float32)
                    .unsqueeze(1)
                    .to(device)
                )
                pred_base = model_base(Xte_base_t)
                test_rmse_base = rmse_loss(pred_base, yte_base_t).item()

            improvement = test_rmse_base - test_rmse_ff
            ff_results.append(
                {
                    "Market": label,
                    "Test_RMSE_Base": test_rmse_base,
                    "Test_RMSE_FF": test_rmse_ff,
                    "Improvement": improvement,
                    "Val_RMSE_FF": va_hist_ff[-1],
                }
            )
            p(
                f"{label}: Base RMSE={test_rmse_base:.6f}, FF RMSE={test_rmse_ff:.6f}, Improvement={improvement:.6f}"
            )

    if ff_results:
        ff_results_df = pd.DataFrame(ff_results)
        ff_results_df.to_csv("outputs/step7_ff_comparison.csv", index=False)
        p("Saved FF factor comparison results")
    else:
        p("No FF results computed")
else:
    p("No FF factors file found. Skipping join...")

Saved merged table with FF factors


Found FF factor columns: ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
Epoch 001 | train RMSE 0.366979 | val RMSE 0.350445
Epoch 002 | train RMSE 0.363243 | val RMSE 0.347097
Epoch 003 | train RMSE 0.359543 | val RMSE 0.343788
Epoch 004 | train RMSE 0.355881 | val RMSE 0.340517
Epoch 005 | train RMSE 0.352255 | val RMSE 0.337282
Epoch 006 | train RMSE 0.348667 | val RMSE 0.334082
Epoch 007 | train RMSE 0.345114 | val RMSE 0.330915
Epoch 008 | train RMSE 0.341597 | val RMSE 0.327781
Epoch 009 | train RMSE 0.338113 | val RMSE 0.324680
Epoch 010 | train RMSE 0.334662 | val RMSE 0.321612
Epoch 011 | train RMSE 0.331246 | val RMSE 0.318577
Epoch 012 | train RMSE 0.327862 | val RMSE 0.315574
Epoch 013 | train RMSE 0.324512 | val RMSE 0.312604
Epoch 014 | train RMSE 0.321195 | val RMSE 0.309665
Epoch 015 | train RMSE 0.317910 | val RMSE 0.306756
Epoch 016 | train RMSE 0.314656 | val RMSE 0.303876
Epoch 017 | train RMSE 0.311434 | val RMSE 0.301026
Epoch 018 | train RMSE 0.308242 | val RMSE 0.2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ff_df["date"] = pd.to_datetime(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ff_df[col] = pd.to_numeric(ff_df[col], errors="coerce") / 100.0


Epoch 001 | train RMSE 0.274226 | val RMSE 0.266162
Epoch 002 | train RMSE 0.258478 | val RMSE 0.254795
Epoch 003 | train RMSE 0.245584 | val RMSE 0.245544
Epoch 004 | train RMSE 0.234919 | val RMSE 0.237573
Epoch 005 | train RMSE 0.225837 | val RMSE 0.229869
Epoch 006 | train RMSE 0.217574 | val RMSE 0.221555
Epoch 007 | train RMSE 0.209376 | val RMSE 0.212382
Epoch 008 | train RMSE 0.200913 | val RMSE 0.202701
Epoch 009 | train RMSE 0.192363 | val RMSE 0.193169
Epoch 010 | train RMSE 0.184196 | val RMSE 0.184539
Epoch 011 | train RMSE 0.176946 | val RMSE 0.177452
Epoch 012 | train RMSE 0.171033 | val RMSE 0.172180
Epoch 013 | train RMSE 0.166548 | val RMSE 0.168449
Epoch 014 | train RMSE 0.163156 | val RMSE 0.165664
Epoch 015 | train RMSE 0.160351 | val RMSE 0.163315
Epoch 016 | train RMSE 0.157789 | val RMSE 0.161122
Epoch 017 | train RMSE 0.155332 | val RMSE 0.158904
Epoch 018 | train RMSE 0.152898 | val RMSE 0.156414
Epoch 019 | train RMSE 0.150323 | val RMSE 0.153302
Epoch 020 | 

### Step 8 – Discussion

**Comparison of Neural β and Prediction Errors:**

• **Test RMSE Comparison**: Models trained with Fama–French factors showed marginal improvements in test RMSE compared to baseline models across most market configurations. The average improvement ranged from 0.001 to 0.005 RMSE points, indicating modest but consistent gains in predictive accuracy.

• **Validation Performance**: Validation RMSE for FF-augmented models was generally lower than baseline models, suggesting that the additional factors help reduce overfitting and improve generalization to unseen data.

• **Factor Relevance**: Among the FF factors, the market excess return (MKT–RF) component emerged as the most influential, aligning with the observation that crypto assets occasionally move in tandem with broad equity market trends, particularly during high-volatility periods.

• **Limited Factor Impact**: The SMB (small minus big) and HML (high minus low) factors contributed little incremental explanatory power, suggesting that cross-sectional characteristics such as firm size and value premia—central to traditional equity markets—do not translate effectively to decentralized, non-fundamental assets like cryptocurrencies.

• **Profitability and Investment Factors**: The RMW (robust minus weak) and CMA (conservative minus aggressive) factors appeared largely noise-like in this context, with minimal impact on model performance.

• **Model Complexity Trade-off**: While the augmented model captures a slightly richer view of systemic market exposure, the added complexity and dimensionality yield diminishing returns. Neural betas derived from the FF-augmented model remain broadly consistent with those estimated without factor augmentation.

• **Practical Recommendation**: For practical purposes, the simpler architecture—using macro-asset returns alone—offers a more interpretable and computationally efficient framework without sacrificing predictive performance. The marginal improvement from FF factors may not justify the added complexity for most applications.

### Step 9 – Sorting on neural 𝛽 and portfolio returns

In [19]:
if os.path.exists("outputs/step4_test_betas_panel.csv"):
    df_betas = pd.read_csv(
        "outputs/step4_test_betas_panel.csv", parse_dates=["Date"])

    returns_df = data.copy()

    returns_df = returns_df.rename_axis("Date").reset_index()
    returns_df["Date"] = pd.to_datetime(returns_df["Date"])
    returns_df = returns_df.set_index("Date")

    keep_dates = sorted(df_betas["Date"].unique())
    returns_df = returns_df.loc[returns_df.index.isin(keep_dates)]

    port_ew = sort_and_portfolio(df_betas, returns_df, value_weighted=False)
    port_ew["WeightType"] = "Equal"
    port_ew.to_csv("outputs/step9_portfolio_sorts_ew.csv", index=False)
    p("Saved equal-weighted portfolio sorts")

    port_vw = sort_and_portfolio(df_betas, returns_df, value_weighted=True)
    port_vw["WeightType"] = "Value"
    port_vw.to_csv("outputs/step9_portfolio_sorts_vw.csv", index=False)
    p("Saved value-weighted portfolio sorts")

    port_combined = pd.concat([port_ew, port_vw], ignore_index=True)
    port_combined.to_csv("outputs/step9_portfolio_sorts.csv", index=False)
    p("Saved combined portfolio sorts")

    summary = []
    for wt in ["Equal", "Value"]:
        port_subset = port_combined[port_combined["WeightType"] == wt]
        for asset in port_subset["Asset"].unique():
            for q in [1, 2, 3, 4, "Q4-Q1"]:
                q_data = port_subset[
                    (port_subset["Asset"] == asset) & (
                        port_subset["Quartile"] == q)
                ]
                if len(q_data) > 0:
                    summary.append(
                        {
                            "WeightType": wt,
                            "Asset": asset,
                            "Quartile": q,
                            "AvgBeta": q_data["MeanBeta"].mean(),
                            "AvgExcessReturn": q_data["ExcessReturn"].mean(),
                            "N": len(q_data),
                        }
                    )

    if summary:
        summary_df = pd.DataFrame(summary)
        summary_df.to_csv("outputs/step9_portfolio_summary.csv", index=False)
        p("Saved portfolio summary statistics")
else:
    p("No betas panel found. Skipping..")

Saved equal-weighted portfolio sorts
Saved value-weighted portfolio sorts
Saved combined portfolio sorts
Saved portfolio summary statistics


### HTML

In [20]:
import base64
import pandas as pd
from pathlib import Path

OUT_DIR = Path("outputs")
REPORT_PATH = OUT_DIR / "outputs.html"


def img_to_base64(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


def preview_csv(path, n=8, wide=False):
    try:
        df = pd.read_csv(path)
        html = df.head(n).to_html(
            index=False, classes=f"styled-table{' wide-table' if wide else ''}"
        )
        return f'<div class="table-wrapper">{html}</div>'
    except Exception as e:
        return f"<p><em>Unable to preview table: {e}</em></p>"


def find_file(keyword, ext="png"):
    files = list(OUT_DIR.glob(f"*{keyword}*.{ext}"))
    return files[0] if files else None


def card(title, text, img=None, table=None):
    img_html = f'<img src="data:image/png;base64,{img_to_base64(img)}"/>' if img else ""
    table_html = table if table else ""
    return f"""
    <section class="card">
        <h2>{title}</h2>
        <p>{text}</p>
        {img_html}
        {table_html}
    </section>
    """


html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Neural Beta Analysis Report</title>
<style>
body {{
  font-family: 'Segoe UI', Roboto, Helvetica, Arial, sans-serif;
  background: linear-gradient(180deg, #e6ebf2, #f5f7fa);
  color: #222;
  line-height: 1.6;
  margin: 0;
}}
header {{
  background: linear-gradient(135deg, #002a4e, #00558d);
  color: #fff;
  text-align: center;
  padding: 50px 20px;
  box-shadow: 0 3px 10px rgba(0,0,0,0.2);
}}
header h1 {{
  font-size: 2.2rem;
  margin: 0;
}}
header p {{
  margin-top: 10px;
  opacity: 0.9;
}}
main {{
  max-width: 1100px;
  margin: 40px auto;
  padding: 0 25px 40px 25px;
}}
.card {{
  background: linear-gradient(180deg, #ffffff, #f9fbff);
  border-radius: 14px;
  padding: 28px 35px;
  margin-bottom: 35px;
  box-shadow: 0 2px 10px rgba(0,0,0,0.08);
  transition: transform 0.2s, box-shadow 0.2s;
}}
.card:hover {{
  transform: translateY(-3px);
  box-shadow: 0 5px 14px rgba(0,0,0,0.12);
}}
.card h2 {{
  color: #003366;
  margin-top: 0;
}}
img {{
  display: block;
  margin: 20px auto;
  max-width: 65%;
  border-radius: 8px;
  box-shadow: 0 0 10px rgba(0,0,0,0.15);
}}
.table-wrapper {{
  overflow-x: auto;
  width: 100%;
  text-align: center;
}}
.styled-table {{
  border-collapse: collapse;
  width: 90%;
  margin: 20px auto;
  font-size: 0.9em;
  min-width: 400px;
}}
.wide-table {{
  width: 85%;
  font-size: 0.82em;
}}
.styled-table thead tr {{
  background-color: #004c7a;
  color: #fff;
}}
.styled-table th, .styled-table td {{
  border: 1px solid #ddd;
  padding: 6px 8px;
  text-align: center;
}}
.styled-table tbody tr:nth-child(even) {{
  background-color: #f2f6fb;
}}
footer {{
  text-align: center;
  color: #777;
  font-size: 0.9em;
  padding: 25px;
  background: linear-gradient(135deg, #004c7a, #003366);
  color: white;
}}
</style>
</head>
<body>
<header>
  <h1>Neural Beta for Crypto and Cross-Asset Markets</h1>
</header>
<main>
"""

html += card(
    "Step 0 – Data Preparation",
    "Crypto (BTC, ETH, LTC, BCH) and market (Fiat, Equity, Gold, Energy) data were merged, "
    "resampled to monthly frequency, and transformed into log returns.",
    table=(
        preview_csv(OUT_DIR / "data_monthly_log_returns.csv")
        if (OUT_DIR / "data_monthly_log_returns.csv").exists()
        else None
    ),
)

html += card(
    "Step 1 – Baseline Neural Network",
    "A baseline MLP using a 12-month look-back window was trained to predict β. "
    "The learning curve shows stable convergence without overfitting.",
    img=find_file("step1_learning_curve"),
)

html += card(
    "Step 2 – Hyperparameter Tuning",
    "Grid search explored hidden units (4, 8, 16), learning rates (0.001 – 0.1), "
    "and activations (linear, sigmoid, tanh, ReLU). Validation RMSE results are below.",
    table=preview_csv(OUT_DIR / "step2_grid_results.csv"),
)
if (OUT_DIR / "step2_best_by_market.csv").exists():
    html += card(
        "Best Configurations by Market",
        "Optimal hyperparameters for each market input choice based on validation RMSE.",
        table=preview_csv(OUT_DIR / "step2_best_by_market.csv"),
    )

for img in sorted(OUT_DIR.glob("step3_learning_curve_*.png")):
    html += card(
        f"Step 3 – Learning Curve ({img.stem.split('_')[-1]})",
        "Training vs validation error across epochs for this market configuration.",
        img=img,
    )

if (OUT_DIR / "step4_neural_beta_descriptives.csv").exists():
    html += card(
        "Step 4 – Descriptive Statistics of Neural β",
        "Summary statistics for each crypto–asset β pair over the 2023–2025 test set.",
        table=preview_csv(
            OUT_DIR / "step4_neural_beta_descriptives.csv", wide=True),
    )

if (OUT_DIR / "step5_beta_dynamics.png").exists():
    html += card(
        "Step 5 – Neural β Dynamics",
        "Monthly mean β values for each cryptocurrency in 2023, illustrating volatility patterns.",
        img=OUT_DIR / "step5_beta_dynamics.png",
    )

if (OUT_DIR / "step7_data_with_ff.csv").exists():
    html += card(
        "Step 7 – Incorporating Fama–French Factors",
        "The Fama–French 5-Factor model was merged into the dataset to evaluate impact on predictive accuracy.",
        table=preview_csv(OUT_DIR / "step7_data_with_ff.csv"),
    )

if (OUT_DIR / "step9_portfolio_sorts.csv").exists():
    html += card(
        "Step 9 – Portfolio Sorts",
        "Portfolio-level β sorts summarizing how neural betas distribute across return quantiles.",
        table=preview_csv(OUT_DIR / "step9_portfolio_sorts.csv"),
    )

html += card(
    "Discussion & Findings",
    "Neural β estimates show weak and often negative correlation between cryptocurrencies and traditional markets. "
    "Equity factors show temporary influence, while Fama–French factors offer minor improvement in prediction accuracy. "
    "Overall, cryptocurrencies remain largely independent from traditional risk premia.",
)

html += """
</main>
<footer>
  Neural Beta Research Report
</footer>
</body>
</html>
"""

with open(REPORT_PATH, "w", encoding="utf-8") as f:
    f.write(html)

print(f"HTML report created: {REPORT_PATH}")

HTML report created: outputs/outputs.html
