# SF - TRAINING

In [1]:
!pip install arch ta pandas numpy torch hmmlearn scipy yfinance


Collecting arch
  Downloading arch-8.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading arch-8.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (981 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.3/981.3 kB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hmmlearn-0.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/166.0 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=294

## preprocessing.py

In [2]:
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from arch import arch_model
from ta.trend import SMAIndicator, EMAIndicator
from ta.volatility import BollingerBands
from ta.momentum import RSIIndicator

# Color codes for logging
RED = '\033[91m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
ENDC = '\033[0m'

def fetch_data(tickers=['AAPL', 'GOOGL'], start="2007-01-01", end="2024-12-31"):
    """Fetch OHLCV data from yfinance, preserving DatetimeIndex."""
    data = yf.download(tickers, start=start, end=end, auto_adjust=True)
    ticker_dfs = {}
    for ticker in tickers:
        ticker_data = data.xs(ticker, axis=1, level=1) if len(tickers) > 1 else data
        ticker_data.columns = [col.lower() for col in ticker_data.columns]
        ticker_data.index = pd.to_datetime(ticker_data.index)
        ticker_dfs[ticker] = ticker_data
    print(GREEN + f"Fetched data for {tickers}" + ENDC)
    return ticker_dfs

def clean_data(df, ticker):
    """Clean data, preserving DatetimeIndex and minimizing row loss."""
    print(BLUE + f"Step 1: Cleaning data for {ticker}" + ENDC)
    if df is None or df.empty:
        print(RED + f"No data for {ticker}" + ENDC)
        return None

    print(f"Initial shape: {df.shape}, index type: {type(df.index)}")
    print(f"Date range: {df.index.min()} to {df.index.max()}")
    print(f"NaN counts:\n{df.isna().sum()}")

    df = df.copy()
    df = df.dropna(subset=['open', 'high', 'low', 'close', 'volume'])
    print(f"After dropna OHLCV shape: {df.shape}")

    df['return'] = df['close'].pct_change().fillna(0)
    df['volatility'] = df['return'].rolling(window=21).std() * np.sqrt(252)
    df['momentum'] = df['return'].rolling(window=21).mean()

    mean_ret, std_ret = df['return'].mean(), df['return'].std()
    df = df[abs(df['return'] - mean_ret) <= 5 * std_ret]
    print(f"After outlier removal shape: {df.shape}")

    df = df.drop_duplicates()
    print(f"After drop duplicates shape: {df.shape}")

    print(BLUE + f"Cleaned data shape: {df.shape}" + ENDC)
    print(f"NaN counts after cleaning:\n{df.isna().sum()}")
    return df

def compute_features(df):
    """Compute technical indicators and features."""
    if df is None or df.empty:
        return None

    df_features = df.copy()
    close_series = df_features['close']

    df_features['SMA_10'] = SMAIndicator(close_series, window=10).sma_indicator()
    df_features['EMA_10'] = EMAIndicator(close_series, window=10).ema_indicator()
    df_features['RSI_14'] = RSIIndicator(close_series, window=14).rsi()
    bb = BollingerBands(close_series, window=10, window_dev=2)
    df_features['BB_High'] = bb.bollinger_hband()
    df_features['BB_Low'] = bb.bollinger_lband()
    df_features['Log_Returns'] = np.log(close_series / close_series.shift(1))
    df_features['Volatility_10'] = df_features['Log_Returns'].rolling(window=10).std() * np.sqrt(252)

    def hurst_exponent(series, lag=20, min_std=1e-6):
        if len(series) < lag:
            return np.nan
        series = series.dropna()
        if len(series) < 10:
            return np.nan
        lags = range(2, min(lag, len(series)))
        rs = []
        for lag in lags:
            lagged_diff = series.diff(lag).dropna()
            if len(lagged_diff) < 5:
                continue
            rs_range = lagged_diff.max() - lagged_diff.min()
            rs_std = max(lagged_diff.std(), min_std)
            rs.append(np.log(rs_range / rs_std) / np.log(lag) if rs_range > 0 else np.nan)
        return np.nanmean(rs) if rs else np.nan

    df_features['Hurst'] = df_features['Log_Returns'].rolling(window=30).apply(hurst_exponent, raw=False)

    print(BLUE + f"Features computed shape (before dropna): {df_features.shape}" + ENDC)
    print(f"NaN counts before dropna:\n{df_features.isna().sum()}")

    critical_columns = ['Log_Returns', 'Volatility_10', 'RSI_14', 'momentum', 'volume']
    df_features = df_features.loc[df_features[critical_columns].notna().all(axis=1)]
    print(f"Features computed shape (after dropna critical): {df_features.shape}")

    for col in ['SMA_10', 'EMA_10', 'BB_High', 'BB_Low', 'Hurst', 'volatility']:
        if col in df_features.columns:
            df_features[col] = df_features[col].interpolate().ffill().bfill()

    print(f"NaN counts after imputation:\n{df_features.isna().sum()}")
    return df_features

def normalize_data(df, columns=['open', 'high', 'low', 'close', 'volume', 'SMA_10', 'EMA_10', 'RSI_14', 'BB_High', 'BB_Low', 'Volatility_10']):
    """Normalize specified columns."""
    if df is None or df.empty:
        return None, None

    df_normalized = df.copy()
    scalers = {}

    for col in columns:
        if col in df_normalized.columns:
            scaler = MinMaxScaler()
            scaled_values = scaler.fit_transform(df_normalized[[col]].values.reshape(-1, 1)).flatten()
            df_normalized[col] = scaled_values
            scalers[col] = scaler
        else:
            print(YELLOW + f"Warning: Column {col} not found in DataFrame" + ENDC)

    print(BLUE + f"Normalized data shape: {df_normalized.shape}" + ENDC)
    return df_normalized, scalers

def fit_garch_model(returns, p=1, q=1, window=200):
    """Fit a rolling GARCH model."""
    try:
        returns = returns.dropna()
        if len(returns) < 50:
            print(RED + "Insufficient data for GARCH model" + ENDC)
            return pd.Series(np.nan, index=returns.index)

        scale_factor = 100
        scaled_returns = returns * scale_factor
        volatility = []
        failures = 0
        for i in range(len(returns)):
            if i < window:
                volatility.append(np.nan)
                continue
            window_returns = scaled_returns.iloc[max(0, i-window):i]
            if len(window_returns) < 50:
                volatility.append(np.nan)
                continue
            try:
                model = arch_model(window_returns, vol='Garch', p=p, q=q, dist='Normal', mean='Zero', rescale=False)
                garch_fit = model.fit(disp='off', options={'maxiter': 1000})
                forecast = garch_fit.forecast(horizon=1)
                volatility.append(np.sqrt(forecast.variance.values[-1, :])[0] / scale_factor)
            except Exception as e:
                failures += 1
                volatility.append(np.nan)
        volatility_series = pd.Series(volatility, index=returns.index)
        print(f"GARCH volatility forecast (last): {volatility_series.iloc[-1]:.4f}")
        print(f"GARCH NaN count: {volatility_series.isna().sum()}")
        print(f"GARCH failures: {failures}")
        return volatility_series
    except Exception as e:
        print(RED + f"GARCH model fitting failed entirely: {e}" + ENDC)
        return pd.Series(np.nan, index=returns.index)

def process_asset_data(data_dict, ticker, augment=False):
    """Process data for a single ticker."""
    df = data_dict.get(ticker)
    if df is None or len(df) < 100:
        print(RED + f"Insufficient data for {ticker}" + ENDC)
        return None, None

    print(GREEN + f"Processing ticker: {ticker}" + ENDC)
    print(f"Initial data shape: {df.shape}, date range: {df.index.min()} to {df.index.max()}")
    for s, e in [('2008-09-01', '2009-03-15'), ('2020-02-15', '2020-03-31')]:
        mask = (df.index >= pd.to_datetime(s)) & (df.index <= pd.to_datetime(e))
        print(f"{s} to {e}: {mask.sum()} rows")

    df_cleaned = clean_data(df, ticker)
    if df_cleaned is None:
        return None, None

    df_features = compute_features(df_cleaned)
    if df_features is None:
        return None, None

    returns = df_features['Log_Returns'].dropna()
    garch_vol = fit_garch_model(returns)
    df_features['GARCH_Vol'] = garch_vol
    if df_features['GARCH_Vol'].isna().all():
        print(YELLOW + f"GARCH failed for {ticker}; using rolling std as fallback" + ENDC)
        df_features['GARCH_Vol'] = df_features['Log_Returns'].rolling(window=21).std() * np.sqrt(252)
    df_features['GARCH_Vol'] = df_features['GARCH_Vol'].interpolate().ffill().bfill()
    print(BLUE + "trying on fixes" + ENDC)
    df_features['GARCH_Vol'] = np.sqrt(df_features['GARCH_Vol']) #this works
    #df['GARCH_Vol'] = df['GARCH_Vol'] / np.sqrt(252)

    df_normalized, scalers = normalize_data(df_features)
    if df_normalized is None:
        return None, None

    autoencoder_columns = ['open', 'high', 'low', 'close', 'volume', 'SMA_10', 'EMA_10', 'RSI_14', 'BB_High', 'BB_Low', 'Volatility_10']
    for col in autoencoder_columns:
        if col in df_normalized.columns:
            min_val, max_val = df_normalized[col].min(), df_normalized[col].max()
            if not (0 <= min_val <= max_val <= 1):
                print(YELLOW + f"Warning: Column {col} not scaled properly (min={min_val:.4f}, max={max_val:.4f})" + ENDC)

    for s, e in [('2008-09-01', '2009-03-15'), ('2020-02-15', '2020-03-31')]:
        mask = (df_normalized.index >= pd.to_datetime(s)) & (df_normalized.index <= pd.to_datetime(e))
        print(f"{s} to {e} after processing: {mask.sum()} rows")
        print(f"Feature ranges in {s} to {e}:")
        for col in ['Log_Returns', 'Volatility_10', 'GARCH_Vol', 'momentum', 'RSI_14', 'volume', 'Hurst']:
            if col in df_normalized.columns:
                min_val = df_normalized[mask][col].min()
                max_val = df_normalized[mask][col].max()
                mean_val = df_normalized[mask][col].mean()
                print(f"  {col}: min={min_val:.4f}, max={max_val:.4f}, mean={mean_val:.4f}")

    print(GREEN + f"Final processed data for {ticker} shape: {df_normalized.shape}" + ENDC)
    return df_normalized, scalers

def process_multiple_assets(data_dict, tickers, augment=False):
    """Process multiple tickers."""
    results = {}
    for ticker in tickers:
        df_processed, scalers = process_asset_data(data_dict, ticker, augment=augment)
        results[ticker] = {'data': df_processed, 'scalers': scalers}
        print(GREEN + f"Processed data for ticker {ticker}" + ENDC)
    return results

def run(tickers, start_date, end_date, augment, otpt_show, save=True, outdir="output_data"):
    import os

    if save:
        os.makedirs(outdir, exist_ok=True)

    data_dict = fetch_data(tickers, start_date, end_date)
    results = process_multiple_assets(data_dict, tickers, augment=augment)

    if otpt_show:
        for ticker, result in results.items():
            df_processed = result['data']
            if df_processed is not None:
                print(f"Processed data shape for {ticker}: {df_processed.shape}")
                print(f"Date range: {df_processed.index.min()} to {df_processed.index.max()}")
                print(f"GARCH Volatility (last): {df_processed['GARCH_Vol'].iloc[-1]:.4f}")

    if save:
        for ticker, result in results.items():
            df_processed = result['data']
            if df_processed is not None:
                filename = os.path.join(outdir, f"{ticker}_processed.csv")
                df_processed.to_csv(filename, index=True)
                print(GREEN + f"Saved processed data for {ticker} to {filename}" + ENDC)

    return results





## helpernets.py

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class attention_pool(nn.Module):
    def __init__(self, embed_dim: int):
        super().__init__()
        self.Q = nn.Parameter(torch.randn(embed_dim))   # [d]
        self.K = nn.Linear(embed_dim, embed_dim)
        self.V = nn.Linear(embed_dim, embed_dim)

    def forward(self, h_inputs: torch.Tensor):
        """
        h_inputs: [batch, N, d]
        Returns:
            h_out: [batch, d]
            alphas: [batch, N]
        """
        keys = self.K(h_inputs)      # [batch, N, d]
        values = self.V(h_inputs)    # [batch, N, d]

        # Compute attention scores
        # Q: [d] → broadcast to [batch, d]
        eij = torch.matmul(keys, self.Q)      # [batch, N, d] · [d] -> [batch, N]

        alphas = torch.softmax(eij, dim=1)    # softmax over assets per batch

        h_out = torch.sum(values * alphas.unsqueeze(-1), dim=1)  # [batch, d]
        print(GREEN + "h_out from attention pool is: ", h_out)

        return h_out, alphas

def ensure_tensor(x, device, dtype=torch.float32):
    if torch.is_tensor(x):
        return x.to(device, dtype=dtype)
    else:
        return torch.tensor(x, dtype=dtype, device=device)


## asset agent

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical, Normal
#from helpernets import ensure_tensor

#Dont forget:
# - CONCAT MEMORY ONTO NON SEQ DATA either in train.py or here


class Asset_Seq_Encoder(nn.Module):

    def __init__(
            self,
            seq_input_dim : int,
            recurrent_layers : int = 1,
            network : str = "lstm",
            recurrent_neurons : int = 64,

    ):
        super().__init__()
        #Experiment with LSTM, 1D CNN, GRU
        if(network == "lstm"):
            self.network = nn.LSTM(seq_input_dim, recurrent_neurons, recurrent_layers, batch_first = True)
        elif(network == "gru"):
            self.network = nn.GRU(seq_input_dim, recurrent_neurons, recurrent_layers, batch_first = True)
        else:
            print("Encoder choice for sequence data doesn't match.")

    def forward(self, seq_data : torch.Tensor):

        _, (h_n, _) = self.network(seq_data) #h_n = (L,B,N_r) or (no of recc layers, batch size, no of recc neurons)
        return h_n[-1] #[-1] means we are taking the hidden state value of the last recurrent neuron. (L,B) #by default this is (1,B)


class AssetPolicyNet(nn.Module):

    def __init__(
            self,
            seq_input_dim: int,
            non_seq_input_dim : int,
            hidden_dim_seq : int=64,
            hidden_dim : int = 128,
            num_discrete: int = 3, #BUY, SELL, HOLD
            memory_dim : int=1, #dimensions of memory update output
            num_signal : int = 2, #signal from asset to domain
            min_std : float = 1e-3, #Minimum standard deviation for the Normal distribution of the memory update.
            ):
        super().__init__()
        self.encoder = Asset_Seq_Encoder(seq_input_dim, hidden_dim_seq) #returns (L,B)
        self.shared_net = nn.Linear(hidden_dim_seq + non_seq_input_dim, hidden_dim)
        self.actor_bhs = nn.Linear(hidden_dim, num_discrete) #actor head 1: BUY HOLD SELL

        #fraction of trades made
        self.trade_frac_mean = nn.Linear(hidden_dim, 1)
        self.trade_frac_std = nn.Linear(hidden_dim, 1)

        self.asset_to_domain_mean = nn.Linear(hidden_dim, num_signal) #actor head 2: signal to domain agent
        self.asset_to_domain_std = nn.Linear(hidden_dim, num_signal)
        #if num_signals are 2, means we want the policy to determine 2 values. This means we create 2 gaussian
        #distributions - one for each of the output variables (therefore 2 means and 2 std devs), and pick one value
        #from each distribution

        self.mem_update_mean = nn.Linear(hidden_dim, memory_dim) #actor head 3: memory update value
        self.mem_update_std = nn.Linear(hidden_dim, memory_dim)

        self.critic = nn.Linear(hidden_dim, 1) #critic outputs the estimated value
        self.min_std = min_std


    def forward(self, seq_data, non_seq_data):

        #seq_data is [B,T,F] and non_seq_data is [B, 2 values (alloc and mem)]

        seq_encoding = self.encoder(seq_data) #returns as (L,B)
        #seq_encoding = seq_encoding.transpose(0,1) #convert that (L,B) to (B,L)
        net_x = torch.cat([seq_encoding, non_seq_data], dim=1) #join along the dimension L or encoding cols. so batch is same and cols
        #of encoding and B put beside each other. => net becomes (B, L+2)
        net_enc_hidden = torch.tanh(self.shared_net(net_x)) #hidden_dim neurons (by default 128)

        #BUY HOLD SELL (categorical policy):
        logits = self.actor_bhs(net_enc_hidden) #(B,3) 3 for B,H,S
        bhs = Categorical(logits = logits)

        #ASSET TO DOMAIN SIGNAL
        ast_to_dom_mean = self.asset_to_domain_mean(net_enc_hidden) #gives 2 (num_signal) => (B,2)
        ast_do_dom_logstd = self.asset_to_domain_std(net_enc_hidden).clamp(min=torch.log(torch.tensor(self.min_std))) #gives 2 logstds
        ast_to_dom_std = ast_do_dom_logstd.exp() #calculate logstd to ensure that values are positive
        ast_to_dom = Normal(ast_to_dom_mean, ast_to_dom_std) #outputs a Gaussian distribution
        #every time the actor processes data, it calculates a new policy - a new distribution and then we sample the action
        #from that distribution, encouraging exploration and exploitation based on a distribution rather than strict epsilon
        #greedy percentages

        #MEMORY UPDATE SIGNAL
        mem_mean = self.mem_update_mean(net_enc_hidden) #gives 2 means => (B,2)
        mem_logstd = self.mem_update_std(net_enc_hidden).clamp(min=torch.log(torch.tensor(self.min_std))) #gives 2 log std
        mem_std = mem_logstd.exp() #calculate logstd to ensure that values are positive
        mem_update = Normal(mem_mean, mem_std)

        # TRADE FRACTION HEAD (continuous between 0 and 1)
        frac_mean = torch.sigmoid(self.trade_frac_mean(net_enc_hidden))  # ensures [0,1]
        frac_logstd = self.trade_frac_std(net_enc_hidden).clamp(min=torch.log(torch.tensor(self.min_std)))
        frac_std = frac_logstd.exp()
        trade_frac = Normal(frac_mean, frac_std)


        value = self.critic(net_enc_hidden).squeeze(-1)

        return bhs, ast_to_dom, mem_update, trade_frac, value


class AssetAgent(nn.Module):


    def __init__(
            self,
            seq_input_dim,
            non_seq_input_dim,
            hidden_dim_seq = 64,
            hidden_dim = 128,
            num_discrete = 3,
            memory_dim = 1,
            num_signal = 2, #signal sent from asset to domain
            min_std = 1e-3,
            lr = 3e-4,
            clip_eps = 0.2,
            c1 = 0.5, #value coefficient (multiplied with L^{value})
            c2 = 0.01, #entropy coefficient (multiplied with entropy of action distributions)
            device = "cpu"
    ):
        super().__init__()
        self.device = device
        self.policynet = AssetPolicyNet(
            seq_input_dim=seq_input_dim,
            non_seq_input_dim=non_seq_input_dim,
            hidden_dim_seq=hidden_dim_seq,
            hidden_dim=hidden_dim,
            num_discrete=num_discrete,
            memory_dim=memory_dim,
            num_signal=num_signal,
            min_std=min_std
        ).to(device)

        self.optimizer = optim.Adam(self.policynet.parameters(), lr=lr)

        self.clip_eps = clip_eps
        self.c1 = c1
        self.c2 = c2

    def act(self, seq_data, non_seq_data):

        #CONCAT MEMORY ONTO NON SEQ DATA - EITHER HERE OR IN TRAINING LOOP AND SEND

        #seq_data is [1,T,F] and non_seq_data is [1, 2 values (alloc and mem)]

        seq_data = seq_data.to(self.device)
        non_seq_data = non_seq_data.to(self.device)
        bhs_dist, ast_to_dom_dist, mem_update_dist, trade_frac_dist, value = self.policynet(seq_data, non_seq_data)

        #1. SAMPLING FROM DISTRIBUTIONS
        bhs = bhs_dist.sample()
        ast_to_dom = ast_to_dom_dist.sample()
        mem_update = mem_update_dist.sample()
        trade_frac = trade_frac_dist.sample().clamp(0, 1)

        #2. CALCULATING LOG PROBS
        bhs_logprob = bhs_dist.log_prob(bhs) #need log probabilities for policy gradient calculation

        ast_to_dom_logprobs = ast_to_dom_dist.log_prob(ast_to_dom).sum(-1)
        #continuous product of probs = continuous sum of log probs
        mem_update_logprobs = mem_update_dist.log_prob(mem_update).sum(-1)
        trade_frac_logprob = trade_frac_dist.log_prob(trade_frac).sum(-1)

        total_logprob = bhs_logprob + ast_to_dom_logprobs + mem_update_logprobs + trade_frac_logprob

        return (bhs, ast_to_dom, mem_update, trade_frac), total_logprob, value

    def evaluate(self, seq_data, non_seq_data, a1,a2,a3,a4):
        """Recompute logprobs + entropy + value for PPO update."""

        #CONCAT MEMORY ONTO NON SEQ DATA - EITHER HERE OR IN TRAINING LOOP AND SEND

        seq_data = seq_data.to(self.device)
        non_seq_data = non_seq_data.to(self.device)

        bhs_dist, atod_dist, mem_dist, trade_frac_dist, value = self.policynet(seq_data, non_seq_data)

        bhs_p = bhs_dist.log_prob(a1)
        atod_p = atod_dist.log_prob(a2).sum(-1)
        mem_upd_p = mem_dist.log_prob(a3).sum(-1)
        trade_frac_p = trade_frac_dist.log_prob(a4).sum(-1)

        logprob = bhs_p + atod_p + mem_upd_p + trade_frac_p
        entropy = (bhs_dist.entropy() + atod_dist.entropy().sum(-1) + mem_dist.entropy().sum(-1)).mean()
        return logprob, entropy, value

    def update_policy(self, rollouts):

        """
        rollouts should be a dict with:
        - seq_data
        - non_seq_data
        - actions = (a1, a2, a3)
        - old_logprobs
        - returns
        - advantages

        this is basically all the data that we stored for the number of episodes during which we
        had freezed policy updation. Now we review our calculations.
        """


        seq_data = rollouts["seq_data"].to(self.device)
        non_seq_data = rollouts["non_seq_data"].to(self.device)
        bhs = rollouts["bhs"]
        ast_to_dom = rollouts["ast_to_dom"]
        mem_update = rollouts["mem_update"]
        trade_frac = rollouts["trade_frac"]
        old_logprobs = rollouts["old_logprobs"].to(self.device)
        returns = rollouts["rewards"].to(self.device)
        advantages = rollouts["values"].to(self.device)
        #returns and advantages calculated in train.py file in training loop

        logprobs, entropy, values = self.evaluate(seq_data, non_seq_data, bhs, ast_to_dom, mem_update, trade_frac)
        #multiple rows (mini batches) can be evaluated by pytorch just as well as a single row.

        ratios = torch.exp(logprobs - old_logprobs)

        #if r_t > 1 => new policy assigns higher prob to that action than old policy
        #if r_1 < 1 => assigns lower prob

        surr1 = ratios * advantages
        surr2 = torch.clamp(ratios, 1 - self.clip_eps, 1 + self.clip_eps) * advantages
        actor_loss = -torch.min(surr1, surr2).mean()

        value_loss = (returns - values).pow(2).mean()
        #critic training -> critic learns to estimate return value so we train it using regression (MSE)

        loss = actor_loss + self.c1 * value_loss - self.c2* entropy
        #entropy increases => loss decreases (supports exploration => higher entropy means more exploration)
        #since actor and critic share the same network we backpropagate it with the same loss (thus add it).

        #magic statements
        self.optimizer.zero_grad()
        loss = loss.mean() #taking mean across the entire batch
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policynet.parameters(), max_norm=0.5)
        self.optimizer.step()

        return {
            "loss": loss.item(),
            "actor_loss": actor_loss.item(),
            "value_loss": value_loss.item(),
            "entropy": entropy.item()
        }

class AssetRolloutBuffer:
    def __init__(self):
        self.seq_data = []
        self.non_seq_data = []
        self.actions = []
        self.logprobs = []
        self.values = []
        self.rewards = []
        self.dones = []

    def clear(self):
        self.__init__()


## domain agent

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical, Normal
#from helpernets import attention_pool

class DomainPolicyNet(nn.Module):

    def __init__(
            self,
            num_assets : int,
            h_asset_dim : int,
            master_signal_dim : int,
            memory_dim : int = 1,
            num_signal : int = 2, #signal sent from domain to master
            min_std : float = 1e-3,
            hidden_dim : int = 128,
    ):
        super().__init__()
        self.attentionpool = attention_pool(h_asset_dim)

        self.shared_net = nn.Linear(h_asset_dim + memory_dim + master_signal_dim, hidden_dim)
        self.actor_port_alloc = nn.Linear(hidden_dim, num_assets + 1) #+1 for "cash" token for unallocated cash

        self.domain_to_master_mean = nn.Linear(hidden_dim, num_signal)
        self.domain_to_master_std = nn.Linear(hidden_dim, num_signal)


        self.mem_update_mean = nn.Linear(hidden_dim, memory_dim) #actor head 3: memory update value
        self.mem_update_std = nn.Linear(hidden_dim, memory_dim)

        self.critic = nn.Linear(hidden_dim, 1) #outputs value function
        self.min_std = min_std

    def forward(self, h_assets, master_signal, mem):

        print(MAGENTA + "master signal: ", master_signal, ENDC)

        print(MAGENTA + "h_assets: " , h_assets , ENDC)
        h_sector, alphas = self.attentionpool(h_assets) #(batch, D) => h_sector
        print(YELLOW + "attention pool done (domain agent)" + ENDC)
        # if mem.dim() == 1:
        #     mem = mem.unsqueeze(0)
        # if mem.size(0) != h_assets.size(0):
        #     mem = mem.expand(h_assets.size(0), -1)

        if isinstance(mem, float):
            #convert to tensor with batch as first dimension.
            mem = torch.tensor([mem], dtype=torch.float32)
            mem = mem.unsqueeze(0) #because batch dimension will always be 1


        print(BLUE, "h_sector", h_sector, ENDC)
        print(BLUE, "master_signal", master_signal, ENDC)
        print(BLUE, "mem", mem, ENDC)
        net_x = torch.cat([h_sector, master_signal, mem], dim=1) #all the dims other than index 1 should be same.
        h = F.relu(self.shared_net(net_x))

        #ASSET ALLOCATION (USE DIRICHLET DISTRIBUTION):
        logits = self.actor_port_alloc(h)
        # logits -> concentration parameters for Dirichlet
        raw_alpha = self.actor_port_alloc(h)                   # (batch, num_assets+1)
        alpha = F.softplus(raw_alpha) + 1e-3                   # ensure positivity
        alloc_distn = torch.distributions.Dirichlet(alpha)

        #MEMORY UPDATE SIGNAL
        mem_mean = self.mem_update_mean(h) #gives 2 means
        mem_logstd = self.mem_update_std(h).clamp(min=torch.log(torch.tensor(self.min_std))) #gives 2 log std
        mem_std = mem_logstd.exp() #calculate logstd to ensure that values are positive
        mem_update = Normal(mem_mean, mem_std)

        #DOMAIN TO MASTER SIGNAL
        dtom_mean = self.domain_to_master_mean(h)
        dtom_logstd = self.domain_to_master_std(h).clamp(min=torch.log(torch.tensor(self.min_std)))
        dtom_std = dtom_logstd.exp()
        dtom = Normal(dtom_mean, dtom_std)

        value = self.critic(h).squeeze(-1)

        return alloc_distn, mem_update, dtom, value

class DomainAgent(nn.Module):

    def __init__(
            self,
            num_assets : int,
            h_asset_dim : int,
            master_signal_dim : int,
            memory_dim : int = 1,
            num_signal : int = 2,
            min_std : float = 1e-3,
            hidden_dim : int = 128,
            lr = 3e-4,
            clip_eps = 0.2,
            c1 = 0.5, #value coefficient (multiplied with L^{value})
            c2 = 0.01, #entropy coefficient (multiplied with entropy of action distributions)
            device = "cpu"
    ):

        super().__init__()
        self.device = device
        self.policynet = DomainPolicyNet(
            num_assets = num_assets,
            h_asset_dim = h_asset_dim,
            master_signal_dim = master_signal_dim,
            memory_dim = memory_dim,
            num_signal = num_signal,
            min_std = min_std,
            hidden_dim = hidden_dim
        ).to(device)

        self.optimizer = optim.Adam(self.policynet.parameters(), lr=lr)

        self.clip_eps = clip_eps
        self.c1 = c1
        self.c2 = c2

        #Initialize memory variables:
        self.memory_dim = memory_dim
        self.memory = torch.zeros(1, memory_dim, device=device)  # (batch=1, memory_dim)


    def act(self, h_assets, master_signal, mem):

        h_assets = h_assets.to(self.device) #(batch, num_assets, dim)
        master_signal = master_signal.to(self.device)
        alloc_distn, mem_update_dist, dtom_dist, value = self.policynet(h_assets, master_signal, mem)

        #1. SAMPLING FROM DISTRIBUTIONS
        dtom = dtom_dist.sample()
        mem_update = mem_update_dist.sample()
        allocations = alloc_distn.rsample()                    # (batch, num_assets+1)


        #2. CALCULATING LOG PROBS
        alloc_log_prob = alloc_distn.log_prob(allocations)
        entropy = alloc_distn.entropy()
        dtom_logprob = dtom_dist.log_prob(dtom).sum(-1)
        mem_update_logprob = mem_update_dist.log_prob(mem_update).sum(-1)

        total_logprob = alloc_log_prob + dtom_logprob + mem_update_logprob

        return (allocations, dtom, mem_update), total_logprob, value, alloc_distn

    def evaluate(self, h_assets, domain_memory, master_signal,alloc, dtom, mem_update):

        #recompute values

        h_assets = h_assets.to(self.device)
        master_signal = master_signal.to(self.device)
        alloc_distn, mem_update_dist, dtom_dist, value = self.policynet(h_assets, master_signal, domain_memory)

        alloc_p = alloc_distn.log_prob(alloc)
        dtom_p = dtom_dist.log_prob(dtom).sum(-1)
        mem_upd_p = mem_update_dist.log_prob(mem_update).sum(-1)

        logprob = alloc_p + dtom_p + mem_upd_p

        alloc_entropy = alloc_distn.entropy()                # (batch,)
        dtom_entropy = dtom_dist.entropy().sum(-1)           # (batch,)
        mem_entropy = mem_update_dist.entropy().sum(-1)

        entropy = alloc_entropy + dtom_entropy + mem_entropy
        return logprob, entropy, value

    def update_policy(self, rollouts):

        """
            rollouts should be a dict with:
            - h_assets
            - domain memory
            - master agent allocation
            - actions = (a1, a2, a3)
            - old_logprobs
            - returns
            - advantages

            this is basically all the data that we stored for the number of episodes during which we
            had freezed policy updation. Now we review our calculations.
            """

        h_assets = rollouts["h_assets"].to(self.device)
        domain_mem = rollouts["domain_memory"].to(self.device)
        master_alloc = rollouts["master_alloc"].to(self.device)
        alloc = rollouts["allocations"].to(self.device)
        dtom = rollouts["dtom"].to(self.device)
        mem = rollouts["mem_update"].to(self.device)
        old_logprobs = rollouts["old_logprobs"].to(self.device)
        returns = rollouts["returns"].to(self.device)
        advantages = rollouts["advantages"].to(self.device)
        #returns and advantages calculated in train.py file in training loop

        logprobs, entropy, values = self.evaluate(h_assets, domain_mem, master_alloc, alloc, dtom, mem)
        ratios = torch.exp(logprobs - old_logprobs) #old logprobs are summed up so that new logprobs and old logprobs both are batch first and have same batch value

        surr1 = ratios * advantages
        surr2 = torch.clamp(ratios, 1 - self.clip_eps, 1 + self.clip_eps) * advantages
        actor_loss = -torch.min(surr1, surr2).mean()

        value_loss = (returns - values).pow(2).mean() #returns and values are also batch first, deliberately made returns batch wise in buffer addition using unsqueeze(0)
        loss = actor_loss + self.c1 * value_loss - self.c2* entropy

        #magic statements
        self.optimizer.zero_grad()
        loss = loss.mean() #taking mean across the entire batch
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policynet.parameters(), max_norm=0.5)
        self.optimizer.step()

        return {
            "loss": loss.item(),
            "actor_loss": actor_loss.mean().item(),
            "value_loss": value_loss.mean().item(),
            "entropy": entropy.mean().item()
        }

## masteragent

In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical, Normal
#from helpernets import attention_pool
#MASTER AGENT WILL USE AN INSTANCE OF THE SAME CLASS AS DOMAIN ATTENTION POOL FOR POOLING

class MasterPolicyNet(nn.Module):

    def __init__(
           self,
           num_domains : int,
           h_domain_dim : int,
           memory_dim : int = 1,
           min_std : float = 1e-3,
           hidden_dim : int = 128,

    ):

        super().__init__()
        self.attentionpool = attention_pool(h_domain_dim)

        self.shared_net = nn.Linear(h_domain_dim + memory_dim, hidden_dim)
        self.actor_port_alloc = nn.Linear(hidden_dim, num_domains + 1) #+1 for "cash"

        self.mem_update_mean = nn.Linear(hidden_dim, memory_dim) #actor head 3: memory update value
        self.mem_update_std = nn.Linear(hidden_dim, memory_dim)

        self.critic = nn.Linear(hidden_dim, 1) #outputs value function
        self.min_std = min_std

    def forward(self, h_domains, mem):

        if isinstance(mem, float):
            #convert to tensor with batch as first dimension.
            mem = torch.tensor([mem], dtype=torch.float32)
            mem = mem.unsqueeze(0) #because batch dimension will always be 1

        h_master, alphas = self.attentionpool(h_domains) #(batch, D) => h_master
        net_x = torch.cat([h_master, mem], dim=1)
        h = F.relu(self.shared_net(net_x))

        #DOMAIN ALLOCATION (USE DIRICHLET DISTRIBUTION):
        logits = self.actor_port_alloc(h)
        # logits -> concentration parameters for Dirichlet
        raw_alpha = self.actor_port_alloc(h)                   # (batch, num_assets+1)
        alpha = F.softplus(raw_alpha) + 1e-3                   # ensure positivity
        alloc_distn = torch.distributions.Dirichlet(alpha)

        #MEMORY UPDATE SIGNAL
        mem_mean = self.mem_update_mean(h) #gives 2 means
        mem_logstd = self.mem_update_std(h).clamp(min=torch.log(torch.tensor(self.min_std))) #gives 2 log std
        mem_std = mem_logstd.exp() #calculate logstd to ensure that values are positive
        mem_update = Normal(mem_mean, mem_std)

        value = self.critic(h).squeeze(-1)

        return alloc_distn, mem_update, value

class MasterAgent(nn.Module):

    def __init__(
            self,
            num_domains : int,
            h_domain_dim : int,
            memory_dim : int = 1,
            min_std : float = 1e-3,
            hidden_dim : int = 128,
            lr = 3e-4,
            clip_eps = 0.2,
            c1 = 0.5, #value coefficient (multiplied with L^{value})
            c2 = 0.01, #entropy coefficient (multiplied with entropy of action distributions)
            device = "cpu"

    ):
        super().__init__()
        self.device = device
        self.policynet = MasterPolicyNet(
            num_domains = num_domains,
            h_domain_dim= h_domain_dim,
            memory_dim=memory_dim,
            min_std=min_std,
            hidden_dim=hidden_dim
        ).to(device)

        self.optimizer = optim.Adam(self.policynet.parameters(), lr=lr)

        self.clip_eps = clip_eps
        self.c1 = c1
        self.c2 = c2

        #Initialize memory variables:
        self.memory_dim = memory_dim
        self.memory = torch.zeros(1, memory_dim, device=device)  # (batch=1, memory_dim)

    def act(self, h_domains, mem):

        h_domains = h_domains.to(self.device)
        alloc_distn, mem_update_dist, value = self.policynet(h_domains,mem)

        #1. SAMPLING FROM DISTRIBUTIONS
        mem_update = mem_update_dist.sample()
        allocations = alloc_distn.rsample()

        #2. CALCULATING LOG PROBS
        alloc_log_prob = alloc_distn.log_prob(allocations)
        entropy = alloc_distn.entropy()
        mem_update_logprob = mem_update_dist.log_prob(mem_update).sum(-1)

        total_logprob = alloc_log_prob + mem_update_logprob

        return (allocations, mem_update), total_logprob, value, alloc_distn

    def evaluate(self, h_domains, master_memory, alloc, mem):

        h_domains = h_domains.to(self.device)

        alloc_distn, mem_update_dist, value = self.policynet(h_domains,master_memory)

        alloc_p = alloc_distn.log_prob(alloc)
        mem_upd_p = mem_update_dist.log_prob(mem).sum(-1)

        logprob = alloc_p + mem_upd_p

        alloc_entropy = alloc_distn.entropy()                # (batch,)
        mem_entropy = mem_update_dist.entropy().sum(-1)

        entropy = alloc_entropy + mem_entropy
        return logprob, entropy, value

    def update_policy(self, rollouts):

        """
            rollouts should be a dict with:
            - h_domains
            - master memory
            - actions = (a1, a2)
            - old_logprobs
            - returns
            - advantages

            this is basically all the data that we stored for the number of episodes during which we
            had freezed policy updation. Now we review our calculations.
            """

        h_domains = rollouts["h_domains"].to(self.device)
        master_mem = rollouts["master_memory"].to(self.device)
        allocations = rollouts["allocations"].to(self.device)
        mem_update = rollouts["mem_update"].to(self.device)
        old_logprobs = rollouts["old_logprobs"].to(self.device)
        returns = rollouts["returns"].to(self.device)
        advantages = rollouts["advantages"].to(self.device)
        #returns and advantages calculated in train.py file in training loop

        logprobs, entropy, values = self.evaluate(h_domains, master_mem, allocations, mem_update)
        ratios = torch.exp(logprobs - old_logprobs)

        surr1 = ratios * advantages
        surr2 = torch.clamp(ratios, 1 - self.clip_eps, 1 + self.clip_eps) * advantages
        actor_loss = -torch.min(surr1, surr2).mean()

        value_loss = (returns - values).pow(2).mean()
        loss = actor_loss + self.c1 * value_loss - self.c2* entropy

        #magic statements
        self.optimizer.zero_grad()
        loss = loss.mean() #taking mean across the entire batch
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policynet.parameters(), max_norm=0.5)

        self.optimizer.step()

        return {
            "loss": loss.item(),
            "actor_loss": actor_loss.mean().item(),
            "value_loss": value_loss.mean().item(),
            "entropy": entropy.mean().item()
        }

# training

In [31]:
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical, Normal
from torch.utils.data import Dataset, DataLoader, random_split

import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from arch import arch_model
from ta.trend import SMAIndicator, EMAIndicator
from ta.volatility import BollingerBands
from ta.momentum import RSIIndicator

# Import agents and preprocessing
# sys.path.append('src/agents')
# from asset_level_agent import AssetAgent, AssetRolloutBuffer
# from domain_level_agent import DomainAgent
# from master_agent import MasterAgent
# from helpernets import *
# from preprocessing import run  # import only run()

RED = '\033[91m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
ENDC = '\033[0m'
BRIGHT_MAGENTA = '\033[95m'
CYAN = '\033[96m'
WHITE = '\033[97m'
STD_BLUE = '\033[34m'
MAGENTA = '\033[35m'

#----------------------------------------------------------------------------------------
#VARIABLES

start = "2007-01-01"
end = "2024-12-31"
augment = False
otpt_show = True
batch_size = 20
split_ratio = 0.8
num_epochs = 10
total_portfolio_value = 100000

all_tickers = ['AAPL', 'GOOGL', 'MSFT', 'JPM', 'BAC', 'GS']
domain_wise_tickers = {
    "Tech": ['AAPL', 'GOOGL', 'MSFT'],
    "Finance": ['JPM', 'BAC', 'GS'],
}

domain_indices = {}
asset_indices = {}
d = 0

#ASSET INDICES MAPPING:
for domain, assets in domain_wise_tickers.items():
    a = 0
    domain_indices[domain] = d
    for asset in assets:
        asset_indices[asset] = a
        a+=1
    d +=1

num_domains= len(domain_wise_tickers.keys())

#----------------------------------------------------------------------------------------
#LOADING DATA

outdir = "output_data"
os.makedirs(outdir, exist_ok=True)

results = {}
missing_tickers = []

for ticker in all_tickers:
    filename = os.path.join(outdir, f"{ticker}_processed.csv")
    if os.path.exists(filename):
        print(GREEN + f"Loading preprocessed data for {ticker} from {filename}" + ENDC)
        df = pd.read_csv(filename, index_col=0, parse_dates=True)
        results[ticker] = {"data": df, "scalers": None}  # scalers not saved to CSV
    else:
        print(YELLOW + f"No preprocessed file for {ticker}, will run preprocessing" + ENDC)
        missing_tickers.append(ticker)

if missing_tickers:
    print(BLUE + f"Running preprocessing for missing tickers: {missing_tickers}" + ENDC)
    new_results = run(missing_tickers, start, end, augment, otpt_show)
    results.update(new_results)

seq_data = results
print(GREEN + "Final dataset ready!" + ENDC)
print(seq_data.keys())

#----------------------------------------------------------------------------------------
#PREPARING DATA FOR TRAINING AND TESTING

num_seq_cols = len(seq_data["AAPL"]["data"].keys())
num_seq_rows = len(seq_data["AAPL"]["data"])
split_idx = int(num_seq_rows * split_ratio)

def make_windows(seq_data, window_size = 60):
    windows = {}
    for asset, info in seq_data.items():

        df = info["data"]
        data = df.values #numpy array
        asset_windows = []
        for i in range(len(data) - window_size):
            seq = data[i : i+window_size]
            asset_windows.append(seq)

        windows[asset] = np.array(asset_windows)
    return windows, len(windows)

def split(windows, split_ratio=0.8):
    train_data = {}
    test_data = {}
    for asset, seqs in windows.items():
        n = len(seqs)
        split_idx = int(n*split_ratio)
        train_data[asset] = seqs[:split_idx]
        test_data[asset]  = seqs[split_idx:]
    return train_data, test_data


window_size = 60
all_windows, num_windows = make_windows(seq_data, window_size)
train_data, test_data = split(all_windows, split_ratio=0.8)
num_episodes = len(train_data["AAPL"].shape)
print(BLUE, "Train data shape: " , train_data["AAPL"].shape, ENDC) #(no_of_windows, window_timesteps, columns)
print(GREEN + "no of windows, window timesteps, columns" + ENDC)
print(BLUE, "Test data shape: ", test_data["AAPL"].shape, ENDC)

#----------------------------------------------------------------------------------------
#DEFINING ARCHITECTURES

num_assets = 3
"""number of assets per domain
Currently, we assume same number of assets for each domain - later to be changed to where the
inputs for pooling layer dimensions obtained "dynamically"

num_signal = 2 by default (for both asset to domain, domain to master)
master sends 1 allocation and domain also does."""


AssetAG = AssetAgent(seq_input_dim=num_seq_cols, non_seq_input_dim=2)
DomainAG = DomainAgent(num_assets, h_asset_dim=2, master_signal_dim=1)
MasterAG = MasterAgent(num_domains=2, h_domain_dim=2)

AssetBuffer = AssetRolloutBuffer()


# After creating agents: AssetAG, DomainAG, MasterAG
device_asset = AssetAG.device
device_domain = DomainAG.device
device_master = MasterAG.device

# memory dims (your agents initialize memory_dim attribute)
asset_memory_dim = 1
domain_memory_dim = DomainAG.memory_dim if hasattr(DomainAG, "memory_dim") else 1
master_memory_dim = MasterAG.memory_dim if hasattr(MasterAG, "memory_dim") else 1

# master memory as tensor (batch dim 1)
master_mem = torch.zeros(1, master_memory_dim, device=device_master, dtype=torch.float32)

# domain memories: list of tensors shaped (1, memory_dim)
domain_mem = [
    torch.zeros(1, domain_memory_dim, device=device_domain, dtype=torch.float32)
    for _ in range(num_domains)
]

# asset memories: list (per domain) of list (per asset) of tensors (1, memory_dim)
asset_mems = []
for domain, assets in domain_wise_tickers.items():
    am = [torch.zeros(1, asset_memory_dim, device=device_asset, dtype=torch.float32) for _ in assets]
    asset_mems.append(am)

#ALLOCATIONS ARE IN THE FORM OF RATIOS: TRUE ALLOCATIONS = ALLOCATIONS * PORTFOLIO VALUE

# domain_allocs: keep as list of 1-element tensors (shape (1,))
# domain_allocs = [
#     torch.tensor([1 / num_domains], dtype=torch.float32, device=device_domain)
#     for _ in range(num_domains)
# ]
domain_allocs = []
dtemp = [torch.tensor(1 / num_domains) for _ in range(num_domains+1)]
domain_allocs.append(dtemp)
#a tensor

print(BLUE, "domain allocs: ", domain_allocs, ENDC)

# asset_allocs: per-domain list of per-asset 1-element tensors
asset_allocs = []
for domain_idx, (domain, assets) in enumerate(domain_wise_tickers.items()):
    per_domain = [
        torch.tensor((1 / num_domains) / len(assets), dtype=torch.float32, device=device_asset)
        for _ in assets
    ]
    asset_allocs.append(per_domain)
#an array of tensors

print(BLUE, "asset allocs initialized: ", asset_allocs, ENDC)

# current holdings (keep as python numbers or tensors as you prefer)
domain_cur_holdings = [0 for _ in range(num_domains)]
asset_cur_holdings = [[0 for _ in assets] for assets in domain_wise_tickers.values()]


asset_mem_param = 0.7
domain_mem_param = 0.7
master_mem_param = 0.7

#----------------------------------------------------------------------------------------
#RETURN ESTIMATIONS:


def compute_returns(rewards, gamma=0.99):
    returns = []
    G = 0.0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    return torch.tensor(returns, dtype=torch.float32)

#----------------------------------------------------------------------------------------
#TRAINING

def train():
    global domain_allocs, asset_allocs, domain_mem, master_mem
    for epoch in range(num_epochs):

        #batches- PPO UPDATE OCCURS HERE
        print(GREEN + f"starting epoch {epoch}..." + ENDC)
        for start in range(0, num_windows, batch_size): #generating batches dynamically.

            end = start + batch_size
            batch_idxs = range(start, min(end, num_windows))

            batch_rewards = []
            # asset_buffer = {
            #     "seq_data": [],       # list of tensors: [seq_len, features]
            #     "non_seq_data": [],   # list of tensors: [non_seq_features]
            #     "actions": [],        # list of tuples: (a1, a2, a3, a4)
            #     "old_logprobs": [],   # list of tensors (scalar)
            #     "values": [],         # list of tensors (scalar)
            #     "rewards": []         # list of floats
            # }
            asset_buffer = {}


            # domain_buffer = {
            #     "h_assets" : [],
            #     "domain_memory" : [],
            #     "master_alloc" : [],
            #     "actions" : [],
            #     "old_logprobs" : [],
            #     "returns" : [],
            #     "advantages" : []
            # }

            domain_buffer = {}

            # master_buffer = {
            #     "h_domains" : [],
            #     "master_memory" : [],
            #     "actions" : [],
            #     "old_logprobs" : [],
            #     "returns" : [],
            #     "advantages" : []
            # }

            master_buffer = {}

            for window_idx in batch_idxs:

                asset_to_domain_signals = []  # [num_domains][num_assets][ast_to_dom_dim]
                domain_to_master_signals = []  # [num_domains,batch, dtom_dim]
                master_returns = 0

                for domain, assets in domain_wise_tickers.items():
                    domain_idx = domain_indices[domain]
                    asset_to_domain_sig_domain = []
                    r_t_domain = 0.0

                    for asset in assets:
                        asset_idx = asset_indices[asset]

                        seq_tensor = torch.tensor(train_data[asset][window_idx], dtype=torch.float32) #(window timesteps, columns)
                        #non_seq_tensor = torch.tensor([asset_allocs[domain_idx][asset_idx], asset_mems[domain_idx][asset_idx]], dtype=torch.float32)

                        # print(BLUE + "asset allocs: " , asset_allocs[domain_idx][asset_idx] , ENDC)
                        # print(BLUE + "asset mems: " , asset_mems[domain_idx][asset_idx] , ENDC)

                        alloc_val = asset_allocs[domain_idx][asset_idx]  #  this is not batch first when printed.
                        mem_val = asset_mems[domain_idx][asset_idx]      # this is in fact batch first.

                        if alloc_val.ndim == 1:
                            alloc_val = alloc_val.unsqueeze(-1)  # (batch, 1)

                        elif alloc_val.ndim == 0:
                            alloc_val = alloc_val.unsqueeze(0) .unsqueeze(0) # (batch, 1)
                        else:
                            pass

                        # print(BLUE + "val allocs: " , alloc_val , ENDC)
                        # print(BLUE + "val mems: " , mem_val , ENDC)

                        non_seq_tensor = torch.cat([alloc_val, mem_val], dim=-1)  # (batch, 2)

                        #1D tensor :- [asset_allocation value, asset memory value]

                        seq_in = seq_tensor.unsqueeze(0).to(AssetAG.device)  # [1, T, F]
                        #in PPO we are processing data in batches BUT we have to pass each batch instance one at a time, but since
                        #pytorch expects data as (batch, ...,...) we need to make faux batches.

                        non_seq_in = non_seq_tensor.to(AssetAG.device) #non_seq_tensor.unsqueeze(0).to(AssetAG.device) #(1, 1D tensor)

                        # print(RED, "seq_in shape: ", seq_in.shape, " non_seq_in shape: ", non_seq_in.shape, ENDC)


                        actions, total_logprob, value = AssetAG.act(seq_in, non_seq_in) #put in the form of batch first

                        bhs = actions[0].detach()
                        ast_to_dom = actions[1].detach() #detach completely removes the new tensor from the current computational graph
                        mem_update = actions[2].detach()
                        trade_frac = actions[3].detach() #not adding squeeze(0) so the first dimension is still batch.

                        with torch.no_grad():


                            # reward calc
                            p_t = train_data[asset][window_idx+1][0][0] #close price of first timestep of next window
                            if window_idx < num_windows - 1:
                                p_t_plus_1 = train_data[asset][window_idx][0][0]
                                frac = p_t_plus_1 / p_t
                                w_asset = asset_allocs[domain_idx][asset_idx]
                                # print(YELLOW + f"Asset: {asset}, p_t: {p_t}, p_t+1: {p_t_plus_1}, frac: {frac}, w_asset: {w_asset}" + ENDC)
                                r_t = w_asset * (frac - 1)
                                r_t_domain += r_t

                            asset_to_domain_sig_domain.append(ast_to_dom) #(assets, batch, ...) because the first dimension of every ast_to_dom
                    #is batch and we are appending various ast_to_doms in the list above.
                            asset_mems[domain_idx][asset_idx] += asset_mem_param * mem_update

                        #I JUST NEED TO STORE IN ROLLOUTS IN THE SAME FORMAT AS I HAD PASSED INTO ACT() BECAUSE THE UPDATE POLICY ONLY
                        #RECREATES THE SCENARIO

                        if(len(asset_buffer)==0):
                            # print(CYAN + "First value in rollouts being stored" + ENDC)
                            asset_buffer["seq_data"] = seq_in.clone()
                            asset_buffer["non_seq_data"] = non_seq_in.clone()
                            asset_buffer["bhs"] = bhs.clone()
                            asset_buffer["ast_to_dom"] = ast_to_dom.clone()
                            asset_buffer["mem_update"] = mem_update.clone()
                            asset_buffer["trade_frac"] = trade_frac.clone()
                            asset_buffer["old_logprobs"] = total_logprob.clone()
                            asset_buffer["values"] = value.clone()
                            # print(CYAN + "r_t: ", r_t.unsqueeze(0), ENDC)
                            asset_buffer["rewards"] = r_t.clone().unsqueeze(0) #need to put this in a batch first format maybe
                            # print(CYAN + "First asset done" + ENDC)
                        else:
                            print(RED, "asset buffer before: ", asset_buffer["seq_data"].shape, ENDC)
                            asset_buffer["seq_data"] = torch.cat([asset_buffer["seq_data"], seq_in.clone()], dim=0) #seq_tensor has batch first (batch, window timesteps, columns)
                            # print(CYAN + "seq_in shape stored: ", seq_in, ENDC)
                            print(RED, "asset buffer after: ", asset_buffer["seq_data"].shape, ENDC)
                            asset_buffer["non_seq_data"] = torch.cat([asset_buffer["non_seq_data"], non_seq_in.clone()], dim=0)
                            asset_buffer["bhs"] = torch.cat([asset_buffer["bhs"], bhs.clone()], dim=0)
                            asset_buffer["ast_to_dom"] = torch.cat([asset_buffer["ast_to_dom"], ast_to_dom.clone()], dim=0)
                            asset_buffer["mem_update"] = torch.cat([asset_buffer["mem_update"], mem_update.clone()], dim=0)
                            asset_buffer["trade_frac"] = torch.cat([asset_buffer["trade_frac"], trade_frac.clone()], dim=0)
                            asset_buffer["old_logprobs"] = torch.cat([asset_buffer["old_logprobs"], total_logprob.clone()], dim=0)
                            asset_buffer["values"] = torch.cat([asset_buffer["values"], value.clone()], dim=0)
                            # print(CYAN + "r_t: ", r_t.unsqueeze(0), ENDC)
                            asset_buffer["rewards"] = torch.cat([asset_buffer["rewards"], r_t.clone().unsqueeze(0)], dim=0) #need to put this in a batch first format maybe
                            # print(CYAN + "Asset done" + ENDC)

                    # if isinstance(asset_to_domain_sig_domain, list):
                    #     if isinstance(asset_to_domain_sig_domain[0], torch.Tensor):
                    #         asset_to_domain_sig_domain = torch.stack(asset_to_domain_sig_domain)
                    #         #joins all the elements of the array along dimension 0 (stack) => batch stack
                    #     else:
                    #         asset_to_domain_sig_domain = torch.tensor(asset_to_domain_sig_domain, dtype=torch.float32)

                    #i am skipping this block because i want to keep everything batch first to ensure uniformity
                    with torch.no_grad():
                        asset_to_domain_sig_domain = (
                            torch.stack(asset_to_domain_sig_domain)    # (num_assets, batch, dim)
                            .permute(1, 0, 2)                          # → (batch, num_assets, dim)
                            .to(DomainAG.device)
                        )

                        # if not asset_to_domain_sig_domain:
                        #     # Make a dummy tensor if empty to prevent crash
                        #     asset_to_domain_sig_domain = torch.zeros(
                        #         (1, len(domain_wise_tickers[domain]), DomainAG.h_asset_dim),
                        #         device=DomainAG.device
                        #     )
                        # else:
                        #     asset_to_domain_sig_domain = (
                        #         torch.stack(asset_to_domain_sig_domain)
                        #         .permute(1, 0, 2)
                        #         .to(DomainAG.device)
                        #     )

                    # print(CYAN, "asset to domain sig domain stacked" , asset_to_domain_sig_domain, ENDC)
                    alloc_val = domain_allocs[0][domain_idx]  #  this is not batch first when printed.
                    # print(GREEN, "alloc val: ", alloc_val, ENDC)

                    if alloc_val.ndim == 1:
                        alloc_val = alloc_val.unsqueeze(0)  # (batch, 1)

                    elif alloc_val.ndim == 0:
                        alloc_val = alloc_val.unsqueeze(0).unsqueeze(0) # (batch, 1) => adding 2 dims to match h_assets
                    elif alloc_val.ndim == 3:
                        alloc_val = alloc_val.squeeze(0)  #remove extra dim if already present
                    else:
                            pass
                    # print(GREEN, "alloc val: ", alloc_val, ENDC)
                    # print(BLUE + "h_assets: " , asset_to_domain_sig_domain , ENDC)
                    actions, total_logprob, value, alloc_distn = DomainAG.act(asset_to_domain_sig_domain, alloc_val, domain_mem[domain_idx])

                    with torch.no_grad():
                        allocations = actions[0].detach() #Keeping batch dimensions
                        print(GREEN , "all allocations (domain output)" , allocations, ENDC)
                        dtom = actions[1].detach()
                        mem_update = actions[2].detach()

                        asset_allocs[domain_idx] = allocations.squeeze(0)  #remove batch dim
                        domain_mem[domain_idx] += domain_mem_param * mem_update

                        if(len(domain_buffer)==0):
                            # print(CYAN, "First value in domain rollouts being stored: h_assets ", asset_to_domain_sig_domain, ENDC)
                            domain_buffer["h_assets"] = asset_to_domain_sig_domain.detach()
                            domain_buffer["domain_memory"] = domain_mem[domain_idx].detach()
                            domain_buffer["master_alloc"] = alloc_val.detach()
                            domain_buffer["allocations"] = allocations.detach()
                            domain_buffer["dtom"] = dtom.detach()
                            domain_buffer["mem_update"] = mem_update.detach()
                            domain_buffer["returns"] = r_t_domain.detach().unsqueeze(0)
                            domain_buffer["old_logprobs"] = total_logprob.detach()
                            domain_buffer["advantages"] = value.detach()
                        else:
                            # print(CYAN, "Next value in domain rollouts being stored: h_assets ", asset_to_domain_sig_domain, ENDC)
                            domain_buffer["h_assets"] = torch.cat([domain_buffer["h_assets"], asset_to_domain_sig_domain.detach()], dim=0)
                            # print(CYAN + "domain h_assets shape stored: ", asset_to_domain_sig_domain, ENDC)
                            domain_buffer["domain_memory"] = torch.cat([domain_buffer["domain_memory"], domain_mem[domain_idx].detach()], dim=0)
                            domain_buffer["master_alloc"] = torch.cat([domain_buffer["master_alloc"], alloc_val.detach()], dim=0)
                            domain_buffer["allocations"] = torch.cat([domain_buffer["allocations"], allocations.detach()], dim=0)
                            domain_buffer["dtom"] = torch.cat([domain_buffer["dtom"], dtom.detach()], dim=0)
                            domain_buffer["mem_update"] = torch.cat([domain_buffer["mem_update"], mem_update.detach()], dim=0)
                            domain_buffer["returns"] = torch.cat([domain_buffer["returns"], torch.tensor(r_t_domain).detach().unsqueeze(0)], dim=0)
                            domain_buffer["old_logprobs"] = torch.cat([domain_buffer["old_logprobs"], total_logprob.detach()], dim=0)
                            domain_buffer["advantages"] = torch.cat([domain_buffer["advantages"], value.detach()], dim=0)


                        domain_to_master_signals.append(dtom)
                        master_returns += domain_allocs[0][domain_idx] * r_t_domain
                        print(MAGENTA + "domain done" + ENDC)

                with torch.no_grad():
                    # master
                    domain_to_master_signals = (
                            torch.stack(domain_to_master_signals)    # [num_domains,batch, dtom_dim]
                            .permute(1, 0, 2)                          # → (batch, num_domains, dtom_dim)
                            .to(MasterAG.device)
                    )

                master_actions, total_logprob, value, alloc_distn = MasterAG.act(domain_to_master_signals, master_mem)

                with torch.no_grad():
                    allocations = master_actions[0].detach()  #keeping batch first => that's why not applying squeeze(0)
                    mem_update = master_actions[1].detach()
                    domain_allocs = allocations #keep as it is                     #DOUBT HERE ******
                    print(GREEN , "all allocations (master output)" , allocations, ENDC)
                    master_mem += master_mem_param * mem_update

                    if(len(master_buffer)==0):
                        # print(MAGENTA, "First value in master rollouts being stored: h_assets ", asset_to_domain_sig_domain, ENDC)
                        master_buffer["h_domains"] = domain_to_master_signals.detach()
                        master_buffer["master_memory"] = master_mem.detach()
                        master_buffer["allocations"] = allocations.detach()
                        master_buffer["mem_update"] = mem_update.detach()
                        master_buffer["returns"] = master_returns.detach().unsqueeze(0)
                        master_buffer["old_logprobs"] = total_logprob.detach()
                        master_buffer["advantages"] = value.detach()
                    else:
                        # print(MAGENTA, "Next value in master rollouts being stored: h_assets ", domain_to_master_signals, ENDC)
                        master_buffer["h_domains"] = torch.cat([master_buffer["h_domains"], domain_to_master_signals.detach()], dim=0)
                        master_buffer["master_memory"] = torch.cat([master_buffer["master_memory"], master_mem.detach()], dim=0)
                        master_buffer["allocations"] = torch.cat([master_buffer["allocations"], allocations.detach()], dim=0)
                        master_buffer["mem_update"] = torch.cat([master_buffer["mem_update"], mem_update.detach()], dim=0)
                        master_buffer["returns"] = torch.cat([master_buffer["returns"], torch.tensor(master_returns).detach().unsqueeze(0)], dim=0)
                        master_buffer["old_logprobs"] = torch.cat([master_buffer["old_logprobs"], total_logprob.detach()], dim=0)
                        master_buffer["advantages"] = torch.cat([master_buffer["advantages"], value.detach()], dim=0)

                print(WHITE + f"one asset-domain-master cycle (window: {window_idx}) done -----------------------------------------------------" + ENDC)


            print(BRIGHT_MAGENTA + "upadating all agents" + ENDC)
            AssetAG.update_policy(asset_buffer)
            print(MAGENTA + "asset agent updated" + ENDC)
            DomainAG.update_policy(domain_buffer)
            print(MAGENTA + "domain agent updated" + ENDC)
            MasterAG.update_policy(master_buffer)
            print(BRIGHT_MAGENTA + "updates done!" + ENDC)


[92mLoading preprocessed data for AAPL from output_data/AAPL_processed.csv[0m
[92mLoading preprocessed data for GOOGL from output_data/GOOGL_processed.csv[0m
[92mLoading preprocessed data for MSFT from output_data/MSFT_processed.csv[0m
[92mLoading preprocessed data for JPM from output_data/JPM_processed.csv[0m
[92mLoading preprocessed data for BAC from output_data/BAC_processed.csv[0m
[92mLoading preprocessed data for GS from output_data/GS_processed.csv[0m
[92mFinal dataset ready![0m
dict_keys(['AAPL', 'GOOGL', 'MSFT', 'JPM', 'BAC', 'GS'])
[94m Train data shape:  (3549, 60, 17) [0m
[92mno of windows, window timesteps, columns[0m
[94m Test data shape:  (888, 60, 17) [0m
[94m domain allocs:  [[tensor(0.5000), tensor(0.5000), tensor(0.5000)]] [0m
[94m asset allocs initialized:  [[tensor(0.1667), tensor(0.1667), tensor(0.1667)], [tensor(0.1667), tensor(0.1667), tensor(0.1667)]] [0m


In [8]:
seq_data["AAPL"]["data"].keys()

Index(['close', 'high', 'low', 'open', 'volume', 'return', 'volatility',
       'momentum', 'SMA_10', 'EMA_10', 'RSI_14', 'BB_High', 'BB_Low',
       'Log_Returns', 'Volatility_10', 'Hurst', 'GARCH_Vol'],
      dtype='object')

In [32]:
train()

[92mstarting epoch 0...[0m
[91m asset buffer before:  torch.Size([1, 60, 17]) [0m
[91m asset buffer after:  torch.Size([2, 60, 17]) [0m
[91m asset buffer before:  torch.Size([2, 60, 17]) [0m
[91m asset buffer after:  torch.Size([3, 60, 17]) [0m
[35mmaster signal:  tensor([[0.5000]]) [0m
[35mh_assets:  tensor([[[-0.9197,  0.6746],
         [-0.3599,  1.2104],
         [-1.5007,  1.4691]]]) [0m
[92mh_out from attention pool is:  tensor([[0.5220, 1.6478]], grad_fn=<SumBackward1>)
[93mattention pool done (domain agent)[0m
[94m h_sector tensor([[0.5220, 1.6478]], grad_fn=<SumBackward1>) [0m
[94m master_signal tensor([[0.5000]]) [0m
[94m mem tensor([[0.]]) [0m
[92m all allocations (domain output) tensor([[0.3839, 0.1371, 0.3135, 0.1656]]) [0m
[35mdomain done[0m
[91m asset buffer before:  torch.Size([3, 60, 17]) [0m
[91m asset buffer after:  torch.Size([4, 60, 17]) [0m
[91m asset buffer before:  torch.Size([4, 60, 17]) [0m
[91m asset buffer after:  torch.Size

  domain_buffer["returns"] = torch.cat([domain_buffer["returns"], torch.tensor(r_t_domain).detach().unsqueeze(0)], dim=0)
  master_buffer["returns"] = torch.cat([master_buffer["returns"], torch.tensor(master_returns).detach().unsqueeze(0)], dim=0)


[91m asset buffer before:  torch.Size([7, 60, 17]) [0m
[91m asset buffer after:  torch.Size([8, 60, 17]) [0m
[91m asset buffer before:  torch.Size([8, 60, 17]) [0m
[91m asset buffer after:  torch.Size([9, 60, 17]) [0m
[35mmaster signal:  tensor([[0.3030]]) [0m
[35mh_assets:  tensor([[[ 0.0252,  0.3754],
         [-0.3694,  1.0179],
         [-0.2295,  0.2936]]]) [0m
[92mh_out from attention pool is:  tensor([[0.6650, 1.0546]], grad_fn=<SumBackward1>)
[93mattention pool done (domain agent)[0m
[94m h_sector tensor([[0.6650, 1.0546]], grad_fn=<SumBackward1>) [0m
[94m master_signal tensor([[0.3030]]) [0m
[94m mem tensor([[0.4211]]) [0m
[92m all allocations (domain output) tensor([[0.2044, 0.0154, 0.0884, 0.6918]]) [0m
[35mdomain done[0m
[91m asset buffer before:  torch.Size([9, 60, 17]) [0m
[91m asset buffer after:  torch.Size([10, 60, 17]) [0m
[91m asset buffer before:  torch.Size([10, 60, 17]) [0m
[91m asset buffer after:  torch.Size([11, 60, 17]) [0m
[9