In [1]:
import numpy as np
import pandas as pd
import polars as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import xgboost as xgb
import random
from sklearn.multioutput import MultiOutputRegressor
from collections import deque
from copy import deepcopy


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Copying your exact evaluation function ---
# We will use this to score our agent at the end of each training epoch.
def calculate_competition_score(y_true_df: pd.DataFrame, y_pred_signals: np.ndarray) -> float:
    ''' Calculates the competition score based on true values and predicted signals. '''
    solution = y_true_df.copy() # Use .copy() to avoid SettingWithCopyWarning
    solution['position'] = y_pred_signals
    solution['strategy_returns'] = (
        solution['risk_free_rate'] * (1 - solution['position']) +
        solution['position'] * solution['forward_returns']
    )
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_geo_mean = (1 + strategy_excess_returns).prod() ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()
    
    # Handle zero std dev (e.g., if agent learns to always output 0)
    if strategy_std < 1e-8: return 0.0 
    
    trading_days_per_yr = 252
    sharpe = strategy_geo_mean / strategy_std * np.sqrt(trading_days_per_yr)
    market_std = solution['forward_returns'].std()
    market_volatility = market_std * np.sqrt(trading_days_per_yr) * 100
    strategy_volatility = strategy_std * np.sqrt(trading_days_per_yr) * 100
    
    # Handle zero market vol
    if market_volatility < 1e-8: 
        excess_vol = 0.0
    else:
        excess_vol = max(0, strategy_volatility / market_volatility - 1.2)
        
    vol_penalty = 1 + excess_vol
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_geo_mean = (1 + market_excess_returns).prod() ** (1 / len(solution)) - 1
    return_gap = max(0, (market_geo_mean - strategy_geo_mean) * 100 * trading_days_per_yr)
    return_penalty = 1 + (return_gap**2) / 100
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    
    # print(f"Strategy Vol: {strategy_volatility:.2f}%, Market Vol: {market_volatility:.2f}%, Sharpe: {sharpe:.4f}, Adj Sharpe: {adjusted_sharpe:.4f}")
    return adjusted_sharpe

# class RlTradingEnv:
#     """
#     An RL environment built from your data pipeline.
    
#     It provides a per-step reward based on the 'target' column.
#     It uses your 'calculate_competition_score' for final evaluation.
#     """
#     """
#     An RL environment built from your data pipeline.
#     (With nan_fallback_counter)
#     """
#     def __init__(self, features, targets, scorer_info, transaction_cost=0.0001):
#         self.features = features.to_numpy()
#         self.targets = targets.to_numpy().flatten()
#         self.scorer_info_df = scorer_info.to_pandas() 
        
#         self.transaction_cost = transaction_cost
#         self.reward_scale = 100.0  # Let's keep the scaling for when it works
        
#         self.n_steps = len(self.features)
#         self.n_features = self.features.shape[1]
#         self.action_space_dim = 1
#         self.max_action = 2.0
        
#         self.current_step = 0
#         self.last_leverage = 0.0
        
#         # --- ADD COUNTER ---
#         self.nan_fallback_counter = 0

#     def reset(self):
#         """Resets the environment and returns the first state."""
#         self.current_step = 0
#         self.last_leverage = 0.0
        
#         # --- RESET COUNTER ---
#         self.nan_fallback_counter = 0
#         return self.features[self.current_step]

#     def step(self, action_leverage):
#         """
#         Takes an action, calculates reward, and returns the next state.
#         (With NaN/inf failsafe and counter)
#         """
#         if self.current_step >= self.n_steps - 2:
#             return self.features[self.current_step], 0.0, True

#         leverage = np.clip(action_leverage, 0.0, self.max_action)[0]
#         target_excess_return = self.targets[self.current_step]
        
#         cost = self.transaction_cost * abs(leverage - self.last_leverage)
#         reward = (leverage * target_excess_return) - cost
        
#         if np.isnan(reward) or np.isinf(reward):
#             reward = 0.0 
#             # --- INCREMENT COUNTER ---
#             self.nan_fallback_counter += 1

#         self.last_leverage = leverage
#         self.current_step += 1
#         next_state = self.features[self.current_step]
#         done = (self.current_step == self.n_steps - 2)
        
#         return next_state, (reward * self.reward_scale), done


#     def run_evaluation(self, policy_actor):
#         """
#         Runs a full backtest with a given policy (actor)
#         and returns the final adjusted Sharpe score.
#         """
#         state = self.reset()
#         signals = []
#         # --- FIX IS HERE ---
#         # If we have a torch actor, get its device (e.g., 'cuda:0')
#         actor_device = None
#         if isinstance(policy_actor, Actor):
#             actor_device = next(policy_actor.parameters()).device
#         # --- END FIX ---
        
#         for t in range(self.n_steps - 1):
#             if isinstance(policy_actor, Actor): # It's our MLP/TD3 actor
#                 # --- FIX IS HERE ---
#                 # Create tensor and move it to the actor's device
#                 state_tensor = torch.FloatTensor(state).unsqueeze(0).to(actor_device)
#                 # --- END FIX ---
#                 #state_tensor = torch.FloatTensor(state).unsqueeze(0)
#                 action = policy_actor(state_tensor).cpu().data.numpy().flatten()
#             else: # It's our XGBoost agent
#                 action = [policy_actor.select_action(state, exploration_rate=0.0)]
                
#             signals.append(np.clip(action, 0.0, self.max_action)[0])
            
#             # We don't need reward here, just the next state
#             if t < self.n_steps - 2:
#                 state = self.features[t + 1]
        
#         # Score the *entire* run
#         # We need to trim the scorer_info_df to match the length of signals
#         scorer_df_trimmed = self.scorer_info_df.iloc[:len(signals)]
#         return calculate_competition_score(scorer_df_trimmed, np.array(signals))

class RlTradingEnv:
    """
    An RL environment with a HYBRID reward:
    (1 - lambda) * DSR_reward + (lambda) * Return_reward
    """
    def __init__(self, features, targets, scorer_info, transaction_cost=0.0001):
        self.features = features.to_numpy()
        self.targets = targets.to_numpy().flatten()
        self.scorer_info_df = scorer_info.to_pandas() 
        
        self.transaction_cost = transaction_cost
        
        self.n_steps = len(self.features)
        self.n_features = self.features.shape[1]
        self.action_space_dim = 1
        self.max_action = 2.0
        
        # --- REWARD HYPERPARAMETERS ---
        # 0.0 = all DSR, 1.0 = all Return
        self.reward_lambda = 1
        self.return_reward_scale = 100.0 # Scale for the "fast" return reward
        
        # --- DSR STATE VARIABLES ---
        self.dsr_eta = 0.01  # Learning rate for the EMAs
        self.dsr_a = 0.0     # EMA of returns
        self.dsr_b = 0.0     # EMA of squared returns
        
        self.current_step = 0
        self.last_leverage = 0.0
        self.nan_fallback_counter = 0

    def reset(self):
        """Resets the environment and returns the first state."""
        self.current_step = 0
        self.last_leverage = 0.0
        self.nan_fallback_counter = 0
        
        self.reward_lambda = self.reward_lambda/1.02
        
        # --- RESET DSR STATE ---
        self.dsr_a = 0.0
        self.dsr_b = 1e-4 # Small value to avoid div by zero
        
        return self.features[self.current_step]

    def _calculate_dsr(self, R_p):
        """
        Calculates the Differential Sharpe Ratio for a given portfolio return.
        This is the "slow" risk-adjusted reward.
        """
        # 1. Update the EMAs
        self.dsr_a = self.dsr_a + self.dsr_eta * (R_p - self.dsr_a)
        self.dsr_b = self.dsr_b + self.dsr_eta * (R_p**2 - self.dsr_b)
        
        std_dev = np.sqrt(self.dsr_b - self.dsr_a**2)
        
        if std_dev < 1e-8:
            return 0.0

        # 3. Calculate the DSR reward
        dsr_reward = ( (R_p - self.dsr_a) * self.dsr_b - 0.5 * (R_p**2 - self.dsr_b) * self.dsr_a ) \
                     / ( (self.dsr_b - self.dsr_a**2)**1.5 + 1e-6)
        
        return dsr_reward

    def step(self, action_leverage):
        """
        Takes an action, calculates reward, and returns the next state.
        (With HYBRID reward)
        """
        if self.current_step >= self.n_steps - 2:
            return self.features[self.current_step], 0.0, True

        leverage = np.clip(action_leverage, 0.0, self.max_action)[0]
        target_excess_return = self.targets[self.current_step]
        
        # --- 1. Calculate Portfolio Return (R_p) ---
        cost = self.transaction_cost * abs(leverage - self.last_leverage)
        portfolio_return = (leverage * target_excess_return) - cost
        
        # --- 2. Handle NaNs (Failsafe) ---
        if np.isnan(portfolio_return) or np.isinf(portfolio_return):
            portfolio_return = 0.0 
            self.nan_fallback_counter += 1

        # --- 3. Calculate the two reward components ---
        
        # "Fast" Reward: Simple scaled return
        # This encourages the agent to make *any* profitable trade
        fast_reward = portfolio_return * self.return_reward_scale
        
        # "Slow" Reward: DSR
        # This encourages the agent to make *risk-adjusted* trades
        slow_reward = self._calculate_dsr(portfolio_return)
        
        # Handle NaN in DSR calculation (though unlikely if R_p is clean)
        if np.isnan(slow_reward) or np.isinf(slow_reward):
            slow_reward = 0.0

        # --- 4. Combine rewards ---
        final_reward = (1.0 - self.reward_lambda) * slow_reward + self.reward_lambda * fast_reward

        if leverage < 1e-3:
            final_reward -= 0.1  # Penalize doing nothing a bit
        
        # 5. Update state
        self.last_leverage = leverage
        self.current_step += 1
        next_state = self.features[self.current_step]
        done = (self.current_step == self.n_steps - 2)
        
        return next_state, final_reward, done

    # ... (run_evaluation is unchanged) ...
    def run_evaluation(self, policy_actor):
        """
        Runs a full backtest with a given policy (actor)
        and returns the final adjusted Sharpe score.
        """
        state = self.reset()
        signals = []
        # --- FIX IS HERE ---
        # If we have a torch actor, get its device (e.g., 'cuda:0')
        actor_device = None
        if isinstance(policy_actor, Actor):
            actor_device = next(policy_actor.parameters()).device
        # --- END FIX ---
        
        for t in range(self.n_steps - 1):
            if isinstance(policy_actor, Actor): # It's our MLP/TD3 actor
                # --- FIX IS HERE ---
                # Create tensor and move it to the actor's device
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(actor_device)
                # --- END FIX ---
                #state_tensor = torch.FloatTensor(state).unsqueeze(0)
                action = policy_actor(state_tensor).cpu().data.numpy().flatten()
            else: # It's our XGBoost agent
                action = [policy_actor.select_action(state, exploration_rate=0.0)]
                
            signals.append(np.clip(action, 0.0, self.max_action)[0])
            
            # We don't need reward here, just the next state
            if t < self.n_steps - 2:
                state = self.features[t + 1]
        
        # Score the *entire* run
        # We need to trim the scorer_info_df to match the length of signals
        scorer_df_trimmed = self.scorer_info_df.iloc[:len(signals)]
        return calculate_competition_score(scorer_df_trimmed, np.array(signals))

Using device: cuda


In [2]:
# --- Replay Buffer (for off-policy learning) ---
class ReplayBuffer:
    """A simple replay buffer, as requested for 'random subset' training."""
    def __init__(self, max_size=1_000_000):
        self.buffer = deque(maxlen=max_size)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        
        return (np.array(states), np.array(actions), 
                np.array(rewards).reshape(-1, 1), 
                np.array(next_states), 
                np.array(dones).reshape(-1, 1))

    def __len__(self):
        return len(self.buffer)

# --- Data Loading (Adapted from your notebook) ---
# We need to copy the feature generation functions from your notebook
# I'll just copy generate_features_7 as it's the last one you used.

def generate_features_7 (df: pl.DataFrame) -> pl.DataFrame:
  """Generates new features from the base data.
    This function is the target of the evolutionary algorithm.
  
    Available Feature Categories:
    - D* (Dummy/Binary features): 9 columns (D1-D9)
    - E* (Macro Economic features): 20 columns (E1-E20)
    - I* (Interest Rate features): 9 columns (I1-I9)
    - M* (Market Dynamics/Technical features): 18 columns (M1-M18)
    - P* (Price/Valuation features): 13 columns (P1-P13)
    - S* (Sentiment features): 12 columns (S1-S12)
    - V* (Volatility features): 13 columns (V1-V13)
  """
  new_features = pl.DataFrame({
      # --- 20 Pairwise Interactions ---
      'feat_M1_x_V1': df['M1'] * df['V1'],
      'feat_P1_add_E1': df['P1'] + df['E1'],
      'feat_S1_sub_I1': df['S1'] - df['I1'],
      'feat_M10_div_V10': df['M10'] / (df['V10'] + 1e-6),
      'feat_P10_x_E10': df['P10'] * df['E10'],
      'feat_M2_x_S3': df['M2'] * df['S3'],
      'feat_V2_div_P2': df['V2'] / (df['P2'] + 1e-6),
      'feat_E4_sub_I3': df['E4'] - df['I3'],
      'feat_S7_add_M12': df['S7'] + df['M12'],
      'feat_I5_x_V11': df['I5'] * df['V11'],
      'feat_P5_div_S8': df['P5'] / (df['S8'] + 1e-6),
      'feat_E12_x_I9': df['E12'] * df['I9'],
      'feat_M1_div_S1': df['M1'] / (df['S1'] + 1e-6),
      'feat_V1_add_P1': df['V1'] + df['P1'],
      'feat_E1_sub_I1': df['E1'] - df['I1'],
      'feat_M2_div_V2': df['M2'] / (df['V2'] + 1e-6),
      'feat_P2_x_S3': df['P2'] * df['S3'],
      'feat_E4_add_M10': df['E4'] + df['M10'],
      'feat_I3_sub_V10': df['I3'] - df['V10'],
      'feat_S7_x_P10': df['S7'] * df['P10'],
      # --- 10 Rolling Window Features ---
      'feat_V2_roll_mean_5': df['V2'].rolling_mean(window_size=5),
      'feat_V1_roll_std_5': df['V1'].rolling_std(window_size=5),
      'feat_M1_roll_mean_20': df['M1'].rolling_mean(window_size=20),
      'feat_M3_roll_std_20': df['M3'].rolling_std(window_size=20),
      'feat_P1_roll_max_10': df['P1'].rolling_max(window_size=10),
      'feat_P1_roll_min_10': df['P1'].rolling_min(window_size=10),
      'feat_E5_roll_mean_50': df['E5'].rolling_mean(window_size=50),
      'feat_S1_roll_std_50': df['S1'].rolling_std(window_size=50),
      'feat_I1_roll_mean_10': df['I1'].rolling_mean(window_size=10),
      'feat_V10_roll_std_10': df['V10'].rolling_std(window_size=10),
      # --- 10 Complex Interactions (3+ elements) ---
      'feat_M1_V1_div_P1': (df['M1'] * df['V1']) / (df['P1'] + 1e-6),
      'feat_E1_S1_add_I1': df['E1'] + df['S1'] - df['I1'],
      'feat_M2_P2_sub_V2': df['M2'] + df['P2'] - df['V2'],
      'feat_S7_div_E4_I3': df['S7'] / (df['E4'] + df['I3'] + 1e-6),
      'feat_P5_x_M10_x_V10': df['P5'] * df['M10'] * df['V10'],
      'feat_roll_diff_M1_5_20': df['M1'].rolling_mean(window_size=5) - df['M1'].rolling_mean(window_size=20),
      'feat_roll_diff_V1_5_20': df['V1'].rolling_mean(window_size=5) - df['V1'].rolling_mean(window_size=20),
      'feat_M_S_P_combo': (df['M12'] - df['M1']) / (df['S1'] + df['P1'] + 1e-6),
      'feat_V_E_I_combo': (df['V11'] + df['V2']) * (df['E1'] - df['I1']),
      'feat_ratio_of_ratios': (df['M1']/(df['V1']+1e-6)) / (df['P1']/(df['S1']+1e-6)),
      # --- 10 New Features ---
      'feat_M1_x_V1_x_P1': df['M1'] * df['V1'] * df['P1'],
      'feat_E1_div_S1': df['E1'] / (df['S1'] + 1e-6),
      'feat_I1_sub_V1': df['I1'] - df['V1'],
      'feat_M10_add_V10': df['M10'] + df['V10'],
      'feat_P10_div_E10': df['P10'] / (df['E10'] + 1e-6),
      'feat_M2_add_S3': df['M2'] + df['S3'],
      'feat_V2_x_P2': df['V2'] * df['P2'],
      'feat_E4_add_I3': df['E4'] + df['I3'],
      'feat_S7_div_M12': df['S7'] / (df['M12'] + 1e-6),
      'feat_I5_div_V11': df['I5'] / (df['V11'] + 1e-6),
      #'feat_M1_log_P1': np.log(df['M1'] + 1e-6) / np.log(df['P1'] + 1e-6),
      # --- SAFER LOGIC HERE ---
      #'feat_M1_log_P1': pl.when( (df['M1'] > 0) & (df['P1'] > 0) & (df['P1'] != 1) ).then( df['M1'].log() / df['P1'].log() ).otherwise(0),
      # --- END SAFER LOGIC ---
  })
  # Fill any nulls created by rolling windows
  return new_features.with_columns(pl.all().forward_fill())



def load_and_prep_data(train_path, spy_path, slice_start=2000, test_days=252):
    """
    Loads and processes data just like your notebook.
    (Robust version to handle NaNs and infs)
    """
    
    # 1. Load base data and slice
    full_train_df = pl.read_csv(train_path)
    df_raw = full_train_df.slice(slice_start)
    
    # 2. Basic cleaning from notebook
    # Use fill_null(0.0) instead of 'mean' to avoid propagating bad data
    df = df_raw.with_columns(pl.selectors.numeric().fill_null(0.0))
    df = df.with_columns(pl.col("date_id").cast(pl.Int64))

    # 3. Add weekday feature
    spy_df = pl.read_csv(spy_path)
    weekday_df = spy_df.with_columns(
        pl.col("Date").str.to_date().dt.weekday().alias("weekday")
    ).select(["date_id", "weekday"])
    
    df_with_weekday = df.join(weekday_df, on="date_id", how="left")
    
    # 4. Prep for feature generation
    base_df = df_with_weekday.rename({'market_forward_excess_returns': 'target'})
    feature_cols = [col for col in base_df.columns if col != 'date_id']
    base_df = base_df.with_columns(pl.col(feature_cols).cast(pl.Float64, strict=False))
    
    if 'E7' in base_df.columns:
        base_df = base_df.drop('E7')
        
    base_df = base_df.with_columns(pl.all().forward_fill().backward_fill())

    # 5. Generate and combine features (using corrected function)
    new_features_df = generate_features_7(base_df)
    processed_df = pl.concat([base_df, new_features_df], how="horizontal")
    
    # 6. Finalize X, y, and scorer_info
    base_features = [col for col in base_df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
    new_feature_names = new_features_df.columns
    ALL_FEATURES = base_features + new_feature_names
    
    X = processed_df.select(ALL_FEATURES)
    y = processed_df.select("target")
    scorer_info_df = processed_df.select(["forward_returns", "risk_free_rate"])
    
    # --- FINAL ROBUST CLEANING ---
    Xy = pl.concat([X, y, scorer_info_df], how="horizontal")
    Xy = Xy.with_columns(pl.all().replace([np.inf, -np.inf], None))
    original_rows = Xy.height
    Xy = Xy.drop_nulls()
    cleaned_rows = Xy.height
    # --- END FINAL CLEANING ---
    
    print(f"Data ready. Original rows: {original_rows}, Cleaned rows: {cleaned_rows}")
    
    # --- NEW: SPLIT DATA ---
    if cleaned_rows <= test_days:
        raise ValueError(f"Not enough data. Cleaned rows ({cleaned_rows}) must be greater than test_days ({test_days}).")
        
    Xy_train = Xy.slice(0, -test_days)
    Xy_test = Xy.slice(-test_days)
    
    print(f"Splitting data: {Xy_train.height} train rows, {Xy_test.height} test rows.")
    
    # Deconstruct Train Set
    X_train = Xy_train.select(ALL_FEATURES)
    y_train = Xy_train.select("target")
    scorer_info_train = Xy_train.select(["forward_returns", "risk_free_rate"])
    
    # Deconstruct Test Set
    X_test = Xy_test.select(ALL_FEATURES)
    y_test = Xy_test.select("target")
    scorer_info_test = Xy_test.select(["forward_returns", "risk_free_rate"])
    
    return X_train, y_train, scorer_info_train, X_test, y_test, scorer_info_test
    
    return X, y, scorer_info_df

In [3]:
# --- Define Actor and Critic MLP Networks ---

class Actor(nn.Module):
    """
    Actor Network (Policy) - Maps State -> Action
    (With proper final layer initialization)
    """
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.layer_1 = nn.Linear(state_dim, 256)
        self.layer_2 = nn.Linear(256, 256)
        self.layer_3 = nn.Linear(256, action_dim)
        self.max_action = max_action
        
        # --- ADD INITIALIZATION ---
        # Initialize final layer weights to be small
        # This makes the tanh output close to 0, which scales to leverage 1.0
        # This prevents the policy from starting at 0.0 leverage.
        init_w = 3e-3
        torch.nn.init.uniform_(self.layer_3.weight, -init_w, init_w)
        torch.nn.init.uniform_(self.layer_3.bias, -init_w, init_w)
        # --- END INITIALIZATION ---
    
    def forward(self, x):
        x = F.relu(self.layer_1(x))
        x = F.relu(self.layer_2(x))
        x_tanh = torch.tanh(self.layer_3(x))
        x_scaled = (x_tanh + 1) * 0.5 
        return x_scaled * self.max_action

class Critic(nn.Module):
    """
    Critic Network (Q-Value) - Maps (State, Action) -> Q-Value
    """
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        # Q1 network
        self.l1 = nn.Linear(state_dim + action_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, 1)
        
        # Q2 network
        self.l4 = nn.Linear(state_dim + action_dim, 256)
        self.l5 = nn.Linear(256, 256)
        self.l6 = nn.Linear(256, 1)
        
    def forward(self, x, u):
        xu = torch.cat([x, u], 1)
        
        q1 = F.relu(self.l1(xu))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        
        q2 = F.relu(self.l4(xu))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2
    
    def Q1(self, x, u):
        xu = torch.cat([x, u], 1)
        q1 = F.relu(self.l1(xu))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1

# --- TD3 Agent ---

class TD3Agent:
    def __init__(self, state_dim, action_dim, max_action, device, gamma=0.99, 
                 tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
        
        # --- MODIFY THIS BLOCK ---
        self.device = device
        self.actor = Actor(state_dim, action_dim, max_action).to(self.device)
        self.actor_target = deepcopy(self.actor).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        
        self.critic = Critic(state_dim, action_dim).to(self.device)
        self.critic_target = deepcopy(self.critic).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)
        # --- END MODIFY ---

        self.max_action = max_action
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.total_it = 0

    def select_action(self, state):
        # --- MODIFY THIS LINE ---
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        # --- END MODIFY ---
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer, batch_size=100):
        self.total_it += 1
        
        # 1. Sample from buffer
        state, action, reward, next_state, done = replay_buffer.sample(batch_size)

        # --- MODIFY THIS BLOCK ---
        # Move all tensors to the GPU
        state = torch.FloatTensor(state).to(self.device)
        action = torch.FloatTensor(action).to(self.device)
        reward = torch.FloatTensor(reward).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        done = torch.FloatTensor(done).to(self.device)
        # --- END MODIFY ---
        
        # state = torch.FloatTensor(state)
        # action = torch.FloatTensor(action)
        # reward = torch.FloatTensor(reward)
        # next_state = torch.FloatTensor(next_state)
        # done = torch.FloatTensor(done)

        # 2. Get target action from target actor, add noise
        with torch.no_grad():
            noise = (torch.randn_like(action) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
            next_action = (self.actor_target(next_state) + noise).clamp(0, self.max_action)

            # 3. Compute target Q-value (Clipped Double-Q)
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * target_Q

        # 4. Update Critic networks
        current_Q1, current_Q2 = self.critic(state, action)
        
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # 5. Delayed Policy (Actor) update
        if self.total_it % self.policy_freq == 0:
            
            # Compute actor loss
            actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
            
            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # 6. Soft update target networks
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
            
            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

In [4]:
class XGBoost_FQI_Agent:
    """
    Fitted Q-Iteration (FQI) agent using XGBoost.
    This requires a DISCRETE action space.
    """
    def __init__(self, state_dim, gamma=0.99):
        # 1. DISCRETIZE the action space
        self.action_space = np.linspace(0.0, 2.0, num=10)  # 21 discrete actions from 0.0 to 2.0
        self.n_actions = len(self.action_space)
        print(f"XGBoost agent initialized with discrete actions: {self.action_space}")
        
        self.state_dim = state_dim
        self.gamma = gamma
        
        # We use a MultiOutputRegressor to fit one XGB model per action
        self.model = MultiOutputRegressor(
            xgb.XGBRegressor(
                objective='reg:squarederror', n_estimators=100,
                learning_rate=0.05, max_depth=5, subsample=0.8, 
                colsample_bytree=0.8, n_jobs=-1, random_state=42,
                # --- THIS IS THE ONLY CHANGE NEEDED ---
                device='cuda' 
            )
        )
        # Fit with dummy data to initialize
        self.model.fit(np.random.rand(1, state_dim), np.random.rand(1, self.n_actions))

    def select_action(self, state, exploration_rate=0.1):
        """Selects the best action (epsilon-greedy)."""
        if np.random.rand() < exploration_rate:
            return np.random.choice(self.action_space)
        
        q_values = self.model.predict(state.reshape(1, -1))[0]
        best_action_idx = np.argmax(q_values)
        return self.action_space[best_action_idx]

    def train_fqi(self, replay_buffer, iterations=3):
        """
        Trains the XGBoost model using Fitted Q-Iteration.
        This is an OFFLINE, BATCH process.
        """
        print("\n--- Starting XGBoost FQI Training ---")
        
        if len(replay_buffer) < 1000:
             print("Buffer not full enough. Need at least 1000 samples.")
             return
        
        # Use the *entire* buffer, as FQI is a batch algorithm
        states, actions, rewards, next_states, dones = replay_buffer.sample(len(replay_buffer))
        
        for k in range(iterations):
            print(f"FQI Iteration {k+1}/{iterations}...")
            
            # 1. Calculate the target Q-value (Bellman update)
            # Q_target(s, a) = r + gamma * max_a'(Q_k(s', a'))
            
            # Predict Q(s', a') for all next_states
            next_q_values = self.model.predict(next_states)
            
            # Find max_a'(Q_k(s', a'))
            max_next_q = np.max(next_q_values, axis=1)
            
            # The target: r + gamma * max_Q (or just r if done)
            target_q = rewards.flatten() + (1.0 - dones.flatten()) * self.gamma * max_next_q
            
            # 2. Create the training set for the *new* XGBoost model
            # We want to train Q_k+1(s, a) -> target_q
            X_train = states
            
            # Start with the model's current predictions as the base
            y_train = self.model.predict(X_train)
            
            # Update y_train at the index of the action *actually taken*
            for i in range(len(states)):
                action_taken = actions[i][0]
                # Find the index of the action in our discrete space
                action_idx = np.where(self.action_space == action_taken)[0]
                
                if len(action_idx) > 0:
                    action_idx = action_idx[0]
                    y_train[i, action_idx] = target_q[i]

            # 4. Train a new XGBoost model on these (s, a) -> target_q pairs
            print("Fitting new XGBoost model...")
            self.model.fit(X_train, y_train)
            
        print("--- FQI Training Complete ---")

In [5]:
# --- Main execution script ---


# --- 1. Load Data and Init Env ---
print("Loading and preparing data...")
TRAIN_DATA_PATH = "./kaggle/train.csv"
SPY_DATA_PATH = "./kaggle/spy-historical.csv"

# --- MODIFIED: Load split data ---
X_train, y_train, scorer_info_train, X_test, y_test, scorer_info_test = \
    load_and_prep_data(TRAIN_DATA_PATH, SPY_DATA_PATH, slice_start=2000, test_days=252)

# --- MODIFIED: Create two environments ---
train_env = RlTradingEnv(X_train, y_train, scorer_info_train, transaction_cost=0.000001)
test_env = RlTradingEnv(X_test, y_test, scorer_info_test, transaction_cost=0.000001)

# Use train_env dimensions for the agent
state_dim = train_env.n_features
action_dim = train_env.action_space_dim
max_action = train_env.max_action

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")


# --- 2. Train MLP (TD3) Agent ---
print("\n" + "="*50)
print("Training MLP (TD3) Agent")
print("="*50)

agent_td3 = TD3Agent(state_dim, action_dim, max_action, device=device)
buffer_td3 = ReplayBuffer(max_size=500_000)

N_EPISODES = 0
BATCH_SIZE = 64
START_TRAINING = 1000 
EVAL_EVERY = 1

best_eval_score = -np.inf # Track the best test score

for ep in range(N_EPISODES):
    # --- MODIFIED: Train on train_env ---
    state = train_env.reset()
    done = False
    ep_reward_sum = 0
    
    while not done:
        if len(buffer_td3) < START_TRAINING:
            action = np.random.uniform(0, max_action, size=(action_dim,))
        else:
            action = (
                agent_td3.select_action(state)
                + np.random.normal(0, max_action * 0.1, size=action_dim)
            ).clip(0, max_action)
        
        # --- MODIFIED: Step the train_env ---
        next_state, reward, done = train_env.step(action)
        buffer_td3.add(state, action, reward, next_state, done)
        
        ep_reward_sum += reward
        state = next_state
        
        if len(buffer_td3) > START_TRAINING:
            agent_td3.train(buffer_td3, BATCH_SIZE)
    
    print(f"Episode {ep+1}/{N_EPISODES} | Train Reward: {ep_reward_sum:.2f}")
    #print(f"    Train Fallbacks: {train_env.nan_fallback_counter} times")
    
    # --- MODIFIED: Evaluate on test_env ---
    if (ep + 1) % EVAL_EVERY == 0:
        eval_score = test_env.run_evaluation(agent_td3.actor)
        print(f"--- TEST EVAL | Episode {ep+1} | Adjusted Sharpe: {eval_score:.4f} ---")
        
        if eval_score > best_eval_score:
            best_eval_score = eval_score
            # You would save your model here, e.g.:
            # torch.save(agent_td3.actor.state_dict(), 'best_actor.pth')
            print(f"    New best test score! Model saved.")

print(f"TD3 Training Complete. Best Test Sharpe: {best_eval_score:.4f}")


# # --- 3. Train XGBoost (FQI) Agent ---
# print("\n" + "="*50)
# print("Training XGBoost (FQI) Agent")
# print("="*50)

# agent_fqi = XGBoost_FQI_Agent(state_dim, gamma=0.99)
# buffer_fqi = ReplayBuffer(max_size=500_000) 

# print("Populating buffer for FQI using train_env...")
# # --- MODIFIED: Use train_env to populate buffer ---
# state = train_env.reset()
# done = False
# while not done:
#     action = np.random.choice(agent_fqi.action_space)
#     next_state, reward, done = train_env.step(np.array([action]))
#     buffer_fqi.add(state, np.array([action]), reward, next_state, done)
#     state = next_state

# # Run the offline FQI training (trains on train_env data)
# agent_fqi.train_fqi(buffer_fqi, iterations=3)

# # --- MODIFIED: Evaluate on test_env ---
# fqi_eval_score = test_env.run_evaluation(agent_fqi)
# print(f"--- FQI FINAL TEST EVAL | Adjusted Sharpe: {fqi_eval_score:.4f} ---")

Loading and preparing data...
Data ready. Original rows: 6990, Cleaned rows: 6941
Splitting data: 6689 train rows, 252 test rows.
Using device: cuda

Training MLP (TD3) Agent
TD3 Training Complete. Best Test Sharpe: -inf


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import numpy as np
import copy # Use copy.deepcopy for target networks

# --- 1. Define the SAC Actor (Policy) Network ---
# This is different from the TD3 actor. It outputs a distribution.

class ActorSAC(nn.Module):
    """
    SAC Actor Network (Policy) - Maps State -> Action Distribution
    """
    def __init__(self, state_dim, action_dim, max_action):
        super(ActorSAC, self).__init__()
        self.layer_1 = nn.Linear(state_dim, 368)
        self.layer_2 = nn.Linear(368, 368)
        self.mean = nn.Linear(368, action_dim)      # Outputs mean of distribution
        self.log_std = nn.Linear(368, action_dim) # Outputs log std dev
        
        self.max_action = max_action
        
        # Initialize final layers for stable starting policy
        # (similar to what we did for TD3)
        init_w = 1e-2
        torch.nn.init.uniform_(self.mean.weight, -init_w, init_w)
        torch.nn.init.uniform_(self.mean.bias, -init_w, init_w)
        torch.nn.init.uniform_(self.log_std.weight, -init_w, init_w)
        torch.nn.init.uniform_(self.log_std.bias, -init_w, init_w)

    def forward(self, state, deterministic=False, with_logprob=True):
        x = F.relu(self.layer_1(state))
        x = F.relu(self.layer_2(x))
        
        mean = self.mean(x)
        
        # Clamp log_std for stability
        log_std = self.log_std(x)
        log_std = torch.clamp(log_std, min=-20, max=2)
        std = log_std.exp()
        
        # Create the distribution
        dist = Normal(mean, std)
        
        if deterministic:
            # Used for evaluation: take the mean
            u = mean
        else:
            # Used for training: sample w/ reparameterization trick
            u = dist.rsample()

        # --- This is the key SAC action-squashing logic ---
        # 1. Get action and log-probability
        # Action is squashed to [-1, 1] by tanh
        action_tanh = torch.tanh(u)
        
        if with_logprob:
            # Calculate log-prob of the *squashed* action
            # This is the complex part: log_prob = log_prob_normal - log(1 - tanh(u)^2)
            log_prob = dist.log_prob(u)
            log_prob -= torch.log(1 - action_tanh.pow(2) + 1e-6)
            log_prob = log_prob.sum(dim=1, keepdim=True)
        else:
            log_prob = None
            
        # 2. Rescale action from [-1, 1] to [0, max_action]
        # (tanh(u) + 1) / 2 maps to [0, 1]
        action_scaled = (action_tanh + 1) * 0.5 * self.max_action
        
        return action_scaled, log_prob

    def select_action(self, state, device):
        """Helper function to select action for stepping the env"""
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        # Pass deterministic=False to explore during collection
        action, _ = self.forward(state, deterministic=False, with_logprob=False)
        return action.cpu().data.numpy().flatten()


# --- 2. Define the SAC Agent ---
# This class contains the full SAC logic.
# It re-uses the 'Critic' class defined for TD3.

class SACAgent:
    def __init__(self, state_dim, action_dim, max_action, device,
                 gamma=0.99, tau=0.005, alpha=0.2, lr=3e-4, weight_decay=1e-8):
        
        self.device = device
        self.gamma = gamma
        self.tau = tau

        # --- ADD THIS LINE ---
        self.max_action = max_action
        # --- END OF FIX ---

        # --- Actor ---
        self.actor = ActorSAC(state_dim, action_dim, max_action).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr, weight_decay=weight_decay)
        
        # --- Critic (re-uses TD3's Critic class) ---
        self.critic = Critic(state_dim, action_dim).to(self.device)
        self.critic_target = copy.deepcopy(self.critic).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr, weight_decay=weight_decay)

        # --- Alpha (Entropy Coefficient) ---
        # We make it a learnable parameter
        self.target_entropy = -float(action_dim)*4
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr)
        self.alpha = self.log_alpha.exp().item() # for use in loss calcs
        
    def select_action(self, state):
        return self.actor.select_action(state, self.device)

    def train(self, replay_buffer, batch_size=100):
        # 1. Sample from buffer and move to device
        state, action, reward, next_state, done = replay_buffer.sample(batch_size)
        
        state = torch.FloatTensor(state).to(self.device)
        action = torch.FloatTensor(action).to(self.device)
        reward = torch.FloatTensor(reward).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        done = torch.FloatTensor(done).to(self.device)

        # --- 2. Update Critic (Q-Networks) ---
        with torch.no_grad():
            # Get next action and log-prob from *current* policy
            next_action_scaled, next_log_prob = self.actor(next_state)
            
            # Get Q-values from *target* critic
            target_Q1, target_Q2 = self.critic_target(next_state, next_action_scaled)
            target_Q = torch.min(target_Q1, target_Q2)
            
            # Bellman backup: r + gamma * (1-d) * (Q_target - alpha * log_prob)
            target_Q = reward + (1 - done) * self.gamma * (target_Q - self.alpha * next_log_prob)

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(state, action)
        
        # MSE loss for both critics
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # --- 3. Update Actor (Policy) ---
        # Sample new action and log-prob from current policy
        action_scaled, log_prob = self.actor(state)
        
        # Get Q-values from *current* critic
        Q1, Q2 = self.critic(state, action_scaled)
        Q = torch.min(Q1, Q2)
        
        # Actor loss: (alpha * log_prob - Q)
        actor_loss = (self.alpha * log_prob - Q).mean()
        
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # --- 4. Update Alpha (Entropy Coefficient) ---
        alpha_loss = -(self.log_alpha.exp() * (log_prob + self.target_entropy).detach()).mean()
        
        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()
        
        # Update alpha value
        self.alpha = self.log_alpha.exp().item()

        # --- 5. Soft Update Target Critic ---
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

    def run_evaluation(self, env):
        """
        Runs a full backtest with the SAC agent in deterministic mode.
        (This is a helper, you can also use env.run_evaluation)
        """
        state = env.reset()
        signals = []
        
        for t in range(env.n_steps - 1):
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            # Use deterministic=True for evaluation
            action, _ = self.actor(state_tensor, deterministic=True, with_logprob=False)
            action = action.cpu().data.numpy().flatten()
                
            signals.append(np.clip(action, 0.0, self.max_action)[0])
            
            if t < env.n_steps - 2:
                state = env.features[t + 1]
        
        scorer_df_trimmed = env.scorer_info_df.iloc[:len(signals)]
        return calculate_competition_score(scorer_df_trimmed, np.array(signals))
            


In [7]:

# --- 3. Main SAC Training and Evaluation Loop ---
    
# --- 1. Load Data and Init Env (Re-using existing setup) ---
print("Loading and preparing data for SAC test...")
# (Assuming these are defined in your notebook)
# TRAIN_DATA_PATH = "./kaggle/train.csv"
# SPY_DATA_PATH = "./kaggle/spy-historical.csv"

X_train, y_train, scorer_info_train, X_test, y_test, scorer_info_test = \
    load_and_prep_data(TRAIN_DATA_PATH, SPY_DATA_PATH, slice_start=2000, test_days=252)

train_env_sac = RlTradingEnv(X_train, y_train, scorer_info_train, transaction_cost=0.00001)
test_env_sac = RlTradingEnv(X_test, y_test, scorer_info_test, transaction_cost=0.00001)

state_dim = train_env_sac.n_features
action_dim = train_env_sac.action_space_dim
max_action = train_env_sac.max_action

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device} for SAC")


# --- 2. Train SAC Agent ---
print("\n" + "="*50)
print("Training SAC (Soft Actor-Critic) Agent")
print("="*50)

agent_sac = SACAgent(state_dim, action_dim, max_action, device=device)
buffer_sac = ReplayBuffer(max_size=500_000)

N_EPISODES = 0
BATCH_SIZE = 128
START_TRAINING = 1000 
EVAL_EVERY = 1

best_eval_score_sac = -np.inf 

for ep in range(N_EPISODES):
    state = train_env_sac.reset()
    done = False
    ep_reward_sum = 0
    
    while not done:
        if len(buffer_sac) < START_TRAINING:
            action = np.random.uniform(0, max_action, size=(action_dim,))
        else:
            action = agent_sac.select_action(state)
        
        next_state, reward, done = train_env_sac.step(action)
        buffer_sac.add(state, action, reward, next_state, done)
        
        ep_reward_sum += reward
        state = next_state
        
        if len(buffer_sac) > START_TRAINING:
            agent_sac.train(buffer_sac, BATCH_SIZE)
    
    print(f"SAC Episode {ep+1}/{N_EPISODES} | Train Reward: {ep_reward_sum:.2f}")
    #print(f"    Train Fallbacks: {train_env_sac.nan_fallback_counter} times | Current Alpha: {agent_sac.alpha:.4f}")
    
    # Evaluate on test_env
    if (ep + 1) % EVAL_EVERY == 0:
        # Use the agent's internal eval function which uses deterministic=True
        eval_score = agent_sac.run_evaluation(test_env_sac)
        print(f"--- SAC TEST EVAL | Episode {ep+1} | Adjusted Sharpe: {eval_score:.4f} ---")
        
        if eval_score > best_eval_score_sac:
            best_eval_score_sac = eval_score
            # torch.save(agent_sac.actor.state_dict(), 'best_actor_sac.pth')
            print(f"    New best SAC test score! Model saved.")

print(f"SAC Training Complete. Best Test Sharpe: {best_eval_score_sac:.4f}")

Loading and preparing data for SAC test...
Data ready. Original rows: 6990, Cleaned rows: 6941
Splitting data: 6689 train rows, 252 test rows.
Using device: cuda for SAC

Training SAC (Soft Actor-Critic) Agent
SAC Training Complete. Best Test Sharpe: -inf


In [8]:
from sklearn.model_selection import TimeSeriesSplit # Add this to your imports

def load_and_prep_data(train_path, spy_path, slice_start=2000):
    """
    Loads and processes the *entire* dataset for use in cross-validation.
    (Corrected for InvalidOperationError)
    """
    
    # 1. Load base data and slice
    full_train_df = pl.read_csv(train_path)
    df_raw = full_train_df.slice(slice_start)
    
    # 2. Basic cleaning
    df = df_raw.with_columns(
        # --- FIX 1 ---
        pl.selectors.float().replace([np.inf, -np.inf], None) # Was pl.all()
    ).with_columns(
        pl.selectors.numeric().fill_null(0.0) 
    )
    df = df.with_columns(pl.col("date_id").cast(pl.Int64))

    # 3. Add weekday feature (as before)
    spy_df = pl.read_csv(spy_path)
    weekday_df = spy_df.with_columns(
        pl.col("Date").str.to_date().dt.weekday().alias("weekday")
    ).select(["date_id", "weekday"])
    df_with_weekday = df.join(weekday_df, on="date_id", how="left").fill_null(0.0)
    
    # 4. Prep for feature generation (as before)
    base_df = df_with_weekday.rename({'market_forward_excess_returns': 'target'})
    feature_cols = [col for col in base_df.columns if col != 'date_id']
    base_df = base_df.with_columns(pl.col(feature_cols).cast(pl.Float64, strict=False))
    if 'E7' in base_df.columns:
        base_df = base_df.drop('E7')
        
    # --- PRE-CLEAN BASE_DF ---
    base_df = base_df.with_columns(
        # --- FIX 2 ---
        pl.selectors.float().replace([np.inf, -np.inf], None) # Was pl.all()
    ).with_columns(
        pl.all().fill_null(0.0).forward_fill().backward_fill()
    )

    # 5. Generate and combine features (as before)
    new_features_df = generate_features_7(base_df) 
    processed_df = pl.concat([base_df, new_features_df], how="horizontal")
    
    # 6. Finalize X, y, and scorer_info
    base_features = [col for col in base_df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
    new_feature_names = new_features_df.columns
    ALL_FEATURES = base_features + new_feature_names
    
    Xy = pl.concat([processed_df.select(ALL_FEATURES), 
                    processed_df.select("target"), 
                    processed_df.select(["forward_returns", "risk_free_rate"])], 
                   how="horizontal")
    
    # --- FINAL ROBUST CLEANING ---
    Xy = Xy.with_columns(
        # --- FIX 3 ---
        pl.selectors.float().replace([np.inf, -np.inf], None) # Was pl.all()
    )
    original_rows = Xy.height
    Xy = Xy.drop_nulls()
    cleaned_rows = Xy.height
    
    X = Xy.select(ALL_FEATURES)
    y = Xy.select("target")
    scorer_info_df = Xy.select(["forward_returns", "risk_free_rate"])
    
    print(f"Data ready. Total cleaned rows: {cleaned_rows} (from {original_rows})")
    print(f"Features shape: {X.shape}, Target shape: {y.shape}")
    
    return X, y, scorer_info_df

# --- 3. Main SAC Cross-Validation Training and Evaluation Loop ---


# --- 1. Load FULL Data (using new function) ---
print("Loading and preparing full dataset for CV...")
TRAIN_DATA_PATH = "./kaggle/train.csv"
SPY_DATA_PATH = "./kaggle/spy-historical.csv" # Make sure this path is correct

X_full, y_full, scorer_info_full = \
    load_and_prep_data(TRAIN_DATA_PATH, SPY_DATA_PATH, slice_start=2000)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device} for SAC")

# --- 2. Setup CV Parameters ---
N_SPLITS = 20  # Number of folds for cross-validation
N_EPISODES_PER_FOLD = 30 # As requested

BATCH_SIZE = 256
START_TRAINING = 20000 # Steps before training starts (per fold)

tscv = TimeSeriesSplit(n_splits=N_SPLITS, max_train_size=2000)

# This will store the scores for each episode from each fold
# Shape will be (N_SPLITS, N_EPISODES_PER_FOLD)
all_fold_scores = [] 

print("\n" + "="*50)
print(f"Starting SAC Cross-Validation with {N_SPLITS} Folds")
print("="*50)

fold_num = 0
for train_index, test_index in tscv.split(X_full):
    fold_num += 1
    print(f"\n--- Starting Fold {fold_num}/{N_SPLITS} ---")
    print(f"Train indices: {len(train_index)}, Test indices: {len(test_index)}")

    # --- 3. Create data and environments for THIS fold ---
    X_train, X_test = X_full[train_index], X_full[test_index]
    y_train, y_test = y_full[train_index], y_full[test_index]
    scorer_train, scorer_test = scorer_info_full[train_index], scorer_info_full[test_index]
    
    train_env_fold = RlTradingEnv(X_train, y_train, scorer_train, transaction_cost=0)
    test_env_fold = RlTradingEnv(X_test, y_test, scorer_test, transaction_cost=0)

    # --- 4. CRITICAL: Re-initialize Agent and Buffer for each fold ---
    # This prevents data leakage from previous folds
    state_dim = train_env_fold.n_features
    action_dim = train_env_fold.action_space_dim
    max_action = train_env_fold.max_action
    
    agent_sac_fold = SACAgent(state_dim, action_dim, max_action, device=device, gamma = 0.95)
    buffer_sac_fold = ReplayBuffer(max_size=500_000)
    
    fold_episode_scores = [] # Stores scores for this fold's 15 episodes
    
    # --- 5. Inner Loop: Train for 15 episodes ---
    for ep in range(N_EPISODES_PER_FOLD):
        state = train_env_fold.reset()
        done = False
        ep_reward_sum = 0
        
        while not done:
            if len(buffer_sac_fold) < START_TRAINING:
                action = np.random.uniform(0, max_action, size=(action_dim,))
            else:
                action = agent_sac_fold.select_action(state)
            
            next_state, reward, done = train_env_fold.step(action)
            buffer_sac_fold.add(state, action, reward, next_state, done)
            
            ep_reward_sum += reward
            state = next_state
            
            if len(buffer_sac_fold) > START_TRAINING:
                agent_sac_fold.train(buffer_sac_fold, BATCH_SIZE)
        
        # --- 6. Evaluate and Record Score AFTER EACH EPISODE ---
        eval_score = agent_sac_fold.run_evaluation(test_env_fold)
        fold_episode_scores.append(eval_score)
        
        print(f"  Fold {fold_num}, Ep {ep+1}/{N_EPISODES_PER_FOLD} | Train Reward: {ep_reward_sum:.2f} | Test Sharpe: {eval_score:.4f}")
    
    # This fold is done, store its 15 episode scores
    all_fold_scores.append(fold_episode_scores)
    print(f"--- Fold {fold_num} Complete. Final Test Sharpe: {eval_score:.4f} ---")


# --- 7. Final Results ---
print("\n" + "="*50)
print("Cross-Validation Complete - Aggregated Results")
print("="*50)

scores_array = np.array(all_fold_scores)

# Calculate mean and std dev across folds for each episode
mean_scores_per_episode = np.mean(scores_array, axis=0)
std_scores_per_episode = np.std(scores_array, axis=0)

print("Mean Test Sharpe Score (Adjusted) per Episode (averaged across all folds):")
for i in range(N_EPISODES_PER_FOLD):
    print(f"  Episode {i+1}: {mean_scores_per_episode[i]:.4f} +/- {std_scores_per_episode[i]:.4f}")

print("\nFinal Model Performance (Episode 15):")
print(f"  Mean: {mean_scores_per_episode[-1]:.4f}")
print(f"  Std Dev: {std_scores_per_episode[-1]:.4f}")

Loading and preparing full dataset for CV...
Data ready. Total cleaned rows: 6941 (from 6990)
Features shape: (6941, 144), Target shape: (6941, 1)
Using device: cuda for SAC

Starting SAC Cross-Validation with 20 Folds

--- Starting Fold 1/20 ---
Train indices: 341, Test indices: 330
  Fold 1, Ep 1/30 | Train Reward: 24.91 | Test Sharpe: -0.0410
  Fold 1, Ep 2/30 | Train Reward: 9.49 | Test Sharpe: -0.0410
  Fold 1, Ep 3/30 | Train Reward: 9.31 | Test Sharpe: -0.0410
  Fold 1, Ep 4/30 | Train Reward: 22.54 | Test Sharpe: -0.0410
  Fold 1, Ep 5/30 | Train Reward: 15.54 | Test Sharpe: -0.0410
  Fold 1, Ep 6/30 | Train Reward: 5.23 | Test Sharpe: -0.0410
  Fold 1, Ep 7/30 | Train Reward: 4.73 | Test Sharpe: -0.0410
  Fold 1, Ep 8/30 | Train Reward: 10.80 | Test Sharpe: -0.0410
  Fold 1, Ep 9/30 | Train Reward: 33.42 | Test Sharpe: -0.0410
  Fold 1, Ep 10/30 | Train Reward: 5.12 | Test Sharpe: -0.0410
  Fold 1, Ep 11/30 | Train Reward: 20.61 | Test Sharpe: -0.0410
  Fold 1, Ep 12/30 | Trai

In [13]:
import numpy as np
import pandas as pd
import polars as pl
import torch
# ... other imports

class RlTradingEnv:
    """
    An RL environment with a MEMORYLESS Quadratic Utility reward:
    reward = (lambda_R * R_p) - (lambda_V * R_p^2)
    
    This implements the "Contextual Bandit" approach.
    """
    def __init__(self, features, targets, scorer_info, transaction_cost=0):
        self.features = features.to_numpy()
        self.targets = targets.to_numpy().flatten()
        self.scorer_info_df = scorer_info.to_pandas() 
        
        self.transaction_cost = transaction_cost
        
        self.n_steps = len(self.features)
        self.n_features = self.features.shape[1]
        self.action_space_dim = 1
        self.max_action = 2.0
        
        # --- NEW UTILITY REWARD HYPERPARAMETERS ---
        # Tune these to find the desired risk/return trade-off
        # This is the "Return" component (your old reward_scale)
        self.lambda_return = 100.0  
        # This is the "Risk Aversion" component
        # It's the penalty for volatility. Start high and tune down.
        self.lambda_volatility = 0.0 
        
        self.current_step = 0
        self.last_leverage = 0.0
        self.nan_fallback_counter = 0

    def reset(self):
        """Resets the environment and returns the first state."""
        self.current_step = 0
        self.last_leverage = 0.0
        self.nan_fallback_counter = 0
        return self.features[self.current_step]

    # No _calculate_dsr method is needed.
    
    def step(self, action_leverage):
        """
        Takes an action, calculates memoryless utility reward,
        and returns the next state.
        """
        if self.current_step >= self.n_steps - 2:
            return self.features[self.current_step], 0.0, True

        leverage = np.clip(action_leverage, 0.0, self.max_action)[0]
        target_excess_return = self.targets[self.current_step]
        
        # --- 1. Calculate Portfolio Return (R_p) ---
        cost = self.transaction_cost * abs(leverage - self.last_leverage)
        portfolio_return = (leverage * target_excess_return) - cost
        
        # --- 2. Handle NaNs (Failsafe) ---
        if np.isnan(portfolio_return) or np.isinf(portfolio_return):
            portfolio_return = 0.0 
            self.nan_fallback_counter += 1

        # --- 3. Calculate Memoryless Utility Reward ---
        
        # Return-seeking component (scaled)
        return_component = self.lambda_return * portfolio_return
        
        # Risk-aversion component (penalizes squared return)
        # The (portfolio_return**2) term is always positive,
        # so this is a pure volatility penalty.
        risk_component = self.lambda_volatility * (portfolio_return**2)

        zero_penalty = 0.0
        if leverage < 1e-3:
            zero_penalty = np.log(leverage + 1e-6) * 0.1  # Small penalty for zero leverage
        return_component += zero_penalty
        
        # Final reward is the trade-off
        final_reward = return_component - risk_component
        
        # 4. Update state
        self.last_leverage = leverage
        self.current_step += 1
        next_state = self.features[self.current_step]
        done = (self.current_step == self.n_steps - 2)
        
        return next_state, final_reward, done

    # ... (run_evaluation is unchanged) ...
    def run_evaluation(self, policy_actor):
        """
        Runs a full backtest with a given policy (actor)
        and returns the final adjusted Sharpe score.
        """
        state = self.reset()
        signals = []
        # --- FIX IS HERE ---
        # If we have a torch actor, get its device (e.g., 'cuda:0')
        actor_device = None
        if isinstance(policy_actor, Actor):
            actor_device = next(policy_actor.parameters()).device
        # --- END FIX ---
        
        for t in range(self.n_steps - 1):
            if isinstance(policy_actor, Actor): # It's our MLP/TD3 actor
                # --- FIX IS HERE ---
                # Create tensor and move it to the actor's device
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(actor_device)
                # --- END FIX ---
                #state_tensor = torch.FloatTensor(state).unsqueeze(0)
                action = policy_actor(state_tensor).cpu().data.numpy().flatten()
            else: # It's our XGBoost agent
                action = [policy_actor.select_action(state, exploration_rate=0.0)]
                
            signals.append(np.clip(action, 0.0, self.max_action)[0])
            
            # We don't need reward here, just the next state
            if t < self.n_steps - 2:
                state = self.features[t + 1]
        
        # Score the *entire* run
        # We need to trim the scorer_info_df to match the length of signals
        scorer_df_trimmed = self.scorer_info_df.iloc[:len(signals)]
        return calculate_competition_score(scorer_df_trimmed, np.array(signals))