# XGBoost FQI Agent with Time-Series Cross-Validation

This notebook implements a Fitted Q-Iteration (FQI) agent using XGBoost and evaluates it using rigorous time-series cross-validation.

## 1. Imports and Setup

In [22]:
import numpy as np
import pandas as pd
import polars as pl
import torch
import xgboost as xgb
import random
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import TimeSeriesSplit
from collections import deque

# Set up the device for XGBoost
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


## 2. Competition Evaluation Function

This is the official scoring metric.

In [23]:
def calculate_competition_score(y_true_df: pd.DataFrame, y_pred_signals: np.ndarray) -> float:
    ''' Calculates the competition score based on true values and predicted signals. '''
    solution = y_true_df.copy() # Use .copy() to avoid SettingWithCopyWarning
    solution['position'] = y_pred_signals
    solution['strategy_returns'] = (
        solution['risk_free_rate'] * (1 - solution['position']) +
        solution['position'] * solution['forward_returns']
    )
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_geo_mean = (1 + strategy_excess_returns).prod() ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()
    
    # Handle zero std dev (e.g., if agent learns to always output 0)
    if strategy_std < 1e-10: return 0.0 
    
    trading_days_per_yr = 252
    sharpe = strategy_geo_mean / strategy_std * np.sqrt(trading_days_per_yr)
    market_std = solution['forward_returns'].std()
    market_volatility = market_std * np.sqrt(trading_days_per_yr) * 100
    strategy_volatility = strategy_std * np.sqrt(trading_days_per_yr) * 100
    
    # Handle zero market vol
    if market_volatility < 1e-8: 
        excess_vol = 0.0
    else:
        excess_vol = max(0, strategy_volatility / market_volatility - 1.2)
        
    vol_penalty = 1 + excess_vol
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_geo_mean = (1 + market_excess_returns).prod() ** (1 / len(solution)) - 1
    return_gap = max(0, (market_geo_mean - strategy_geo_mean) * 100 * trading_days_per_yr)
    return_penalty = 1 + (return_gap**2) / 100
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    
    return adjusted_sharpe

## 3. Reinforcement Learning Environment

This environment provides a simple, step-by-step reward based on portfolio returns for training the agent.

In [24]:
class RlTradingEnv:
    """
    A simple RL environment for trading.
    It provides a per-step reward based on the 'target' column.
    It uses 'calculate_competition_score' for final evaluation.
    """
    def __init__(self, features, targets, scorer_info, transaction_cost=0.0001):
        self.features = features.to_numpy()
        self.targets = targets.to_numpy().flatten()
        self.scorer_info_df = scorer_info.to_pandas() 
        
        self.transaction_cost = transaction_cost
        self.reward_scale = 100.0
        
        self.n_steps = len(self.features)
        self.n_features = self.features.shape[1]
        self.action_space_dim = 1
        self.max_action = 2.0
        
        self.current_step = 0
        self.last_leverage = 0.0
        self.nan_fallback_counter = 0

    def reset(self):
        """Resets the environment and returns the first state."""
        self.current_step = 0
        self.last_leverage = 0.0
        self.nan_fallback_counter = 0
        return self.features[self.current_step]

    def step(self, action_leverage):
        """
        Takes an action, calculates reward, and returns the next state.
        """
        if self.current_step >= self.n_steps - 2:
            return self.features[self.current_step], 0.0, True

        leverage = np.clip(action_leverage, 0.0, self.max_action)[0]
        target_excess_return = self.targets[self.current_step]
        
        cost = self.transaction_cost * abs(leverage - self.last_leverage)
        reward = (leverage * target_excess_return) - cost
        
        if np.isnan(reward) or np.isinf(reward):
            reward = 0.0 
            self.nan_fallback_counter += 1

        self.last_leverage = leverage
        self.current_step += 1
        next_state = self.features[self.current_step]
        done = (self.current_step == self.n_steps - 2)
        
        return next_state, (reward * self.reward_scale), done

    def run_evaluation(self, policy_agent):
        """
        Runs a full backtest with a given policy (agent)
        and returns the final adjusted Sharpe score.
        
        This can accept both PyTorch and XGBoost agents.
        """
        state = self.reset()
        signals = []
        
        for t in range(self.n_steps - 1):
            # Check if it's a PyTorch agent (like SAC or TD3)
            # Note: We aren't using this now, but it keeps the env flexible.
            if hasattr(policy_agent, 'actor') and isinstance(policy_agent.actor, torch.nn.Module):
                actor_device = next(policy_agent.actor.parameters()).device
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(actor_device)
                if hasattr(policy_agent, 'select_action_deterministic'):
                    action = policy_agent.select_action_deterministic(state_tensor)
                else: # Fallback for TD3-style agent
                    action = policy_agent.actor(state_tensor).cpu().data.numpy().flatten()
            
            # Check if it's our XGBoost FQI agent
            elif isinstance(policy_agent, XGBoost_FQI_Agent):
                action = [policy_agent.select_action(state, exploration_rate=0.0)]
            
            else:
                raise TypeError("Unknown agent type passed to run_evaluation")
                
            signals.append(np.clip(action, 0.0, self.max_action)[0])
            
            # We don't need reward here, just the next state
            if t < self.n_steps - 2:
                state = self.features[t + 1]
        
        # Score the *entire* run
        scorer_df_trimmed = self.scorer_info_df.iloc[:len(signals)]
        return calculate_competition_score(scorer_df_trimmed, np.array(signals))

## 4. Replay Buffer

A simple deque-based replay buffer to store (s, a, r, s', d) transitions.

In [25]:
class ReplayBuffer:
    """A simple replay buffer for off-policy learning."""
    def __init__(self, max_size=1_000_000):
        self.buffer = deque(maxlen=max_size)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        
        return (np.array(states), np.array(actions), 
                np.array(rewards).reshape(-1, 1), 
                np.array(next_states), 
                np.array(dones).reshape(-1, 1))

    def __len__(self):
        return len(self.buffer)

## 5. Data Loading and Feature Engineering

This section contains the feature generation logic and the main data loading function. `load_full_data_for_cv` is designed to load the *entire* dataset, which we can then split using `TimeSeriesSplit`.

In [26]:
def generate_features_7 (df: pl.DataFrame) -> pl.DataFrame:
  """Generates new features from the base data.
    This function is the target of the evolutionary algorithm.
  
    Available Feature Categories:
    - D* (Dummy/Binary features): 9 columns (D1-D9)
    - E* (Macro Economic features): 20 columns (E1-E20)
    - I* (Interest Rate features): 9 columns (I1-I9)
    - M* (Market Dynamics/Technical features): 18 columns (M1-M18)
    - P* (Price/Valuation features): 13 columns (P1-P13)
    - S* (Sentiment features): 12 columns (S1-S12)
    - V* (Volatility features): 13 columns (V1-V13)
  """
  new_features = pl.DataFrame({
      # --- 20 Pairwise Interactions ---
      'feat_M1_x_V1': df['M1'] * df['V1'],
      'feat_P1_add_E1': df['P1'] + df['E1'],
      'feat_S1_sub_I1': df['S1'] - df['I1'],
      'feat_M10_div_V10': df['M10'] / (df['V10'] + 1e-6),
      'feat_P10_x_E10': df['P10'] * df['E10'],
      'feat_M2_x_S3': df['M2'] * df['S3'],
      'feat_V2_div_P2': df['V2'] / (df['P2'] + 1e-6),
      'feat_E4_sub_I3': df['E4'] - df['I3'],
      'feat_S7_add_M12': df['S7'] + df['M12'],
      'feat_I5_x_V11': df['I5'] * df['V11'],
      'feat_P5_div_S8': df['P5'] / (df['S8'] + 1e-6),
      'feat_E12_x_I9': df['E12'] * df['I9'],
      'feat_M1_div_S1': df['M1'] / (df['S1'] + 1e-6),
      'feat_V1_add_P1': df['V1'] + df['P1'],
      'feat_E1_sub_I1': df['E1'] - df['I1'],
      'feat_M2_div_V2': df['M2'] / (df['V2'] + 1e-6),
      'feat_P2_x_S3': df['P2'] * df['S3'],
      'feat_E4_add_M10': df['E4'] + df['M10'],
      'feat_I3_sub_V10': df['I3'] - df['V10'],
      'feat_S7_x_P10': df['S7'] * df['P10'],
      # --- 10 Rolling Window Features ---
      'feat_V2_roll_mean_5': df['V2'].rolling_mean(window_size=5),
      'feat_V1_roll_std_5': df['V1'].rolling_std(window_size=5),
      'feat_M1_roll_mean_20': df['M1'].rolling_mean(window_size=20),
      'feat_M3_roll_std_20': df['M3'].rolling_std(window_size=20),
      'feat_P1_roll_max_10': df['P1'].rolling_max(window_size=10),
      'feat_P1_roll_min_10': df['P1'].rolling_min(window_size=10),
      'feat_E5_roll_mean_50': df['E5'].rolling_mean(window_size=50),
      'feat_S1_roll_std_50': df['S1'].rolling_std(window_size=50),
      'feat_I1_roll_mean_10': df['I1'].rolling_mean(window_size=10),
      'feat_V10_roll_std_10': df['V10'].rolling_std(window_size=10),
      # --- 10 Complex Interactions (3+ elements) ---
      'feat_M1_V1_div_P1': (df['M1'] * df['V1']) / (df['P1'] + 1e-6),
      'feat_E1_S1_add_I1': df['E1'] + df['S1'] - df['I1'],
      'feat_M2_P2_sub_V2': df['M2'] + df['P2'] - df['V2'],
      'feat_S7_div_E4_I3': df['S7'] / (df['E4'] + df['I3'] + 1e-6),
      'feat_P5_x_M10_x_V10': df['P5'] * df['M10'] * df['V10'],
      'feat_roll_diff_M1_5_20': df['M1'].rolling_mean(window_size=5) - df['M1'].rolling_mean(window_size=20),
      'feat_roll_diff_V1_5_20': df['V1'].rolling_mean(window_size=5) - df['V1'].rolling_mean(window_size=20),
      'feat_M_S_P_combo': (df['M12'] - df['M1']) / (df['S1'] + df['P1'] + 1e-6),
      'feat_V_E_I_combo': (df['V11'] + df['V2']) * (df['E1'] - df['I1']),
      'feat_ratio_of_ratios': (df['M1']/(df['V1']+1e-6)) / (df['P1']/(df['S1']+1e-6)),
      # --- 10 New Features ---
      'feat_M1_x_V1_x_P1': df['M1'] * df['V1'] * df['P1'],
      'feat_E1_div_S1': df['E1'] / (df['S1'] + 1e-6),
      'feat_I1_sub_V1': df['I1'] - df['V1'],
      'feat_M10_add_V10': df['M10'] + df['V10'],
      'feat_P10_div_E10': df['P10'] / (df['E10'] + 1e-6),
      'feat_M2_add_S3': df['M2'] + df['S3'],
      'feat_V2_x_P2': df['V2'] * df['P2'],
      'feat_E4_add_I3': df['E4'] + df['I3'],
      'feat_S7_div_M12': df['S7'] / (df['M12'] + 1e-6),
      'feat_I5_div_V11': df['I5'] / (df['V11'] + 1e-6),
  })
  # Fill any nulls created by rolling windows
  return new_features.with_columns(pl.all().forward_fill())


def load_full_data_for_cv(train_path, spy_path, slice_start=2000):
    """
    Loads and processes the *entire* dataset for use in cross-validation.
    This is the robust version from your SAC CV loop.
    """
    
    # 1. Load base data and slice
    full_train_df = pl.read_csv(train_path)
    df_raw = full_train_df.slice(slice_start)
    
    # 2. Basic cleaning (robust version)
    df = df_raw.with_columns(
        pl.selectors.float().replace([np.inf, -np.inf], None) 
    ).with_columns(
        pl.selectors.numeric().fill_null(0.0) 
    )
    df = df.with_columns(pl.col("date_id").cast(pl.Int64))

    # 3. Add weekday feature
    spy_df = pl.read_csv(spy_path)
    weekday_df = spy_df.with_columns(
        pl.col("Date").str.to_date().dt.weekday().alias("weekday")
    ).select(["date_id", "weekday"])
    df_with_weekday = df.join(weekday_df, on="date_id", how="left").fill_null(0.0)
    
    # 4. Prep for feature generation
    base_df = df_with_weekday.rename({'market_forward_excess_returns': 'target'})
    feature_cols = [col for col in base_df.columns if col != 'date_id']
    base_df = base_df.with_columns(pl.col(feature_cols).cast(pl.Float64, strict=False))
    if 'E7' in base_df.columns:
        base_df = base_df.drop('E7')
        
    # Pre-clean base_df
    base_df = base_df.with_columns(
        pl.selectors.float().replace([np.inf, -np.inf], None)
    ).with_columns(
        pl.all().fill_null(0.0).forward_fill().backward_fill()
    )

    # 5. Generate and combine features
    new_features_df = generate_features_7(base_df) 
    processed_df = pl.concat([base_df, new_features_df], how="horizontal")
    
    # 6. Finalize X, y, and scorer_info
    base_features = [col for col in base_df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
    new_feature_names = new_features_df.columns
    ALL_FEATURES = base_features + new_feature_names
    
    Xy = pl.concat([processed_df.select(ALL_FEATURES), 
                    processed_df.select("target"), 
                    processed_df.select(["forward_returns", "risk_free_rate"])], 
                   how="horizontal")
    
    # Final robust cleaning
    Xy = Xy.with_columns(
        pl.selectors.float().replace([np.inf, -np.inf], None)
    )
    original_rows = Xy.height
    Xy = Xy.drop_nulls()
    cleaned_rows = Xy.height
    
    X = Xy.select(ALL_FEATURES)
    y = Xy.select("target")
    scorer_info_df = Xy.select(["forward_returns", "risk_free_rate"])
    
    print(f"Data ready. Total cleaned rows: {cleaned_rows} (from {original_rows})")
    print(f"Features shape: {X.shape}, Target shape: {y.shape}")
    
    return X, y, scorer_info_df

## 6. XGBoost FQI Agent

This is the Fitted Q-Iteration (FQI) agent using XGBoost. It learns a Q-function (value of state-action pairs) offline.

In [27]:
class XGBoost_FQI_Agent:
    """
    Fitted Q-Iteration (FQI) agent using XGBoost.
    This requires a DISCRETE action space.
    """
    def __init__(self, state_dim, gamma=0.95, n_actions=11):
        # 1. DISCRETIZE the action space
        self.action_space = np.linspace(0.0, 2.0, num=n_actions)
        self.n_actions = len(self.action_space)
        print(f"XGBoost agent initialized with {self.n_actions} discrete actions: {np.round(self.action_space, 2)}")
        
        self.state_dim = state_dim
        self.gamma = gamma
        
        # We use a MultiOutputRegressor to fit one XGB model per action
        self.model = MultiOutputRegressor(
            xgb.XGBRegressor(
                objective='reg:squarederror', n_estimators=20,
                learning_rate=0.05, max_depth=5, subsample=0.8, 
                colsample_bytree=0.8, n_jobs=-1, random_state=42,
                device=device 
            )
        )
        # Fit with dummy data to initialize
        self.model.fit(np.random.rand(1, state_dim), np.random.rand(1, self.n_actions))

    def select_action(self, state, exploration_rate=0.1):
        """Selects the best action (epsilon-greedy)."""
        if np.random.rand() < exploration_rate:
            return np.random.choice(self.action_space)
        
        q_values = self.model.predict(state.reshape(1, -1))[0]
        best_action_idx = np.argmax(q_values)
        return self.action_space[best_action_idx]

    def train_fqi(self, replay_buffer, iterations=3, batch_size=500_000):
        """
        Trains the XGBoost model using Fitted Q-Iteration.
        This is an OFFLINE, BATCH process.
        It samples from the buffer to keep training time reasonable.
        """
        print(f"\n--- Starting XGBoost FQI Training ({iterations} iterations) ---")
        
        if len(replay_buffer) < 1000:
             print("Buffer not full enough. Need at least 1000 samples. Skipping training.")
             return
        
        # Sample a large batch from the buffer
        # Use min() to avoid error if buffer is smaller than batch_size
        sample_size = min(len(replay_buffer), batch_size)
        print(f"Sampling {sample_size} transitions from buffer...")
        states, actions, rewards, next_states, dones = replay_buffer.sample(sample_size)
        
        for k in range(iterations):
            print(f"FQI Iteration {k+1}/{iterations}...")
            
            # 1. Calculate the target Q-value (Bellman update)
            # Q_target(s, a) = r + gamma * max_a'(Q_k(s', a'))
            
            # Predict Q(s', a') for all next_states
            next_q_values = self.model.predict(next_states)
            
            # Find max_a'(Q_k(s', a'))
            max_next_q = np.max(next_q_values, axis=1)
            
            # The target: r + gamma * max_Q (or just r if done)
            target_q = rewards.flatten() + (1.0 - dones.flatten()) * self.gamma * max_next_q
            
            # 2. Create the training set for the *new* XGBoost model
            # We want to train Q_k+1(s, a) -> target_q
            X_train = states
            
            # Start with the model's current predictions as the base
            y_train = self.model.predict(X_train)
            
            # Update y_train at the index of the action *actually taken*
            for i in range(len(states)):
                action_taken = actions[i][0]
                # Find the *closest* index of the action in our discrete space
                action_idx = (np.abs(self.action_space - action_taken)).argmin()
                y_train[i, action_idx] = target_q[i]

            # 4. Train a new XGBoost model on these (s, a) -> target_q pairs
            print("Fitting new XGBoost model...")
            self.model.fit(X_train, y_train)
            
        print("--- FQI Training Complete ---")

## 7. Main Cross-Validation Execution

This is the main script to run the cross-validation for the `XGBoost_FQI_Agent`.

In [29]:
# --- 1. Define Paths and CV Parameters ---
TRAIN_DATA_PATH = "./kaggle/train.csv"
SPY_DATA_PATH = "./kaggle/spy-historical.csv"

N_SPLITS = 20              # Number of folds for cross-validation
FQI_ITERATIONS = 3         # Number of FQI iterations per fold
FQI_BATCH_SIZE = 1000_000   # Max samples to use for training FQI
TRANSACTION_COST = 0 

# --- 2. Load Full Dataset ---
print("Loading and preparing full dataset for CV...")
X_full, y_full, scorer_info_full = \
    load_full_data_for_cv(TRAIN_DATA_PATH, SPY_DATA_PATH, slice_start=2000)

# --- 3. Setup CV Loop ---
tscv = TimeSeriesSplit(n_splits=N_SPLITS)
all_fold_scores = []

print("\n" + "="*50)
print(f"Starting XGBoost FQI Cross-Validation with {N_SPLITS} Folds")
print("="*50)

fold_num = 0
for train_index, test_index in tscv.split(X_full):
    fold_num += 1
    print(f"\n--- Starting Fold {fold_num}/{N_SPLITS} ---")
    print(f"Train indices: {len(train_index)}, Test indices: {len(test_index)}")

    # --- 4. Create data and environments for THIS fold ---
    X_train, X_test = X_full[train_index], X_full[test_index]
    y_train, y_test = y_full[train_index], y_full[test_index]
    scorer_train, scorer_test = scorer_info_full[train_index], scorer_info_full[test_index]
    
    # Use 0 transaction cost for data collection to get cleaner Q-values
    train_env_fold = RlTradingEnv(X_train, y_train, scorer_train, transaction_cost=0)
    # Use real transaction cost for evaluation
    test_env_fold = RlTradingEnv(X_test, y_test, scorer_test, transaction_cost=TRANSACTION_COST)

    # --- 5. CRITICAL: Re-initialize Agent and Buffer for each fold ---
    state_dim = train_env_fold.n_features
    agent_fqi_fold = XGBoost_FQI_Agent(state_dim, gamma=0.95, n_actions=5)
    buffer_fqi_fold = ReplayBuffer(max_size=2_000_000) # Large buffer
    
    # --- 6. Populate Buffer (Offline Data Collection) ---
    # We use a simple random policy to explore the state-action space
    print(f"Populating replay buffer with {train_env_fold.n_steps} transitions...")
    state = train_env_fold.reset()
    done = False
    while not done:
        action = np.random.choice(agent_fqi_fold.action_space)
        next_state, reward, done = train_env_fold.step(np.array([action]))
        buffer_fqi_fold.add(state, np.array([action]), reward, next_state, done)
        state = next_state
        
    # --- 7. Train Agent (Offline FQI) ---
    agent_fqi_fold.train_fqi(
        buffer_fqi_fold, 
        iterations=FQI_ITERATIONS, 
        batch_size=FQI_BATCH_SIZE
    )
    
    # --- 8. Evaluate Agent on Test Fold ---
    print("Evaluating agent on test fold...")
    # We use run_evaluation with exploration_rate=0.0 (deterministic policy)
    eval_score = test_env_fold.run_evaluation(agent_fqi_fold)
    all_fold_scores.append(eval_score)
    print(f"--- Fold {fold_num} Complete. Test Sharpe: {eval_score:.4f} ---")

# --- 9. Final Results ---
print("\n" + "="*50)
print("Cross-Validation Complete - Aggregated Results")
print("="*50)

scores_array = np.array(all_fold_scores)

print(f"Scores per fold: {np.round(scores_array, 4)}")
print("\n--- Final Model Performance ---")
print(f"Mean Adjusted Sharpe: {np.mean(scores_array):.4f}")
print(f"Std Dev of Sharpe:  {np.std(scores_array):.4f}")

Loading and preparing full dataset for CV...
Data ready. Total cleaned rows: 6941 (from 6990)
Features shape: (6941, 144), Target shape: (6941, 1)

Starting XGBoost FQI Cross-Validation with 20 Folds

--- Starting Fold 1/20 ---
Train indices: 341, Test indices: 330
XGBoost agent initialized with 5 discrete actions: [0.  0.5 1.  1.5 2. ]
Populating replay buffer with 341 transitions...

--- Starting XGBoost FQI Training (3 iterations) ---
Buffer not full enough. Need at least 1000 samples. Skipping training.
Evaluating agent on test fold...
--- Fold 1 Complete. Test Sharpe: 0.0645 ---

--- Starting Fold 2/20 ---
Train indices: 671, Test indices: 330
XGBoost agent initialized with 5 discrete actions: [0.  0.5 1.  1.5 2. ]
Populating replay buffer with 671 transitions...

--- Starting XGBoost FQI Training (3 iterations) ---
Buffer not full enough. Need at least 1000 samples. Skipping training.
Evaluating agent on test fold...
--- Fold 2 Complete. Test Sharpe: 0.0000 ---

--- Starting Fold