In [56]:
import polars as pl
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit

# The training data path should be updated to your actual training file.
TRAIN_DATA_PATH = "./kaggle/train.csv"
SPY_DATA_PATH = "./kaggle/spy-historical.csv"

def generate_features_7 (df: pl.DataFrame) -> pl.DataFrame:
  """Generates new features from the base data.
    This function is the target of the evolutionary algorithm.
  
    Available Feature Categories:
    - D* (Dummy/Binary features): 9 columns (D1-D9)
    - E* (Macro Economic features): 20 columns (E1-E20)
    - I* (Interest Rate features): 9 columns (I1-I9)
    - M* (Market Dynamics/Technical features): 18 columns (M1-M18)
    - P* (Price/Valuation features): 13 columns (P1-P13)
    - S* (Sentiment features): 12 columns (S1-S12)
    - V* (Volatility features): 13 columns (V1-V13)
  """
  new_features = pl.DataFrame({
      # --- 20 Pairwise Interactions ---
      'feat_M1_x_V1': df['M1'] * df['V1'],
      'feat_P1_add_E1': df['P1'] + df['E1'],
      'feat_S1_sub_I1': df['S1'] - df['I1'],
      'feat_M10_div_V10': df['M10'] / (df['V10'] + 1e-6),
      'feat_P10_x_E10': df['P10'] * df['E10'],
      'feat_M2_x_S3': df['M2'] * df['S3'],
      'feat_V2_div_P2': df['V2'] / (df['P2'] + 1e-6),
      'feat_E4_sub_I3': df['E4'] - df['I3'],
      'feat_S7_add_M12': df['S7'] + df['M12'],
      'feat_I5_x_V11': df['I5'] * df['V11'],
      'feat_P5_div_S8': df['P5'] / (df['S8'] + 1e-6),
      'feat_E12_x_I9': df['E12'] * df['I9'],
      'feat_M1_div_S1': df['M1'] / (df['S1'] + 1e-6),
      'feat_V1_add_P1': df['V1'] + df['P1'],
      'feat_E1_sub_I1': df['E1'] - df['I1'],
      'feat_M2_div_V2': df['M2'] / (df['V2'] + 1e-6),
      'feat_P2_x_S3': df['P2'] * df['S3'],
      'feat_E4_add_M10': df['E4'] + df['M10'],
      'feat_I3_sub_V10': df['I3'] - df['V10'],
      'feat_S7_x_P10': df['S7'] * df['P10'],
      # --- 10 Rolling Window Features ---
      'feat_V2_roll_mean_5': df['V2'].rolling_mean(window_size=5),
      'feat_V1_roll_std_5': df['V1'].rolling_std(window_size=5),
      'feat_M1_roll_mean_20': df['M1'].rolling_mean(window_size=20),
      'feat_M3_roll_std_20': df['M3'].rolling_std(window_size=20),
      'feat_P1_roll_max_10': df['P1'].rolling_max(window_size=10),
      'feat_P1_roll_min_10': df['P1'].rolling_min(window_size=10),
      'feat_E5_roll_mean_50': df['E5'].rolling_mean(window_size=50),
      'feat_S1_roll_std_50': df['S1'].rolling_std(window_size=50),
      'feat_I1_roll_mean_10': df['I1'].rolling_mean(window_size=10),
      'feat_V10_roll_std_10': df['V10'].rolling_std(window_size=10),
      # --- 10 Complex Interactions (3+ elements) ---
      'feat_M1_V1_div_P1': (df['M1'] * df['V1']) / (df['P1'] + 1e-6),
      'feat_E1_S1_add_I1': df['E1'] + df['S1'] - df['I1'],
      'feat_M2_P2_sub_V2': df['M2'] + df['P2'] - df['V2'],
      'feat_S7_div_E4_I3': df['S7'] / (df['E4'] + df['I3'] + 1e-6),
      'feat_P5_x_M10_x_V10': df['P5'] * df['M10'] * df['V10'],
      'feat_roll_diff_M1_5_20': df['M1'].rolling_mean(window_size=5) - df['M1'].rolling_mean(window_size=20),
      'feat_roll_diff_V1_5_20': df['V1'].rolling_mean(window_size=5) - df['V1'].rolling_mean(window_size=20),
      'feat_M_S_P_combo': (df['M12'] - df['M1']) / (df['S1'] + df['P1'] + 1e-6),
      'feat_V_E_I_combo': (df['V11'] + df['V2']) * (df['E1'] - df['I1']),
      'feat_ratio_of_ratios': (df['M1']/(df['V1']+1e-6)) / (df['P1']/(df['S1']+1e-6)),
      # --- 10 New Features ---
      'feat_M1_x_V1_x_P1': df['M1'] * df['V1'] * df['P1'],
      'feat_E1_div_S1': df['E1'] / (df['S1'] + 1e-6),
      'feat_I1_sub_V1': df['I1'] - df['V1'],
      'feat_M10_add_V10': df['M10'] + df['V10'],
      'feat_P10_div_E10': df['P10'] / (df['E10'] + 1e-6),
      'feat_M2_add_S3': df['M2'] + df['S3'],
      'feat_V2_x_P2': df['V2'] * df['P2'],
      'feat_E4_add_I3': df['E4'] + df['I3'],
      'feat_S7_div_M12': df['S7'] / (df['M12'] + 1e-6),
      'feat_I5_div_V11': df['I5'] / (df['V11'] + 1e-6),
      #'feat_M1_log_P1': np.log(df['M1'] + 1e-6) / np.log(df['P1'] + 1e-6),
      # --- SAFER LOGIC HERE ---
      #'feat_M1_log_P1': pl.when( (df['M1'] > 0) & (df['P1'] > 0) & (df['P1'] != 1) ).then( df['M1'].log() / df['P1'].log() ).otherwise(0),
      # --- END SAFER LOGIC ---
  })
  # Fill any nulls created by rolling windows
  return new_features.with_columns(pl.all().forward_fill())



In [None]:
#solve: daily reranking

def solve(df: pl.DataFrame) -> float:
    """
    Runs a time-series cross-validation process using a "top-k leader"
    ensemble strategy with DAILY re-ranking.
    
    1. Trains N models at the start of each fold.
    2. Iterates DAY-BY-DAY through the test set.
    3. Each day, it finds the top k leaders based on the most recent 
       (e.g., 30-day) history.
    4. It ensembles the positions from *only* those leaders for that day.
    5. Appends the daily performance of ALL N strategies to the master 
       history to be used for the next day's ranking.
    """
    
    # --- Ensemble Configuration ---
    N_STRATEGIES = 50           # We'll create 10 strategies
    SUBSAMPLE_RATIO = 0.05       # Each strategy sees 70% of the training data
    FEATURE_GENERATOR = generate_features_7 # All strategies use this
    
    K_LEADERS = 3               # How many leaders to follow (e.g., 3)
    ROLLING_WINDOW_DAYS = 50    # Lookback period to find leaders
    MIN_HISTORY_DAYS = 25       # Min days needed to start picking leaders
    TRADING_DAYS_PER_YR = 252

    nsplits = 20

    # --- Helper functions (unchanged) ---
    
    def calculate_competition_score(y_true_df: pl.DataFrame, y_pred_signals: np.ndarray) -> float:
        ''' Calculates the competition score based on true values and predicted signals. '''
        solution = y_true_df.to_pandas()
        solution['position'] = y_pred_signals
        solution['strategy_returns'] = (
            solution['risk_free_rate'] * (1 - solution['position']) +
            solution['position'] * solution['forward_returns']
        )
        strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
        strategy_geo_mean = (1 + strategy_excess_returns).prod() ** (1 / len(solution)) - 1
        strategy_std = solution['strategy_returns'].std()
        if strategy_std == 0: return 0.0
        
        sharpe = strategy_geo_mean / strategy_std * np.sqrt(TRADING_DAYS_PER_YR)
        market_std = solution['forward_returns'].std()
        market_volatility = market_std * np.sqrt(TRADING_DAYS_PER_YR) * 100
        strategy_volatility = strategy_std * np.sqrt(TRADING_DAYS_PER_YR) * 100
        excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
        vol_penalty = 1 + excess_vol
        market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
        market_geo_mean = (1 + market_excess_returns).prod() ** (1 / len(solution)) - 1
        return_gap = max(0, (market_geo_mean - strategy_geo_mean) * 100 * TRADING_DAYS_PER_YR)
        return_penalty = 1 + (return_gap**2) / 100
        adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
        print(f"  Fold {i+1} (Daily): Strat Vol: {strategy_volatility:.2f}%, Mkt Vol: {market_volatility:.2f}%, Ret_penalty: {(return_penalty-1):.4f}, Sharpe: {sharpe:.4f}, Adj Sharpe: {adjusted_sharpe:.4f}")
        return adjusted_sharpe

    def convert_to_signal(predictions: np.ndarray, multiplier: float = 400.0) -> np.ndarray:
        ''' Converts raw model predictions into trading signals in the range [0, 2]. '''
        signals = predictions * multiplier + 0.8
        #signals = -np.abs(predictions-0.001) * 1000 + 3
        return np.clip(signals, 0.0, 2.0)

    def calculate_strategy_returns_pl(y_true_df_1_day: pl.DataFrame, y_pred_signal_1_day: np.ndarray) -> pl.Series:
        """ Calculates strategy returns for a *single day*. """
        signals_series = pl.Series("position", y_pred_signal_1_day)
        df_with_pos = y_true_df_1_day.with_columns(signals_series)
        strategy_returns = (
            df_with_pos['risk_free_rate'] * (1 - df_with_pos['position']) +
            df_with_pos['position'] * df_with_pos['forward_returns']
        )
        return strategy_returns.alias("strategy_returns")

    def calculate_sharpe_for_leaderboard(hist_window: pl.DataFrame, strategy_col: str) -> float:
        """ Calculates the geometric Sharpe ratio for a single strategy from the history. """
        if hist_window.height == 0:
            return 0.0
        
        strategy_returns = hist_window[strategy_col]
        risk_free_rate = hist_window['risk_free_rate']

        market_excess_returns = (hist_window['forward_returns'] - hist_window['risk_free_rate']).to_pandas()
        
        strategy_excess_returns = (strategy_returns - risk_free_rate).to_numpy()
        
        if not np.all(np.isfinite(strategy_excess_returns)):
            print(f"Bad data for {strategy_col} in leaderboard calculation.")
            return -np.inf

        log_returns = np.log1p(strategy_excess_returns)
        strategy_geo_mean = np.exp(np.mean(log_returns)) - 1

        strategy_std = strategy_returns.std()
        
        if strategy_std is None or strategy_std == 0 or not np.isfinite(strategy_std):
            print(f"Bad std for {strategy_col} in leaderboard calculation.")
            return 0.0
            
        sharpe = strategy_geo_mean / strategy_std * np.sqrt(TRADING_DAYS_PER_YR)

        market_geo_mean = (1 + market_excess_returns).prod() ** (1 / len(market_excess_returns)) - 1
        return_gap = max(0, (market_geo_mean - strategy_geo_mean) * 100 * TRADING_DAYS_PER_YR)
        return_penalty = 1 + (return_gap**2) / 100

        sharpe = sharpe/return_penalty
        
        return sharpe if np.isfinite(sharpe) else -np.inf

    # --- Data Preparation (Efficient) ---
    print(f"Initial DataFrame shape: {df.shape}")
    base_df = df.rename({'market_forward_excess_returns': 'target'})
    feature_cols = [col for col in base_df.columns if col != 'date_id']
    base_df = base_df.with_columns(pl.col(feature_cols).cast(pl.Float64, strict=False))
    
    if 'E7' in base_df.columns:
        base_df = base_df.drop('E7')
        
    base_df = base_df.with_columns(pl.all().forward_fill())
    print(f"Base DataFrame shape after cleaning: {base_df.shape}")

    # --- Generate ALL features ONCE ---
    print(f"Generating features using {FEATURE_GENERATOR.__name__}...")
    base_features = [col for col in base_df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
    new_features_df = FEATURE_GENERATOR(base_df)
    
    X_all_features = pl.concat([base_df.select(base_features), new_features_df], how="horizontal")
    X_all_features = base_df.select(base_features)
    print(f"Full feature set shape: {X_all_features.shape}")

    TARGET_COL = "target"
    y = base_df.select(TARGET_COL)
    scorer_info_df = base_df.select(["date_id", "forward_returns", "risk_free_rate"])

    # --- Time-Series Cross-Validation ---
    print(f"Starting DAILY re-ranking CV with {N_STRATEGIES} strategies, following top {K_LEADERS} leaders.")
    
    tscv = TimeSeriesSplit(n_splits=nsplits)
    
    cv_scores = []
    
    # Master history of OOS performance for all N strategies
    # This will be updated DAY-BY-DAY
    history_df = pl.DataFrame()
    
    # Store final ensembled signals and truths for overall score
    overall_final_signals = []
    overall_y_true = []
    
    rng = np.random.default_rng(seed=12)

    for i, (train_index, test_index) in enumerate(tscv.split(base_df)):
        print(f"--- Starting Fold {i+1}/{nsplits} (Days {test_index[0]} to {test_index[-1]}) ---")
        
        # Get the full train/test slices for this fold
        X_train_fold_full = X_all_features[train_index]
        y_train_fold_full = y[train_index]
        
        X_test_fold = X_all_features[test_index]
        y_test_info_fold = scorer_info_df[test_index] # All "truth" data for this fold

        # --- 1. Train N Models (Once per fold) ---
        # print("  Training N models...")
        N_signals_fold = [] # Will store signals for all N models
        n_samples = X_train_fold_full.height
        subset_size = int(n_samples * SUBSAMPLE_RATIO)
        
        for j in range(N_STRATEGIES):
            subset_fold_indices = rng.choice(n_samples, subset_size, replace=False)
            X_train_j = X_train_fold_full[subset_fold_indices]
            y_train_j = y_train_fold_full[subset_fold_indices]

            model = xgb.XGBRegressor(
                objective='reg:absoluteerror', n_estimators=5, device='cuda',
                learning_rate=0.05, max_depth=4, subsample=0.8, colsample_bytree=0.8,
                n_jobs=-1, random_state=42 + j 
            )
            
            model.fit(X_train_j, y_train_j, verbose=False)

            # Predict on the *entire* test fold and store signals
            predictions = model.predict(X_test_fold) 
            signals = convert_to_signal(predictions)
            N_signals_fold.append(signals)
            
        # Stack into a [days, strategies] numpy array
        N_signals_fold = np.stack(N_signals_fold, axis=1) 
        
        # --- 2. Iterate Day-By-Day through the test fold ---
        print(f"  Iterating daily and re-ranking leaders...")
        
        # Store the *final* ensembled signals for this fold
        fold_ensembled_signals = [] 

        for day_idx in range(len(test_index)):
            
            # --- 2a. Find Top k Leaders (for today) ---
            # Look at history *up to this point*
            hist_window = history_df.tail(ROLLING_WINDOW_DAYS)
            
            if hist_window.height < MIN_HISTORY_DAYS:
                # Not enough history, average all strategies
                leaderboard = None
                top_k_indices = list(range(N_STRATEGIES))
                if day_idx % 20 == 0: # Print status periodically
                   print(f"    Day {day_idx}: Not enough history, averaging all {N_STRATEGIES}.")
            else:
                # We have history, find the leaders
                leaderboard = []
                for j in range(N_STRATEGIES):
                    sharpe = calculate_sharpe_for_leaderboard(hist_window, f'returns_{j}')
                    leaderboard.append((sharpe, j))
                
                leaderboard.sort(key=lambda x: x[0], reverse=True)
                top_k_indices = [j for sharpe, j in leaderboard[:K_LEADERS]]
                if day_idx % 20 == 0: # Print status periodically
                    print(f"    Day {day_idx}: Top {K_LEADERS} leaders: {top_k_indices}, Sharpes: {[f'{s:.3f}' for s, j in leaderboard[:K_LEADERS]]}")

            # --- 2b. Ensemble Position (for today) ---
            # Get the pre-calculated signals for *all N* strategies *for today*
            today_N_signals = N_signals_fold[day_idx, :]
            
            # Get signals from *only* the leaders
            signals_from_leaders = today_N_signals[top_k_indices]
            
            # Average them to get the final position
            final_signal_today = np.mean(signals_from_leaders[0:2])

            # default to const strategy if no strategy is doing good enough.

            if leaderboard and leaderboard[0][0] < -0.2:
                final_signal_today = 0.8

            fold_ensembled_signals.append(final_signal_today)

            # --- 2c. Update Master History (for tomorrow's ranking) ---
            # Get today's "truth" data (1-row DataFrame)
            today_test_info = y_test_info_fold[day_idx]
            
            # Calculate the returns for *all N* individual strategies for today
            today_returns_N = []
            for j in range(N_STRATEGIES):
                signal_j_today = np.array([today_N_signals[j]])
                return_j_series = calculate_strategy_returns_pl(today_test_info, signal_j_today)
                today_returns_N.append(return_j_series[0]) # Get the single float value
            
            # Build the new history row
            new_history_row_data = {
                "date_id": today_test_info["date_id"][0],
                "forward_returns": today_test_info["forward_returns"][0],
                "risk_free_rate": today_test_info["risk_free_rate"][0],
                **{f"returns_{j}": ret for j, ret in enumerate(today_returns_N)}
            }
            new_history_row_df = pl.DataFrame(new_history_row_data)

            # Append to the master history_df
            history_df = pl.concat([history_df, new_history_row_df])

        # --- 3. Score Fold (after iterating all days) ---
        print("  Fold iteration complete. Scoring fold...")
        fold_final_signals_arr = np.array(fold_ensembled_signals)
        score = calculate_competition_score(y_test_info_fold, fold_final_signals_arr)
        cv_scores.append(score)
        
        # Store for overall score calculation
        overall_final_signals.append(fold_final_signals_arr)
        overall_y_true.append(y_test_info_fold)

    # --- 4. Final Evaluation ---
    mean_score = np.mean(cv_scores)
    print(f"\n--- CV Finished ---")
    print(f"Mean Ensembled CV Score (Daily Re-ranking): {mean_score:.4f}, std: {np.std(cv_scores):.4f}")
    
    overall_y_true_df = pl.concat(overall_y_true)
    overall_final_signals_arr = np.concatenate(overall_final_signals)
    
    print("\n--- Overall OOS Performance (Daily Re-ranking) ---")
    # This score is the most important one
    overall_score = calculate_competition_score(overall_y_true_df, overall_final_signals_arr)
    print(f" Overall Ensembled Score: {overall_score:.4f}")
    
    return overall_score

In [48]:
# todo: add constant strategy and other basic strategies (ma5/20, rsi, ...) to pool
# todo:  
# todo: weighted position

import polars as pl
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit

# ... (Previous imports and generate_features_7 function remain unchanged) ...

def solve(df: pl.DataFrame) -> float:
    """
    Runs a time-series cross-validation process using a "top-k leader"
    ensemble strategy with DAILY re-ranking.
    
    MODIFICATION: Uses iterative data pruning (semi-deterministic).
    After training a strategy, data points that were 'well-predicted' 
    (low error) are removed from the training set for the next strategy.
    """
    
    # --- Ensemble Configuration ---
    N_STRATEGIES = 50           
    SUBSAMPLE_RATIO = 0.05       # Increased slightly to ensure coverage as we shrink data
    FEATURE_GENERATOR = generate_features_7 
    
    K_LEADERS = 2               
    ROLLING_WINDOW_DAYS = 50    
    MIN_HISTORY_DAYS = 15       
    TRADING_DAYS_PER_YR = 252

    # Percentage of "easiest" data points to remove after each strategy
    # i.e., remove the 15% of rows with the lowest absolute error
    PRUNE_PERCENTILE = 15       

    nsplits = 20

    # --- Helper functions (same as before) ---
    def calculate_competition_score(y_true_df: pl.DataFrame, y_pred_signals: np.ndarray) -> float:
        ''' Calculates the competition score based on true values and predicted signals. '''
        solution = y_true_df.to_pandas()
        solution['position'] = y_pred_signals
        solution['strategy_returns'] = (
            solution['risk_free_rate'] * (1 - solution['position']) +
            solution['position'] * solution['forward_returns']
        )
        strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
        strategy_geo_mean = (1 + strategy_excess_returns).prod() ** (1 / len(solution)) - 1
        strategy_std = solution['strategy_returns'].std()
        if strategy_std == 0: return 0.0
        
        sharpe = strategy_geo_mean / strategy_std * np.sqrt(TRADING_DAYS_PER_YR)
        market_std = solution['forward_returns'].std()
        market_volatility = market_std * np.sqrt(TRADING_DAYS_PER_YR) * 100
        strategy_volatility = strategy_std * np.sqrt(TRADING_DAYS_PER_YR) * 100
        excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
        vol_penalty = 1 + excess_vol
        market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
        market_geo_mean = (1 + market_excess_returns).prod() ** (1 / len(solution)) - 1
        return_gap = max(0, (market_geo_mean - strategy_geo_mean) * 100 * TRADING_DAYS_PER_YR)
        return_penalty = 1 + (return_gap**2) / 100
        adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
        print(f"  Fold {i+1} (Daily): Strat Vol: {strategy_volatility:.2f}%, Mkt Vol: {market_volatility:.2f}%, Ret_penalty: {(return_penalty-1):.4f}, Sharpe: {sharpe:.4f}, Adj Sharpe: {adjusted_sharpe:.4f}")
        return adjusted_sharpe

    def convert_to_signal(predictions: np.ndarray, multiplier: float = 400.0) -> np.ndarray:
        signals = predictions * multiplier + 0.9
        #signals = -np.abs(predictions-0.001) * 1000 + 3
        return np.clip(signals, 0.0, 2.0)

    def calculate_strategy_returns_pl(y_true_df_1_day: pl.DataFrame, y_pred_signal_1_day: np.ndarray) -> pl.Series:
        signals_series = pl.Series("position", y_pred_signal_1_day)
        df_with_pos = y_true_df_1_day.with_columns(signals_series)
        strategy_returns = (
            df_with_pos['risk_free_rate'] * (1 - df_with_pos['position']) +
            df_with_pos['position'] * df_with_pos['forward_returns']
        )
        return strategy_returns.alias("strategy_returns")

    def calculate_sharpe_for_leaderboard(hist_window: pl.DataFrame, strategy_col: str) -> float:
        if hist_window.height == 0: return 0.0
        strategy_returns = hist_window[strategy_col]
        risk_free_rate = hist_window['risk_free_rate']
        strategy_excess_returns = (strategy_returns - risk_free_rate).to_numpy()
        if not np.all(np.isfinite(strategy_excess_returns)): return -np.inf
        log_returns = np.log1p(strategy_excess_returns)
        strategy_geo_mean = np.exp(np.mean(log_returns)) - 1
        strategy_std = strategy_returns.std()
        if strategy_std is None or strategy_std == 0 or not np.isfinite(strategy_std): return 0.0
        sharpe = strategy_geo_mean / strategy_std * np.sqrt(TRADING_DAYS_PER_YR)
        return sharpe if np.isfinite(sharpe) else -np.inf

    # --- Data Preparation ---
    print(f"Initial DataFrame shape: {df.shape}")
    base_df = df.rename({'market_forward_excess_returns': 'target'})
    feature_cols = [col for col in base_df.columns if col != 'date_id']
    base_df = base_df.with_columns(pl.col(feature_cols).cast(pl.Float64, strict=False))
    if 'E7' in base_df.columns: base_df = base_df.drop('E7')
    base_df = base_df.with_columns(pl.all().forward_fill())
    
    print(f"Generating features using {FEATURE_GENERATOR.__name__}...")
    base_features = [col for col in base_df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
    new_features_df = FEATURE_GENERATOR(base_df)
    X_all_features = pl.concat([base_df.select(base_features), new_features_df], how="horizontal")
    print(f"Full feature set shape: {X_all_features.shape}")

    TARGET_COL = "target"
    y = base_df.select(TARGET_COL)
    scorer_info_df = base_df.select(["date_id", "forward_returns", "risk_free_rate"])

    print(f"Starting DAILY re-ranking CV with {N_STRATEGIES} strategies, following top {K_LEADERS} leaders.")
    
    tscv = TimeSeriesSplit(n_splits=nsplits)
    cv_scores = []
    history_df = pl.DataFrame()
    overall_final_signals = []
    overall_y_true = []
    
    rng = np.random.default_rng(seed=12)

    for i, (train_index, test_index) in enumerate(tscv.split(base_df)):
        print(f"--- Starting Fold {i+1}/{nsplits} (Days {test_index[0]} to {test_index[-1]}) ---\n")
        
        # Slices for test
        X_test_fold = X_all_features[test_index]
        y_test_info_fold = scorer_info_df[test_index]

        # --- 1. Semi-Deterministic Iterative Training ---
        N_signals_fold = [] 
        
        # Initialize the pool of available training indices for this fold
        # (We convert to a numpy array to make filtering easier)
        available_train_indices = np.array(train_index)
        
        for j in range(N_STRATEGIES):
            current_pool_size = len(available_train_indices)
            
            # Safety check: if we pruned too much, reset to full random sample
            if current_pool_size < 200: 
                # Fallback: Just sample from original train_index
                subset_indices = rng.choice(train_index, int(len(train_index)*SUBSAMPLE_RATIO), replace=False)
            else:
                # Subsample from the REMAINING filtered pool
                sample_size = int(current_pool_size * SUBSAMPLE_RATIO)
                subset_indices = rng.choice(available_train_indices, sample_size, replace=False)

            X_train_j = X_all_features[subset_indices]
            y_train_j = y[subset_indices]

            model = xgb.XGBRegressor(
                objective='reg:absoluteerror', n_estimators=10, device='cuda',
                learning_rate=0.05, max_depth=4, subsample=0.8, colsample_bytree=0.8,
                n_jobs=-1, random_state=42 + j 
            )
            
            model.fit(X_train_j, y_train_j, verbose=False)

            # Store predictions for the test fold
            predictions_test = model.predict(X_test_fold) 
            signals = convert_to_signal(predictions_test)
            N_signals_fold.append(signals)

            # --- DATA PRUNING LOGIC ---
            # 1. Predict on the *entire* currently available training pool
            X_eval_pool = X_all_features[available_train_indices]
            y_eval_pool = y[available_train_indices].to_numpy().flatten()
            
            preds_pool = model.predict(X_eval_pool)
            
            # 2. Calculate Absolute Error
            # Low error = "Close to Market" / Model solved these days
            errors = np.abs(preds_pool - y_eval_pool)
            
            # 3. Determine threshold (e.g., the bottom 15% errors are considered "solved")
            error_threshold = np.percentile(errors, PRUNE_PERCENTILE)
            
            # 4. Filter: Keep only the rows where the error is HIGH (Model struggled)
            # This forces the next model to learn the "hard" cases.
            keep_mask = errors > error_threshold
            available_train_indices = available_train_indices[keep_mask]
            
            # Optional: Debug print to see pool shrinking
            # if j % 3 == 0:
            #     print(f"    Strat {j}: Pool size {current_pool_size} -> {len(available_train_indices)} (removed solved dates)")

        N_signals_fold = np.stack(N_signals_fold, axis=1) 
        
        # --- 2. Iterate Day-By-Day (Unchanged) ---
        # print(f"  Iterating daily and re-ranking leaders...")
        fold_ensembled_signals = [] 

        for day_idx in range(len(test_index)):
            hist_window = history_df.tail(ROLLING_WINDOW_DAYS)
            
            if hist_window.height < MIN_HISTORY_DAYS:
                leaderboard = []
                top_k_indices = list(range(N_STRATEGIES))
            else:
                leaderboard = []
                for j in range(N_STRATEGIES):
                    sharpe = calculate_sharpe_for_leaderboard(hist_window, f'returns_{j}')
                    leaderboard.append((sharpe, j))
                
                leaderboard.sort(key=lambda x: x[0], reverse=True)
                top_k_indices = [j for sharpe, j in leaderboard[:K_LEADERS]]

            today_N_signals = N_signals_fold[day_idx, :]
            signals_from_leaders = today_N_signals[top_k_indices]
            final_signal_today = np.mean(signals_from_leaders)

            if leaderboard and day_idx % 60 == 0: # Print status periodically
                    print(f"    Day {day_idx}: Top {K_LEADERS} leaders: {top_k_indices}, Sharpes: {[f'{s:.3f}' for s, j in leaderboard[:K_LEADERS]]}")

            # --- 2b. Ensemble Position (for today) ---

            if leaderboard and leaderboard[0][0] < -1:
                final_signal_today = 0.5

            fold_ensembled_signals.append(final_signal_today)

            today_test_info = y_test_info_fold[day_idx]
            today_returns_N = []
            for j in range(N_STRATEGIES):
                signal_j_today = np.array([today_N_signals[j]])
                return_j_series = calculate_strategy_returns_pl(today_test_info, signal_j_today)
                today_returns_N.append(return_j_series[0]) 
            
            new_history_row_data = {
                "date_id": today_test_info["date_id"][0],
                "forward_returns": today_test_info["forward_returns"][0],
                "risk_free_rate": today_test_info["risk_free_rate"][0],
                **{f"returns_{j}": ret for j, ret in enumerate(today_returns_N)}
            }
            history_df = pl.concat([history_df, pl.DataFrame(new_history_row_data)])

        # --- 3. Score Fold ---
        fold_final_signals_arr = np.array(fold_ensembled_signals)
        score = calculate_competition_score(y_test_info_fold, fold_final_signals_arr)
        cv_scores.append(score)
        
        overall_final_signals.append(fold_final_signals_arr)
        overall_y_true.append(y_test_info_fold)

    # --- 4. Final Evaluation ---
    mean_score = np.mean(cv_scores)
    print(f"\n--- CV Finished ---")
    print(f"Mean Ensembled CV Score (Daily Re-ranking): {mean_score:.4f}, std: {np.std(cv_scores):.4f}")
    
    overall_y_true_df = pl.concat(overall_y_true)
    overall_final_signals_arr = np.concatenate(overall_final_signals)
    
    print("\n--- Overall OOS Performance (Daily Re-ranking) ---")
    overall_score = calculate_competition_score(overall_y_true_df, overall_final_signals_arr)
    print(f" Overall Ensembled Score: {overall_score:.4f}")
    
    return overall_score

In [59]:

def evaluate(excessarg: int) -> float:
    """
    Main evaluation function for FunSearch. It loads the data
    and runs the solver which performs cross-validation.
    """
    full_train_df = pl.read_csv(TRAIN_DATA_PATH)
    # Use a slice of data for faster evaluation runs during development
    df_raw = full_train_df.slice(4000)
    print(df_raw.shape)

    #fill nulls in df with mean
    df = df_raw.with_columns(
        # Select all numeric columns for the operation
        pl.selectors.numeric()
          # Step 1: Attempt to fill with the rolling mean of each respective column
          .fill_null(
              pl.selectors.numeric().rolling_mean(window_size=5, min_periods=1)
          )
          # Step 2: Fall back to the global column mean for any remaining nulls
          #.fill_null(strategy='mean')
    )
    df = df.with_columns(
    pl.col("date_id").cast(pl.Int64)
    )
    
    weekday_df = add_weekday_column(SPY_DATA_PATH)
    print("\n--- Joining weekday feature onto sliced data ---")
    # Join the weekday information onto the sliced training data.
    # A 'left' join ensures we keep all rows from the original `df`.
    df_with_features = df.join(weekday_df, on="date_id", how="left")
    # print("DataFrame after join:")
    # print(df_with_features.shape)
    return solve(df_with_features)
  
def add_weekday_column(input_csv_path: str) -> pl.DataFrame:
    """
    Reads a CSV file, adds a 'weekday' column based on the 'Date' column,
    and saves the result to a new CSV file.

    Args:
        input_csv_path (str): The path to the source CSV file.
        output_csv_path (str): The path where the output CSV will be saved.
    """
    # Read the CSV file into a Polars DataFrame
    df = pl.read_csv(input_csv_path)

    # Add a new column named 'weekday'
    # 1. Select the 'Date' column.
    # 2. Convert the string representation to a proper date type.
    # 3. Use the .dt.weekday() function to get the day of the week (Monday=1, Sunday=7).
    # 4. Alias the new expression to 'weekday'.
    df_with_weekday = df.with_columns(
        pl.col("Date").str.to_date().dt.weekday().alias("weekday")
    )

    # Print the transformed DataFrame to the console to show the result
    returned_df = df_with_weekday.select(["date_id", "weekday"])
    return returned_df

import warnings
warnings.filterwarnings('ignore', message='.*XGBoost is not compiled with CUDA support.*')

evaluate(0)

(4990, 98)

--- Joining weekday feature onto sliced data ---
Initial DataFrame shape: (4990, 99)
Base DataFrame shape after cleaning: (4990, 98)
Generating features using generate_features_7...
Full feature set shape: (4990, 94)
Starting DAILY re-ranking CV with 50 strategies, following top 3 leaders.
--- Starting Fold 1/20 (Days 250 to 486) ---


  pl.selectors.numeric().rolling_mean(window_size=5, min_periods=1)


  Iterating daily and re-ranking leaders...
    Day 0: Not enough history, averaging all 50.
    Day 20: Not enough history, averaging all 50.
    Day 40: Top 3 leaders: [41, 16, 29], Sharpes: ['2.942', '2.584', '2.406']
    Day 60: Top 3 leaders: [29, 41, 16], Sharpes: ['4.181', '3.527', '2.868']
    Day 80: Top 3 leaders: [8, 1, 33], Sharpes: ['0.635', '0.407', '0.305']
    Day 100: Top 3 leaders: [46, 8, 49], Sharpes: ['2.498', '2.007', '1.436']
    Day 120: Top 3 leaders: [8, 46, 1], Sharpes: ['3.709', '2.465', '2.141']
    Day 140: Top 3 leaders: [40, 38, 18], Sharpes: ['4.613', '4.373', '4.353']
    Day 160: Top 3 leaders: [10, 46, 41], Sharpes: ['3.240', '2.307', '2.236']
    Day 180: Top 3 leaders: [10, 25, 24], Sharpes: ['3.131', '0.470', '0.078']
    Day 200: Top 3 leaders: [32, 10, 25], Sharpes: ['2.319', '1.197', '0.887']
    Day 220: Top 3 leaders: [10, 46, 48], Sharpes: ['1.055', '0.809', '0.673']
  Fold iteration complete. Scoring fold...
  Fold 1 (Daily): Strat Vol: 7.0

np.float64(0.4457566789306708)