In [9]:
import polars as pl
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit

# The training data path should be updated to your actual training file.
TRAIN_DATA_PATH = "./kaggle/train.csv"
SPY_DATA_PATH = "./kaggle/spy-historical.csv"



In [10]:
def generate_features_1(df: pl.DataFrame) -> pl.DataFrame:
    """Generates 40 new features from the base data.
      This function is the target of the evolutionary algorithm.
    
      Available Feature Categories:
      - D* (Dummy/Binary features): 9 columns (D1-D9)
      - E* (Macro Economic features): 20 columns (E1-E20)
      - I* (Interest Rate features): 9 columns (I1-I9)
      - M* (Market Dynamics/Technical features): 18 columns (M1-M18)
      - P* (Price/Valuation features): 13 columns (P1-P13)
      - S* (Sentiment features): 12 columns (S1-S12)
      - V* (Volatility features): 13 columns (V1-V13)
    """
    new_features = pl.DataFrame({
        # --- New Interaction Features ---
        'feat_M1_x_V10_x_P1': df['M1'] * df['V10'] * df['P1'],
        'feat_P1_add_E10_div_I5': (df['P1'] + df['E10']) / (df['I5'] + 1e-6),
        'feat_S3_div_I5_x_M12': df['S3'] / (df['I5'] + 1e-6) * df['M12'],
        'feat_M12_x_V2_x_P2': df['M12'] * df['V2'] * df['P2'],
        'feat_P2_sqrt_E4_x_S8': np.sqrt(df['P2'] * df['E4']) * df['S8'],
        'feat_M2_x_S8_x_V2': df['M2'] * df['S8'] * df['V2'],
        'feat_V10_cbrt_P5_x_E12': np.cbrt(df['V10'] * df['P5']) * df['E12'],
        'feat_E12_x_I9_x_M1': df['E12'] * df['I9'] * df['M1'],
        'feat_M1_log_V1_x_S7': np.log(df['M1'] + 1) * np.log(df['V1'] + 1) * df['S7'],
        'feat_S7_abs_P1_x_I5': np.abs(df['S7'] - df['P1']) * df['I5'],
    })
    # Fill any nulls created by rolling windows
    return new_features.with_columns(pl.all().forward_fill().backward_fill())



def generate_features_2(df: pl.DataFrame) -> pl.DataFrame:
  """Generates 10 new features from the base data.
    This function is the target of the evolutionary algorithm.
  
    Available Feature Categories:
    - D* (Dummy/Binary features): 9 columns (D1-D9)
    - E* (Macro Economic features): 20 columns (E1-E20)
    - I* (Interest Rate features): 9 columns (I1-I9)
    - M* (Market Dynamics/Technical features): 18 columns (M1-M18)
    - P* (Price/Valuation features): 13 columns (P1-P13)
    - S* (Sentiment features): 12 columns (S1-S12)
    - V* (Volatility features): 13 columns (V1-V13)
    - MOM* (Momentum features): 0 columns
  """
  cols = ['M1', 'V1', 'P1', 'E1', 'S1', 'I1', 'M10', 'V10', 'P10', 'E10']
  # Generate interaction features with exponential moving averages
  new_features = pl.DataFrame({
      #'feat_P1_add_E1': df[cols[2]] + df[cols[3]],
      #'feat_S1_sub_I1': df[cols[4]] - df[cols[5]],
      #'feat_M10_div_V10': (df[cols[6]] / (df[cols[7]] + 1e-6)),
      #'feat_P10_x_E10': df[cols[8]] * df[cols[9]],
      # Exponential moving averages instead of simple rolling means
      'feat_V1_ema_5': df[cols[1]].rolling_mean(window_size=5, min_periods=1, center=True),
      'feat_V1_ewm_std_5': df[cols[1]].rolling_std(window_size=5, min_periods=1, center=True),
      'feat_M1_ema_20': df[cols[0]].rolling_mean(window_size=20, min_periods=1, center=True),
      'feat_M1_ewm_std_20': df[cols[0]].rolling_std(window_size=20, min_periods=1, center=True),
      # Relative strength index
      'feat_RSI_14': (df[cols[2]].rolling_max(14) - df[cols[2]].rolling_min(14)) / (df[cols[2]].rolling_max(14) - df[cols[2]].rolling_min(14) + 1e-6),
  })
  return new_features.with_columns(pl.all().fill_null(0))

def generate_features_3(df: pl.DataFrame) -> pl.DataFrame:
  """Generates 10 new features from the base data.
    This function is the target of the evolutionary algorithm.
  
    Available Feature Categories:
    - D* (Dummy/Binary features): 9 columns (D1-D9)
    - E* (Macro Economic features): 20 columns (E1-E20)
    - I* (Interest Rate features): 9 columns (I1-I9)
    - M* (Market Dynamics/Technical features): 18 columns (M1-M18)
    - P* (Price/Valuation features): 13 columns (P1-P13)
    - S* (Sentiment features): 12 columns (S1-S12)
    - V* (Volatility features): 13 columns (V1-V13)
    - MOM* (Momentum features): 0 columns
  """
  cols = ['M1', 'V1', 'P1', 'E1', 'S1', 'I1', 'M10', 'V10', 'P10', 'E10']
  # Generate interaction features
  new_features = pl.DataFrame({
      # Pairwise interactions between different categories
      'feat_M1_x_V1': df[cols[0]] * df[cols[1]],
      'feat_P1_add_E1': df[cols[2]] + df[cols[3]],
      'feat_S1_sub_I1': df[cols[4]] - df[cols[5]],
      'feat_M10_div_V10': (df[cols[6]] / (df[cols[7]] + 1e-6)),
      'feat_P10_x_E10': df[cols[8]] * df[cols[9]],
      # Rolling window features on volatile columns
      'feat_V1_roll_mean_5': df[cols[1]].rolling_mean(window_size=5),
      'feat_V1_roll_std_5': df[cols[1]].rolling_std(window_size=5),
      'feat_M1_roll_mean_20': df[cols[0]].rolling_mean(window_size=20),
      'feat_M1_roll_std_20': df[cols[0]].rolling_std(window_size=20),
      # A simple ratio
      'feat_P1_div_M1': (df[cols[2]] / (df[cols[0]] + 1e-6)),
      # New feature: Exponential moving average
      'feat_M1_exp_mean_5': df[cols[0]].rolling_mean(window_size=5, min_periods=1, center=True),
  })
  # Fill any nulls created by rolling windows
  return new_features.with_columns(pl.all().forward_fill().backward_fill())

def generate_features_4(df: pl.DataFrame) -> pl.DataFrame:
  """Generates 40 new features from the base data.
    This function is the target of the evolutionary algorithm.
  
    Available Feature Categories:
    - D* (Dummy/Binary features): 9 columns (D1-D9)
    - E* (Macro Economic features): 20 columns (E1-E20)
    - I* (Interest Rate features): 9 columns (I1-I9)
    - M* (Market Dynamics/Technical features): 18 columns (M1-M18)
    - P* (Price/Valuation features): 13 columns (P1-P13)
    - S* (Sentiment features): 12 columns (S1-S12)
    - V* (Volatility features): 13 columns (V1-V13)
  """
  """Improved version of `generate_features_v0` with different features."""
  new_features = pl.DataFrame({
      # --- 20 Pairwise Interactions ---
      #'feat_M1_x_V2': df['M1'] * df['V2'],
      #'feat_P1_add_E2': df['P1'] + df['E2'],
      'feat_S1_sub_I2': df['S1'] - df['I2'],
      'feat_M10_div_V11': df['M10'] / (df['V11'] + 1e-6),
      'feat_P10_x_E11': df['P10'] * df['E11'],
      'feat_M2_x_S4': df['M2'] * df['S4'],
      'feat_V2_div_P3': df['V2'] / (df['P3'] + 1e-6),
      'feat_E4_sub_I4': df['E4'] - df['I4'],
      'feat_S7_add_M11': df['S7'] + df['M11'],
      'feat_I5_x_V12': df['I5'] * df['V12'],
      'feat_P5_div_S9': df['P5'] / (df['S9'] + 1e-6),
      'feat_E12_x_I8': df['E12'] * df['I8'],
      'feat_M1_div_S2': df['M1'] / (df['S2'] + 1e-6),
      'feat_V1_add_P2': df['V1'] + df['P2'],
      'feat_E1_sub_I2': df['E1'] - df['I2'],
      'feat_M2_div_V3': df['M2'] / (df['V3'] + 1e-6),
      'feat_P2_x_S4': df['P2'] * df['S4'],
      'feat_E4_add_M11': df['E4'] + df['M11'],
      'feat_I3_sub_V11': df['I3'] - df['V11'],
      'feat_S7_x_P11': df['S7'] * df['P11'],
      # --- 10 Rolling Window Features ---
      'feat_V2_roll_mean_5': df['V2'].rolling_mean(window_size=5),
      'feat_V2_roll_std_5': df['V2'].rolling_std(window_size=5),
      'feat_M2_roll_mean_20': df['M2'].rolling_mean(window_size=20),
      'feat_M2_roll_std_20': df['M2'].rolling_std(window_size=20),
      'feat_P2_roll_max_10': df['P2'].rolling_max(window_size=10),
      'feat_P2_roll_min_10': df['P2'].rolling_min(window_size=10),
      'feat_E2_roll_mean_50': df['E2'].rolling_mean(window_size=50),
      'feat_S2_roll_std_50': df['S2'].rolling_std(window_size=50),
      'feat_I2_roll_mean_10': df['I2'].rolling_mean(window_size=10),
      'feat_V11_roll_std_10': df['V11'].rolling_std(window_size=10),
      # --- 10 Complex Interactions (3+ elements) ---
      'feat_M1_V2_div_P2': (df['M1'] * df['V2']) / (df['P2'] + 1e-6),
      'feat_E2_S2_add_I2': df['E2'] + df['S2'] - df['I2'],
      'feat_M2_P3_sub_V3': df['M2'] + df['P3'] - df['V3'],
      'feat_S8_div_E4_I4': df['S8'] / (df['E4'] + df['I4'] + 1e-6),
      'feat_P6_x_M11_x_V11': df['P6'] * df['M11'] * df['V11'],
      'feat_roll_diff_M2_5_20': df['M2'].rolling_mean(window_size=5) - df['M2'].rolling_mean(window_size=20),
      'feat_roll_diff_V2_5_20': df['V2'].rolling_mean(window_size=5) - df['V2'].rolling_mean(window_size=20),
      'feat_M_S_P_combo_v2': (df['M11'] - df['M2']) / (df['S2'] + df['P2'] + 1e-6),
      'feat_V_E_I_combo_v2': (df['V12'] + df['V3']) * (df['E2'] - df['I2']),
      'feat_ratio_of_ratios_v2': (df['M2']/(df['V2']+1e-6)) / (df['P2']/(df['S2']+1e-6)),
  })
  # Fill any nulls created by rolling windows
  return new_features.with_columns(pl.all().forward_fill().backward_fill())

def generate_features_5(df: pl.DataFrame) -> pl.DataFrame:
    """Generates 40 new features from the base data.
      This function is the target of the evolutionary algorithm.
    
      Available Feature Categories:
      - D* (Dummy/Binary features): 9 columns (D1-D9)
      - E* (Macro Economic features): 20 columns (E1-E20)
      - I* (Interest Rate features): 9 columns (I1-I9)
      - M* (Market Dynamics/Technical features): 18 columns (M1-M18)
      - P* (Price/Valuation features): 13 columns (P1-P13)
      - S* (Sentiment features): 12 columns (S1-S12)
      - V* (Volatility features): 13 columns (V1-V13)
    """
    """Improved version of `generate_features_v2` with more complex interactions."""
    new_features = pl.DataFrame({
        'feat_M1_add_V1': df['M1'] + df['V1'],
        'feat_P1_sub_E1': df['P1'] - df['E1'],
        'feat_S1_mul_I1': df['S1'] * df['I1'],
        #'feat_M10_div_V10': df['M10'] / (df['V10'] + 1e-6),
        'feat_P10_mul_E10': df['P10'] * df['E10'],
        'feat_M2_mul_S3': df['M2'] * df['S3'],
        'feat_V2_div_P2': df['V2'] / (df['P2'] + 1e-6),
        'feat_E4_sub_I3': df['E4'] - df['I3'],
        'feat_S7_add_M12': df['S7'] + df['M12'],
        'feat_I5_mul_V11': df['I5'] * df['V11'],
        # 'feat_V1_roll_mean_5': df['V1'].rolling_mean(window_size=5),
        # 'feat_V1_roll_std_5': df['V1'].rolling_std(window_size=5),
        # 'feat_M1_roll_mean_20': df['M1'].rolling_mean(window_size=20),
        # 'feat_M1_roll_std_20': df['M1'].rolling_std(window_size=20),
        'feat_P1_roll_max_10': df['P1'].rolling_max(window_size=10),
        'feat_P1_roll_min_10': df['P1'].rolling_min(window_size=10),
        'feat_E1_roll_mean_50': df['E1'].rolling_mean(window_size=50),
        'feat_S1_roll_std_50': df['S1'].rolling_std(window_size=50),
        'feat_I1_roll_mean_10': df['I1'].rolling_mean(window_size=10),
        'feat_V10_roll_std_10': df['V10'].rolling_std(window_size=10),
        'feat_M1_V1_div_P1': (df['M1'] * df['V1']) / (df['P1'] + 1e-6),
        'feat_E1_S1_add_I1': df['E1'] + df['S1'] - df['I1'],
        'feat_M2_P2_sub_V2': df['M2'] + df['P2'] - df['V2'],
        'feat_S7_div_E4_I3': df['S7'] / (df['E4'] + df['I3'] + 1e-6),
        'feat_P5_x_M10_x_V10': df['P5'] * df['M10'] * df['V10'],
        'feat_roll_diff_M1_5_20': df['M1'].rolling_mean(window_size=5) - df['M1'].rolling_mean(window_size=20),
        'feat_roll_diff_V1_5_20': df['V1'].rolling_mean(window_size=5) - df['V1'].rolling_mean(window_size=20),
        'feat_M_S_P_combo': (df['M12'] - df['M1']) / (df['S1'] + df['P1'] + 1e-6),
        'feat_V_E_I_combo': (df['V11'] + df['V2']) * (df['E1'] - df['I1']),
        'feat_ratio_of_ratios': (df['M1']/(df['V1']+1e-6)) / (df['P1']/(df['S1']+1e-6)),
        'feat_M1_roll_skew_5': df['M1'].rolling_skew(window_size=5),
        'feat_V1_roll_kurt_5': df['V1'].rolling_kurtosis(window_size=5),
        'feat_E1_roll_skew_20': df['E1'].rolling_skew(window_size=20),
        'feat_S1_roll_kurt_20': df['S1'].rolling_kurtosis(window_size=20),
        'feat_I1_roll_skew_10': df['I1'].rolling_skew(window_size=10),
        'feat_P1_roll_kurt_10': df['P1'].rolling_kurtosis(window_size=10),
        'feat_M1_V1_E1_S1': df['M1'] * df['V1'] * df['E1'] * df['S1'],
        'feat_P1_S1_I1_V1': df['P1'] * df['S1'] * df['I1'] * df['V1'],
        'feat_M10_V10_P10_E10': df['M10'] * df['V10'] * df['P10'] * df['E10'],
        'feat_E1_S1_V1_I1': df['E1'] * df['S1'] * df['V1'] * df['I1'],
        'feat_complex_interaction': df['M1'] * (df['V1'] - df['E1']) / (df['P1'] + df['I1'] + 1e-6),
    })
    # Fill any nulls created by rolling windows
    return new_features.with_columns(pl.all().forward_fill().backward_fill())


def generate_features_6 (df: pl.DataFrame) -> pl.DataFrame:
  """Generates 40 new features from the base data.
    This function is the target of the evolutionary algorithm.
  
    Available Feature Categories:
    - D* (Dummy/Binary features): 9 columns (D1-D9)
    - E* (Macro Economic features): 20 columns (E1-E20)
    - I* (Interest Rate features): 9 columns (I1-I9)
    - M* (Market Dynamics/Technical features): 18 columns (M1-M18)
    - P* (Price/Valuation features): 13 columns (P1-P13)
    - S* (Sentiment features): 12 columns (S1-S12)
    - V* (Volatility features): 13 columns (V1-V13)
  """
  new_features = pl.DataFrame({
      # --- 10 New Pairwise Interactions ---
      'feat_M1_x_V2': df['M1'] * df['V2'],
      'feat_P1_add_E2': df['P1'] + df['E2'],
      'feat_S2_sub_I2': df['S2'] - df['I2'],
      'feat_M3_div_V3': df['M3'] / (df['V3'] + 1e-6),
      'feat_P2_x_E3': df['P2'] * df['E3'],
      'feat_M4_x_S4': df['M4'] * df['S4'],
      'feat_V4_div_P5': df['V4'] / (df['P5'] + 1e-6),
      'feat_E5_sub_I4': df['E5'] - df['I4'],
      'feat_S8_add_M13': df['S8'] + df['M13'],
      'feat_I7_x_V12': df['I7'] * df['V12'],
  })
  return new_features.with_columns(pl.all().fill_null(0))

def generate_features_7 (df: pl.DataFrame) -> pl.DataFrame:
  """Generates new features from the base data.
    This function is the target of the evolutionary algorithm.
  
    Available Feature Categories:
    - D* (Dummy/Binary features): 9 columns (D1-D9)
    - E* (Macro Economic features): 20 columns (E1-E20)
    - I* (Interest Rate features): 9 columns (I1-I9)
    - M* (Market Dynamics/Technical features): 18 columns (M1-M18)
    - P* (Price/Valuation features): 13 columns (P1-P13)
    - S* (Sentiment features): 12 columns (S1-S12)
    - V* (Volatility features): 13 columns (V1-V13)
  """
  new_features = pl.DataFrame({
      # --- 20 Pairwise Interactions ---
      'feat_M1_x_V1': df['M1'] * df['V1'],
      'feat_P1_add_E1': df['P1'] + df['E1'],
      'feat_S1_sub_I1': df['S1'] - df['I1'],
      'feat_M10_div_V10': df['M10'] / (df['V10'] + 1e-6),
      'feat_P10_x_E10': df['P10'] * df['E10'],
      'feat_M2_x_S3': df['M2'] * df['S3'],
      'feat_V2_div_P2': df['V2'] / (df['P2'] + 1e-6),
      'feat_E4_sub_I3': df['E4'] - df['I3'],
      'feat_S7_add_M12': df['S7'] + df['M12'],
      'feat_I5_x_V11': df['I5'] * df['V11'],
      'feat_P5_div_S8': df['P5'] / (df['S8'] + 1e-6),
      'feat_E12_x_I9': df['E12'] * df['I9'],
      'feat_M1_div_S1': df['M1'] / (df['S1'] + 1e-6),
      'feat_V1_add_P1': df['V1'] + df['P1'],
      'feat_E1_sub_I1': df['E1'] - df['I1'],
      'feat_M2_div_V2': df['M2'] / (df['V2'] + 1e-6),
      'feat_P2_x_S3': df['P2'] * df['S3'],
      'feat_E4_add_M10': df['E4'] + df['M10'],
      'feat_I3_sub_V10': df['I3'] - df['V10'],
      'feat_S7_x_P10': df['S7'] * df['P10'],
      # --- 10 Rolling Window Features ---
      'feat_V2_roll_mean_5': df['V2'].rolling_mean(window_size=5),
      'feat_V1_roll_std_5': df['V1'].rolling_std(window_size=5),
      'feat_M1_roll_mean_20': df['M1'].rolling_mean(window_size=20),
      'feat_M3_roll_std_20': df['M3'].rolling_std(window_size=20),
      'feat_P1_roll_max_10': df['P1'].rolling_max(window_size=10),
      'feat_P1_roll_min_10': df['P1'].rolling_min(window_size=10),
      'feat_E5_roll_mean_50': df['E5'].rolling_mean(window_size=50),
      'feat_S1_roll_std_50': df['S1'].rolling_std(window_size=50),
      'feat_I1_roll_mean_10': df['I1'].rolling_mean(window_size=10),
      'feat_V10_roll_std_10': df['V10'].rolling_std(window_size=10),
      # --- 10 Complex Interactions (3+ elements) ---
      'feat_M1_V1_div_P1': (df['M1'] * df['V1']) / (df['P1'] + 1e-6),
      'feat_E1_S1_add_I1': df['E1'] + df['S1'] - df['I1'],
      'feat_M2_P2_sub_V2': df['M2'] + df['P2'] - df['V2'],
      'feat_S7_div_E4_I3': df['S7'] / (df['E4'] + df['I3'] + 1e-6),
      'feat_P5_x_M10_x_V10': df['P5'] * df['M10'] * df['V10'],
      'feat_roll_diff_M1_5_20': df['M1'].rolling_mean(window_size=5) - df['M1'].rolling_mean(window_size=20),
      'feat_roll_diff_V1_5_20': df['V1'].rolling_mean(window_size=5) - df['V1'].rolling_mean(window_size=20),
      'feat_M_S_P_combo': (df['M12'] - df['M1']) / (df['S1'] + df['P1'] + 1e-6),
      'feat_V_E_I_combo': (df['V11'] + df['V2']) * (df['E1'] - df['I1']),
      'feat_ratio_of_ratios': (df['M1']/(df['V1']+1e-6)) / (df['P1']/(df['S1']+1e-6)),
      # --- 10 New Features ---
      'feat_M1_x_V1_x_P1': df['M1'] * df['V1'] * df['P1'],
      'feat_E1_div_S1': df['E1'] / (df['S1'] + 1e-6),
      'feat_I1_sub_V1': df['I1'] - df['V1'],
      'feat_M10_add_V10': df['M10'] + df['V10'],
      'feat_P10_div_E10': df['P10'] / (df['E10'] + 1e-6),
      'feat_M2_add_S3': df['M2'] + df['S3'],
      'feat_V2_x_P2': df['V2'] * df['P2'],
      'feat_E4_add_I3': df['E4'] + df['I3'],
      'feat_S7_div_M12': df['S7'] / (df['M12'] + 1e-6),
      'feat_I5_div_V11': df['I5'] / (df['V11'] + 1e-6),
      'feat_M1_log_P1': np.log(df['M1'] + 1e-6) / np.log(df['P1'] + 1e-6),
  })
  # Fill any nulls created by rolling windows
  return new_features.with_columns(pl.all().forward_fill())
  return new_features.with_columns(pl.all().forward_fill().backward_fill())




In [17]:

def evaluate(excessarg: int) -> float:
    """
    Main evaluation function for FunSearch. It loads the data
    and runs the solver which performs cross-validation.
    """
    full_train_df = pl.read_csv(TRAIN_DATA_PATH)
    # Use a slice of data for faster evaluation runs during development
    df_raw = full_train_df.slice(2000)
    print(df_raw.shape)

    #fill nulls in df with mean
    df = df_raw.with_columns(
        # Select all numeric columns for the operation
        pl.selectors.numeric()
          # Step 1: Attempt to fill with the rolling mean of each respective column
          .fill_null(
              pl.selectors.numeric().rolling_mean(window_size=5, min_periods=1)
          )
          # Step 2: Fall back to the global column mean for any remaining nulls
          #.fill_null(strategy='mean')
    )
    df = df.with_columns(
    pl.col("date_id").cast(pl.Int64)
    )
    
    weekday_df = add_weekday_column(SPY_DATA_PATH)
    print("\n--- Joining weekday feature onto sliced data ---")
    # Join the weekday information onto the sliced training data.
    # A 'left' join ensures we keep all rows from the original `df`.
    df_with_features = df.join(weekday_df, on="date_id", how="left")
    # print("DataFrame after join:")
    # print(df_with_features.shape)
    return solve(df_with_features)
  
def add_weekday_column(input_csv_path: str) -> pl.DataFrame:
    """
    Reads a CSV file, adds a 'weekday' column based on the 'Date' column,
    and saves the result to a new CSV file.

    Args:
        input_csv_path (str): The path to the source CSV file.
        output_csv_path (str): The path where the output CSV will be saved.
    """
    # Read the CSV file into a Polars DataFrame
    df = pl.read_csv(input_csv_path)

    # Add a new column named 'weekday'
    # 1. Select the 'Date' column.
    # 2. Convert the string representation to a proper date type.
    # 3. Use the .dt.weekday() function to get the day of the week (Monday=1, Sunday=7).
    # 4. Alias the new expression to 'weekday'.
    df_with_weekday = df.with_columns(
        pl.col("Date").str.to_date().dt.weekday().alias("weekday")
    )

    # Print the transformed DataFrame to the console to show the result
    returned_df = df_with_weekday.select(["date_id", "weekday"])
    return returned_df

# def solve(df: pl.DataFrame) -> float:
#     """
#     A placeholder for the user's actual solving/modeling function.
#     This dummy function just prints the DataFrame it receives to show
#     that the join operation was successful.
#     """
#     print("--- DataFrame passed to solve() ---")
#     print(df.shape)
#     # In a real scenario, this would return a score based on a model's performance
#     # If the dataframe passed here is empty, a CV function would fail.
#     if df.height == 0:
#         raise ValueError("DataFrame passed to solve() is empty.")
#     return 1.0

def solve(df: pl.DataFrame) -> float:
    """
    Runs a full time-series cross-validation process to evaluate
    the features generated by the `generate_features` function.
    This version pre-generates features to avoid a CPU bottleneck.
    """
    # --- Helper functions (unchanged) ---
    def calculate_competition_score(y_true_df: pl.DataFrame, y_pred_signals: np.ndarray) -> float:
        ''' Calculates the competition score based on true values and predicted signals. '''
        solution = y_true_df.to_pandas()
        solution['position'] = y_pred_signals
        solution['strategy_returns'] = (
            solution['risk_free_rate'] * (1 - solution['position']) +
            solution['position'] * solution['forward_returns']
        )
        strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
        strategy_geo_mean = (1 + strategy_excess_returns).prod() ** (1 / len(solution)) - 1
        strategy_std = solution['strategy_returns'].std()
        if strategy_std == 0: return 0.0
        trading_days_per_yr = 252
        sharpe = strategy_geo_mean / strategy_std * np.sqrt(trading_days_per_yr)
        market_std = solution['forward_returns'].std()
        market_volatility = market_std * np.sqrt(trading_days_per_yr) * 100
        strategy_volatility = strategy_std * np.sqrt(trading_days_per_yr) * 100
        excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
        vol_penalty = 1 + excess_vol
        market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
        market_geo_mean = (1 + market_excess_returns).prod() ** (1 / len(solution)) - 1
        return_gap = max(0, (market_geo_mean - strategy_geo_mean) * 100 * trading_days_per_yr)
        return_penalty = 1 + (return_gap**2) / 100
        adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
        print(f"Strategy Volatility: {strategy_volatility:.2f}%, Market Volatility: {market_volatility:.2f}%, Sharpe: {sharpe:.4f}, Adjusted Sharpe: {adjusted_sharpe:.4f}")
        return adjusted_sharpe

    def convert_to_signal(predictions: np.ndarray, multiplier: float = 400.0) -> np.ndarray:
        ''' Converts raw model predictions into trading signals in the range [0, 2]. '''
        signals = predictions * multiplier + 1
        return np.clip(signals, 0.0, 2.0)

    # --- Feature Generation and Data Prep (Moved Outside the Loop) ---
    #print("Preparing data and generating features once...")
    print(f"Initial DataFrame shape: {df.shape}")
    # 1. Prepare base data
    base_df = df.rename({'market_forward_excess_returns': 'target'})
    feature_cols = [col for col in base_df.columns if col != 'date_id']
    print(f"Base DataFrame shape before cleaning: {base_df.shape}")
    base_df = base_df.with_columns(pl.col(feature_cols).cast(pl.Float64, strict=False))
    #base_df.write_csv("debug_base_df_before_cleaning.csv")
    base_df = base_df.drop('E7')
    print(f"Base DataFrame shape during cleaning: {base_df.shape}")
    
    df = df.with_columns(
    pl.col("date_id").cast(pl.Int64)
    )
    base_df = base_df.with_columns(pl.all().forward_fill())
    #base_df = base_df.with_columns(pl.all().forward_fill()).drop_nulls()
    print(f"Base DataFrame shape after cleaning: {base_df.shape}")


    # 2. Generate new features using the evolved function
    # new_features_df_1 = generate_features_1(base_df)
    # new_features_df_2 = generate_features_2(base_df)
    # new_features_df_3 = generate_features_3(base_df)
    # new_features_df_4 = generate_features_4(base_df)
    # new_features_df_5 = generate_features_5(base_df)
    # new_features_df_6 = generate_features_6(base_df)
    # newf = [new_features_df_1, new_features_df_2, new_features_df_3, new_features_df_4,new_features_df_5, new_features_df_6]
    #newf = [new_features_df_1, new_features_df_2, new_features_df_6]
    #newf = [new_features_df_1, new_features_df_5]
    #new_features_df = pl.concat(newf, how="horizontal")
    new_features_df = generate_features_7(base_df)

    # 3. Combine base data with new features
    processed_df = pl.concat([base_df, new_features_df], how="horizontal")
    #processed_df = base_df

    print(f"Processed DataFrame shape: {processed_df.shape}")

    # 4. Set up data for modeling
    base_features = [col for col in base_df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
    new_feature_names = new_features_df.columns
    ALL_FEATURES = base_features #+ new_feature_names
    #ALL_FEATURES = ['']
    TARGET_COL = "target"

    X = processed_df.select(ALL_FEATURES)
    y = processed_df.select(TARGET_COL)
    scorer_info_df = processed_df.select(["forward_returns", "risk_free_rate"])

    # --- Time-Series Cross-Validation (Loop is now much lighter) ---
    # print("Starting cross-validation loop...")
    nsplits = 28
    tscv = TimeSeriesSplit(n_splits=nsplits)
    cv_scores = []
    entire_signal = []
    entire_ytest = scorer_info_df.clear()

    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        #print(f"  Starting Fold {i+1}/{nsplits}...")
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        y_test_info = scorer_info_df[test_index]

        model = xgb.XGBRegressor(
            objective='reg:absoluteerror', n_estimators=50, device='cuda',
            learning_rate=0.05, max_depth=5, subsample=0.8, colsample_bytree=0.8,
            n_jobs=-1, random_state=42
        )
        
        # GPU is now the primary worker here
        model.fit(X_train, y_train, verbose=False)

        predictions = model.predict(X_test)
        signals = convert_to_signal(predictions)
        score = calculate_competition_score(y_test_info, signals)
        cv_scores.append(score)
        entire_signal.extend(signals)
        entire_ytest.extend(y_test_info)
        #print(f"  Fold {i+1}/{nsplits} Score: {score:.4f}")

    mean_score = np.mean(cv_scores)
    print(f"\nMean CV Score: {mean_score:.4f}, std: {np.std(cv_scores):.4f}")
    overall_score = calculate_competition_score(entire_ytest, entire_signal)
    print(f" Overall Score Calculation: {overall_score:.4f}")
    return overall_score

if __name__ == "__main__":
    # Example call to evaluate function
    final_score = evaluate(0)
    print(f"Final Evaluation Score: {final_score:.4f}")

(6990, 98)

--- Joining weekday feature onto sliced data ---
Initial DataFrame shape: (6990, 99)
Base DataFrame shape before cleaning: (6990, 99)
Base DataFrame shape during cleaning: (6990, 98)
Base DataFrame shape after cleaning: (6990, 98)
Processed DataFrame shape: (6990, 149)
Strategy Volatility: 19.45%, Market Volatility: 19.44%, Sharpe: 1.3630, Adjusted Sharpe: 1.3630
Strategy Volatility: 35.29%, Market Volatility: 20.40%, Sharpe: -0.3061, Adjusted Sharpe: -0.1350
Strategy Volatility: 34.80%, Market Volatility: 23.32%, Sharpe: -0.7309, Adjusted Sharpe: -0.5657


  pl.selectors.numeric().rolling_mean(window_size=5, min_periods=1)
  lambda out: ufunc(*args, out=out, dtype=dtype_char, **kwargs),


Strategy Volatility: 30.12%, Market Volatility: 22.91%, Sharpe: -0.4288, Adjusted Sharpe: -0.3845
Strategy Volatility: 20.44%, Market Volatility: 21.65%, Sharpe: -0.7497, Adjusted Sharpe: -0.0773
Strategy Volatility: 11.43%, Market Volatility: 11.69%, Sharpe: 2.1750, Adjusted Sharpe: 2.1750
Strategy Volatility: 11.60%, Market Volatility: 10.17%, Sharpe: 1.3299, Adjusted Sharpe: 1.3299




Strategy Volatility: 13.60%, Market Volatility: 10.90%, Sharpe: -0.0071, Adjusted Sharpe: -0.0068
Strategy Volatility: 10.70%, Market Volatility: 9.96%, Sharpe: 2.1841, Adjusted Sharpe: 2.1841
Strategy Volatility: 31.55%, Market Volatility: 19.67%, Sharpe: -0.4006, Adjusted Sharpe: -0.2853




Strategy Volatility: 45.74%, Market Volatility: 34.60%, Sharpe: 0.2060, Adjusted Sharpe: 0.1836
Strategy Volatility: 23.34%, Market Volatility: 17.09%, Sharpe: 1.0957, Adjusted Sharpe: 0.9403
Strategy Volatility: 22.04%, Market Volatility: 16.29%, Sharpe: 0.8843, Adjusted Sharpe: 0.7578
Strategy Volatility: 32.15%, Market Volatility: 21.95%, Sharpe: 0.0437, Adjusted Sharpe: 0.0248




Strategy Volatility: 14.33%, Market Volatility: 12.77%, Sharpe: 0.8061, Adjusted Sharpe: 0.7426
Strategy Volatility: 11.71%, Market Volatility: 11.51%, Sharpe: 1.3440, Adjusted Sharpe: 1.2002
Strategy Volatility: 12.81%, Market Volatility: 11.50%, Sharpe: 1.1076, Adjusted Sharpe: 1.0801




Strategy Volatility: 26.62%, Market Volatility: 16.45%, Sharpe: -0.3171, Adjusted Sharpe: -0.2236
Strategy Volatility: 11.49%, Market Volatility: 11.13%, Sharpe: 2.2836, Adjusted Sharpe: 2.2836
Strategy Volatility: 7.30%, Market Volatility: 6.77%, Sharpe: 3.1199, Adjusted Sharpe: 3.1199




Strategy Volatility: 29.13%, Market Volatility: 16.90%, Sharpe: -0.8030, Adjusted Sharpe: -0.2282
Strategy Volatility: 16.23%, Market Volatility: 12.76%, Sharpe: 2.2992, Adjusted Sharpe: 2.1455
Strategy Volatility: 40.22%, Market Volatility: 25.72%, Sharpe: 0.8723, Adjusted Sharpe: 0.6396




Strategy Volatility: 17.50%, Market Volatility: 12.12%, Sharpe: 2.7953, Adjusted Sharpe: 2.2470
Strategy Volatility: 41.90%, Market Volatility: 23.28%, Sharpe: -0.8573, Adjusted Sharpe: -0.1735
Strategy Volatility: 15.66%, Market Volatility: 15.32%, Sharpe: 0.7199, Adjusted Sharpe: 0.7199




Strategy Volatility: 12.90%, Market Volatility: 12.52%, Sharpe: 1.8093, Adjusted Sharpe: 1.7799
Strategy Volatility: 25.07%, Market Volatility: 16.19%, Sharpe: 0.1947, Adjusted Sharpe: 0.1234

Mean CV Score: 0.8200, std: 1.0065
Strategy Volatility: 24.70%, Market Volatility: 17.65%, Sharpe: 0.3647, Adjusted Sharpe: 0.3042
 Overall Score Calculation: 0.3042
Final Evaluation Score: 0.3042




In [25]:
import lightgbm as lgb

def solve(df: pl.DataFrame) -> float:
    """
    Runs a full time-series cross-validation process to evaluate
    the features generated by the `generate_features` function.
    This version uses a LightGBM model.
    """
    # --- Helper functions (unchanged) ---
    def calculate_competition_score(y_true_df: pl.DataFrame, y_pred_signals: np.ndarray) -> float:
        ''' Calculates the competition score based on true values and predicted signals. '''
        solution = y_true_df.to_pandas()
        solution['position'] = y_pred_signals
        solution['strategy_returns'] = (
            solution['risk_free_rate'] * (1 - solution['position']) +
            solution['position'] * solution['forward_returns']
        )
        strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
        strategy_geo_mean = (1 + strategy_excess_returns).prod() ** (1 / len(solution)) - 1
        strategy_std = solution['strategy_returns'].std()
        if strategy_std == 0: return 0.0
        trading_days_per_yr = 252
        sharpe = strategy_geo_mean / strategy_std * np.sqrt(trading_days_per_yr)
        market_std = solution['forward_returns'].std()
        market_volatility = market_std * np.sqrt(trading_days_per_yr) * 100
        strategy_volatility = strategy_std * np.sqrt(trading_days_per_yr) * 100
        excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
        vol_penalty = 1 + excess_vol
        market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
        market_geo_mean = (1 + market_excess_returns).prod() ** (1 / len(solution)) - 1
        return_gap = max(0, (market_geo_mean - strategy_geo_mean) * 100 * trading_days_per_yr)
        return_penalty = 1 + (return_gap**2) / 100
        adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
        print(f"Strategy Volatility: {strategy_volatility:.2f}%, Market Volatility: {market_volatility:.2f}%, Sharpe: {sharpe:.4f}, Adjusted Sharpe: {adjusted_sharpe:.4f}")
        return adjusted_sharpe

    def convert_to_signal(predictions: np.ndarray, multiplier: float = 400.0) -> np.ndarray:
        ''' Converts raw model predictions into trading signals in the range [0, 2]. '''
        signals = predictions * multiplier + 1
        return np.clip(signals, 0.0, 2.0)

    # --- Feature Generation and Data Prep ---
    base_df = df.rename({'market_forward_excess_returns': 'target'})
    feature_cols = [col for col in base_df.columns if col != 'date_id']
    base_df = base_df.with_columns(pl.col(feature_cols).cast(pl.Float64, strict=False))
    base_df = base_df.with_columns(pl.all().forward_fill()).drop_nulls()

    # Generate and combine features
    new_features_df_1 = generate_features_1(base_df)
    new_features_df_2 = generate_features_2(base_df)
    new_features_df_3 = generate_features_3(base_df)
    new_features_df_4 = generate_features_4(base_df)
    new_features_df_5 = generate_features_5(base_df)
    new_features_df_6 = generate_features_6(base_df)
    #newf = [new_features_df_1, new_features_df_2, new_features_df_3, new_features_df_4, new_features_df_5, new_features_df_6]
    newf = [new_features_df_1, new_features_df_2, new_features_df_3, new_features_df_5, new_features_df_6]
    new_features_df = pl.concat(newf, how="horizontal")
    new_features_df = generate_features_7(base_df)
    
    processed_df = pl.concat([base_df, new_features_df], how="horizontal")

    # Setup data for modeling
    base_features = [col for col in base_df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
    new_feature_names = new_features_df.columns
    ALL_FEATURES = base_features + new_feature_names
    TARGET_COL = "target"

    X = processed_df.select(ALL_FEATURES)
    y = processed_df.select(TARGET_COL)
    scorer_info_df = processed_df.select(["forward_returns", "risk_free_rate"])

    # --- Time-Series Cross-Validation ---
    nsplits = 10
    tscv = TimeSeriesSplit(n_splits=nsplits)
    cv_scores = []

    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        y_test_info = scorer_info_df[test_index]

        # Define the LightGBM model
        model = lgb.LGBMRegressor(
            objective='regression_l1',  # MAE, an alternative to squared error
            n_estimators=100,
            device='cpu',              # Use 'cpu' if you don't have a compatible GPU
            learning_rate=0.05,
            max_depth=5,
            subsample=0.8,             # Equivalent to 'bagging_fraction'
            colsample_bytree=0.8,      # Equivalent to 'feature_fraction'
            n_jobs=-1,
            random_state=42,
            verbose=-1                 # Suppress verbose output
        )
        
        # Train the model
        # Convert to NumPy arrays before fitting and predicting
        model.fit(X_train.to_numpy(), y_train.to_numpy().ravel())

        predictions = model.predict(X_test.to_numpy())
        signals = convert_to_signal(predictions)
        score = calculate_competition_score(y_test_info, signals)
        cv_scores.append(score)

    mean_score = np.mean(cv_scores)
    print(f"\nMean CV Score: {mean_score:.4f}, std: {np.std(cv_scores):.4f}")
    return mean_score

if __name__ == "__main__":
    # Example call to evaluate function
    final_score = evaluate(0)
    print(f"Final Evaluation Score: {final_score:.4f}")

(6990, 98)

--- Joining weekday feature onto sliced data ---
Strategy Volatility: 28.62%, Market Volatility: 15.94%, Sharpe: -0.0878, Adjusted Sharpe: -0.0479
Strategy Volatility: 21.21%, Market Volatility: 12.73%, Sharpe: 1.0726, Adjusted Sharpe: 0.7316
Strategy Volatility: 51.03%, Market Volatility: 26.93%, Sharpe: 0.7052, Adjusted Sharpe: 0.4160
Strategy Volatility: 20.83%, Market Volatility: 16.07%, Sharpe: 2.2220, Adjusted Sharpe: 2.0275
Strategy Volatility: 20.37%, Market Volatility: 12.49%, Sharpe: 1.1389, Adjusted Sharpe: 0.7959


  lambda out: ufunc(*args, out=out, dtype=dtype_char, **kwargs),
  lambda out: ufunc(*args, out=out, dtype=dtype_char, **kwargs),
  'feat_V1_ema_5': df[cols[1]].rolling_mean(window_size=5, min_periods=1, center=True),
  'feat_V1_ewm_std_5': df[cols[1]].rolling_std(window_size=5, min_periods=1, center=True),
  'feat_M1_ema_20': df[cols[0]].rolling_mean(window_size=20, min_periods=1, center=True),
  'feat_M1_ewm_std_20': df[cols[0]].rolling_std(window_size=20, min_periods=1, center=True),
  'feat_M1_exp_mean_5': df[cols[0]].rolling_mean(window_size=5, min_periods=1, center=True),


Strategy Volatility: 44.18%, Market Volatility: 24.63%, Sharpe: -1.2821, Adjusted Sharpe: -0.1086
Strategy Volatility: 16.30%, Market Volatility: 17.38%, Sharpe: 0.6788, Adjusted Sharpe: 0.2637
Strategy Volatility: 11.09%, Market Volatility: 11.43%, Sharpe: 2.3504, Adjusted Sharpe: 2.3504
Strategy Volatility: 18.35%, Market Volatility: 12.54%, Sharpe: 1.5822, Adjusted Sharpe: 1.2518
Strategy Volatility: 31.36%, Market Volatility: 17.63%, Sharpe: 0.0348, Adjusted Sharpe: 0.0188

Mean CV Score: 0.7699, std: 0.8185
Final Evaluation Score: 0.7699


