In [1]:


import os
from pathlib import Path
import datetime

from tqdm import tqdm
from dataclasses import dataclass, asdict

import polars as pl 
import numpy as np
from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression
from sklearn.preprocessing import StandardScaler

import kaggle_evaluation.default_inference_server

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Set environment variable to make ONLY GPU 1 (index 1) visible
os.environ["CUDA_VISIBLE_DEVICES"] = "1" 

# ============ PATHS ============
DATA_PATH: Path = Path('./kaggle')

# ============ RETURNS TO SIGNAL CONFIGS ============
MIN_SIGNAL: float = 0.0                         # Minimum value for the daily signal 
MAX_SIGNAL: float = 2.0                         # Maximum value for the daily signal 
SIGNAL_MULTIPLIER: float = 400.0                # Multiplier of the OLS market forward excess returns predictions to signal 

# ============ MODEL CONFIGS ============
CV: int = 10                                    # Number of cross validation folds in the model fitting
L1_RATIO: float = 0.5                           # ElasticNet mixing parameter
ALPHAS: np.ndarray = np.logspace(-4, 2, 100)    # Constant that multiplies the penalty terms
MAX_ITER: int = 1000000                         # The maximum number of iterations

In [2]:


# ================================
# Hull Tactical Market Prediction
# Optimized Baseline (Paraphrased)
# ================================

import os
import numpy as np
import pandas as pd
import polars as pl
from warnings import filterwarnings
from scipy.optimize import minimize, Bounds
from gc import collect

import kaggle_evaluation.default_inference_server

filterwarnings("ignore")

# -------------------------------
# Global constants
# -------------------------------
MIN_POSITION = 0
MAX_POSITION = 2


# -------------------------------
# Custom evaluation metric
# -------------------------------
class UserVisibleError(Exception):
    """Custom error for invalid predictions."""
    pass


def adjusted_sharpe(solution: pd.DataFrame, submission: pd.DataFrame) -> float:
    """
    Compute a Sharpe-like score with penalties for excess volatility and poor returns.
    """

    solution = solution.copy()
    solution['position'] = submission['prediction']

    if solution['position'].max() > MAX_POSITION:
        raise UserVisibleError(f"Prediction above max limit {MAX_POSITION}")
    if solution['position'].min() < MIN_POSITION:
        raise UserVisibleError(f"Prediction below min limit {MIN_POSITION}")

    # Strategy returns
    solution['strategy_returns'] = (
        solution['risk_free_rate'] * (1 - solution['position']) +
        solution['position'] * solution['forward_returns']
    )

    excess = solution['strategy_returns'] - solution['risk_free_rate']
    cum_excess = (1 + excess).prod()
    mean_excess = cum_excess ** (1 / len(solution)) - 1
    std_excess = solution['strategy_returns'].std()

    annual_days = 252
    if std_excess == 0:
        raise ZeroDivisionError
    sharpe = mean_excess / std_excess * np.sqrt(annual_days)
    strat_vol = float(std_excess * np.sqrt(annual_days) * 100)

    # Market comparison
    market_excess = solution['forward_returns'] - solution['risk_free_rate']
    market_cum = (1 + market_excess).prod()
    market_mean = market_cum ** (1 / len(solution)) - 1
    market_std = solution['forward_returns'].std()
    market_vol = float(market_std * np.sqrt(annual_days) * 100)

    # Penalties
    excess_vol_penalty = (
        1 + max(0, strat_vol / market_vol - 1.2) if market_vol > 0 else 1
    )
    return_gap = max(0, (market_mean - mean_excess) * 100 * annual_days)
    return_penalty = 1 + (return_gap ** 2) / 100

    score = sharpe / (excess_vol_penalty * return_penalty)
    return float(min(score, 1_000_000))


# -------------------------------
# Load training data
# -------------------------------
train = pd.read_csv(
    "./kaggle/train.csv",
    index_col="date_id"
)

# -------------------------------
# Optimization objective
# -------------------------------
def objective(x):
    recent = train[-180:].copy()
    submission = pd.DataFrame({'prediction': x.clip(0, 2)}, index=recent.index)
    return -adjusted_sharpe(recent, submission)


# Initial guess + optimization
# x0 = np.full(180, 0.05)
# res = minimize(objective, x0, method="Powell", bounds=Bounds(0, 2), tol=1e-8)
# print(res)

# optimal_preds = res.x


# -------------------------------
# Prediction function for Kaggle server
# -------------------------------
counter = 0

def predict(batch: pl.DataFrame) -> float:
    global counter, optimal_preds
    value = np.float64(optimal_preds[counter])
    print(f"[{counter}] Prediction: {value:.8f}")
    counter += 1
    return value



# -------------------------------
# Run Kaggle evaluation server
# -------------------------------
# server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

# if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
#     server.serve()
# else:
#     server.run_local_gateway(("./kaggle/",))



In [1]:
import polars as pl
import numpy as np
import xgboost as xgb
import itertools
import os




# --- 1. Metric Implementation (No changes here) ---
def calculate_competition_score(y_true_df: pl.DataFrame, y_pred_signals: np.ndarray) -> float:
    solution = y_true_df.to_pandas()
    solution['position'] = y_pred_signals
    solution['strategy_returns'] = (
        solution['risk_free_rate'] * (1 - solution['position']) +
        solution['position'] * solution['forward_returns']
    )
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_geo_mean = (1 + strategy_excess_returns).prod() ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()
    if strategy_std == 0: return 0.0
    trading_days_per_yr = 252
    sharpe = strategy_geo_mean / strategy_std * np.sqrt(trading_days_per_yr)
    market_std = solution['forward_returns'].std()
    market_volatility = market_std * np.sqrt(trading_days_per_yr) * 100
    strategy_volatility = strategy_std * np.sqrt(trading_days_per_yr) * 100
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_geo_mean = (1 + market_excess_returns).prod() ** (1 / len(solution)) - 1
    return_gap = max(0, (market_geo_mean - strategy_geo_mean) * 100 * trading_days_per_yr)
    return_penalty = 1 + (return_gap**2) / 100
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return adjusted_sharpe


# --- 2. Feature Engineering (MODIFIED) ---
def create_and_save_interaction_features(df: pl.DataFrame, batch_size: int = 20, output_dir="features") -> list[str]:
    """
    Generates pairwise interaction features in batches to conserve memory and saves them to disk.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    base_features = [col for col in df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
    print(f"Starting batched feature generation for {len(base_features)} base features...")
    
    file_paths = []
    
    # Iterate through the features in chunks
    for i in range(0, len(base_features), batch_size):
        batch_features = base_features[i:i + batch_size]
        
        # Create a temporary DataFrame for this batch's new features
        batch_interaction_df = pl.DataFrame()
        
        # --- Interactions WITHIN the current batch ---
        for f1, f2 in itertools.combinations(batch_features, 2):
            batch_interaction_df = batch_interaction_df.with_columns(
                (df[f1] + df[f2]).alias(f'{f1}_add_{f2}'),
                (df[f1] - df[f2]).alias(f'{f1}_sub_{f2}'),
                (df[f1] * df[f2]).alias(f'{f1}_mult_{f2}'),
            )
            
        # --- Interactions BETWEEN the current batch and ALL PREVIOUS features ---
        previous_features = base_features[:i]
        for f1 in batch_features:
            for f2 in previous_features:
                 batch_interaction_df = batch_interaction_df.with_columns(
                    (df[f1] + df[f2]).alias(f'{f1}_add_{f2}'),
                    (df[f1] - df[f2]).alias(f'{f1}_sub_{f2}'),
                    (df[f1] * df[f2]).alias(f'{f1}_mult_{f2}'),
                )
        
        if batch_interaction_df.width > 0:
            file_path = f"{output_dir}/interactions_batch_{i//batch_size}.parquet"
            batch_interaction_df.write_parquet(file_path)
            file_paths.append(file_path)
            print(f"  ... Saved batch {i//batch_size} with {batch_interaction_df.width} features to {file_path}")
            
    return file_paths

# --- Main Script ---
# 1. Load and do initial prep
full_train_df = pl.read_csv("./kaggle/train.csv")
full_train_df = full_train_df.slice(1000) # Optional: Skip initial rows if needed
full_train_df = full_train_df.rename({'market_forward_excess_returns': 'target'})

# Explicitly cast all columns except date_id to Float64 to ensure they are all numeric
feature_cols = [col for col in full_train_df.columns if col != 'date_id']
full_train_df = full_train_df.with_columns(
    pl.col(feature_cols).cast(pl.Float64, strict=False)
)

# Handle nulls in the base data first
base_df = full_train_df.with_columns(pl.all().forward_fill())#.drop_nulls()
print(f"Base DataFrame shape after cleaning: {base_df.shape}")

# 2. Generate and save interaction features in batches
interaction_files = create_and_save_interaction_features(base_df, batch_size=20)

# 3. Load all features for the selection process
print("\nLoading all original and generated features for selection...")
interaction_dfs = [pl.read_parquet(f) for f in interaction_files]
# Combine original data with all generated feature batches horizontally
processed_df = pl.concat([base_df] + interaction_dfs, how="horizontal")

# 4. Chronological split (same as before)
VALIDATION_SIZE = 180
train_df = processed_df.head(-VALIDATION_SIZE)
# We don't need the validation set for feature selection, only training data
ALL_FEATURES = [col for col in train_df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
TARGET_COL = "target"
X_train_all = train_df.select(ALL_FEATURES)
y_train = train_df.select(TARGET_COL)

print(f"\nGenerated a total of {len(ALL_FEATURES)} features for selection.")

# 5. Feature Selection using XGBoost Importance
print("\nStarting feature selection...")
N_FEATURES_TO_SELECT = 150
selector_model = xgb.XGBRegressor(objective='reg:absoluteerror', n_estimators=500, random_state=42, n_jobs=-1, tree_method='hist', device='cuda')
selector_model.fit(X_train_all, y_train, verbose=False)

importances = selector_model.feature_importances_
feature_importance_df = pl.DataFrame({'feature': ALL_FEATURES, 'importance': importances}).sort('importance', descending=True)
selected_features = feature_importance_df.head(N_FEATURES_TO_SELECT).get_column('feature').to_list()

print(f"Selected the top {len(selected_features)} most important features.")
final_training_data = processed_df.select(
    selected_features + ["target", "forward_returns", "risk_free_rate"]
)
# 6. Save the list of selected features for later use
output_filename = "final_training_data_150_features.parquet"
final_training_data.write_parquet(output_filename)

print(f"Successfully saved final training data with {final_training_data.width} columns to '{output_filename}'")


Base DataFrame shape after cleaning: (7990, 98)
Starting batched feature generation for 94 base features...
  ... Saved batch 0 with 570 features to features/interactions_batch_0.parquet
  ... Saved batch 1 with 1770 features to features/interactions_batch_1.parquet
  ... Saved batch 2 with 2970 features to features/interactions_batch_2.parquet
  ... Saved batch 3 with 4170 features to features/interactions_batch_3.parquet
  ... Saved batch 4 with 3633 features to features/interactions_batch_4.parquet

Loading all original and generated features for selection...

Generated a total of 13207 features for selection.

Starting feature selection...
Selected the top 150 most important features.
Successfully saved final training data with 153 columns to 'final_training_data_150_features.parquet'


In [2]:
def generate_features_7 (df: pl.DataFrame) -> pl.DataFrame:
  """Generates new features from the base data.
    This function is the target of the evolutionary algorithm.
  
    Available Feature Categories:
    - D* (Dummy/Binary features): 9 columns (D1-D9)
    - E* (Macro Economic features): 20 columns (E1-E20)
    - I* (Interest Rate features): 9 columns (I1-I9)
    - M* (Market Dynamics/Technical features): 18 columns (M1-M18)
    - P* (Price/Valuation features): 13 columns (P1-P13)
    - S* (Sentiment features): 12 columns (S1-S12)
    - V* (Volatility features): 13 columns (V1-V13)
  """
  new_features = pl.DataFrame({
      # --- 20 Pairwise Interactions ---
      'feat_M1_x_V1': df['M1'] * df['V1'],
      'feat_P1_add_E1': df['P1'] + df['E1'],
      'feat_S1_sub_I1': df['S1'] - df['I1'],
      'feat_M10_div_V10': df['M10'] / (df['V10'] + 1e-6),
      'feat_P10_x_E10': df['P10'] * df['E10'],
      'feat_M2_x_S3': df['M2'] * df['S3'],
      'feat_V2_div_P2': df['V2'] / (df['P2'] + 1e-6),
      'feat_E4_sub_I3': df['E4'] - df['I3'],
      'feat_S7_add_M12': df['S7'] + df['M12'],
      'feat_I5_x_V11': df['I5'] * df['V11'],
      'feat_P5_div_S8': df['P5'] / (df['S8'] + 1e-6),
      'feat_E12_x_I9': df['E12'] * df['I9'],
      'feat_M1_div_S1': df['M1'] / (df['S1'] + 1e-6),
      'feat_V1_add_P1': df['V1'] + df['P1'],
      'feat_E1_sub_I1': df['E1'] - df['I1'],
      'feat_M2_div_V2': df['M2'] / (df['V2'] + 1e-6),
      'feat_P2_x_S3': df['P2'] * df['S3'],
      'feat_E4_add_M10': df['E4'] + df['M10'],
      'feat_I3_sub_V10': df['I3'] - df['V10'],
      'feat_S7_x_P10': df['S7'] * df['P10'],
      # --- 10 Rolling Window Features ---
      'feat_V2_roll_mean_5': df['V2'].rolling_mean(window_size=5),
      'feat_V1_roll_std_5': df['V1'].rolling_std(window_size=5),
      'feat_M1_roll_mean_20': df['M1'].rolling_mean(window_size=20),
      'feat_M3_roll_std_20': df['M3'].rolling_std(window_size=20),
      'feat_P1_roll_max_10': df['P1'].rolling_max(window_size=10),
      'feat_P1_roll_min_10': df['P1'].rolling_min(window_size=10),
      'feat_E5_roll_mean_50': df['E5'].rolling_mean(window_size=50),
      'feat_S1_roll_std_50': df['S1'].rolling_std(window_size=50),
      'feat_I1_roll_mean_10': df['I1'].rolling_mean(window_size=10),
      'feat_V10_roll_std_10': df['V10'].rolling_std(window_size=10),
      # --- 10 Complex Interactions (3+ elements) ---
      'feat_M1_V1_div_P1': (df['M1'] * df['V1']) / (df['P1'] + 1e-6),
      'feat_E1_S1_add_I1': df['E1'] + df['S1'] - df['I1'],
      'feat_M2_P2_sub_V2': df['M2'] + df['P2'] - df['V2'],
      'feat_S7_div_E4_I3': df['S7'] / (df['E4'] + df['I3'] + 1e-6),
      'feat_P5_x_M10_x_V10': df['P5'] * df['M10'] * df['V10'],
      'feat_roll_diff_M1_5_20': df['M1'].rolling_mean(window_size=5) - df['M1'].rolling_mean(window_size=20),
      'feat_roll_diff_V1_5_20': df['V1'].rolling_mean(window_size=5) - df['V1'].rolling_mean(window_size=20),
      'feat_M_S_P_combo': (df['M12'] - df['M1']) / (df['S1'] + df['P1'] + 1e-6),
      'feat_V_E_I_combo': (df['V11'] + df['V2']) * (df['E1'] - df['I1']),
      'feat_ratio_of_ratios': (df['M1']/(df['V1']+1e-6)) / (df['P1']/(df['S1']+1e-6)),
      # --- 10 New Features ---
      'feat_M1_x_V1_x_P1': df['M1'] * df['V1'] * df['P1'],
      'feat_E1_div_S1': df['E1'] / (df['S1'] + 1e-6),
      'feat_I1_sub_V1': df['I1'] - df['V1'],
      'feat_M10_add_V10': df['M10'] + df['V10'],
      'feat_P10_div_E10': df['P10'] / (df['E10'] + 1e-6),
      'feat_M2_add_S3': df['M2'] + df['S3'],
      'feat_V2_x_P2': df['V2'] * df['P2'],
      'feat_E4_add_I3': df['E4'] + df['I3'],
      'feat_S7_div_M12': df['S7'] / (df['M12'] + 1e-6),
      'feat_I5_div_V11': df['I5'] / (df['V11'] + 1e-6),
      'feat_M1_log_P1': np.log(df['M1'] + 1e-6) / np.log(df['P1'] + 1e-6),
  })
  # Fill any nulls created by rolling windows
  return new_features.with_columns(pl.all().forward_fill())

In [6]:
# === Run this in a SEPARATE "Experimentation" Notebook ===

import polars as pl
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit

# --- Copy the metric and signal functions from your other notebook ---

def calculate_competition_score(y_true_df: pl.DataFrame, y_pred_signals: np.ndarray) -> float:
    # (The full function code goes here)
    solution = y_true_df.to_pandas()
    solution['position'] = y_pred_signals
    solution['strategy_returns'] = (
        solution['risk_free_rate'] * (1 - solution['position']) +
        solution['position'] * solution['forward_returns']
    )
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_geo_mean = (1 + strategy_excess_returns).prod() ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()
    if strategy_std == 0: return 0.0
    trading_days_per_yr = 252
    sharpe = strategy_geo_mean / strategy_std * np.sqrt(trading_days_per_yr)
    market_std = solution['forward_returns'].std()
    market_volatility = market_std * np.sqrt(trading_days_per_yr) * 100
    strategy_volatility = strategy_std * np.sqrt(trading_days_per_yr) * 100
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_geo_mean = (1 + market_excess_returns).prod() ** (1 / len(solution)) - 1
    return_gap = max(0, (market_geo_mean - strategy_geo_mean) * 100 * trading_days_per_yr)
    return_penalty = 1 + (return_gap**2) / 100
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return adjusted_sharpe

def convert_to_signal(predictions: np.ndarray, multiplier: float = 400.0) -> np.ndarray:
    # (The full function code goes here)
    signals = predictions * multiplier + 1
    return np.clip(signals, 0.0, 2.0)

# --- Main Cross-Validation Script ---

print("Loading pre-processed training data for cross-validation...")
training_df = pl.read_parquet("./final_training_data_150_features.parquet")
new_features_df = generate_features_7(base_df)

# 3. Combine base data with new features
processed_df = pl.concat([training_df, new_features_df], how="horizontal")
train_df = processed_df

FEATURES = [col for col in training_df.columns if col not in ["target", "forward_returns", "risk_free_rate"]]
TARGET_COL = "target"

X = training_df.select(FEATURES)
y = training_df.select(TARGET_COL)
scorer_info_df = training_df.select(["forward_returns", "risk_free_rate"])
print(f"Final feature set shape: {X.shape}")
print("\n" + "="*50)
nsplits = 40
print(f"Starting {nsplits}-Fold Time Series Cross-Validation xgboost")

tscv = TimeSeriesSplit(n_splits=nsplits)
cv_scores = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
    y_test_info = scorer_info_df[test_index]
    
    # Use your final model parameters for an accurate score estimate
    model = xgb.XGBRegressor(
        objective='reg:absoluteerror', n_estimators=30, device='cuda',
        learning_rate=0.05, max_depth=5, subsample=0.8, colsample_bytree=0.8,
        n_jobs=-1, random_state=42
    )
    model.fit(X_train, y_train, verbose=False)
    
    predictions = model.predict(X_test)
    signals = convert_to_signal(predictions)
    score = calculate_competition_score(y_test_info, signals)
    cv_scores.append(score)
    print(f"  Fold {i+1}/{nsplits} Score: {score:.4f}")

print(f"\nMean CV Score: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
print("="*50)

Loading pre-processed training data for cross-validation...
Final feature set shape: (7990, 150)

Starting 40-Fold Time Series Cross-Validation xgboost
  Fold 1/40 Score: 1.7969
  Fold 2/40 Score: 2.4444


  lambda out: ufunc(*args, out=out, dtype=dtype_char, **kwargs),


  Fold 3/40 Score: 0.7756
  Fold 4/40 Score: 1.1288
  Fold 5/40 Score: 0.7408
  Fold 6/40 Score: 1.7309
  Fold 7/40 Score: 0.6708
  Fold 8/40 Score: 0.1927
  Fold 9/40 Score: -0.1879
  Fold 10/40 Score: -0.5499
  Fold 11/40 Score: 0.0730
  Fold 12/40 Score: 2.2683
  Fold 13/40 Score: 0.7544
  Fold 14/40 Score: 0.7824
  Fold 15/40 Score: 0.9627
  Fold 16/40 Score: 0.6598
  Fold 17/40 Score: 0.2653
  Fold 18/40 Score: 0.0446
  Fold 19/40 Score: -0.4093
  Fold 20/40 Score: 2.5169
  Fold 21/40 Score: 0.0093
  Fold 22/40 Score: 0.0431
  Fold 23/40 Score: 1.0502
  Fold 24/40 Score: 1.9792
  Fold 25/40 Score: 1.3466
  Fold 26/40 Score: 2.0052
  Fold 27/40 Score: -0.1972
  Fold 28/40 Score: 1.9224
  Fold 29/40 Score: 1.9095
  Fold 30/40 Score: 2.5819
  Fold 31/40 Score: 0.5177
  Fold 32/40 Score: 0.1207
  Fold 33/40 Score: 0.4032
  Fold 34/40 Score: 1.8381
  Fold 35/40 Score: 1.5766
  Fold 36/40 Score: -0.2658
  Fold 37/40 Score: -0.2125
  Fold 38/40 Score: 1.9496
  Fold 39/40 Score: 1.9097
  