# Import + config + load data

In [1]:
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
from lightgbm import LGBMClassifier  # regressor no longer needed

# ------------------------------------------------
# Config flag: which features to use
# ------------------------------------------------
USE_ONLY_ENG_FEATS = False  # True -> X = X_eng, False -> X = raw + eng

# ------------------------------------------------
# Paths
# ------------------------------------------------
DATA_PATH = Path.home() / "Documents/kaggle/hull_tactical/data"

# ------------------------------------------------
# Load data with Polars
# ------------------------------------------------
train = pl.read_csv(DATA_PATH / "train.csv")
test  = pl.read_csv(DATA_PATH / "test.csv")  # only used for potential future alignment

# Cast non-date columns to float
train = train.with_columns(
    pl.all().exclude("date_id").cast(pl.Float64, strict=False)
)


# Data prep + feature eng + X/y

In [2]:
# ------------------------------------------------
# Simple mean imputation
# ------------------------------------------------
train_filled = train.with_columns([
    pl.when(pl.col(c).is_null())
      .then(pl.col(c).mean())
      .otherwise(pl.col(c))
      .alias(c)
    for c in train.columns if c != "date_id"
])

# ------------------------------------------------
# Base raw feature set (common train/test columns)
# ------------------------------------------------
train_cols = set(train_filled.columns)
test_cols  = set(test.columns)

common_cols = train_cols & test_cols
BASE_FEATURE_COLS = sorted(common_cols - {"is_scored"})

# ------------------------------------------------
# Sort by time
# ------------------------------------------------
train_sorted = train_filled.sort("date_id")

# Target
y = train_sorted["market_forward_excess_returns"].to_numpy()

# Global return std for Kelly-ish stuff (used by alloc_kelly)
GLOBAL_RET_STD = float(np.std(y) + 1e-12)

# ======================================================================
#   FEATURE ENGINEERING (from your preprocess_data)
# ======================================================================
def preprocess_features(df: pl.DataFrame) -> pd.DataFrame:
    """
    Feature engineering adapted from your preprocess_data:
    - day_of_cycle from date_id
    - lags, rolling mean/std for key features
    - EMAs
    - interaction ratios and differences
    - quadratic terms
    - drops target and non-feature columns
    """

    # --- Time-based feature: day_of_cycle ---
    if "date_id" in df.columns:
        df = df.with_columns(
            (pl.col("date_id") % 5).alias("day_of_cycle")  # proxy for day-of-week / cycle
        )

    # Rolling / lag setup
    ROLLING_WINDOWS = [5, 10]
    BASE_FEATURES = ["M1", "E1", "V1", "S1", "T1", "P1", "D1"]

    expressions = []

    # Lags
    LAG_WINDOWS = [1, 5]
    for lag in LAG_WINDOWS:
        for col in BASE_FEATURES:
            if col in df.columns:
                expressions.append(
                    pl.col(col).shift(lag).alias(f"{col}_lag_{lag}")
                )

    # Rolling mean/std
    for window in ROLLING_WINDOWS:
        for col in BASE_FEATURES:
            if col in df.columns:
                expressions.append(
                    pl.col(col)
                      .rolling_mean(window_size=window, min_samples=1)
                      .alias(f"{col}_roll_mean_{window}")
                )
                expressions.append(
                    pl.col(col)
                      .rolling_std(window_size=window, min_samples=1)
                      .alias(f"{col}_roll_std_{window}")
                )

    if expressions:
        df = df.with_columns(expressions)

    # Convert to pandas for EMA & interaction features
    pdf = df.to_pandas()

    # EMAs
    EMA_WINDOWS = [10, 30]
    for window in EMA_WINDOWS:
        for col in BASE_FEATURES:
            if col in pdf.columns:
                pdf[f"{col}_ema_{window}"] = pdf[col].ewm(span=window, adjust=False).mean()

    # Interaction features (ratios, differences)
    FEATURE_PAIRS = [
        ("M1", "M2"),
        ("E1", "E2"),
        ("V1", "V2"),
        ("S1", "S2"),
        ("T1", "T2"),
        ("P1", "P2"),
        ("D1", "D2"),
    ]

    for col1, col2 in FEATURE_PAIRS:
        if col1 in pdf.columns and col2 in pdf.columns:
            denom = pdf[col2].replace(0, 1e-6) + 1e-6
            pdf[f"{col1}_div_{col2}"] = pdf[col1] / denom
            pdf[f"{col1}_minus_{col2}"] = pdf[col1] - pdf[col2]

    # Quadratic terms
    for col in BASE_FEATURES:
        if col in pdf.columns:
            pdf[f"{col}_sq"] = pdf[col] ** 2

    # Drop target / meta columns from features
    EXCLUDE_FINAL_COLS = [
        "date_id",
        "forward_returns",
        "risk_free_rate",
        "market_forward_excess_returns",
        "is_scored",
        "lagged_forward_returns",
        "lagged_risk_free_rate",
        "lagged_market_forward_excess_returns",
    ]

    final_cols = [c for c in pdf.columns if c not in EXCLUDE_FINAL_COLS]
    return pdf[final_cols]

# ------------------------------------------------
# Build X using RAW + ENGINEERED features (controlled by flag)
# ------------------------------------------------

# 1) Raw features (your original baseline X)
X_raw = train_sorted.select(BASE_FEATURE_COLS).to_pandas()
X_raw = X_raw.fillna(X_raw.mean())

# 2) Engineered features (from preprocess_features)
X_eng = preprocess_features(train_sorted)
X_eng = X_eng.fillna(X_eng.mean())

if USE_ONLY_ENG_FEATS:
    X = X_eng
else:
    # 3) Align indices and concatenate raw + engineered
    X_combined = pd.concat(
        [X_raw.reset_index(drop=True),
         X_eng.reset_index(drop=True)],
        axis=1
    )
    # Remove duplicate column names (keep first occurrence)
    X_combined = X_combined.loc[:, ~X_combined.columns.duplicated()]
    X = X_combined

N = len(y)
print("USE_ONLY_ENG_FEATS:", USE_ONLY_ENG_FEATS)
print("X_raw shape:", X_raw.shape)
print("X_eng shape:", X_eng.shape)
print("X shape:", X.shape, "| y shape:", y.shape)


USE_ONLY_ENG_FEATS: False
X_raw shape: (9021, 95)
X_eng shape: (9021, 161)
X shape: (9021, 162) | y shape: (9021,)


# Folds

In [3]:
# ------------------------------------------------
# Late-expanding folds on the tail of the series
# ------------------------------------------------
def make_late_expanding_folds(N, n_folds=5, tail_frac=0.5, min_train_frac_within_tail=0.2):
    """
    Use only the last `tail_frac` of history.
    Within that tail, do expanding folds.

    Example:
      tail_frac=0.5  -> only last half of 9021 days
      n_folds=5      -> 5 expanding train/val splits in that region
    """
    idx = np.arange(N)
    tail_start = int(N * (1.0 - tail_frac))
    tail_idx = idx[tail_start:]
    T = len(tail_idx)

    fractions = np.linspace(min_train_frac_within_tail, 1.0, n_folds + 1)
    bounds = (fractions * T).astype(int)

    folds = []
    for i in range(n_folds):
        train_end = bounds[i]
        val_start = bounds[i]
        val_end   = bounds[i + 1]
        if val_end > val_start and train_end > 0:
            tr = tail_idx[:train_end]
            va = tail_idx[val_start:val_end]
            folds.append((tr, va))
    return folds

# Build late-expanding folds and assign to fold_indices
folds_late_expanding = make_late_expanding_folds(
    N,
    n_folds=5,
    tail_frac=0.5,                 # only use last 50% of the series
    min_train_frac_within_tail=0.2 # first train uses 20% of that tail
)

fold_indices = folds_late_expanding

print("Number of folds (late-expanding):", len(fold_indices))
for i, (tr, va) in enumerate(fold_indices, 1):
    print(
        f"Fold {i}: "
        f"train [{tr[0]}..{tr[-1]}] (len={len(tr)}), "
        f"val [{va[0]}..{va[-1]}] (len={len(va)})"
    )


Number of folds (late-expanding): 5
Fold 1: train [4510..5411] (len=902), val [5412..6132] (len=721)
Fold 2: train [4510..6132] (len=1623), val [6133..6854] (len=722)
Fold 3: train [4510..6854] (len=2345), val [6855..7576] (len=722)
Fold 4: train [4510..7576] (len=3067), val [7577..8298] (len=722)
Fold 5: train [4510..8298] (len=3789), val [8299..9020] (len=722)


In [4]:
# import numpy as np
# import pandas as pd
# import polars as pl
# from pathlib import Path
# from lightgbm import LGBMClassifier  # regressor no longer needed

# # ------------------------------------------------
# # Config flag: which features to use
# # ------------------------------------------------
# USE_ONLY_ENG_FEATS = True   # True -> X = X_eng, False -> X = raw + eng

# # ------------------------------------------------
# # Paths
# # ------------------------------------------------
# DATA_PATH = Path.home() / "Documents/kaggle/hull_tactical/data"

# # ------------------------------------------------
# # Load data with Polars
# # ------------------------------------------------
# train = pl.read_csv(DATA_PATH / "train.csv")
# test  = pl.read_csv(DATA_PATH / "test.csv")  # only used for potential future alignment

# # Cast non-date columns to float
# train = train.with_columns(
#     pl.all().exclude("date_id").cast(pl.Float64, strict=False)
# )

# # ------------------------------------------------
# # Simple mean imputation
# # ------------------------------------------------
# train_filled = train.with_columns([
#     pl.when(pl.col(c).is_null())
#       .then(pl.col(c).mean())
#       .otherwise(pl.col(c))
#       .alias(c)
#     for c in train.columns if c != "date_id"
# ])

# # ------------------------------------------------
# # Base raw feature set (common train/test columns)
# # ------------------------------------------------
# train_cols = set(train_filled.columns)
# test_cols  = set(test.columns)

# common_cols = train_cols & test_cols
# BASE_FEATURE_COLS = sorted(common_cols - {"is_scored"})

# # ------------------------------------------------
# # Sort by time
# # ------------------------------------------------
# train_sorted = train_filled.sort("date_id")

# # Target
# y = train_sorted["market_forward_excess_returns"].to_numpy()

# # Global return std for Kelly-ish stuff (used by alloc_kelly)
# GLOBAL_RET_STD = float(np.std(y) + 1e-12)

# # ======================================================================
# #   FEATURE ENGINEERING (from your preprocess_data)
# # ======================================================================
# def preprocess_features(df: pl.DataFrame) -> pd.DataFrame:
#     """
#     Feature engineering adapted from your preprocess_data:
#     - day_of_cycle from date_id
#     - lags, rolling mean/std for key features
#     - EMAs
#     - interaction ratios and differences
#     - quadratic terms
#     - drops target and non-feature columns
#     """

#     # --- Time-based feature: day_of_cycle ---
#     if "date_id" in df.columns:
#         df = df.with_columns(
#             (pl.col("date_id") % 5).alias("day_of_cycle")  # proxy for day-of-week / cycle
#         )

#     # Rolling / lag setup
#     ROLLING_WINDOWS = [5, 10]
#     BASE_FEATURES = ["M1", "E1", "V1", "S1", "T1", "P1", "D1"]

#     expressions = []

#     # Lags
#     LAG_WINDOWS = [1, 5]
#     for lag in LAG_WINDOWS:
#         for col in BASE_FEATURES:
#             if col in df.columns:
#                 expressions.append(
#                     pl.col(col).shift(lag).alias(f"{col}_lag_{lag}")
#                 )

#     # Rolling mean/std
#     for window in ROLLING_WINDOWS:
#         for col in BASE_FEATURES:
#             if col in df.columns:
#                 expressions.append(
#                     pl.col(col)
#                       .rolling_mean(window_size=window, min_samples=1)
#                       .alias(f"{col}_roll_mean_{window}")
#                 )
#                 expressions.append(
#                     pl.col(col)
#                       .rolling_std(window_size=window, min_samples=1)
#                       .alias(f"{col}_roll_std_{window}")
#                 )

#     if expressions:
#         df = df.with_columns(expressions)

#     # Convert to pandas for EMA & interaction features
#     pdf = df.to_pandas()

#     # EMAs
#     EMA_WINDOWS = [10, 30]
#     for window in EMA_WINDOWS:
#         for col in BASE_FEATURES:
#             if col in pdf.columns:
#                 pdf[f"{col}_ema_{window}"] = pdf[col].ewm(span=window, adjust=False).mean()

#     # Interaction features (ratios, differences)
#     FEATURE_PAIRS = [
#         ("M1", "M2"),
#         ("E1", "E2"),
#         ("V1", "V2"),
#         ("S1", "S2"),
#         ("T1", "T2"),
#         ("P1", "P2"),
#         ("D1", "D2"),
#     ]

#     for col1, col2 in FEATURE_PAIRS:
#         if col1 in pdf.columns and col2 in pdf.columns:
#             denom = pdf[col2].replace(0, 1e-6) + 1e-6
#             pdf[f"{col1}_div_{col2}"] = pdf[col1] / denom
#             pdf[f"{col1}_minus_{col2}"] = pdf[col1] - pdf[col2]

#     # Quadratic terms
#     for col in BASE_FEATURES:
#         if col in pdf.columns:
#             pdf[f"{col}_sq"] = pdf[col] ** 2

#     # Drop target / meta columns from features
#     EXCLUDE_FINAL_COLS = [
#         "date_id",
#         "forward_returns",
#         "risk_free_rate",
#         "market_forward_excess_returns",
#         "is_scored",
#         "lagged_forward_returns",
#         "lagged_risk_free_rate",
#         "lagged_market_forward_excess_returns",
#     ]

#     final_cols = [c for c in pdf.columns if c not in EXCLUDE_FINAL_COLS]
#     return pdf[final_cols]

# # ------------------------------------------------
# # Build X using RAW + ENGINEERED features (controlled by flag)
# # ------------------------------------------------

# # 1) Raw features (your original baseline X)
# X_raw = train_sorted.select(BASE_FEATURE_COLS).to_pandas()
# X_raw = X_raw.fillna(X_raw.mean())

# # 2) Engineered features (from preprocess_features)
# X_eng = preprocess_features(train_sorted)
# X_eng = X_eng.fillna(X_eng.mean())

# if USE_ONLY_ENG_FEATS:
#     X = X_eng
# else:
#     # 3) Align indices and concatenate raw + engineered
#     X_combined = pd.concat(
#         [X_raw.reset_index(drop=True),
#          X_eng.reset_index(drop=True)],
#         axis=1
#     )
#     # Remove duplicate column names (keep first occurrence)
#     X_combined = X_combined.loc[:, ~X_combined.columns.duplicated()]
#     X = X_combined

# N = len(y)
# print("USE_ONLY_ENG_FEATS:", USE_ONLY_ENG_FEATS)
# print("X_raw shape:", X_raw.shape)
# print("X_eng shape:", X_eng.shape)
# print("X shape:", X.shape, "| y shape:", y.shape)

# # ------------------------------------------------
# # Walk-forward splits
# # ------------------------------------------------
# fractions = [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
# bounds = [int(f * N) for f in fractions]

# fold_indices = []
# for i in range(5):
#     train_end = bounds[i]
#     val_start = bounds[i]
#     val_end   = bounds[i+1]
#     fold_indices.append((np.arange(0, train_end), np.arange(val_start, val_end)))


# Allocation Strategies

In [5]:
# ======================================================================
#   NUMERICALLY SAFE SOFTPLUS
# ======================================================================
def softplus_safe(x):
    # Avoid overflow in exp(x)
    x = np.asarray(x)
    return np.where(x > 30, x, np.log1p(np.exp(x)))


# ======================================================================
#   BASE ALLOCATION STRATEGIES  
#   (the ones you've already tested)
# ======================================================================

# 1) ✔ Tanh scaling (default, stable)
def alloc_tanh(pred):
    EDGE_SCALE = 50.0
    a = 1.0 + np.tanh(pred * EDGE_SCALE)
    return np.clip(a, 0.0, 2.0)

# 2) ✔ Linear scaling (simple, can be volatile)
def alloc_linear(pred):
    SCALE = 300.0
    a = 1.0 + pred * SCALE
    return np.clip(a, 0.0, 2.0)

# 3) ✔ Piecewise (momentum-style)
def alloc_piecewise(pred):
    a = np.where(pred > 0, 1.25, 0.75)
    return np.clip(a, 0.0, 2.0)

# 4) ✔ Asymmetric tanh (more long than short)
def alloc_asymmetric(pred):
    SCALE_UP = 60.0
    SCALE_DOWN = 30.0
    pos = 1.0 + np.tanh(pred * SCALE_UP)
    neg = 1.0 + np.tanh(pred * SCALE_DOWN)
    a = np.where(pred >= 0, pos, neg)
    return np.clip(a, 0.0, 2.0)

# 5) ✔ Softplus (smooth convex response)
def alloc_softplus(pred):
    SCALE = 70.0
    x = pred * SCALE
    sp = softplus_safe(x)
    a = 1.0 + (sp - 0.5) / 20.0
    return np.clip(a, 0.0, 2.0)

# 6) ✔ Clipped sign (very simple and robust)
def alloc_sign(pred):
    a = 1.0 + 0.5 * np.sign(pred)
    return np.clip(a, 0.0, 2.0)


# ======================================================================
#   ADVANCED ALLOCATION STRATEGIES
# ======================================================================

# 7) Volatility-scaled softplus
def alloc_softplus_vol(pred):
    WINDOW = 20
    SCALE = 70.0

    # True short-circuit safe softplus
    def softplus_safe_inner(z):
        z = np.asarray(z)
        out = np.empty_like(z)
        mask = z > 30
        out[mask] = z[mask]
        out[~mask] = np.log1p(np.exp(z[~mask]))
        return out

    s = pd.Series(pred)
    vol = s.rolling(WINDOW, min_periods=1).std().to_numpy()

    # Safe volatility
    safe_vol = np.where((vol <= 1e-12) | (~np.isfinite(vol)), 1e-6, vol)

    # Normalize
    norm = pred / safe_vol
    norm = np.where(~np.isfinite(norm), 0.0, norm)

    # Softplus input
    x = norm * SCALE
    x = np.where(~np.isfinite(x), 0.0, x)

    # Apply fully safe softplus
    sp = softplus_safe_inner(x)

    # Final allocation
    a = 1.0 + (sp - 0.5) / 20.0
    a = np.where(~np.isfinite(a), 1.0, a)

    return np.clip(a, 0.0, 2.0)


# 8) Thresholded tanh (ignore small/noisy signals)
def alloc_tanh_threshold(pred):
    THRESH = 0.0002  # tune this
    pred_thr = np.where(np.abs(pred) < THRESH, 0.0, pred)
    EDGE_SCALE = 50.0
    a = 1.0 + np.tanh(pred_thr * EDGE_SCALE)
    return np.clip(a, 0.0, 2.0)

# 9) Kelly-inspired allocation
def alloc_kelly(pred):
    sigma2 = GLOBAL_RET_STD**2
    EDGE_SCALE = 1.0
    f_raw = (pred * EDGE_SCALE) / sigma2
    f_clipped = np.clip(f_raw, -1.0, 1.0)
    a = 1.0 + f_clipped
    a = np.where(~np.isfinite(a), 1.0, a)
    return np.clip(a, 0.0, 2.0)

# 10) Regime-switching: low-vol uses aggressive softplus, high-vol shrinks bets
def alloc_regime_softplus(pred):
    WINDOW = 20
    s = pd.Series(pred)
    vol = s.rolling(WINDOW, min_periods=1).std().to_numpy()
    median_vol = np.nanmedian(vol) if np.any(np.isfinite(vol)) else 0.0

    high_vol = vol > median_vol

    # Low-vol: regular softplus
    a_low = alloc_softplus(pred)

    # High-vol: compressed tanh around 1.0 (less leverage)
    SCALE_HV = 20.0
    a_high = 1.0 + 0.5 * np.tanh(pred * SCALE_HV)

    a = np.where(high_vol, a_high, a_low)
    a = np.where(~np.isfinite(a), 1.0, a)
    return np.clip(a, 0.0, 2.0)

# 11) Softplus scaled by rolling Sharpe of the signal
def alloc_softplus_sharpe(pred):
    WINDOW = 60
    ALPHA = 0.5

    s = pd.Series(pred)

    roll_mean = s.rolling(WINDOW, min_periods=5).mean().to_numpy()
    roll_std  = s.rolling(WINDOW, min_periods=5).std().to_numpy()

    safe_std = np.where((roll_std <= 1e-12) | (~np.isfinite(roll_std)), 1e-6, roll_std)

    roll_sharpe = roll_mean / safe_std
    roll_sharpe = np.where(~np.isfinite(roll_sharpe), 0.0, roll_sharpe)

    # Factor in [0,1]
    factor = 0.5 + 0.5 * np.tanh(ALPHA * roll_sharpe)
    factor = np.where(~np.isfinite(factor), 0.5, factor)

    base = alloc_softplus(pred)
    a = 1.0 + (base - 1.0) * factor
    a = np.where(~np.isfinite(a), 1.0, a)

    return np.clip(a, 0.0, 2.0)


In [6]:
# # ======================================================================
# #   Sharpe-like competition evaluation
# # ======================================================================
# def sharpe_like_strategy(y_true, allocation):
#     strat = allocation * y_true
#     return float(np.mean(strat) / (np.std(strat) + 1e-9))


# def pretty_print_scores(name, scores):
#     print(f"{name} per fold:")
#     for i, s in enumerate(scores, 1):
#         print(f"  Fold {i}: {s:.6f}")
#     print(f"{name} avg: {np.mean(scores):.6f}\n")


# # ======================================================================
# #   SELECT STRATEGY HERE (ONE ONLY)
# #   (keeping your known last-return scores in comments)
# # ======================================================================
# # allocation_fn = alloc_softplus        # last return 0.009228 __ lgbRegressor 0.010550 __ lgbClassifier 0.016441 __ lgbClassifier + engFeats 0.016018 __ lgbClassifier + allFeats 0.014958 __ lgbRegressor + allFeats 0.010536 __ lgbRegressor + engFeats 0.010548 __ lgbRegressor + allFeats diff 0.010535 __ lgbRegressor + engFeats diff 0.010561
# # allocation_fn = alloc_piecewise       # last return 0.002353 __ lgbRegressor 0.013183 __ lgbClassifier 0.014958 __ lgbClassifier + engFeats 0.012609 __ lgbClassifier + allFeats 0.013328 __ lgbRegressor + allFeats 0.016140 __ lgbRegressor + engFeats 0.013942 __ lgbRegressor + allFeats diff 0.013353 __ lgbRegressor + engFeats diff 0.016874
# # allocation_fn = alloc_tanh            # last return -0.008881 __ lgbRegressor 0.015537 __ lgbClassifier 0.023084 __ lgbClassifier + engFeats 0.015506 __ lgbClassifier + allFeats 0.017850 __ lgbRegressor + allFeats 0.014679 __ lgbRegressor + engFeats 0.014429 __ lgbRegressor + allFeats diff 0.016001 __ lgbRegressor + engFeats diff 0.016848
# # allocation_fn = alloc_linear          # last return -0.017213 __ lgbRegressor 0.022014 __ lgbClassifier 0.021528 __ lgbClassifier + engFeats 0.015748 __ lgbClassifier + allFeats 0.016321 __ lgbRegressor + allFeats 0.020992 __ lgbRegressor + engFeats 0.015480 __ lgbRegressor + allFeats diff 0.024258 __ lgbRegressor + engFeats diff 0.024873
# allocation_fn = alloc_asymmetric      # last return -0.005558 __ lgbRegressor 0.015647 __ lgbClassifier 0.023039 __ lgbClassifier + engFeats 0.015523 __ lgbClassifier + allFeats 0.018372 __ lgbRegressor + allFeats 0.014747 __ lgbRegressor + engFeats 0.014558 __ lgbRegressor + allFeats diff 0.015411 __ lgbRegressor + engFeats diff 0.016220
# # allocation_fn = alloc_sign            # last return -0.005170 __ lgbRegressor 0.014692 __ lgbClassifier 0.018195 __ lgbClassifier + engFeats 0.014055 __ lgbClassifier + allFeats 0.015280 __ lgbRegressor + allFeats 0.020447 __ lgbRegressor + engFeats 0.016303 __ lgbRegressor + allFeats diff 0.015221 __ lgbRegressor + engFeats diff 0.021624

# # Advanced ones (you’ve tested some of these)
# # allocation_fn = alloc_softplus_vol    # last return -0.000803 __ lgbRegressor 0.015839 __ lgbClassifier 0.018269 __ lgbClassifier + engFeats 0.012075 __ lgbClassifier + allFeats 0.013491 __ lgbRegressor + allFeats 0.018407 __ lgbRegressor + engFeats 0.015977 __ lgbRegressor + allFeats diff 0.015086 __ lgbRegressor + engFeats diff 0.017941
# # allocation_fn = alloc_tanh_threshold  # last return -0.008884 __ lgbRegressor 0.015549 __ lgbClassifier 0.023084 __ lgbClassifier + engFeats 0.015506 __ lgbClassifier + allFeats 0.017850 __ lgbRegressor + allFeats 0.014672 __ lgbRegressor + engFeats 0.014426 __ lgbRegressor + allFeats diff 0.016013 __ lgbRegressor + engFeats diff 0.016846
# # allocation_fn = alloc_kelly           # last return -0.016253 __ lgbRegressor 0.016557 __ lgbClassifier 0.021611 __ lgbClassifier + engFeats 0.015409 __ lgbClassifier + allFeats 0.017153 __ lgbRegressor + allFeats 0.025583 __ lgbRegressor + engFeats 0.016114 __ lgbRegressor + allFeats diff 0.017716 __ lgbRegressor + engFeats diff 0.025736
# # allocation_fn = alloc_regime_softplus # last return 0.005810 __ lgbRegressor 0.011300 __ lgbClassifier 0.019707 __ lgbClassifier + engFeats 0.017977 __ lgbClassifier + allFeats 0.017953 __ lgbRegressor + allFeats 0.011064 __ lgbRegressor + engFeats 0.011259 __ lgbRegressor + allFeats diff 0.011379 __ lgbRegressor + engFeats diff 0.011370
# # allocation_fn = alloc_softplus_sharpe # last return 0.009759 __ lgbRegressor 0.010465 __ lgbClassifier 0.015754 __ lgbClassifier + engFeats 0.016337 __ lgbClassifier + allFeats 0.015771 __ lgbRegressor + allFeats 0.010471 __ lgbRegressor + engFeats 0.010486 __ lgbRegressor + allFeats diff 0.010435 __ lgbRegressor + engFeats diff 0.010449

In [7]:
# # ------------------------------------------------
# # Baseline: last-return signal → allocation → strategy Sharpe
# # ------------------------------------------------
# scores_last = []

# for train_idx, val_idx in fold_indices:
#     y_val = y[val_idx]

#     # Last return baseline signal
#     s_last = y[val_idx - 1]
#     a_last = allocation_fn(s_last)
#     scores_last.append(sharpe_like_strategy(y_val, a_last))

# pretty_print_scores("Baseline (last return) Allocation Sharpe-like", scores_last)


In [8]:
# # ------------------------------------------------
# # LightGBM MODEL: regression or classification
# # ------------------------------------------------

# # Choose your signal model
# # MODEL_TYPE = "regression"
# MODEL_TYPE = "classification"

# lgb_rmse_scores = []
# lgb_sharpe_scores = []

# from lightgbm import LGBMRegressor, LGBMClassifier

# for fold, (train_idx, val_idx) in enumerate(fold_indices, 1):
#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y[train_idx], y[val_idx]

#     # --------------------------------------------
#     # Model selection
#     # --------------------------------------------
#     if MODEL_TYPE == "regression":
#         model = LGBMRegressor(
#             n_estimators=1000,
#             learning_rate=0.03,
#             subsample=0.8,
#             colsample_bytree=0.8,
#             objective="regression",
#             random_state=42,
#         )
#     elif MODEL_TYPE == "classification":
#         # Convert to up/down
#         y_train_clf = (y_train > 0).astype(int)
#         y_val_clf   = (y_val > 0).astype(int)

#         model = LGBMClassifier(
#             n_estimators=1000,
#             learning_rate=0.03,
#             subsample=0.8,
#             colsample_bytree=0.8,
#             objective="binary",
#             random_state=42,
#         )
#     else:
#         raise ValueError("Unknown model type.")

#     # --------------------------------------------
#     # Fit model
#     # --------------------------------------------
#     if MODEL_TYPE == "regression":
#         model.fit(X_train, y_train)
#         y_pred_raw = model.predict(X_val)

#         # Signal is raw return prediction
#         signal = y_pred_raw

#         # RMSE only meaningful for regression
#         rmse = float(np.sqrt(((y_val - y_pred_raw) ** 2).mean()))
#         lgb_rmse_scores.append(rmse)

#     else:  # classification
#         model.fit(X_train, y_train_clf)

#         # Probability P(up)
#         p_up = model.predict_proba(X_val)[:,1]

#         # Convert to signal
#         signal = p_up - 0.5  # centered around zero

#         # Regression RMSE not meaningful in classification
#         lgb_rmse_scores.append(np.nan)

#     # --------------------------------------------
#     # Allocation
#     # --------------------------------------------
#     alloc_model = allocation_fn(signal)

#     sharpe_model = sharpe_like_strategy(y_val, alloc_model)
#     lgb_sharpe_scores.append(sharpe_model)

#     print(f"Fold {fold} LGBM-{MODEL_TYPE} | Allocation Sharpe-like: {sharpe_model:.6f}")

# print()
# pretty_print_scores("LightGBM RMSE", lgb_rmse_scores)
# pretty_print_scores("LightGBM Allocation Sharpe-like", lgb_sharpe_scores)


In [9]:
# ======================================================================
#   Sharpe-like competition evaluation
# ======================================================================
def sharpe_like_strategy(y_true, allocation):
    strat = allocation * y_true
    return float(np.mean(strat) / (np.std(strat) + 1e-9))


def pretty_print_scores(name, scores):
    scores = [float(s) for s in scores]
    print(f"{name} per fold:")
    for i, s in enumerate(scores, 1):
        print(f"  Fold {i}: {s:.6f}")
    print(f"{name} avg: {np.mean(scores):.6f}\n")


# ======================================================================
#   ALL ALLOCATION STRATEGIES IN ONE PLACE
# ======================================================================
allocation_strategies = {
    "softplus":        alloc_softplus,
    "piecewise":       alloc_piecewise,
    "tanh":            alloc_tanh,
    "linear":          alloc_linear,
    "asymmetric":      alloc_asymmetric,
    "sign":            alloc_sign,
    "softplus_vol":    alloc_softplus_vol,
    "tanh_threshold":  alloc_tanh_threshold,
    "kelly":           alloc_kelly,
    "regime_softplus": alloc_regime_softplus,
    "softplus_sharpe": alloc_softplus_sharpe,
}

# ======================================================================
#   LightGBM REGRESSOR: E[return] → signal → allocation → Sharpe
# ======================================================================
from lightgbm import LGBMRegressor

print("=== LGBMRegressor Allocation Sharpe-like per strategy ===\n")

# store per-strategy Sharpe across folds
lgb_sharpe_scores = {name: [] for name in allocation_strategies.keys()}

for fold, (train_idx, val_idx) in enumerate(fold_indices, 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = LGBMRegressor(
        objective='regression',
        metric='rmse',
        n_estimators=5000,             
        learning_rate=0.008,           
        max_depth=-1,                  
        num_leaves=511,                
        subsample=0.8,                
        colsample_bytree=0.8,         
        random_state=42,
        reg_lambda=3.0,                
        min_child_samples=15,          
        boosting_type='gbdt'
    )

    # Fit directly on continuous returns
    model.fit(X_train, y_train)

    # Predicted expected return as signal
    y_pred = model.predict(X_val)

    # Option: center at 0 (keeps symmetry long/short)
    signal = y_pred  # or (y_pred - y_pred.mean()) if you prefer

    # Evaluate ALL allocation strategies on this same signal
    print(f"Fold {fold}:")

    for name, fn in allocation_strategies.items():
        alloc_model = fn(signal)
        sharpe_model = sharpe_like_strategy(y_val, alloc_model)
        lgb_sharpe_scores[name].append(sharpe_model)

        print(f"  {name:16s} Sharpe-like: {sharpe_model:.6f}")
    print()

# Summary per strategy
print("\n=== Summary: LGBMRegressor Sharpe-like per strategy ===\n")
for name, scores in lgb_sharpe_scores.items():
    pretty_print_scores(f"LGBMRegressor + {name}", scores)

=== LGBMRegressor Allocation Sharpe-like per strategy ===

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000919 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31449
[LightGBM] [Info] Number of data points in the train set: 902, number of used features: 145
[LightGBM] [Info] Start training from score -0.000169
Fold 1:
  softplus         Sharpe-like: 0.038462
  piecewise        Sharpe-like: 0.040963
  tanh             Sharpe-like: 0.036206
  linear           Sharpe-like: 0.019954
  asymmetric       Sharpe-like: 0.038301
  sign             Sharpe-like: 0.041222
  softplus_vol     Sharpe-like: 0.034072
  tanh_threshold   Sharpe-like: 0.036136
  kelly            Sharpe-like: 0.035959
  regime_softplus  Sharpe-like: 0.039270
  softplus_sharpe  Sharpe-like: 0.038497

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000988 seconds.
You can set `force_col_wise=true` 