# HULL TACTICAL MARKET PREDICTION - SUBMISSION NOTEBOOK

This notebook introduces feature engineering to our problem and trains a LightGBM (Best performing model from step 2) on the 
preprocessed dataset.

In [15]:
import os
import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# Load data
TRAIN_PATH = "../data/train.csv"
TEST_PATH  = "../data/test.csv"

train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

print(f"Training Shape: {train_df.shape}\nTesting Shape: {test_df.shape}")


Training Shape: (9021, 98)
Testing Shape: (10, 99)


Creating new features

In [16]:
# Feature parameters
LAGS = [1, 3, 5]
ROLL_WINDOWS = [5, 10]

def create_features(df, base_cols):

    df = df.copy()

    # ----- LAGS -----
    for col in base_cols:
        for lag in LAGS:
            df[f"{col}_lag_{lag}"] = df[col].shift(lag)

    # ----- ROLLING FEATURES -----
    for col in base_cols:
        for w in ROLL_WINDOWS:
            df[f"{col}_roll_mean_{w}"] = df[col].rolling(w).mean()
            df[f"{col}_roll_std_{w}"]  = df[col].rolling(w).std()

    # Simple fills
    df = df.ffill().bfill().fillna(0)

    return df

In [17]:
# ============================================================
# Final selected features (quota features from EDA)
# ============================================================

FEATURE_COLS = ['E19', 'E3', 'E2', 'E4', 'E13', 'S2', 'S5', 'S6', 'V3', 'V13', 'V7', 'V5', 'I6', 'I2', 'M4', 'M3', 'M12', 'P11', 'P5', 'P10']

TARGET = "market_forward_excess_returns"

tscv = TimeSeriesSplit(n_splits=5)

Defining evaluation metrics

In [18]:
# Kaggle official scoring function

MIN_INVESTMENT = 0
MAX_INVESTMENT = 2

class ParticipantVisibleError(Exception):
    pass

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name=None) -> float:

    if not pd.api.types.is_numeric_dtype(submission['prediction']):
        raise ParticipantVisibleError('Predictions must be numeric')

    solution = solution.copy()
    solution['position'] = submission['prediction']

    strategy_returns = (
        solution['risk_free_rate'] * (1 - solution['position']) +
        solution['position'] * solution['forward_returns']
    )

    strategy_excess_returns = strategy_returns - solution['risk_free_rate']
    strat_cum = (1 + strategy_excess_returns).prod()
    strat_mean = strat_cum ** (1/len(solution)) - 1
    strategy_std = strategy_returns.std()

    trading_days = 252
    if strategy_std == 0:
        return 0.0

    sharpe = strat_mean / strategy_std * np.sqrt(trading_days)
    strategy_vol = float(strategy_std * np.sqrt(trading_days) * 100)

    market_excess = solution['forward_returns'] - solution['risk_free_rate']
    market_cum = (1 + market_excess).prod()
    market_mean = market_cum ** (1/len(solution)) - 1
    market_std = solution['forward_returns'].std()
    market_vol = float(market_std * np.sqrt(trading_days) * 100)

    excess_vol = max(0, strategy_vol/market_vol - 1.2)
    vol_penalty = 1 + excess_vol

    return_gap = max(0, (market_mean - strat_mean) * 100 * trading_days)
    return_penalty = 1 + (return_gap**2)/100

    adjusted = sharpe / (vol_penalty * return_penalty)
    return float(min(adjusted, 1_000_000))

In [19]:
# Prediction → Allocation + Local Kaggle metric
def returns_to_allocation(pred):
    if pred <= 0:
        return 0.0
    if pred >= 0.01:
        return 2.0
    return 2.0 * (pred / 0.01)

def kaggle_local_score(model, X_valid, df_valid):
    raw_preds = model.predict(X_valid)
    raw_preds = np.clip(raw_preds, -0.05, 0.05)

    allocs = np.array([returns_to_allocation(p) for p in raw_preds])

    submission = pd.DataFrame({"prediction": allocs})
    solution = pd.DataFrame({
        "forward_returns": df_valid["forward_returns"].values,
        "risk_free_rate": df_valid["risk_free_rate"].values
    })

    return score(solution, submission)

Model Hyperparameter Search

In [None]:
# LightGBM Hyperparameter Search (FAST GRID)

def evaluate_lgbm_params(params):

    tscv = TimeSeriesSplit(n_splits=5)
    scores = []
    FE_FEATURE_COLS = None

    for tr_idx, va_idx in tscv.split(train_df):

        df_tr = train_df.iloc[tr_idx].copy()
        df_va = train_df.iloc[va_idx].copy()

        # FE SUR LE TRAIN SEULEMENT
        df_tr_fe = create_features(df_tr, FEATURE_COLS)

        # FE POUR LA VALIDATION = train + valid, puis on coupe
        df_combined = pd.concat([df_tr, df_va], ignore_index=True)
        df_combined_fe = create_features(df_combined, FEATURE_COLS)
        df_va_fe = df_combined_fe.iloc[len(df_tr):]

        # On définit les features FE une seule fois
        if FE_FEATURE_COLS is None:
            FE_FEATURE_COLS = [
                c for c in df_tr_fe.columns
                if c not in [TARGET, 'forward_returns', 'risk_free_rate']
            ]

        Xtr = df_tr_fe[FE_FEATURE_COLS]
        ytr = df_tr_fe[TARGET]

        Xva = df_va_fe[FE_FEATURE_COLS]
        yva = df_va_fe[TARGET]

        model = lgb.LGBMRegressor(**params, random_state=42, verbose=-1)
        model.fit(Xtr, ytr)

        score = kaggle_local_score(model, Xva, df_va)
        scores.append(score)

    return float(np.mean(scores))


lgbm_param_grid = [

    # --- Around the previous best found config ---
    {"n_estimators": 800, "learning_rate": 0.02, "num_leaves": 120, "subsample": 0.7, "colsample_bytree": 0.7,
     "max_depth": -1, "min_child_samples": 20, "reg_alpha": 0.0, "reg_lambda": 0.0},

    {"n_estimators": 1000, "learning_rate": 0.015, "num_leaves": 150, "subsample": 0.8, "colsample_bytree": 0.8,
     "max_depth": -1, "min_child_samples": 30, "reg_alpha": 0.0, "reg_lambda": 0.0},

    {"n_estimators": 600, "learning_rate": 0.03, "num_leaves": 100, "subsample": 0.6, "colsample_bytree": 0.6,
     "max_depth": 10, "min_child_samples": 10, "reg_alpha": 0.0, "reg_lambda": 0.0},

    # --- More robust / regularized versions ---
    {"n_estimators": 900, "learning_rate": 0.012, "num_leaves": 80, "subsample": 0.9, "colsample_bytree": 0.9,
     "max_depth": 8, "min_child_samples": 40, "reg_alpha": 0.1, "reg_lambda": 0.1},

    {"n_estimators": 1200, "learning_rate": 0.01, "num_leaves": 160, "subsample": 1.0, "colsample_bytree": 1.0,
     "max_depth": -1, "min_child_samples": 50, "reg_alpha": 0.3, "reg_lambda": 0.5},
]

best_lgbm_score = -999
best_lgbm_params = None

for params in lgbm_param_grid:
    print("Testing LGBM params:", params)
    score_mean = evaluate_lgbm_params(params)
    print("→ Score:", score_mean)

    if score_mean > best_lgbm_score:
        best_lgbm_score = score_mean
        best_lgbm_params = params

print("\nBEST LGBM SCORE:", best_lgbm_score)
print("BEST LGBM PARAMS:", best_lgbm_params)


# Define model
lgbm_model = lgb.LGBMRegressor(**best_lgbm_params, random_state=42)


Testing LGBM params: {'n_estimators': 800, 'learning_rate': 0.02, 'num_leaves': 120, 'subsample': 0.7, 'colsample_bytree': 0.7, 'max_depth': -1, 'min_child_samples': 20, 'reg_alpha': 0.0, 'reg_lambda': 0.0}
→ Score: 0.35538956176589326
Testing LGBM params: {'n_estimators': 1000, 'learning_rate': 0.015, 'num_leaves': 150, 'subsample': 0.8, 'colsample_bytree': 0.8, 'max_depth': -1, 'min_child_samples': 30, 'reg_alpha': 0.0, 'reg_lambda': 0.0}
→ Score: 0.3017576083563382
Testing LGBM params: {'n_estimators': 600, 'learning_rate': 0.03, 'num_leaves': 100, 'subsample': 0.6, 'colsample_bytree': 0.6, 'max_depth': 10, 'min_child_samples': 10, 'reg_alpha': 0.0, 'reg_lambda': 0.0}
→ Score: 0.5181689055683574
Testing LGBM params: {'n_estimators': 900, 'learning_rate': 0.012, 'num_leaves': 80, 'subsample': 0.9, 'colsample_bytree': 0.9, 'max_depth': 8, 'min_child_samples': 40, 'reg_alpha': 0.1, 'reg_lambda': 0.1}
→ Score: 0.3088060256235065
Testing LGBM params: {'n_estimators': 1200, 'learning_rate

Local Evaluation of the model

In [21]:
# Local model evaluation using Kaggle metric

models = {
    "LGBM": lgbm_model
}

scores = {"LGBM": []}
FE_FEATURE_COLS = None  # sera défini au premier fold

for train_idx, valid_idx in tscv.split(train_df):

    # 1) Train & valid split for this fold
    df_train_fold = train_df.iloc[train_idx].copy()
    df_valid_fold = train_df.iloc[valid_idx].copy()

    # 2) Apply FE on train only
    df_train_fe = create_features(df_train_fold, FEATURE_COLS)

    # Apply FE on train+valid to avoid leakage in rolling features
    df_combined = pd.concat([df_train_fold, df_valid_fold], ignore_index=True)
    df_combined_fe = create_features(df_combined, FEATURE_COLS)
    df_valid_fe = df_combined_fe.iloc[len(df_train_fold):].copy()

    # 3) Determine FE feature columns once
    if FE_FEATURE_COLS is None:
        FE_FEATURE_COLS = [
            c for c in df_train_fe.columns
            if c not in [TARGET, "forward_returns", "risk_free_rate"]
        ]
        print("Number of FE features:", len(FE_FEATURE_COLS))

    # Extract matrices
    Xtr = df_train_fe[FE_FEATURE_COLS]
    ytr = df_train_fe[TARGET]

    Xva = df_valid_fe[FE_FEATURE_COLS]
    yva = df_valid_fe[TARGET]

    df_va_solution = df_valid_fold  # needed for Kaggle metric

    # 4) Evaluate the LightGBM model
    model = lgbm_model
    model.fit(Xtr, ytr)

    fold_score = kaggle_local_score(model, Xva, df_va_solution)
    scores["LGBM"].append(fold_score)

    print(f"LGBM fold score = {fold_score}")

# Summary
print("\nMEAN SCORE (LGBM):", np.mean(scores["LGBM"]))

Number of FE features: 235
LGBM fold score = 0.5851272498288512
LGBM fold score = 0.5418125232149132
LGBM fold score = 0.42386553209091005
LGBM fold score = 0.8406712293333722
LGBM fold score = 0.19936799337374

MEAN SCORE (LGBM): 0.5181689055683574


Final Training of the model

In [22]:
# ============================================================
# Train final LightGBM model on full FE dataset
# ============================================================

# 1) Apply FE to the full training set
train_fe_full = create_features(train_df, FEATURE_COLS)

# 2) Use the FE columns discovered earlier (from cross-validation)
X_full = train_fe_full[FE_FEATURE_COLS]
y_full = train_fe_full[TARGET]

# 3) Train the final LightGBM model
final_lgbm_model = lgb.LGBMRegressor(**best_lgbm_params, random_state=42, verbose=-1)
final_lgbm_model.fit(X_full, y_full)

print("Final LightGBM model trained on full FE dataset.")
print("Training shape:", X_full.shape)

# For later inference
best_model = final_lgbm_model

Final LightGBM model trained on full FE dataset.
Training shape: (9021, 235)


Kaggle submission specifics (IGNORE FOR LOCAL RUN)

In [23]:
# Inference API — Kaggle submission (not needed for local version)

history_df = None
model_loaded = True

def to_pandas(df):
    if isinstance(df, pl.DataFrame):
        return df.to_pandas()
    return pd.DataFrame(df)

def predict(test: pl.DataFrame) -> float:
    global history_df, best_model

    test_pd = to_pandas(test)

    if history_df is None:
        history_df = test_pd.copy()
    else:
        history_df = pd.concat([history_df, test_pd], ignore_index=True)

    window = history_df.copy()  # ou un tail(...) si tu veux limiter la taille
    window_fe = create_features(window, FEATURE_COLS)
    
    X_last = window_fe[FE_FEATURE_COLS].iloc[-1:]
    raw_pred = float(best_model.predict(X_last)[0])
    raw_pred = float(np.clip(raw_pred, -0.05, 0.05))

    alloc = returns_to_allocation(raw_pred)
    return float(alloc)

# Start server
inference_server = kaggle_eval.DefaultInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        ("/kaggle/input/hull-tactical-market-prediction/",)
    )

NameError: name 'kaggle_eval' is not defined