In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import lightgbm as lgb

In [2]:
df = pd.read_csv("../data/cleaned_race_results_for_lightgbm.csv")
race_counts = df["race_id"].value_counts()
valid_races = race_counts[race_counts == 16].index
df_16 = df[df["race_id"].isin(valid_races)]

In [3]:

# 2. Feature columns and target label
selected_features = ["total_weight", 
                     "speed_mps", 
                     "avg_speed_mps", 
                     "favorite", 
                     "bracket_number", 
                     "age", 
                     "top3", 
                     "track_distance", 
                     "weather_weather01", 
                     "weather_weather02"
                     ]
df_16["lgb_label"] = 16 - df_16["finish_position"]



In [4]:
# 3. Split by race
unique_races = df_16["race_id"].unique()
train_races, test_races = train_test_split(unique_races, test_size=0.2, random_state=42)
df_16_train = df_16[df["race_id"].isin(train_races)].copy()
df_16_test = df_16[df["race_id"].isin(test_races)].copy()



In [5]:
# 4. Prepare inputs
X_train = df_16_train[selected_features]
y_train = df_16_train["lgb_label"]
group_train = df_16_train.groupby("race_id").size().tolist()

X_test = df_16_test[selected_features]
y_test = df_16_test["finish_position"]
group_test = df_16_test.groupby("race_id").size().tolist()



In [6]:
# 5. Define param grid to try manually
param_grid = [
    {"num_leaves": 31, "learning_rate": 0.1, "min_child_samples": 20},
    {"num_leaves": 63, "learning_rate": 0.05, "min_child_samples": 10},
    {"num_leaves": 127, "learning_rate": 0.01, "min_child_samples": 5},
]



In [7]:
# 6. Manual tuning loop
best_acc = 0
best_params = None

for params in param_grid:
    model = lgb.LGBMRanker(
        objective='lambdarank',
        metric='ndcg',
        random_state=42,
        **params
    )
    
    model.fit(X_train, y_train, group=group_train)
    df_16_test["score"] = model.predict(X_test)
    df_16_test["pred_rank"] = df_16_test.groupby("race_id")["score"].rank(ascending=False, method="first")
    
    pred_top1 = df_16_test[df_16_test["pred_rank"] == 1]
    actual_top1 = df_16_test[df_16_test["finish_position"] == 1]
    
    merged = pred_top1[["race_id", "horse_id"]].merge(
        actual_top1[["race_id", "horse_id"]],
        on="race_id",
        suffixes=("_pred", "_true")
    )
    merged["correct"] = (merged["horse_id_pred"] == merged["horse_id_true"]).astype(int)
    acc = merged["correct"].mean()
    
    print(f"Params: {params}, Top-1 Accuracy: {acc:.4f}")
    
    if acc > best_acc:
        best_acc = acc
        best_params = params




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 787
[LightGBM] [Info] Number of data points in the train set: 13040, number of used features: 10
Params: {'num_leaves': 31, 'learning_rate': 0.1, 'min_child_samples': 20}, Top-1 Accuracy: 0.3922
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 787
[LightGBM] [Info] Number of data points in the train set: 13040, number of used features: 10
Params: {'num_leaves': 63, 'learning_rate': 0.05, 'min_child_samples': 10}, Top-1 Accuracy: 0.4265
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000204 seco

In [8]:
print(f"Top-1 Accuracy: {best_acc:.4f}")

Top-1 Accuracy: 0.4265
