In [87]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import lightgbm as lgb

In [88]:
df = pd.read_csv("../BX_data_process_construction/top_features_df.csv")
race_counts = df["Race ID"].value_counts()
valid_races = race_counts[race_counts == 16].index
df_16 = df[df["Race ID"].isin(valid_races)]

In [89]:

# 2. Feature columns and target label
feat_cols = [
    'Favorite_Rank', 'Age_Scale', 'Bracket Number', 'Top3_Rank',
    'Speed (m/s)_Rank', 'Track_Distance', 'Weight_Rank'
]
df_16["lgb_label"] = 16 - df_16["Finish Position"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_16["lgb_label"] = 16 - df_16["Finish Position"]


In [90]:
# 3. Split by race
unique_races = df_16["Race ID"].unique()
train_races, test_races = train_test_split(unique_races, test_size=0.2, random_state=42)
df_16_train = df_16[df["Race ID"].isin(train_races)].copy()
df_16_test = df_16[df["Race ID"].isin(test_races)].copy()



  df_16_train = df_16[df["Race ID"].isin(train_races)].copy()
  df_16_test = df_16[df["Race ID"].isin(test_races)].copy()


In [91]:
# 4. Prepare inputs
X_train = df_16_train[feat_cols]
y_train = df_16_train["lgb_label"]
group_train = df_16_train.groupby("Race ID").size().tolist()

X_test = df_16_test[feat_cols]
y_test = df_16_test["Finish Position"]
group_test = df_16_test.groupby("Race ID").size().tolist()



In [92]:
# 5. Define param grid to try manually
param_grid = [
    {"num_leaves": 31, "learning_rate": 0.1, "min_child_samples": 20},
    {"num_leaves": 63, "learning_rate": 0.05, "min_child_samples": 10},
    {"num_leaves": 127, "learning_rate": 0.01, "min_child_samples": 5},
]



In [93]:
# 6. Manual tuning loop
best_acc = 0
best_params = None

for params in param_grid:
    model = lgb.LGBMRanker(
        objective='lambdarank',
        metric='ndcg',
        random_state=42,
        **params
    )
    
    model.fit(X_train, y_train, group=group_train)
    df_16_test["score"] = model.predict(X_test)
    df_16_test["pred_rank"] = df_16_test.groupby("Race ID")["score"].rank(ascending=False, method="first")
    
    pred_top1 = df_16_test[df_16_test["pred_rank"] == 1]
    actual_top1 = df_16_test[df_16_test["Finish Position"] == 1]
    
    merged = pred_top1[["Race ID", "Horse ID"]].merge(
        actual_top1[["Race ID", "Horse ID"]],
        on="Race ID",
        suffixes=("_pred", "_true")
    )
    merged["correct"] = (merged["Horse ID_pred"] == merged["Horse ID_true"]).astype(int)
    acc = merged["correct"].mean()
    
    print(f"Params: {params}, Top-1 Accuracy: {acc:.4f}")
    
    if acc > best_acc:
        best_acc = acc
        best_params = params




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000329 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 12944, number of used features: 7
Params: {'num_leaves': 31, 'learning_rate': 0.1, 'min_child_samples': 20}, Top-1 Accuracy: 0.2069
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 12944, number of used features: 7
Params: {'num_leaves': 63, 'learning_rate': 0.05, 'min_child_samples': 10}, Top-1 Accuracy: 0.1724
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And

In [94]:
print(f"Top-1 Accuracy: {best_acc:.4f}")

Top-1 Accuracy: 0.2069
