In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [30]:
df = pd.read_csv("../BX_data_process_construction/top_features_df.csv")
race_counts = df["Race ID"].value_counts()
valid_races = race_counts[race_counts == 16].index
df_16 = df[df["Race ID"].isin(valid_races)]

In [31]:
# 2. Define features
features = [
    'Favorite_Rank', 'Age_Scale', 'Bracket Number', 'Top3_Rank',
    'Speed (m/s)_Rank', 'Track_Distance', 'Weight_Rank'
]



In [32]:
# 3. Create ranking label (lower finish position = better rank → use -position)
df_16["rank_label"] = 16 - df_16["Finish Position"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_16["rank_label"] = 16 - df_16["Finish Position"]


In [33]:
# 4. Split by race to avoid leakage
unique_races = df_16["Race ID"].unique()
train_races, test_races = train_test_split(unique_races, test_size=0.2, random_state=42)

df_16_train = df_16[df_16["Race ID"].isin(train_races)].copy()
df_16_test = df_16[df_16["Race ID"].isin(test_races)].copy()



In [34]:
# 5. Prepare training data
X_train = df_16_train[features]
y_train = df_16_train["rank_label"]
group_train = df_16_train.groupby("Race ID").size().tolist()



In [35]:
# 6. Prepare testing data
X_test = df_16_test[features]
y_test = df_16_test["Finish Position"]  # Keep original for evaluation
group_test = df_16_test.groupby("Race ID").size().tolist()



In [36]:
# 7. Train LightGBM ranker
ranker = lgb.LGBMRanker(objective='lambdarank', metric='ndcg', random_state=42)
ranker.fit(X_train, y_train, group=group_train)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 12944, number of used features: 7


In [37]:
# 8. Predict scores and rank within each race
df_16_test["score"] = ranker.predict(X_test)
df_16_test["pred_rank"] = df_16_test.groupby("Race ID")["score"].rank(ascending=False, method="first")



In [38]:
# 9. Extract top-1 predictions
top1_pred = df_16_test[df_16_test["pred_rank"] == 1]
top1_actual = df_16_test[df_16_test["Finish Position"] == 1]



In [39]:
# 10. Evaluate top-1 accuracy
merged = top1_pred[["Race ID", "Horse ID"]].merge(
    top1_actual[["Race ID", "Horse ID"]],
    on="Race ID",
    suffixes=("_pred", "_true")
)
merged["correct"] = (merged["Horse ID_pred"] == merged["Horse ID_true"]).astype(int)

accuracy = merged["correct"].mean()
print(f"Top-1 prediction accuracy: {accuracy:.4f}")


Top-1 prediction accuracy: 0.2069
