In [80]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report
import lightgbm as lgb

In [81]:
# Load and filter for races with exactly 16 horses
df = pd.read_csv("../data/top_features_df.csv")
race_counts = df["Race ID"].value_counts()
valid_races = race_counts[race_counts == 16].index
df_16 = df[df["Race ID"].isin(valid_races)]
#1012 valid races

In [82]:
# Feature columns
feature_cols = [
    'Favorite_Rank',
    'Age_Scale',
    'Bracket Number',
    'Top3_Rank',
    'Speed (m/s)_Rank',
    'Track_Distance',
    'Weight_Rank'
]

In [83]:
# Split races (leakage-free)
unique_races = df_16["Race ID"].unique()
train_races, test_races = train_test_split(unique_races, test_size=0.2, random_state=42)
#train races: 809, test races: 203
df_train = df_16[df_16["Race ID"].isin(train_races)]
df_test = df_16[df_16["Race ID"].isin(test_races)]


In [84]:
# Function to generate pairwise dataset
def generate_pairwise_df(df_grouped):
    rows = []
    for race_id, group in df_grouped.groupby("Race ID"):
        group = group.reset_index(drop=True)
        for i, j in combinations(range(len(group)), 2):
            hi = group.loc[i]
            hj = group.loc[j]

            diff = hi[feature_cols].values - hj[feature_cols].values
            target = int(hi["Finish Position"] < hj["Finish Position"])
            rows.append(np.concatenate([diff, [target], [race_id], [hi["Horse ID"]]]))

            diff_rev = hj[feature_cols].values - hi[feature_cols].values
            target_rev = int(hj["Finish Position"] < hi["Finish Position"])
            rows.append(np.concatenate([diff_rev, [target_rev], [race_id], [hj["Horse ID"]]]))
    
    return pd.DataFrame(rows, columns=[f"{col}_diff" for col in feature_cols] + ["target", "race_id", "horse_id"])



In [85]:
# Generate training and testing pairwise data
pairwise_train_df = generate_pairwise_df(df_train)
pairwise_test_df = generate_pairwise_df(df_test)
print(pairwise_train_df)




        Favorite_Rank_diff  Age_Scale_diff  Bracket Number_diff  \
0                      2.0        0.191480                  5.0   
1                     -2.0       -0.191480                 -5.0   
2                      1.0        0.378455                  1.0   
3                     -1.0       -0.378455                 -1.0   
4                     -5.0       -0.196150                  2.0   
...                    ...             ...                  ...   
194155                -6.0        0.196150                 -3.0   
194156                -2.0        0.191480                 -2.0   
194157                 2.0       -0.191480                  2.0   
194158                -8.0        0.387631                 -5.0   
194159                 8.0       -0.387631                  5.0   

        Top3_Rank_diff  Speed (m/s)_Rank_diff  Track_Distance_diff  \
0                  0.0                    0.0                  0.0   
1                  0.0                    0.0          

In [86]:
# Split features and labels
X_train = pairwise_train_df.drop(columns=["target", "race_id", "horse_id"])
y_train = pairwise_train_df["target"]
X_test = pairwise_test_df.drop(columns=["target", "race_id", "horse_id"])
y_test = pairwise_test_df["target"]
test_race_ids = pairwise_test_df["race_id"].values
test_horse_ids = pairwise_test_df["horse_id"].values



In [87]:
# Train LightGBM with calibration
X_model, X_cali, y_model, y_cali = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
base_model = lgb.LGBMClassifier(objective='binary', is_unbalance=True, random_state=42)
base_model.fit(X_model, y_model)
calibrated_model = CalibratedClassifierCV(base_model, method='isotonic', cv='prefit')
calibrated_model.fit(X_cali, y_cali)



[LightGBM] [Info] Number of positive: 77580, number of negative: 77748
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000776 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 392
[LightGBM] [Info] Number of data points in the train set: 155328, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499459 -> initscore=-0.002163
[LightGBM] [Info] Start training from score -0.002163




In [88]:
# Predict probabilities
proba = calibrated_model.predict_proba(X_test)[:, 1]
result_df = pd.DataFrame({
    "race_id": test_race_ids,
    "horse_id": test_horse_ids,
    "proba": proba
})
print(result_df)



            race_id      horse_id     proba
0      1.990061e+11  1.985102e+09  0.460806
1      1.990061e+11  1.987105e+09  0.551919
2      1.990061e+11  1.985102e+09  0.419495
3      1.990061e+11  1.987106e+09  0.556971
4      1.990061e+11  1.985102e+09  0.533632
...             ...           ...       ...
48715  2.025070e+11  2.020105e+09  0.247934
48716  2.025070e+11  2.020110e+09  0.587246
48717  2.025070e+11  2.021106e+09  0.419495
48718  2.025070e+11  2.020105e+09  0.318834
48719  2.025070e+11  2.021106e+09  0.723101

[48720 rows x 3 columns]


In [89]:
# Softmax per race
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

result_df["softmax_proba"] = result_df.groupby("race_id")["proba"].transform(softmax)
result_df["rank"] = result_df.groupby("race_id")["softmax_proba"].rank(ascending=False, method="first")
top1_df = result_df[result_df["rank"] == 1]
print(result_df)



            race_id      horse_id     proba  softmax_proba   rank
0      1.990061e+11  1.985102e+09  0.460806       0.003923  121.0
1      1.990061e+11  1.987105e+09  0.551919       0.004297  113.0
2      1.990061e+11  1.985102e+09  0.419495       0.003764  142.0
3      1.990061e+11  1.987106e+09  0.556971       0.004319   98.0
4      1.990061e+11  1.985102e+09  0.533632       0.004219  119.0
...             ...           ...       ...            ...    ...
48715  2.025070e+11  2.020105e+09  0.247934       0.003172  219.0
48716  2.025070e+11  2.020110e+09  0.587246       0.004453   99.0
48717  2.025070e+11  2.021106e+09  0.419495       0.003766  148.0
48718  2.025070e+11  2.020105e+09  0.318834       0.003405  192.0
48719  2.025070e+11  2.021106e+09  0.723101       0.005101   51.0

[48720 rows x 5 columns]


In [90]:
# Merge true finish positions
true_labels_df = df_test[["Race ID", "Horse ID", "Finish Position"]].copy()
true_labels_df["target"] = (true_labels_df["Finish Position"] == 1).astype(int)
true_labels_df = true_labels_df.rename(columns={"Race ID": "race_id", "Horse ID": "horse_id"})

# Predicted label = 1 if horse was ranked #1 by model
predicted_labels_df = result_df[["race_id", "horse_id", "rank"]].copy()
predicted_labels_df["predicted"] = (predicted_labels_df["rank"] == 1).astype(int)
print(predicted_labels_df)

# Merge predictions with actual labels
merged_all = true_labels_df.merge(predicted_labels_df[["race_id", "horse_id", "predicted"]], on=["race_id", "horse_id"], how="left")
merged_all["predicted"] = merged_all["predicted"].fillna(0).astype(int)


# Final classification report
y_true = merged_all["target"]
y_pred = merged_all["predicted"]

from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, digits=4))

            race_id      horse_id   rank  predicted
0      1.990061e+11  1.985102e+09  121.0          0
1      1.990061e+11  1.987105e+09  113.0          0
2      1.990061e+11  1.985102e+09  142.0          0
3      1.990061e+11  1.987106e+09   98.0          0
4      1.990061e+11  1.985102e+09  119.0          0
...             ...           ...    ...        ...
48715  2.025070e+11  2.020105e+09  219.0          0
48716  2.025070e+11  2.020110e+09   99.0          0
48717  2.025070e+11  2.021106e+09  148.0          0
48718  2.025070e+11  2.020105e+09  192.0          0
48719  2.025070e+11  2.021106e+09   51.0          0

[48720 rows x 4 columns]
              precision    recall  f1-score   support

           0     0.9384    0.9968    0.9667     45675
           1     0.2759    0.0184    0.0345      3045

    accuracy                         0.9356     48720
   macro avg     0.6071    0.5076    0.5006     48720
weighted avg     0.8970    0.9356    0.9084     48720

