In [92]:
import pandas as pd
import numpy as np
from itertools import combinations
import joblib

# ===========================
# Step 1: Load 15-horse race data
# ===========================
race_df = pd.read_csv("../data/Prediction_5_04.csv")

# ===========================
# Step 2: Preprocess features
# ===========================
prepared_df = race_df.copy()
prepared_df = prepared_df.rename(columns={
    "fav": "favorite",
    "bracket": "bracket_number"
})

# Add missing features with default values (update if you have real ones)
prepared_df["avg_speed_mps"] = prepared_df["speed_mps"]
prepared_df["top3"] = 0
prepared_df["track_distance"] = 2000
prepared_df["weather_weather01"] = 1
prepared_df["weather_weather02"] = 0

selected_features = [
    "total_weight", "speed_mps", "avg_speed_mps", "favorite", "bracket_number",
    "age", "top3", "track_distance", "weather_weather01", "weather_weather02"
]

# ===========================
# Step 3: Create pairwise differences
# ===========================
pairwise_rows = []
horse_names = prepared_df["Name"].values
for i, j in combinations(range(len(prepared_df)), 2):
    hi = prepared_df.iloc[i]
    hj = prepared_df.iloc[j]

    diff_ij = hi[selected_features].values - hj[selected_features].values
    pairwise_rows.append(np.concatenate([diff_ij, [i]]))  # i vs j

    diff_ji = hj[selected_features].values - hi[selected_features].values
    pairwise_rows.append(np.concatenate([diff_ji, [j]]))  # j vs i

pairwise_df = pd.DataFrame(pairwise_rows, columns=[f"{f}_diff" for f in selected_features] + ["horse_index"])
pairwise_df["horse_index"] = pairwise_df["horse_index"].astype(int)

# ===========================
# Step 4: Load trained model
# ===========================
calibrated_model = joblib.load("calibrated_model.pkl")

# ===========================
# Step 5: Predict outcomes
# ===========================
X_input = pairwise_df.drop(columns="horse_index")
pairwise_df["proba"] = calibrated_model.predict_proba(X_input)[:, 1]

# ===========================
# Step 6: Aggregate and rank
# ===========================
horse_scores = pairwise_df.groupby("horse_index")["proba"].sum().reset_index()
horse_scores["softmax_proba"] = np.exp(horse_scores["proba"]) / np.exp(horse_scores["proba"]).sum()
horse_scores["horse_name"] = horse_scores["horse_index"].apply(lambda i: horse_names[i])

# Final ranking
horse_scores = horse_scores.sort_values("softmax_proba", ascending=False).reset_index(drop=True)
horse_scores.index += 1  # Rank starts from 1

# ===========================
# Step 7: Display results
# ===========================
print("\n🏇 Predicted Horse Rankings (Top 1 is most likely winner):\n")
print(horse_scores[["horse_name", "softmax_proba"]].to_string(index=True))



🏇 Predicted Horse Rankings (Top 1 is most likely winner):

            horse_name  softmax_proba
1             Redentor   4.271938e-01
2      Byzantine Dream   4.217879e-01
3      Shonan la Punta   9.546219e-02
4        Sunrise Earth   3.511858e-02
5       Meiner Emperor   1.291940e-02
6        Justin Palace   4.752782e-03
7       Chevalier Rose   1.748451e-03
8        Blow the Horn   6.432191e-04
9           Warp Speed   2.366271e-04
10            Pradaria   8.705025e-05
11  Hayatenofukunosuke   3.202400e-05
12          Win Erfolg   1.178097e-05
13        Limit Buster   4.333977e-06
14         Jean Kazuma   9.655986e-07
15               Arata   9.533796e-07
