In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer



In [17]:
df = pd.read_excel(r"C:\Users\User\Documents\horse-racing-arima-kinen-2013\data\final_data.xlsx")

In [18]:
import re  # <-- add this line at the top

# Extract 3-letter course/track code
df['Course'] = df['Race Details'].str.extract(r'^([A-Z]{3})')

# Extract race grade (G1, G2, G3, or Listed)
df['RaceGrade'] = df['Race'].str.extract(r'\b(G\d|Listed)\b', flags=re.IGNORECASE)
df['RaceGrade'] = df['RaceGrade'].str.upper().fillna('UNGRADED')


# Optional: map to numeric values
grade_map = {
    'G1': 3,
    'G2': 2,
    'G3': 1,
    'LISTED': 0.5,
    'UNGRADED': 0
}
df['RaceGrade_numeric'] = df['RaceGrade'].map(grade_map)

print(df[['Race Details', 'Course', 'RaceGrade', 'RaceGrade_numeric']].head())


                    Race Details Course RaceGrade  RaceGrade_numeric
0    HAN 11R SANKEI OSAKA HAI G2    HAN        G2                2.0
1           TKY 11R JAPAN CUP G1    TKY        G1                3.0
2    HAN 11R TAKARAZUKA KINEN G1    HAN        G1                3.0
3  KYT 11R TENNO SHO (SPRING) G1    KYT        G1                3.0
4   HAN 11R HANSHIN DAISHOTEN G2    HAN        G2                2.0


In [19]:
df['Year'] = df['Year'].astype(str)

df['RaceDate'] = pd.to_datetime(df['Date'] + ' ' + df['Year'], format='%d %b %Y', errors='coerce')

df = df.sort_values(['Horse', 'RaceDate'])

# --- 6️⃣ Compute Days Since Last Race per Horse ---
df['DaysSinceLastRace'] = df.groupby('Horse')['RaceDate'].diff().dt.days
df['DaysSinceLastRace'] = df['DaysSinceLastRace'].fillna(df['DaysSinceLastRace'].median())

print(df[['Horse', 'RaceDate', 'Race', 'Course', 'RaceGrade', 'DaysSinceLastRace']])

             Horse   RaceDate  \
203   Admire Rakti 2010-11-14   
202   Admire Rakti 2010-12-11   
201   Admire Rakti 2011-01-08   
200   Admire Rakti 2011-03-20   
199   Admire Rakti 2011-04-02   
..             ...        ...   
20   Win Variation 2012-02-12   
19   Win Variation 2012-03-24   
18   Win Variation 2012-04-29   
17   Win Variation 2012-06-24   
16   Win Variation 2013-11-30   

                                                 Race Course RaceGrade  \
203                        14 Nov 2010 KYT 5R ２yo DBT    KYT  UNGRADED   
202                        11 Dec 2010 HAN 4R ２yo MDN    HAN  UNGRADED   
201  8 Jan 2011 KYT 9R FUKUJUSO TOKUBETSU ALW (1 Win)    KYT  UNGRADED   
200              20 Mar 2011 HAN 10R WAKABA STAKES OP    HAN  UNGRADED   
199                 2 Apr 2011 HAN 6R ３yo ALW (1 Win)    HAN  UNGRADED   
..                                                ...    ...       ...   
20                 12 Feb 2012 KYT 11R KYOTO KINEN G2    KYT        G2   
19         

In [20]:
def make_features(df):
    df = df.sort_values(['Horse', 'RaceDate'])
    features = (
        df.groupby('Horse')
        .apply(lambda g: pd.Series({
            'AvgPlacement_Last3': g['Finish'].rolling(3, min_periods=1).mean().iloc[-1],
            'AvgSpeed': g['Speed_kph'].mean(),
            'AvgPosChange': g['PosChange'].mean(),
            'AvgFinalCornerPos': g['FinalCornerPos'].mean(),
            'Runners_Mean': g['Runners'].mean(),
            'DaysSinceLastRace': (pd.Timestamp('2013-12-22') - g['RaceDate'].max()).days,
        }))
        .reset_index()
    )
    return features

In [22]:



# Run feature aggregation
features = make_features(df)

# ===========================================================
# 2️⃣ Add Arima Kinen 2013 race context
# ===========================================================
arima_context = {
    'Distance': 2500,
    'RaceGrade_numeric': 3,  # G1
    'Course': 'NAK',
    'Track': 'Turf',
    'Runners': 16
}

for k, v in arima_context.items():
    features[k] = v

# ===========================================================
# 3️⃣ Prepare training data (historical race-level)
# ===========================================================
df['Win'] = (df['Finish'] == 1).astype(int)
train_df = df.copy()

# ===========================================================
# 4️⃣ Merge aggregated features into race-level data for training
# ===========================================================
train_df = train_df.merge(features, on='Horse', how='left', suffixes=('', '_agg'))

# ===========================================================
# 5️⃣ Define features to train model
# ===========================================================
feature_cols = [
    'AvgPlacement_Last3', 'AvgSpeed', 'AvgPosChange', 'Runners_Mean',
    'DaysSinceLastRace', 'AvgFinalCornerPos',
    'Distance', 'RaceGrade_numeric', 'Course', 'Track'
]

numeric_cols = [
    'AvgPlacement_Last3', 'AvgSpeed', 'AvgPosChange', 'Runners_Mean',
    'DaysSinceLastRace', 'AvgFinalCornerPos',
    'Distance', 'RaceGrade_numeric', 
]
categorical_cols = ['Course', 'Track']

# ===========================================================
# 6️⃣ Preprocessing pipeline
# ===========================================================
numeric_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# ===========================================================
# 7️⃣ Model setup
# ===========================================================
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# ===========================================================
# 8️⃣ Train model on all historical races
# ===========================================================
X_train = train_df[feature_cols]
y_train = train_df['Win']

pipeline.fit(X_train, y_train)

# ===========================================================
# 9️⃣ Predict for Arima Kinen 2013 horses
# ===========================================================
X_arima = features[feature_cols]
features['Win_Prob'] = pipeline.predict_proba(X_arima)[:, 1]

# ===========================================================
# 🔟 Rank horses by predicted win probability
# ===========================================================
leaderboard = features[['Horse', 'Win_Prob']].sort_values('Win_Prob', ascending=False).reset_index(drop=True)
leaderboard['Win_Prob_Normalized'] = leaderboard['Win_Prob'] / leaderboard['Win_Prob'].sum()

print("🏇 Predicted Win Probabilities for 2013 Arima Kinen")
print(leaderboard)

  .apply(lambda g: pd.Series({


🏇 Predicted Win Probabilities for 2013 Arima Kinen
               Horse  Win_Prob  Win_Prob_Normalized
0            Orfevre  0.478669             0.165072
1          Gold Ship  0.396472             0.136726
2        Verde Green  0.272905             0.094113
3          Desperado  0.219130             0.075568
4   Tamamo Best Play  0.210241             0.072503
5      Win Variation  0.171024             0.058979
6       Admire Rakti  0.153260             0.052853
7   Love Is Boo Shet  0.142358             0.049093
8        T M Inazuma  0.132260             0.045611
9     Curren Mirotic  0.131389             0.045310
10     Danon Ballade  0.126481             0.043618
11           Lelouch  0.115758             0.039920
12        Lovely Day  0.109085             0.037619
13      Tosen Jordan  0.102708             0.035420
14   Nakayama Knight  0.089130             0.030737
15      To The Glory  0.048887             0.016859
