In [None]:
# install pkgs
%pip install pandas numpy scikit-learn jupyter matplotlib seaborn xgboost lightgbm

In [32]:
# import libraries
import pandas as pd
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import log_loss, brier_score_loss

# Load test and train data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# view columns
print("Train data columns:")
print(train.columns.tolist())

print("\nTest data columns:")
print(test.columns.tolist())

# preview
print("Train Data Sample:")
print(train.head())

print("\nTest Data Sample:")
print(test.head())

Train data columns:
['Race_Time', 'Race_ID', 'Course', 'Distance', 'distanceYards', 'Prize', 'Going', 'Horse', 'Trainer', 'Jockey', 'betfairSP', 'Position', 'timeSecs', 'pdsBeaten', 'NMFP', 'Runners', 'Age', 'Speed_PreviousRun', 'Speed_2ndPreviousRun', 'NMFPLTO', 'MarketOdds_PreviousRun', 'MarketOdds_2ndPreviousRun', 'TrainerRating', 'JockeyRating', 'daysSinceLastRun', 'SireRating', 'DamsireRating', 'meanRunners']

Test data columns:
['Race_Time', 'Race_ID', 'Course', 'Distance', 'distanceYards', 'Prize', 'Going', 'Horse', 'Trainer', 'Jockey', 'betfairSP', 'Position', 'timeSecs', 'pdsBeaten', 'NMFP', 'Runners', 'Age', 'Speed_PreviousRun', 'Speed_2ndPreviousRun', 'NMFPLTO', 'MarketOdds_PreviousRun', 'MarketOdds_2ndPreviousRun', 'TrainerRating', 'JockeyRating', 'daysSinceLastRun', 'SireRating', 'DamsireRating', 'meanRunners']
Train Data Sample:
        Race_Time  Race_ID         Course Distance  distanceYards  Prize  \
0  2/1/2024 19:00     1935  Wolverhampton   6f 20y           1340   4

In [33]:

# check data types
# Train data 
print("Train Data Info:")
print(train.info())

# Test data 
print("\nTest Data Info:")
print(test.info())

Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52099 entries, 0 to 52098
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Race_Time                  52099 non-null  object 
 1   Race_ID                    52099 non-null  int64  
 2   Course                     52099 non-null  object 
 3   Distance                   52099 non-null  object 
 4   distanceYards              52099 non-null  int64  
 5   Prize                      52099 non-null  int64  
 6   Going                      52099 non-null  object 
 7   Horse                      52099 non-null  object 
 8   Trainer                    52099 non-null  object 
 9   Jockey                     52099 non-null  object 
 10  betfairSP                  52099 non-null  float64
 11  Position                   52099 non-null  int64  
 12  timeSecs                   52099 non-null  float64
 13  pdsBeaten                  52

In [34]:
# defining target position before dropping columns
train['target'] = (train['Position'] == 1).astype(int)
print(train.columns.tolist())

['Race_Time', 'Race_ID', 'Course', 'Distance', 'distanceYards', 'Prize', 'Going', 'Horse', 'Trainer', 'Jockey', 'betfairSP', 'Position', 'timeSecs', 'pdsBeaten', 'NMFP', 'Runners', 'Age', 'Speed_PreviousRun', 'Speed_2ndPreviousRun', 'NMFPLTO', 'MarketOdds_PreviousRun', 'MarketOdds_2ndPreviousRun', 'TrainerRating', 'JockeyRating', 'daysSinceLastRun', 'SireRating', 'DamsireRating', 'meanRunners', 'target']


In [35]:
# drop leakage columns
dropped_columns = ['betfairSP', 'Position', 'timeSecs', 'pdsBeaten', 'NMFP']
train = train.drop(columns=dropped_columns, errors='ignore')
test = test.drop(columns=dropped_columns, errors='ignore')
print(train.columns.tolist())

['Race_Time', 'Race_ID', 'Course', 'Distance', 'distanceYards', 'Prize', 'Going', 'Horse', 'Trainer', 'Jockey', 'Runners', 'Age', 'Speed_PreviousRun', 'Speed_2ndPreviousRun', 'NMFPLTO', 'MarketOdds_PreviousRun', 'MarketOdds_2ndPreviousRun', 'TrainerRating', 'JockeyRating', 'daysSinceLastRun', 'SireRating', 'DamsireRating', 'meanRunners', 'target']


In [36]:
# feature engineering
def race_relative_features(df):
    race_groups = df.groupby('Race_ID')
    df['speed_avg'] = race_groups['Speed_PreviousRun'].transform('mean')
    df['speed_diff_from_avg'] = df['Speed_PreviousRun'] - df['speed_avg']
    return df

train = race_relative_features(train)
test = race_relative_features(test)

# Encode cat variables
train_encoded = pd.get_dummies(train, drop_first=True)
test_encoded = pd.get_dummies(test, drop_first=True)
train_encoded, test_encoded = train_encoded.align(test_encoded, join='left', axis=1, fill_value=0)

# train/validation split
X = train_encoded.drop(columns=['target'])
y = train_encoded['target']

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [37]:
# Saving identifiers before any encoding or transformation
# this will save the original horse names and race IDs , so I can match predictions back to them later
# i created a separate copy so i donâ€™t accidentally change the original data.
test_ids = test[['Race_ID', 'Horse']].copy()

# Predict probabilities each horse winning
test_encoded['winning_probs'] = model.predict_proba(test_encoded[X_train.columns])[:, 1]

# resetting index and merging with original IDs to combine to one table
# made sure both parts line up properly by ignoring old row numbers.
test_with_ids = pd.concat([
    test_ids.reset_index(drop=True),
    test_encoded[['winning_probs']].reset_index(drop=True)
], axis=1)

# Normalize within each race to add up to 1
test_with_ids['winning_probs_normalized'] = test_with_ids.groupby('Race_ID')['winning_probs'].transform(lambda x: x / x.sum())

# format to 2dp
test_with_ids['winning_probs_rounded'] = test_with_ids['winning_probs_normalized'].round(2)

# adjusting one horse per group
def fix_rounded_sum(group):
    diff = 1 - group['winning_probs_rounded'].sum()
    # Adding the difference to the horse with the highest probability
    idx = group['winning_probs_rounded'].idxmax()
    group.loc[idx, 'winning_probs_rounded'] += diff
    return group

test_with_ids = test_with_ids.groupby('Race_ID', group_keys=False).apply(fix_rounded_sum)

  test_with_ids = test_with_ids.groupby('Race_ID', group_keys=False).apply(fix_rounded_sum)


In [40]:
# Import necessary libraries
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss, brier_score_loss

# Cleaning feature names first so that LightGBM won't break cos of special characters or spaces
def clean_feature_names(df):
    import re
    df.columns = [re.sub(r'[^a-zA-Z0-9_]', '_', col) for col in df.columns]
    return df

# Apply to data
X_train_light = clean_feature_names(X_train.select_dtypes(include=['number', 'bool']))
X_val_light = clean_feature_names(X_val.select_dtypes(include=['number', 'bool']))
test_encoded_clean = clean_feature_names(test_encoded)

# Align train and val first
X_train_light, X_val_light = X_train_light.align(X_val_light, join='inner', axis=1, fill_value=0)

# Then align test with train
X_train_light, test_encoded_clean = X_train_light.align(test_encoded_clean, join='inner', axis=1, fill_value=0)

# training LightGBM model
lgb = LGBMClassifier(
    n_estimators=150,
    learning_rate=0.05,
    max_depth=5,
    num_leaves=20,
    class_weight='balanced',
    random_state=42
)
lgb.fit(X_train_light, y_train)

# calibrating probabilities with validation set
calibrator = CalibratedClassifierCV(lgb, method='isotonic', cv='prefit')
calibrator.fit(X_val_light, y_val)

# predicting on test 
test_encoded_clean['winning_probs'] = calibrator.predict_proba(test_encoded_clean[X_train_light.columns])[:, 1]

# normalizing the predicted probabilities in each race so they sum to 1
test_encoded_clean['winning_probs'] /= test_encoded_clean.groupby('Race_ID')['winning_probs'].transform('sum')

# merging back the identifiers
submission = pd.DataFrame({
    'Race_ID': test['Race_ID'].values,  
    'Horse': test['Horse'].values,
    'winning_probs': test_encoded_clean['winning_probs'].values
})
submission.columns = ['Race_ID', 'Horse', 'Predicted_Probability']

# saving to csv
submission.to_csv('Predicted_Probabilities.csv', index=False)

# evaluate performance
val_probs = calibrator.predict_proba(X_val_light)[:, 1]
print("Final Model Evaluation:")
print("Log Loss:", log_loss(y_val, val_probs))
print("Brier Score:", brier_score_loss(y_val, val_probs))

# i have a log loss of 0.31 and a brier score of 0.09 
# via this, i can know how confident and correct the model is and i can also
# measure how close predicted probabilities are to actual outcomes.



[LightGBM] [Info] Number of positive: 4278, number of negative: 37401
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013841 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5208
[LightGBM] [Info] Number of data points in the train set: 41679, number of used features: 908
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




Final Model Evaluation:
Log Loss: 0.30791839826924744
Brier Score: 0.08757478033957591


In [41]:
#  checking per-race sums
race_sums = submission.groupby('Race_ID')['Predicted_Probability'].sum().round(5)
print("\n Race probability sums (should all be 1.0):")
print(race_sums.head(10))
print(race_sums.tail(10))


 Race probability sums (should all be 1.0):
Race_ID
58     1.0
59     1.0
60     1.0
61     1.0
62     1.0
63     1.0
64     1.0
169    1.0
170    1.0
171    1.0
Name: Predicted_Probability, dtype: float64
Race_ID
55195    1.0
55196    1.0
55197    1.0
55198    1.0
55199    1.0
55310    1.0
55311    1.0
55312    1.0
55313    1.0
55314    1.0
Name: Predicted_Probability, dtype: float64
