In [13]:
import requests
import os
import sys
import pandas as pd
import numpy as np


from typing import Optional
import glob
from scipy import stats
# Add the parent directory of this notebook to sys.path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(notebook_dir)
sys.path.append(parent_dir)

from project_tools import project_utils, project_class

import datetime
import json
from tqdm.notebook import tqdm
import gc
# import ds_utils
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, ndcg_score
from scipy.stats import kendalltau, spearmanr
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss, accuracy_score

%matplotlib inline

from importlib import reload
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.0f' % x)

# Or for more precise control
pd.set_option('display.float_format', '{:.5f}'.format)

# load_data 

In [33]:
# train_horse_race_df = pd.read_parquet('../feature_data/train_horse_race_df.parquet')
# val_horse_race_df = pd.read_parquet('../feature_data/val_horse_race_df.parquet')                           
horse_race_df = pd.read_parquet('../feature_data/horse_race_df.parquet')

In [31]:
# load generated featureset
!ls ../feature_data/

basic_cat_ordinal_features.parquet
horse_class_feats.parquet
horse_feats.parquet
horse_running_position_features.parquet
horse_track_running_position_features.parquet
race_course_features.parquet
trace_condition.parquet
train_horse_positions_df.parquet
train_horse_race_df.parquet
train_jockey_positions_df.parquet
train_trainer_positions_df.parquet
val_horse_race_df.parquet


In [32]:
# load features
basic_cat_ordinal_df = pd.read_parquet('../feature_data/basic_cat_ordinal_features.parquet')

In [34]:
print(horse_race_df.shape)
print(basic_cat_ordinal_df.shape)

(29520, 44)
(29520, 6)


In [56]:
# load baseline evaluation result

In [128]:
baseline_mean_results = pd.read_parquet('../evaluation_results/valdf_random_winodd_baseline_mean_results.parquet')

In [85]:
baseline_mean_results

Unnamed: 0,random_probs,winning_odd_preds
Mean Squared Error,25.28416,23.58429
Mean Absolute Error,4.04875,3.87116
Spearman's Rank Correlation,0.02009,0.07598
NDCG,0.83467,0.74502
Winner Match,0.06654,0.11155
Top 3 Set Match,0.00391,0.00196
Top 3 Exact Match,0.0,0.0


# evaluation functions

In [125]:
def evaluate_horse_race_positions(y_true, y_pred_proba, dnf_value=99):
    """
    Evaluate predictions for a single race's finishing positions.
    
    Args:
        y_true: 1D array of true finishing positions
        y_pred_proba: 1D array of predicted probabilities
        dnf_value: Value used to indicate Did Not Finish
        
    Returns:
        Dictionary of evaluation metrics
    """
    # Handle NaN and DNF values in ground truth
    y_true_processed = y_true.copy()
    invalid_mask = np.logical_or(
        np.isnan(y_true),
        y_true == dnf_value
    )
    
    # Get max valid rank (excluding DNF values)
    valid_ranks = y_true[~invalid_mask]
    if len(valid_ranks) > 0:
        max_rank = np.max(valid_ranks)
        # Replace invalid values with max_rank + 1
        y_true_processed[invalid_mask] = max_rank + 1
        
    # Winner match
    y_true_ranksort = np.argsort(y_true_processed)
    y_pred_ranksort = np.argsort(y_pred_proba, axis=0)[::-1]
    
    winner_match = y_true_ranksort[0] == y_pred_ranksort[0]
    
    # Top 3 Set Match - considers [1,3,2] and [2,3,1] as matching
    top3_set_match = set(y_true_ranksort[:3]) == set(y_pred_ranksort[:3])
    
    # Top 3 Exact Match - only considers exact matches like [1,3,2] and [1,3,2]
    top3_exact_match = np.array_equal(y_true_ranksort[:3], y_pred_ranksort[:3])
    
    return {
        'Winner Match': float(winner_match),
        'Top 3 Set Match': float(top3_set_match), 
        'Top 3 Exact Match': float(top3_exact_match)
    }


In [127]:
def evaluate_prediction_sets(eval_dict):
    """
    Evaluate different prediction sets against ground truth for each race and calculate mean metrics
    
    Args:
        eval_dict: Dictionary containing race data with ground truth and different prediction sets
        
    Returns:
        tuple: (eval_result, mean_results_df)
            - eval_result: Dictionary with detailed evaluation metrics for each race
            - mean_results_df: DataFrame comparing mean metrics across prediction types
    """
    # Initialize results dictionary with race_ids as first level keys
    eval_result = {race_id: {} for race_id in eval_dict}

    # Get prediction types from first race data
    first_race_id = next(iter(eval_dict))
    pred_types = [key for key in eval_dict[first_race_id].keys() if key != 'ground_truth']

    # Initialize dictionaries to store mean results
    mean_results = {pred_type: {} for pred_type in pred_types}

    # Loop through each race
    for race_id in eval_dict:
        race_data = eval_dict[race_id]
        ground_truth = race_data['ground_truth']
        
        # Evaluate each prediction type
        for pred_type in pred_types:
            pred_probs = race_data[pred_type]
            
            # Evaluate predictions for this race
            race_eval = evaluate_horse_race_positions(
                ground_truth,
                pred_probs
            )
            
            # Store results for this race under race_id first, then pred_type
            eval_result[race_id][pred_type] = race_eval

    # Calculate mean results for each prediction type
    for pred_type in pred_types:
        # Initialize dict to store means for each metric
        metric_means = {}
        
        # Get metrics from first race to know what metrics exist
        first_race = next(iter(eval_result.values()))
        metrics = first_race[pred_type].keys()
        
        # For each metric, calculate mean across all races
        for metric in metrics:
            total = 0
            num_races = 0
            for race_id in eval_result:
                total += eval_result[race_id][pred_type][metric]
                num_races += 1
            metric_means[metric] = total / num_races
            
        mean_results[pred_type] = metric_means
    
    # Convert mean results to DataFrame for easy comparison
    mean_results_df = pd.DataFrame(mean_results)
    
    return eval_result, mean_results_df

# # Run evaluation
# eval_result, mean_results_df = evaluate_prediction_sets(eval_dict)

# # Display mean results comparison
# print("\nMean Evaluation Metrics Comparison:")
# print(mean_results_df)


# getting basic feature set

In [29]:
def train_lightgbm_model(train_df, val_df, label_col, cat_features=None, params=None):
    """
    Train a LightGBM model for binary classification using LGBMClassifier
    
    Args:
        train_df: Training dataframe containing features and label
        val_df: Validation dataframe containing features and label  
        label_col: Name of label column (should contain binary values 0/1)
        cat_features: List of categorical feature names
        params: Dict of LightGBM parameters
        
    Returns:
        Trained model and validation predictions
    """
    # Default parameters if none provided
    if params is None:
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting_type': 'gbdt',
            'n_estimators':150,
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,  # Column sampling
            'bagging_fraction': 0.8,  # Row sampling 
            'bagging_freq': 5,
            'verbose': -1,
            'max_depth': -1,
            'min_child_samples': 20,
            'reg_alpha': 0.0,
            'reg_lambda': 0.0,
            'is_unbalance': True  # Handle unbalanced datasets
        }

    # Separate features and labels
    features = [col for col in train_df.columns if col != label_col]
    X_train = train_df[features]
    y_train = train_df[label_col]
    X_val = val_df[features]
    y_val = val_df[label_col]

    # Initialize and train model
    model = lgb.LGBMClassifier(**params)
    
    # Fit model with early stopping
    model.fit(
        X_train, y_train,
        categorical_feature=cat_features if cat_features else 'auto'
    )
    
    # Make validation predictions
    val_preds = model.predict_proba(X_val)[:, 1]  # Get probability of positive class
    val_logloss = log_loss(y_val, val_preds)
    # val_acc = accuracy_score(y_val, val_preds > 0.5)  # Convert probs to binary predictions
    print(f'Validation LogLoss: {val_logloss:.4f}')
    # print(f'Validation Accuracy: {val_acc:.4f}')
    
    return model, val_preds

In [35]:
print(horse_race_df.shape)
print(basic_cat_ordinal_df.shape)

(29520, 44)
(29520, 6)


In [38]:
train_years = ['2014','2015','2016']
val_years = ['2017']
train_idx = horse_race_df[horse_race_df['year'].isin(train_years)].index
val_idx = horse_race_df[horse_race_df['year'].isin(val_years)].index
print(len(train_idx), len(val_idx))

23232 6288


In [67]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'n_estimators':300,
    'num_leaves': 32,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,  # Column sampling
    'bagging_fraction': 0.8,  # Row sampling 
    'bagging_freq': 5,
    'verbose': -1,
    'max_depth': -1,
    'min_child_samples': 20,
    'is_unbalance': True  # Handle unbalanced datasets
}


# first past featureset:
target = 'is_winner'
basic_num_features = ['horse_number', 'clean_actual_weight', 'clean_declared_horse_weight',
                  'clean_win_odds', 'race_distance', 'clean_position_mavg_3', 'clean_position_mavg_5',
                     'clean_position_mavg_7']


basic_cat_features = basic_cat_ordinal_df.columns.tolist()

df = pd.concat([horse_race_df[basic_num_features], basic_cat_ordinal_df], axis=1)
df[target] = horse_race_df[target]

train_df = df.loc[train_idx]
val_df = df.loc[val_idx]

print(train_df.shape, val_df.shape)


(23232, 15) (6288, 15)


In [42]:
train_df.columns

Index(['horse_number', 'clean_actual_weight', 'clean_declared_horse_weight',
       'clean_win_odds', 'race_distance', 'clean_position_mavg_3',
       'clean_position_mavg_5', 'clean_position_mavg_7', 'jockey', 'trainer',
       'race_course', 'race_course_track', 'race_class', 'track_condition',
       'is_winner'],
      dtype='object')

# lightgbm with basic featureset

In [68]:
model, val_preds = train_lightgbm_model(train_df, val_df, target,
                                        cat_features=basic_cat_features, params=params)

Validation LogLoss: 0.3241


In [69]:
# evaluation 
val_horse_race_df = horse_race_df.loc[val_idx].reset_index(drop=True)
val_horse_race_df['lgbm_v0_preds'] = val_preds

In [70]:
check_race_id = val_horse_race_df['race_id'].sample(1).values[0]
use_cols = ['race_id', 'horse_id', 'clean_position', 'is_winner', 'lgbm_v0_preds']
val_horse_race_df[val_horse_race_df['race_id']==check_race_id][use_cols]

Unnamed: 0,race_id,horse_id,clean_position,is_winner,lgbm_v0_preds
824,2016-362,S432,1,1,0.19884
825,2016-362,P064,2,0,0.06689
826,2016-362,S367,3,0,0.81773
827,2016-362,V144,4,0,0.2913
828,2016-362,V129,5,0,0.04883
829,2016-362,N265,6,0,0.0329
830,2016-362,S087,7,0,0.33429
831,2016-362,T356,8,0,0.02665
832,2016-362,P105,9,0,0.08548
833,2016-362,S413,10,0,0.0016


In [71]:
eval_dict = {}
ground_truth = {}
lgbm_v0_preds = [] 

for race in val_horse_race_df['race_id'].unique():
    race_df = val_horse_race_df[val_horse_race_df['race_id']==race]
    n_horse = race_df.shape[0]    
    eval_dict[race] = {}
    eval_dict[race]['ground_truth'] = race_df['clean_position'].values
    eval_dict[race]['lgbm_v0_preds'] = -1* race_df['clean_win_odds'].values

In [129]:
eval_result, lgbmv0_results_df = evaluate_prediction_sets(eval_dict)

In [130]:
lgbmv0_results_df

Unnamed: 0,lgbm_v0_preds
Winner Match,0.28963
Top 3 Set Match,0.05479
Top 3 Exact Match,0.00978


In [131]:
compare_results = baseline_mean_results.copy()
compare_results['lgbm_v0_preds'] = lgbmv0_results_df['lgbm_v0_preds']
compare_results

Unnamed: 0,random_probs,winning_odd_preds,lgbm_v0_preds
Winner Match,0.07632,0.29354,0.28963
Top 3 Set Match,0.00783,0.05871,0.05479
Top 3 Exact Match,0.00196,0.00978,0.00978


In [126]:

y_true = np.array(
    [4, 1, 3, 2, 5]
)

y_pred_proba = np.array(
    [0.2, 0.6, 0.4, 0.3, 0.1]
)

y_pred_ranksort = np.argsort(y_pred_proba, axis=0)[::-1][0:3]
y_true_ranksort = np.argsort(y_true, axis=0)[0:3]
# y_pred_ranksort = np.argsort(y_pred_ranks, axis=0)[0:3]

print(y_true_ranksort, y_pred_ranksort)
evaluate_horse_race_positions(y_true, y_pred_proba)

[1 3 2] [1 2 3]


{'Winner Match': 1.0, 'Top 3 Set Match': 1.0, 'Top 3 Exact Match': 0.0}

In [124]:
y_pred_ranks = np.argsort(y_pred_proba)
y_pred_ranks

array([4, 0, 3, 2, 1])

In [112]:
np.argsort(y_pred_proba)[::-1]

array([1, 3, 2, 0, 4])

# ad-hoc data analysis

In [6]:
print(val_df.columns.tolist())

['finishing_position', 'horse_number', 'horse_name', 'horse_id', 'jockey', 'trainer', 'actual_weight', 'declared_horse_weight', 'draw', 'length_behind_winner', 'running_position_1', 'running_position_2', 'running_position_3', 'running_position_4', 'finish_time', 'win_odds', 'running_position_5', 'running_position_6', 'race_id', 'clean_actual_weight', 'clean_declared_horse_weight', 'clean_length_behind_winner', 'clean_finish_time', 'clean_win_odds', 'clean_position', 'is_winner', 'is_top3', 'src', 'race_date', 'race_course', 'race_number', 'race_class', 'race_distance', 'track_condition', 'race_name', 'track', 'sectional_time', 'incident_report', 'race_course_track', 'clean_race_date', 'clean_position_mavg_3', 'clean_position_mavg_5', 'clean_position_mavg_7', 'year']


In [None]:
project_utils.analyze_dataframe(val_df)