In [1]:
import requests
import os
import sys
import pandas as pd
import numpy as np


from typing import Optional
import glob
from scipy import stats
# Add the parent directory of this notebook to sys.path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(notebook_dir)
sys.path.append(parent_dir)

from project_tools import project_utils, project_class

import datetime
import json
from tqdm.notebook import tqdm
import gc
# import ds_utils
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, ndcg_score
from scipy.stats import kendalltau, spearmanr
from lightgbm import LGBMClassifier

from sklearn.metrics import log_loss, accuracy_score

%matplotlib inline

from importlib import reload
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.0f' % x)

# Or for more precise control
pd.set_option('display.float_format', '{:.5f}'.format)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


# load_data 

In [2]:
# train_horse_race_df = pd.read_parquet('../feature_data/train_horse_race_df.parquet')
# val_horse_race_df = pd.read_parquet('../feature_data/val_horse_race_df.parquet')                           
horse_race_df = pd.read_parquet('../feature_data/horse_race_df.parquet')

In [3]:
# load generated featureset
!ls ../feature_data/

basic_cat_ordinal_features.parquet
horse_class_feats.parquet
horse_feats.parquet
horse_race_df.parquet
horse_running_position_features.parquet
horse_track_running_position_features.parquet
race_course_features.parquet
trace_condition.parquet
train_horse_positions_df.parquet
train_horse_race_df.parquet
train_jockey_positions_df.parquet
train_trainer_positions_df.parquet
val_horse_race_df.parquet


In [4]:
# load features
basic_cat_ordinal_df = pd.read_parquet('../feature_data/basic_cat_ordinal_features.parquet')

In [5]:
print(horse_race_df.shape)
print(basic_cat_ordinal_df.shape)

(29520, 44)
(29520, 6)


In [6]:
# load baseline evaluation result

In [7]:
baseline_mean_results = pd.read_parquet('../evaluation_results/valdf_random_winodd_baseline_mean_results.parquet')

In [8]:
baseline_mean_results

Unnamed: 0,random_probs,winning_odd_preds
Winner Match,0.07632,0.29354
Top 3 Set Match,0.00783,0.05871
Top 3 Exact Match,0.00196,0.00978


# evaluation functions

In [9]:
def evaluate_horse_race_positions(y_true, y_pred_proba, dnf_value=99):
    """
    Evaluate predictions for a single race's finishing positions.
    
    Args:
        y_true: 1D array of true finishing positions
        y_pred_proba: 1D array of predicted probabilities
        dnf_value: Value used to indicate Did Not Finish
        
    Returns:
        Dictionary of evaluation metrics
    """
    # Handle NaN and DNF values in ground truth
    y_true_processed = y_true.copy()
    invalid_mask = np.logical_or(
        np.isnan(y_true),
        y_true == dnf_value
    )
    
    # Get max valid rank (excluding DNF values)
    valid_ranks = y_true[~invalid_mask]
    if len(valid_ranks) > 0:
        max_rank = np.max(valid_ranks)
        # Replace invalid values with max_rank + 1
        y_true_processed[invalid_mask] = max_rank + 1
        
    # Winner match
    y_true_ranksort = np.argsort(y_true_processed)
    y_pred_ranksort = np.argsort(y_pred_proba, axis=0)[::-1]
    
    winner_match = y_true_ranksort[0] == y_pred_ranksort[0]
    
    # Top 3 Set Match - considers [1,3,2] and [2,3,1] as matching
    top3_set_match = set(y_true_ranksort[:3]) == set(y_pred_ranksort[:3])
    
    # Top 3 Exact Match - only considers exact matches like [1,3,2] and [1,3,2]
    top3_exact_match = np.array_equal(y_true_ranksort[:3], y_pred_ranksort[:3])
    
    return {
        'Winner Match': float(winner_match),
        'Top 3 Set Match': float(top3_set_match), 
        'Top 3 Exact Match': float(top3_exact_match)
    }


In [10]:
def evaluate_prediction_sets(eval_dict):
    """
    Evaluate different prediction sets against ground truth for each race and calculate mean metrics
    
    Args:
        eval_dict: Dictionary containing race data with ground truth and different prediction sets
        
    Returns:
        tuple: (eval_result, mean_results_df)
            - eval_result: Dictionary with detailed evaluation metrics for each race
            - mean_results_df: DataFrame comparing mean metrics across prediction types
    """
    # Initialize results dictionary with race_ids as first level keys
    eval_result = {race_id: {} for race_id in eval_dict}

    # Get prediction types from first race data
    first_race_id = next(iter(eval_dict))
    pred_types = [key for key in eval_dict[first_race_id].keys() if key != 'ground_truth']

    # Initialize dictionaries to store mean results
    mean_results = {pred_type: {} for pred_type in pred_types}

    # Loop through each race
    for race_id in eval_dict:
        race_data = eval_dict[race_id]
        ground_truth = race_data['ground_truth']
        
        # Evaluate each prediction type
        for pred_type in pred_types:
            pred_probs = race_data[pred_type]
            
            # Evaluate predictions for this race
            race_eval = evaluate_horse_race_positions(
                ground_truth,
                pred_probs
            )
            
            # Store results for this race under race_id first, then pred_type
            eval_result[race_id][pred_type] = race_eval

    # Calculate mean results for each prediction type
    for pred_type in pred_types:
        # Initialize dict to store means for each metric
        metric_means = {}
        
        # Get metrics from first race to know what metrics exist
        first_race = next(iter(eval_result.values()))
        metrics = first_race[pred_type].keys()
        
        # For each metric, calculate mean across all races
        for metric in metrics:
            total = 0
            num_races = 0
            for race_id in eval_result:
                total += eval_result[race_id][pred_type][metric]
                num_races += 1
            metric_means[metric] = total / num_races
            
        mean_results[pred_type] = metric_means
    
    # Convert mean results to DataFrame for easy comparison
    mean_results_df = pd.DataFrame(mean_results)
    
    return eval_result, mean_results_df

# # Run evaluation
# eval_result, mean_results_df = evaluate_prediction_sets(eval_dict)

# # Display mean results comparison
# print("\nMean Evaluation Metrics Comparison:")
# print(mean_results_df)


# lightgbm training function

In [47]:
def train_lightgbm_model(train_df, val_df, label_col, cat_features=None, params=None):
    """
    Train a LightGBM model for binary classification using LGBMClassifier
    
    Args:
        train_df: Training dataframe containing features and label
        val_df: Validation dataframe containing features and label  
        label_col: Name of label column (should contain binary values 0/1)
        cat_features: List of categorical feature names
        params: Dict of LightGBM parameters
        
    Returns:
        Trained model and validation predictions
    """
    # Default parameters if none provided
    if params is None:
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting_type': 'gbdt',
            'n_estimators':150,
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,  # Column sampling
            'bagging_fraction': 0.8,  # Row sampling 
            'bagging_freq': 5,
            'verbose': -1,
            'max_depth': -1,
            'min_child_samples': 20,
            'reg_alpha': 0.0,
            'reg_lambda': 0.0,
            'is_unbalance': True  # Handle unbalanced datasets
        }

    # Separate features and labels
    features = [col for col in train_df.columns if col != label_col]
    X_train = train_df[features]
    y_train = train_df[label_col]
    X_val = val_df[features]
    y_val = val_df[label_col]

    # Initialize and train model
    model = LGBMClassifier(**params)
    
    # Fit model with early stopping
    model.fit(
        X_train, y_train,
        categorical_feature=cat_features if cat_features else 'auto'
    )
    
    # Make validation predictions
    val_preds = model.predict_proba(X_val)[:, 1]  # Get probability of positive class
    val_logloss = log_loss(y_val, val_preds)
    # val_acc = accuracy_score(y_val, val_preds > 0.5)  # Convert probs to binary predictions
    print(f'Validation LogLoss: {val_logloss:.4f}')
    # print(f'Validation Accuracy: {val_acc:.4f}')
    
    return model, val_preds

In [12]:
print(horse_race_df.shape)
print(basic_cat_ordinal_df.shape)

(29520, 44)
(29520, 6)


# lightgbm with basic featureset

In [13]:
train_years = ['2014','2015','2016']
val_years = ['2017']
train_idx = horse_race_df[horse_race_df['year'].isin(train_years)].index
val_idx = horse_race_df[horse_race_df['year'].isin(val_years)].index
print(len(train_idx), len(val_idx))

23232 6288


In [48]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'n_estimators':1000,
    'num_leaves': 64,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,  # Column sampling
    'bagging_fraction': 0.8,  # Row sampling 
    'bagging_freq': 5,
    'verbose': -1,
    'max_depth': -1,
    'min_child_samples': 20,
    'is_unbalance': True  # Handle unbalanced datasets
}


# first past featureset:
target = 'is_winner'
basic_num_features = ['horse_number', 'clean_actual_weight', 'clean_declared_horse_weight',
                  'clean_win_odds', 'race_distance', 'clean_position_mavg_3', 'clean_position_mavg_5',
                     'clean_position_mavg_7']


basic_cat_features = basic_cat_ordinal_df.columns.tolist()

df = pd.concat([horse_race_df[basic_num_features], basic_cat_ordinal_df], axis=1)
df[target] = horse_race_df[target]

train_df = df.loc[train_idx]
val_df = df.loc[val_idx]

print(train_df.shape, val_df.shape)


(23232, 15) (6288, 15)


In [49]:
model, val_preds = train_lightgbm_model(train_df, val_df, target,
                                        cat_features=basic_cat_features, params=params)

Validation LogLoss: 0.2753


In [50]:
# evaluation 
val_horse_race_df = horse_race_df.loc[val_idx].reset_index(drop=True)
pred_col = f'lgbm_v0_preds_{target}'
val_horse_race_df[pred_col] = val_preds

In [51]:
check_race_id = val_horse_race_df['race_id'].sample(1).values[0]
use_cols = ['race_id', 'horse_id', 'clean_position', target, pred_col]
val_horse_race_df[val_horse_race_df['race_id']==check_race_id][use_cols]

Unnamed: 0,race_id,horse_id,clean_position,is_winner,lgbm_v0_preds_is_winner
3959,2016-615,V290,1,1,0.98646
3960,2016-615,A021,2,0,0.20894
3961,2016-615,A011,3,0,0.34026
3962,2016-615,V199,4,0,0.00063
3963,2016-615,S247,5,0,0.00042
3964,2016-615,V024,6,0,0.02747
3965,2016-615,V372,7,0,0.00049
3966,2016-615,T432,8,0,0.00238
3967,2016-615,V347,9,0,0.01804
3968,2016-615,A067,10,0,0.029


In [53]:
eval_dict = {}
ground_truth = {}
lgbm_v0_preds = [] 

for race in val_horse_race_df['race_id'].unique():
    race_df = val_horse_race_df[val_horse_race_df['race_id']==race]
    n_horse = race_df.shape[0]    
    eval_dict[race] = {}
    eval_dict[race]['ground_truth'] = race_df['clean_position'].values
    eval_dict[race][pred_col] = race_df[pred_col].values

In [54]:
eval_result, lgbmv0_results_df = evaluate_prediction_sets(eval_dict)

In [55]:
lgbmv0_results_df

Unnamed: 0,lgbm_v0_preds_is_winner
Winner Match,0.37573
Top 3 Set Match,0.09589
Top 3 Exact Match,0.0274


In [56]:
compare_results = baseline_mean_results.copy()
compare_results[pred_col] = lgbmv0_results_df[pred_col]
compare_results

Unnamed: 0,random_probs,winning_odd_preds,lgbm_v0_preds_is_winner
Winner Match,0.07632,0.29354,0.37573
Top 3 Set Match,0.00783,0.05871,0.09589
Top 3 Exact Match,0.00196,0.00978,0.0274


In [70]:
reload(project_utils)

<module 'project_tools.project_utils' from '/home/yifan/playground/hk_horse_racing/project_tools/project_utils.py'>

# lightgbm feature importance

In [71]:
impt_df = project_utils.lgbm_feature_importance(model)

In [72]:
impt_df

Unnamed: 0,feature,feature_importance
0,clean_declared_horse_weight,10864
1,clean_win_odds,9867
2,clean_position_mavg_7,6806
3,clean_position_mavg_3,6798
4,clean_position_mavg_5,6455
5,clean_actual_weight,5351
6,horse_number,4325
7,trainer,3087
8,race_distance,2822
9,jockey,2814


# null hpyothesis feature selection

In [74]:
null_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'n_estimators':100,
    'num_leaves': 16,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,  # Column sampling
    'bagging_fraction': 0.8,  # Row sampling 
    'bagging_freq': 5,
    'verbose': -1,
    'max_depth': -1,
    'min_child_samples': 20,
    'is_unbalance': True  # Handle unbalanced datasets
}

In [166]:
nround = 1

for i in range(nround):
    null_train_df = train_df.copy()
    null_train_df[target] = null_train_df[target].sample(frac=1).values
    features = [col for col in null_train_df.columns if col != target]
    x_train = null_train_df[features]
    y_train = null_train_df[target]
    round_model = LGBMClassifier(**params)
    round_model.fit(
        x_train, y_train,
        categorical_feature=basic_cat_features if basic_cat_features else 'auto'
    )
    round_imp_df = project_utils.lgbm_feature_importance(round_model)
    round_imp_df['run'] = i + 1

round_imp_df

Unnamed: 0,feature,feature_importance,run
0,clean_declared_horse_weight,10631,1
1,clean_win_odds,8585,1
2,clean_position_mavg_3,6769,1
3,clean_position_mavg_7,6358,1
4,clean_position_mavg_5,6327,1
5,clean_actual_weight,5449,1
6,horse_number,4338,1
7,jockey,3800,1
8,trainer,3700,1
9,race_distance,2772,1


In [162]:
null_train_df = train_df.copy()
null_train_df[target] = null_train_df[target].sample(frac=1.0).values
null_train_df[target].values[0:10]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

Unnamed: 0,feature,feature_importance,run
0,clean_declared_horse_weight,10654,1
1,clean_win_odds,8548,1
2,clean_position_mavg_3,6844,1
3,clean_position_mavg_7,6400,1
4,clean_position_mavg_5,6152,1
5,clean_actual_weight,5233,1
6,horse_number,4298,1
7,trainer,3799,1
8,jockey,3784,1
9,race_distance,2767,1


# ad-hoc data analysis

In [6]:
print(val_df.columns.tolist())

['finishing_position', 'horse_number', 'horse_name', 'horse_id', 'jockey', 'trainer', 'actual_weight', 'declared_horse_weight', 'draw', 'length_behind_winner', 'running_position_1', 'running_position_2', 'running_position_3', 'running_position_4', 'finish_time', 'win_odds', 'running_position_5', 'running_position_6', 'race_id', 'clean_actual_weight', 'clean_declared_horse_weight', 'clean_length_behind_winner', 'clean_finish_time', 'clean_win_odds', 'clean_position', 'is_winner', 'is_top3', 'src', 'race_date', 'race_course', 'race_number', 'race_class', 'race_distance', 'track_condition', 'race_name', 'track', 'sectional_time', 'incident_report', 'race_course_track', 'clean_race_date', 'clean_position_mavg_3', 'clean_position_mavg_5', 'clean_position_mavg_7', 'year']


In [None]:
project_utils.analyze_dataframe(val_df)