In [1]:
import requests
import os
import sys
import pandas as pd
import numpy as np


from typing import Optional
import glob
from scipy import stats
# Add the parent directory of this notebook to sys.path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(notebook_dir)
sys.path.append(parent_dir)

from project_tools import project_utils, project_class

import datetime
import json
from tqdm.notebook import tqdm
import gc
# import ds_utils
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, ndcg_score
from scipy.stats import kendalltau, spearmanr
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss, accuracy_score

%matplotlib inline

from importlib import reload
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.0f' % x)

# Or for more precise control
pd.set_option('display.float_format', '{:.5f}'.format)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


# load_data 

In [138]:
# train_horse_race_df = pd.read_parquet('../feature_data/train_horse_race_df.parquet')
# val_horse_race_df = pd.read_parquet('../feature_data/val_horse_race_df.parquet')                           
horse_race_df = pd.read_parquet('../feature_data/horse_race_df.parquet')

In [3]:
# load generated featureset
!ls ../feature_data/

basic_cat_ordinal_features.parquet
horse_class_feats.parquet
horse_feats.parquet
horse_race_df.parquet
horse_running_position_features.parquet
horse_track_running_position_features.parquet
race_course_features.parquet
trace_condition.parquet
train_horse_positions_df.parquet
train_horse_race_df.parquet
train_jockey_positions_df.parquet
train_trainer_positions_df.parquet
val_horse_race_df.parquet


In [4]:
# load features
basic_cat_ordinal_df = pd.read_parquet('../feature_data/basic_cat_ordinal_features.parquet')

In [5]:
print(horse_race_df.shape)
print(basic_cat_ordinal_df.shape)

(29520, 44)
(29520, 6)


In [6]:
# load baseline evaluation result

In [7]:
baseline_mean_results = pd.read_parquet('../evaluation_results/valdf_random_winodd_baseline_mean_results.parquet')

In [8]:
baseline_mean_results

Unnamed: 0,random_probs,winning_odd_preds
Winner Match,0.07632,0.29354
Top 3 Set Match,0.00783,0.05871
Top 3 Exact Match,0.00196,0.00978


# evaluation functions

In [9]:
def evaluate_horse_race_positions(y_true, y_pred_proba, dnf_value=99):
    """
    Evaluate predictions for a single race's finishing positions.
    
    Args:
        y_true: 1D array of true finishing positions
        y_pred_proba: 1D array of predicted probabilities
        dnf_value: Value used to indicate Did Not Finish
        
    Returns:
        Dictionary of evaluation metrics
    """
    # Handle NaN and DNF values in ground truth
    y_true_processed = y_true.copy()
    invalid_mask = np.logical_or(
        np.isnan(y_true),
        y_true == dnf_value
    )
    
    # Get max valid rank (excluding DNF values)
    valid_ranks = y_true[~invalid_mask]
    if len(valid_ranks) > 0:
        max_rank = np.max(valid_ranks)
        # Replace invalid values with max_rank + 1
        y_true_processed[invalid_mask] = max_rank + 1
        
    # Winner match
    y_true_ranksort = np.argsort(y_true_processed)
    y_pred_ranksort = np.argsort(y_pred_proba, axis=0)[::-1]
    
    winner_match = y_true_ranksort[0] == y_pred_ranksort[0]
    
    # Top 3 Set Match - considers [1,3,2] and [2,3,1] as matching
    top3_set_match = set(y_true_ranksort[:3]) == set(y_pred_ranksort[:3])
    
    # Top 3 Exact Match - only considers exact matches like [1,3,2] and [1,3,2]
    top3_exact_match = np.array_equal(y_true_ranksort[:3], y_pred_ranksort[:3])
    
    return {
        'Winner Match': float(winner_match),
        'Top 3 Set Match': float(top3_set_match), 
        'Top 3 Exact Match': float(top3_exact_match)
    }


In [10]:
def evaluate_prediction_sets(eval_dict):
    """
    Evaluate different prediction sets against ground truth for each race and calculate mean metrics
    
    Args:
        eval_dict: Dictionary containing race data with ground truth and different prediction sets
        
    Returns:
        tuple: (eval_result, mean_results_df)
            - eval_result: Dictionary with detailed evaluation metrics for each race
            - mean_results_df: DataFrame comparing mean metrics across prediction types
    """
    # Initialize results dictionary with race_ids as first level keys
    eval_result = {race_id: {} for race_id in eval_dict}

    # Get prediction types from first race data
    first_race_id = next(iter(eval_dict))
    pred_types = [key for key in eval_dict[first_race_id].keys() if key != 'ground_truth']

    # Initialize dictionaries to store mean results
    mean_results = {pred_type: {} for pred_type in pred_types}

    # Loop through each race
    for race_id in eval_dict:
        race_data = eval_dict[race_id]
        ground_truth = race_data['ground_truth']
        
        # Evaluate each prediction type
        for pred_type in pred_types:
            pred_probs = race_data[pred_type]
            
            # Evaluate predictions for this race
            race_eval = evaluate_horse_race_positions(
                ground_truth,
                pred_probs
            )
            
            # Store results for this race under race_id first, then pred_type
            eval_result[race_id][pred_type] = race_eval

    # Calculate mean results for each prediction type
    for pred_type in pred_types:
        # Initialize dict to store means for each metric
        metric_means = {}
        
        # Get metrics from first race to know what metrics exist
        first_race = next(iter(eval_result.values()))
        metrics = first_race[pred_type].keys()
        
        # For each metric, calculate mean across all races
        for metric in metrics:
            total = 0
            num_races = 0
            for race_id in eval_result:
                total += eval_result[race_id][pred_type][metric]
                num_races += 1
            metric_means[metric] = total / num_races
            
        mean_results[pred_type] = metric_means
    
    # Convert mean results to DataFrame for easy comparison
    mean_results_df = pd.DataFrame(mean_results)
    
    return eval_result, mean_results_df

# # Run evaluation
# eval_result, mean_results_df = evaluate_prediction_sets(eval_dict)

# # Display mean results comparison
# print("\nMean Evaluation Metrics Comparison:")
# print(mean_results_df)


# lightgbm training function

In [17]:
def train_lightgbm_model(train_df, val_df, label_col, cat_features=None, params=None):
    """
    Train a LightGBM model for binary classification using LGBMClassifier
    
    Args:
        train_df: Training dataframe containing features and label
        val_df: Validation dataframe containing features and label  
        label_col: Name of label column (should contain binary values 0/1)
        cat_features: List of categorical feature names
        params: Dict of LightGBM parameters
        
    Returns:
        Trained model and validation predictions
    """
    # Default parameters if none provided
    if params is None:
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting_type': 'gbdt',
            'n_estimators':150,
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,  # Column sampling
            'bagging_fraction': 0.8,  # Row sampling 
            'bagging_freq': 5,
            'verbose': -1,
            'max_depth': -1,
            'min_child_samples': 20,
            'reg_alpha': 0.0,
            'reg_lambda': 0.0,
            'is_unbalance': True  # Handle unbalanced datasets
        }

    # Separate features and labels
    features = [col for col in train_df.columns if col != label_col]
    X_train = train_df[features]
    y_train = train_df[label_col]
    X_val = val_df[features]
    y_val = val_df[label_col]

    # Initialize and train model
    model = LGBMClassifier(**params)
    
    # Fit model with early stopping
    model.fit(
        X_train, y_train,
        categorical_feature=cat_features if cat_features else 'auto'
    )
    
    # Make validation predictions
    val_preds = model.predict_proba(X_val)[:, 1]  # Get probability of positive class
    val_logloss = log_loss(y_val, val_preds)
    # val_acc = accuracy_score(y_val, val_preds > 0.5)  # Convert probs to binary predictions
    print(f'Validation LogLoss: {val_logloss:.4f}')
    # print(f'Validation Accuracy: {val_acc:.4f}')
    
    return model, val_preds

In [18]:
print(horse_race_df.shape)
print(basic_cat_ordinal_df.shape)

(29520, 44)
(29520, 6)


# getting basic feature set

In [19]:
train_years = ['2014','2015','2016']
val_years = ['2017']
train_idx = horse_race_df[horse_race_df['year'].isin(train_years)].index
val_idx = horse_race_df[horse_race_df['year'].isin(val_years)].index
print(len(train_idx), len(val_idx))

23232 6288


In [20]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'n_estimators':300,
    'num_leaves': 32,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,  # Column sampling
    'bagging_fraction': 0.8,  # Row sampling 
    'bagging_freq': 5,
    'verbose': -1,
    'max_depth': -1,
    'min_child_samples': 20,
    'is_unbalance': True  # Handle unbalanced datasets
}


# first past featureset:
target = 'is_winner'
basic_num_features = ['horse_number', 'clean_actual_weight', 'clean_declared_horse_weight',
                  'clean_win_odds', 'race_distance', 'clean_position_mavg_3', 'clean_position_mavg_5',
                     'clean_position_mavg_7']


basic_cat_features = basic_cat_ordinal_df.columns.tolist()

df = pd.concat([horse_race_df[basic_num_features], basic_cat_ordinal_df], axis=1)
df[target] = horse_race_df[target]

train_df = df.loc[train_idx]
val_df = df.loc[val_idx]

print(train_df.shape, val_df.shape)


(23232, 15) (6288, 15)


In [21]:
train_df.columns

Index(['horse_number', 'clean_actual_weight', 'clean_declared_horse_weight',
       'clean_win_odds', 'race_distance', 'clean_position_mavg_3',
       'clean_position_mavg_5', 'clean_position_mavg_7', 'jockey', 'trainer',
       'race_course', 'race_course_track', 'race_class', 'track_condition',
       'is_winner'],
      dtype='object')

# lightgbm with basic featureset

In [22]:
model, val_preds = train_lightgbm_model(train_df, val_df, target,
                                        cat_features=basic_cat_features, params=params)

Validation LogLoss: 0.3241


In [23]:
# evaluation 
val_horse_race_df = horse_race_df.loc[val_idx].reset_index(drop=True)
pred_col = f'lgbm_v0_preds_{target}'
val_horse_race_df[pred_col] = val_preds

In [24]:
check_race_id = val_horse_race_df['race_id'].sample(1).values[0]
use_cols = ['race_id', 'horse_id', 'clean_position', target, pred_col]
val_horse_race_df[val_horse_race_df['race_id']==check_race_id][use_cols]

Unnamed: 0,race_id,horse_id,clean_position,is_winner,lgbm_v0_preds
1439,2016-412,A170,1,1,0.26074
1440,2016-412,V017,2,0,0.24441
1441,2016-412,S419,3,0,0.03615
1442,2016-412,P310,4,0,0.00118
1443,2016-412,S240,5,0,0.0271
1444,2016-412,V235,6,0,0.12457
1445,2016-412,A100,7,0,0.803
1446,2016-412,T331,8,0,0.21937
1447,2016-412,A157,9,0,0.21365
1448,2016-412,T357,10,0,0.18894


In [25]:
eval_dict = {}
ground_truth = {}
lgbm_v0_preds = [] 

for race in val_horse_race_df['race_id'].unique():
    race_df = val_horse_race_df[val_horse_race_df['race_id']==race]
    n_horse = race_df.shape[0]    
    eval_dict[race] = {}
    eval_dict[race]['ground_truth'] = race_df['clean_position'].values
    eval_dict[race][pred_col] = race_df[pred_col].values

In [26]:
eval_result, lgbmv0_results_df = evaluate_prediction_sets(eval_dict)

In [27]:
lgbmv0_results_df

Unnamed: 0,lgbm_v0_preds
Winner Match,0.3953
Top 3 Set Match,0.10568
Top 3 Exact Match,0.03131


In [28]:
compare_results = baseline_mean_results.copy()
compare_results[pred_col] = lgbmv0_results_df[pred_col]
compare_results

Unnamed: 0,random_probs,winning_odd_preds,lgbm_v0_preds
Winner Match,0.07632,0.29354,0.3953
Top 3 Set Match,0.00783,0.05871,0.10568
Top 3 Exact Match,0.00196,0.00978,0.03131


# top 3 modelling

In [29]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'n_estimators':300,
    'num_leaves': 32,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,  # Column sampling
    'bagging_fraction': 0.8,  # Row sampling 
    'bagging_freq': 5,
    'verbose': -1,
    'max_depth': -1,
    'min_child_samples': 20,
    'is_unbalance': True  # Handle unbalanced datasets
}


# first past featureset:
target = 'is_top3'
basic_num_features = ['horse_number', 'clean_actual_weight', 'clean_declared_horse_weight',
                  'clean_win_odds', 'race_distance', 'clean_position_mavg_3', 'clean_position_mavg_5',
                     'clean_position_mavg_7']


basic_cat_features = basic_cat_ordinal_df.columns.tolist()

df = pd.concat([horse_race_df[basic_num_features], basic_cat_ordinal_df], axis=1)
df[target] = horse_race_df[target]

train_df = df.loc[train_idx]
val_df = df.loc[val_idx]

print(train_df.shape, val_df.shape)

model, val_preds = train_lightgbm_model(train_df, val_df, target,
                                        cat_features=basic_cat_features, params=params)

(23232, 15) (6288, 15)
Validation LogLoss: 0.4456


In [45]:
val_horse_race_df = horse_race_df.loc[val_idx].reset_index(drop=True)
pred_col = f'lgbm_v0_preds_{target}'
val_horse_race_df[pred_col] = val_preds

In [46]:
check_race_id = val_horse_race_df['race_id'].sample(1).values[0]
use_cols = ['race_id', 'horse_id', 'clean_position', target, pred_col]
val_horse_race_df[val_horse_race_df['race_id']==check_race_id][use_cols]

Unnamed: 0,race_id,horse_id,clean_position,is_top3,lgbm_v0_preds_is_top3
4863,2016-690,V060,1,1,0.68253
4864,2016-690,S205,2,1,0.55973
4865,2016-690,S074,3,1,0.23369
4866,2016-690,V179,4,0,0.15813
4867,2016-690,T045,5,0,0.27722
4868,2016-690,T089,6,0,0.45844
4869,2016-690,T203,7,0,0.18228
4870,2016-690,V365,8,0,0.39994
4871,2016-690,P130,9,0,0.03846
4872,2016-690,V220,10,0,0.175


In [47]:
eval_dict = {}
ground_truth = {}
lgbm_v0_preds = [] 

for race in val_horse_race_df['race_id'].unique():
    race_df = val_horse_race_df[val_horse_race_df['race_id']==race]
    n_horse = race_df.shape[0]    
    eval_dict[race] = {}
    eval_dict[race]['ground_truth'] = race_df['clean_position'].values
    eval_dict[race][pred_col] = race_df[pred_col].values

In [49]:
eval_result, lgbmv0_results_df = evaluate_prediction_sets(eval_dict)

In [50]:
lgbmv0_results_df

Unnamed: 0,lgbm_v0_preds_is_top3
Winner Match,0.41683
Top 3 Set Match,0.1272
Top 3 Exact Match,0.03327


In [52]:
compare_results = baseline_mean_results.copy()
compare_results[pred_col] = lgbmv0_results_df[pred_col]
compare_results

Unnamed: 0,random_probs,winning_odd_preds,lgbm_v0_preds_is_top3
Winner Match,0.07632,0.29354,0.41683
Top 3 Set Match,0.00783,0.05871,0.1272
Top 3 Exact Match,0.00196,0.00978,0.03327


# enhanced modelling with additional features

<module 'project_tools.project_class' from '/home/yifan/playground/hk_horse_racing/project_tools/project_class.py'>

In [53]:
!ls ../feature_data/

basic_cat_ordinal_features.parquet
horse_class_feats.parquet
horse_feats.parquet
horse_race_df.parquet
horse_running_position_features.parquet
horse_track_running_position_features.parquet
race_course_features.parquet
trace_condition.parquet
train_horse_positions_df.parquet
train_horse_race_df.parquet
train_jockey_positions_df.parquet
train_trainer_positions_df.parquet
val_horse_race_df.parquet


## uniform feature

In [222]:


basic_num_features = ['horse_number', 'clean_actual_weight', 'clean_declared_horse_weight',
                  'clean_win_odds', 'race_distance', 'clean_position_mavg_3', 'clean_position_mavg_5',
                     'clean_position_mavg_7']


basic_cat_features = basic_cat_ordinal_df.columns.tolist()

df0 = pd.concat([horse_race_df[basic_num_features], basic_cat_ordinal_df], axis=1)
# df0[target] = horse_race_df[target]


In [223]:
basic_cat_ordinal_df.columns

Index(['jockey', 'trainer', 'race_course', 'race_course_track', 'race_class',
       'track_condition'],
      dtype='object')

In [224]:
reload(project_class)


# binary encoding
lbl_enc = project_class.DataFrameLabelTransformer()
lbl_df = lbl_enc.fit_transform(horse_race_df[basic_cat_features].copy())

binary_enc = project_class.DataFrameBinaryEncoder(cat_cols=basic_cat_features,  verbose=False)
bin_df = binary_enc.fit_transform(lbl_df)

freq_df = project_utils.freq_encoding(lbl_df)

print(lbl_df.shape, bin_df.shape,freq_df.shape)

(29520, 6) (29520, 27) (29520, 6)


##  feature from train df only

In [225]:
def convert_date_to_int(date_str: str) -> int:
    """
    Convert date string in YYYY-MM-DD format to integer that preserves ordering
    
    Args:
        date_str: Date string in YYYY-MM-DD format
        
    Returns:
        Integer in format YYYYMMDD
        
    Example:
        '2015-11-18' -> 20151118
        '2016-03-31' -> 20160331
    """
    # Remove hyphens and convert to integer
    return int(date_str.replace('-', ''))

In [226]:
race_df = pd.read_csv('../data/race-result-race.csv')
race_df['clean_race_date'] = race_df['race_date'].apply(lambda x:convert_date_to_int(x))
race_df['year'] = race_df['race_date'].apply(lambda x:x[0:4])
race_df['race_course_track'] = race_df['race_course'] + '_' + race_df['track']

In [227]:
train_years = ['2014','2015','2016']
val_years = ['2017']
train_race_df = race_df[race_df['year'].isin(train_years)]


In [228]:
# race_course_feature
# Pattern classification
def classify_race_pattern(times):
    times = times.astype(float)
    diffs = np.diff(times)
    if all(diffs < 0):
        return 'progressive_speedup'
    elif all(diffs > 0):
        return 'progressive_slowdown'
    elif diffs[0] < 0 and diffs[-1] > 0:
        return 'middle_burst'
    elif diffs[0] > 0 and diffs[-1] < 0:
        return 'strong_finish'
    return 'mixed'


race_course_features = []
for i,row in tqdm(train_race_df.iterrows(), total=len(train_race_df)):
    stime_items = np.array(row['sectional_time'].split(' ')).astype(float)    
    item_dict = dict()
    # item_dict['race_id'] = row['race_id']
    item_dict['race_course_track'] = row['race_course_track']
    item_dict['race_stime_min'] = stime_items.min()
    item_dict['race_stime_max'] = stime_items.max()
    item_dict['race_stime_mean'] = stime_items.mean()
    item_dict['race_stime_std'] = stime_items.std()
    stime_diffs = np.diff(stime_items)
    # item_dict['race_stime_diffs'] = stime_diffs
    item_dict['race_stime_pace_variance'] = np.var(stime_diffs)
    item_dict['race_stime_max_pace_change'] = np.abs(stime_diffs).max()
    item_dict['race_stime_skewness'] = stats.skew(stime_items)
    
    item_dict['race_stime_first_half_avg'] = np.mean(stime_items[:len(stime_items)//2].astype(float))
    item_dict['race_stime_second_half_avg'] = np.mean(stime_items[len(stime_items)//2:].astype(float))
    item_dict['race_stime_speed_bias'] = item_dict['race_stime_second_half_avg'] - item_dict['race_stime_first_half_avg']  # Negative means faster finish

    # Percentile-based features
    stime_percentiles = np.percentile(stime_items.astype(float), [25, 75])
    item_dict['race_stime_iqr'] = stime_percentiles[1] - stime_percentiles[0]

    # Section comparisons
    item_dict['race_stime_fastest_section_idx'] = np.argmin(stime_items.astype(float))
    item_dict['race_stime_slowest_section_idx'] = np.argmax(stime_items.astype(float))

    # Trend analysis
    item_dict['race_stime_faster'] = all(stime_diffs < 0)  # True if consistently speeding up
    item_dict['race_stime_slower'] = all(stime_diffs > 0)  # True if consistently slowing down

  
    # item_dict['race_pattern'] = classify_race_pattern(stime_items)
    race_course_features.append(item_dict)    
# break
# stime_

train_rc_features = pd.DataFrame.from_dict(race_course_features)
train_rc_features['race_stime_faster'] = train_rc_features['race_stime_faster'].astype(np.int8) 
train_rc_features['race_stime_slower'] = train_rc_features['race_stime_slower'].astype(np.int8) 

  0%|          | 0/1856 [00:00<?, ?it/s]

In [97]:
train_rc_features.head(5)

Unnamed: 0,race_course_track,race_stime_min,race_stime_max,race_stime_mean,race_stime_std,race_stime_pace_variance,race_stime_max_pace_change,race_stime_skewness,race_stime_first_half_avg,race_stime_second_half_avg,race_stime_speed_bias,race_stime_iqr,race_stime_fastest_section_idx,race_stime_slowest_section_idx,race_stime_faster,race_stime_slower
0,"Sha Tin_TURF - ""A"" COURSE",13.59,23.55,20.5825,4.07222,13.42247,8.49,-1.09679,17.835,23.33,5.495,3.2625,0,3,0,1
1,"Sha Tin_TURF - ""A"" COURSE",13.55,22.89,20.385,3.95432,15.75707,8.7,-1.1407,17.9,22.87,4.97,2.785,0,2,0,0
2,"Sha Tin_TURF - ""A"" COURSE",22.25,24.06,23.32333,0.77633,2.5921,1.81,-0.56894,24.06,22.955,-1.105,0.905,1,0,0,0
3,"Sha Tin_TURF - ""A"" COURSE",22.47,23.42,22.79,0.4455,0.21622,0.94,0.70684,23.42,22.475,-0.945,0.475,2,0,1,0
4,"Sha Tin_TURF - ""A"" COURSE",22.62,24.0,23.08667,0.64588,0.49,1.38,0.7066,24.0,22.63,-1.37,0.69,1,0,0,0


In [229]:
groupby_feats1 = ['race_course_track']
numfeats = [col for col in train_rc_features.columns if 'stime' in col]
stat = ['median', 'mean', 'std'] 
print(numfeats)

agg_recipies = [
    [groupby_feats1, numfeats, stat],
]

res_dfs = project_utils.groupby_agg_execution(agg_recipies, train_rc_features, verbose=False)

['race_stime_min', 'race_stime_max', 'race_stime_mean', 'race_stime_std', 'race_stime_pace_variance', 'race_stime_max_pace_change', 'race_stime_skewness', 'race_stime_first_half_avg', 'race_stime_second_half_avg', 'race_stime_speed_bias', 'race_stime_iqr', 'race_stime_fastest_section_idx', 'race_stime_slowest_section_idx', 'race_stime_faster', 'race_stime_slower']


In [111]:
res_dfs.keys()

dict_keys(['race_course_track'])

In [230]:
train_track_features = res_dfs['race_course_track']
train_track_features.shape

(11, 46)

In [114]:
train_track_features.head(2)

Unnamed: 0,race_course_track,race_course_track_race_stime_min_median,race_course_track_race_stime_min_mean,race_course_track_race_stime_min_std,race_course_track_race_stime_max_median,race_course_track_race_stime_max_mean,race_course_track_race_stime_max_std,race_course_track_race_stime_mean_median,race_course_track_race_stime_mean_mean,race_course_track_race_stime_mean_std,race_course_track_race_stime_std_median,race_course_track_race_stime_std_mean,race_course_track_race_stime_std_std,race_course_track_race_stime_pace_variance_median,race_course_track_race_stime_pace_variance_mean,race_course_track_race_stime_pace_variance_std,race_course_track_race_stime_max_pace_change_median,race_course_track_race_stime_max_pace_change_mean,race_course_track_race_stime_max_pace_change_std,race_course_track_race_stime_skewness_median,race_course_track_race_stime_skewness_mean,race_course_track_race_stime_skewness_std,race_course_track_race_stime_first_half_avg_median,race_course_track_race_stime_first_half_avg_mean,race_course_track_race_stime_first_half_avg_std,race_course_track_race_stime_second_half_avg_median,race_course_track_race_stime_second_half_avg_mean,race_course_track_race_stime_second_half_avg_std,race_course_track_race_stime_speed_bias_median,race_course_track_race_stime_speed_bias_mean,race_course_track_race_stime_speed_bias_std,race_course_track_race_stime_iqr_median,race_course_track_race_stime_iqr_mean,race_course_track_race_stime_iqr_std,race_course_track_race_stime_fastest_section_idx_median,race_course_track_race_stime_fastest_section_idx_mean,race_course_track_race_stime_fastest_section_idx_std,race_course_track_race_stime_slowest_section_idx_median,race_course_track_race_stime_slowest_section_idx_mean,race_course_track_race_stime_slowest_section_idx_std,race_course_track_race_stime_faster_median,race_course_track_race_stime_faster_mean,race_course_track_race_stime_faster_std,race_course_track_race_stime_slower_median,race_course_track_race_stime_slower_mean,race_course_track_race_stime_slower_std
0,"Happy Valley_TURF - ""A"" COURSE",22.995,20.71226,4.51233,24.39,25.36313,1.81016,23.48667,23.40191,1.92967,1.35544,1.84206,1.63844,2.00096,4.88883,5.87544,2.895,3.83187,3.26914,-0.12201,-0.07547,0.78228,23.825,22.49121,4.39373,23.64,23.81973,0.84688,-0.44,1.32852,4.04285,1.165,1.6084,1.52435,1.0,1.29808,1.09791,0.0,0.93269,1.11437,0.0,0.16346,0.37068,0.0,0.12019,0.32597
1,"Happy Valley_TURF - ""B"" COURSE",22.98,20.50457,4.514,24.415,25.51085,2.00364,23.485,23.34649,1.96114,1.58717,1.97962,1.63447,2.53601,5.55892,6.23762,3.495,4.16793,3.33367,-0.17046,-0.05671,0.83323,23.87,22.45639,4.53552,23.61,23.73327,0.83003,-0.525,1.27688,4.20237,1.28,1.69468,1.57438,1.0,1.24468,1.08155,0.0,0.90957,1.10773,0.0,0.15426,0.36216,0.0,0.14362,0.35164


##  train_df horse level groupby

In [231]:
# train_df horse level groupby
horse_race_df = pd.read_parquet('../feature_data/horse_race_df.parquet')
horse_race_df_train =  horse_race_df.loc[train_idx].copy()


In [232]:
groupby_feats1 = ['horse_id']
numfeats = ['clean_length_behind_winner', 'clean_actual_weight', 'clean_declared_horse_weight', 'clean_win_odds', 
            'running_position_1', 'running_position_2', 'running_position_3', 'running_position_4', 
            'running_position_5', 'running_position_6', 'race_distance', 'clean_position']
stat = ['median', 'mean', 'std'] 

agg_recipies = [
    [groupby_feats1, numfeats, stat],
]
res_dfs = project_utils.groupby_agg_execution(agg_recipies, horse_race_df_train, verbose=False)
train_horse_groupby_features = res_dfs['horse_id']
train_horse_groupby_features.shape

(1895, 37)

In [157]:
train_horse_groupby_features.head(2)

Unnamed: 0,horse_id,horse_id_clean_length_behind_winner_median,horse_id_clean_length_behind_winner_mean,horse_id_clean_length_behind_winner_std,horse_id_clean_actual_weight_median,horse_id_clean_actual_weight_mean,horse_id_clean_actual_weight_std,horse_id_clean_declared_horse_weight_median,horse_id_clean_declared_horse_weight_mean,horse_id_clean_declared_horse_weight_std,horse_id_clean_win_odds_median,horse_id_clean_win_odds_mean,horse_id_clean_win_odds_std,horse_id_running_position_1_median,horse_id_running_position_1_mean,horse_id_running_position_1_std,horse_id_running_position_2_median,horse_id_running_position_2_mean,horse_id_running_position_2_std,horse_id_running_position_3_median,horse_id_running_position_3_mean,horse_id_running_position_3_std,horse_id_running_position_4_median,horse_id_running_position_4_mean,horse_id_running_position_4_std,horse_id_running_position_5_median,horse_id_running_position_5_mean,horse_id_running_position_5_std,horse_id_running_position_6_median,horse_id_running_position_6_mean,horse_id_running_position_6_std,horse_id_race_distance_median,horse_id_race_distance_mean,horse_id_race_distance_std,horse_id_clean_position_median,horse_id_clean_position_mean,horse_id_clean_position_std
0,A001,4.0,4.0,,125.0,125.0,,1083.0,1083.0,,49.0,49.0,,9.0,9.0,,7.0,7.0,,11.0,11.0,,,,,,,,,,,1200.0,1200.0,,11.0,11.0,
1,A002,0.75,0.7,0.57663,130.0,129.0,3.60555,1096.0,1095.0,6.55744,3.69922,4.63281,2.43686,4.0,3.66667,1.52753,4.0,4.66667,1.1547,4.0,4.66667,1.1547,2.0,3.66667,2.88675,2.0,2.0,,2.0,2.0,,1600.0,1800.0,346.41016,2.0,2.0,0.0


##  train_df horse race-track level groupby

In [233]:
groupby_feats = ['horse_id', 'race_course_track']
numfeats = ['running_position_1', 'running_position_2', 'running_position_3', 'running_position_4', 
            'running_position_5', 'running_position_6', 'clean_finish_time', 'clean_win_odds']
stat = ['median', 'mean', 'std']  # {'ptp':np.ptp}]#{'sharp':project_utils.get_array_sharpe}]

agg_recipies = [
    [groupby_feats, numfeats, stat],
]
res_dfs = project_utils.groupby_agg_execution(agg_recipies, horse_race_df_train, verbose=False)
train_horse_track_groupby_stat = res_dfs['horse_id_race_course_track']
train_horse_track_groupby_stat.shape

(10665, 26)

## jockey, trainer performance features in train dataset

In [234]:
groupby_feats1 = ['jockey'] 
groupby_feats2 = ['trainer']
numfeats = ['clean_position']
stat = ['median', 'mean', 'std', 'count']  # {'ptp':np.ptp}]#{'sharp':project_utils.get_array_sharpe}]


agg_recipies = [
    [groupby_feats1, numfeats, stat],
    [groupby_feats2, numfeats, stat],
]
res_dfs = project_utils.groupby_agg_execution(agg_recipies, horse_race_df_train, verbose=False)

train_jockey_groupby_features = res_dfs['jockey']
train_trainer_groupby_features = res_dfs['trainer']
print(train_jockey_groupby_features.shape)
print(train_trainer_groupby_features.shape)

(99, 5)
(87, 5)


## feature derived from main horse data

In [235]:
horse_feats = pd.DataFrame()
horse_feats['awght_dwght_ratio'] = horse_race_df['clean_actual_weight'] / horse_race_df['clean_declared_horse_weight']
horse_feats['awght_dwght_delta'] = horse_race_df['clean_actual_weight'] - horse_race_df['clean_declared_horse_weight']

## final feature set aggregation

In [236]:
horse_race_df = pd.read_parquet('../feature_data/horse_race_df.parquet')

In [None]:
# concat: bin_df, freq_df, horse_feats
# merge: train_track_features, train_horse_groupby_features, train_horse_track_groupby_stat
#        train_jockey_groupby_features, train_trainer_groupby_features

In [237]:
print(df0.shape, bin_df.shape, freq_df.shape, horse_feats.shape)

(29520, 14) (29520, 27) (29520, 6) (29520, 2)


In [262]:
# concat
df1 = pd.concat([df0, bin_df, freq_df, horse_feats], axis=1)
df1.shape

(29520, 49)

In [265]:
# merge_id_cols = [['race_course_track', 'horse_id', 'jockey', 'trainer']
merge_dfs = [train_track_features, train_horse_groupby_features, train_horse_track_groupby_stat,
            train_jockey_groupby_features, train_trainer_groupby_features]
feature_cols = []
for mdf in merge_dfs:
    use_cols = [c for c in mdf.columns if c not in merge_id_cols]
    feature_cols += use_cols
print(len(feature_cols))

df2 = horse_race_df.copy()
df2 = df2.merge(right=train_track_features, on = 'race_course_track', how='left')
# df2 = df2.merge(right=train_horse_groupby_features, on = 'horse_id', how='left')
# df2 = df2.merge(right=train_horse_track_groupby_stat, on = ['horse_id', 'race_course_track'], how='left')
# df2 = df2.merge(right=train_jockey_groupby_features, on = 'jockey', how='left')
# df2 = df2.merge(right=train_trainer_groupby_features, on = 'trainer', how='left')

use_cols = [c for c in df2.columns if c in feature_cols]

df2 = df2[use_cols]
df2.shape

113


(29520, 45)

In [266]:
df3 = pd.concat([df1, df2], axis=1)

train_df = df3.loc[train_idx]
val_df = df3.loc[val_idx]


params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'n_estimators':300,
    'num_leaves': 32,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,  # Column sampling
    'bagging_fraction': 0.8,  # Row sampling 
    'bagging_freq': 5,
    'verbose': -1,
    'max_depth': -1,
    'min_child_samples': 20,
    'is_unbalance': True  # Handle unbalanced datasets
}

target1 = 'is_winner'
target2 = 'is_top3'
basic_cat_features = basic_cat_ordinal_df.columns.tolist()




In [245]:
# is_winner modelling
df_winner = df3.copy()
df_winner[target1] = horse_race_df[target1]

train_df = df_winner.loc[train_idx]
val_df = df_winner.loc[val_idx]

print(train_df.shape, val_df.shape)
model_winner, val_winner_preds = train_lightgbm_model(train_df, val_df, target1,
                                        cat_features=basic_cat_features, params=params)

(23232, 163) (6288, 163)
Validation LogLoss: 0.7199


In [251]:
# is_top3 modelling
df_top3 = df3.copy()
df_top3[target2] = horse_race_df[target2]

train_df = df_top3.loc[train_idx]
val_df = df_top3.loc[val_idx]

print(train_df.shape, val_df.shape)
model_top3, val_top3_preds = train_lightgbm_model(train_df, val_df, target2,
                                        cat_features=basic_cat_features, params=params)

(23232, 163) (6288, 163)
Validation LogLoss: 1.0500


## top 1 model evaluation

In [256]:
val_horse_race_df = horse_race_df.loc[val_idx].reset_index(drop=True)
pred_col = f'lgbm_v1_preds_{target1}'
val_horse_race_df[pred_col] = val_winner_preds

eval_dict = {}
ground_truth = {}
lgbm_v1_preds = [] 

for race in val_horse_race_df['race_id'].unique():
    race_df = val_horse_race_df[val_horse_race_df['race_id']==race]
    n_horse = race_df.shape[0]    
    eval_dict[race] = {}
    eval_dict[race]['ground_truth'] = race_df['clean_position'].values
    eval_dict[race][pred_col] = race_df[pred_col].values

In [257]:
eval_result, lgbmv1_iswinner_results_df = evaluate_prediction_sets(eval_dict)

In [258]:
lgbmv1_iswinner_results_df

Unnamed: 0,lgbm_v1_preds_is_winner
Winner Match,0.27593
Top 3 Set Match,0.06654
Top 3 Exact Match,0.0137


In [259]:
pred_col = f'lgbm_v1_preds_{target1}'
check_race_id = val_horse_race_df['race_id'].sample(1).values[0]
use_cols = ['race_id', 'horse_id', 'clean_position', target, pred_col]
val_horse_race_df[val_horse_race_df['race_id']==check_race_id][use_cols]

Unnamed: 0,race_id,horse_id,clean_position,is_top3,lgbm_v1_preds_is_winner
13,2016-298,T099,1,1,0.23011
14,2016-298,T345,2,1,0.96826
15,2016-298,T073,3,1,0.00903
16,2016-298,V306,4,0,0.99303
17,2016-298,T422,5,0,0.57832
18,2016-298,T091,6,0,0.01458
19,2016-298,A031,7,0,0.97403
20,2016-298,A007,8,0,0.98487
21,2016-298,A075,9,0,0.92352
22,2016-298,V317,10,0,0.00034


## top 3 model evaluation

In [260]:
val_horse_race_df = horse_race_df.loc[val_idx].reset_index(drop=True)
pred_col = f'lgbm_v1_preds_{target2}'
val_horse_race_df[pred_col] = val_top3_preds

eval_dict = {}
ground_truth = {}
lgbm_v1_preds = [] 

for race in val_horse_race_df['race_id'].unique():
    race_df = val_horse_race_df[val_horse_race_df['race_id']==race]
    n_horse = race_df.shape[0]    
    eval_dict[race] = {}
    eval_dict[race]['ground_truth'] = race_df['clean_position'].values
    eval_dict[race][pred_col] = race_df[pred_col].values

eval_result, lgbmv1_istop3_results_df = evaluate_prediction_sets(eval_dict)

In [261]:
lgbmv1_istop3_results_df

Unnamed: 0,lgbm_v1_preds_is_top3
Winner Match,0.27006
Top 3 Set Match,0.03327
Top 3 Exact Match,0.00783


In [124]:
y_pred_ranks = np.argsort(y_pred_proba)
y_pred_ranks

array([4, 0, 3, 2, 1])

In [112]:
np.argsort(y_pred_proba)[::-1]

array([1, 3, 2, 0, 4])

# ad-hoc data analysis

In [6]:
print(val_df.columns.tolist())

['finishing_position', 'horse_number', 'horse_name', 'horse_id', 'jockey', 'trainer', 'actual_weight', 'declared_horse_weight', 'draw', 'length_behind_winner', 'running_position_1', 'running_position_2', 'running_position_3', 'running_position_4', 'finish_time', 'win_odds', 'running_position_5', 'running_position_6', 'race_id', 'clean_actual_weight', 'clean_declared_horse_weight', 'clean_length_behind_winner', 'clean_finish_time', 'clean_win_odds', 'clean_position', 'is_winner', 'is_top3', 'src', 'race_date', 'race_course', 'race_number', 'race_class', 'race_distance', 'track_condition', 'race_name', 'track', 'sectional_time', 'incident_report', 'race_course_track', 'clean_race_date', 'clean_position_mavg_3', 'clean_position_mavg_5', 'clean_position_mavg_7', 'year']


In [None]:
project_utils.analyze_dataframe(val_df)