In [1]:
import requests
import os
import sys
import pandas as pd
import numpy as np


from typing import Optional
import glob
from scipy import stats
# Add the parent directory of this notebook to sys.path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(notebook_dir)
sys.path.append(parent_dir)

from project_tools import project_utils, project_class

import datetime
import json
from tqdm.notebook import tqdm
import gc
# import ds_utils
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, ndcg_score
from scipy.stats import kendalltau, spearmanr
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss, accuracy_score
from scipy import stats
%matplotlib inline

from importlib import reload
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.0f' % x)

# Or for more precise control
pd.set_option('display.float_format', '{:.5f}'.format)

# evaluation functions

In [2]:
def evaluate_horse_race_positions(y_true, y_pred_proba, dnf_value=99):
    """
    Evaluate predictions for a single race's finishing positions.
    
    Args:
        y_true: 1D array of true finishing positions
        y_pred_proba: 1D array of predicted probabilities
        dnf_value: Value used to indicate Did Not Finish
        
    Returns:
        Dictionary of evaluation metrics
    """
    # Handle NaN and DNF values in ground truth
    y_true_processed = y_true.copy()
    invalid_mask = np.logical_or(
        np.isnan(y_true),
        y_true == dnf_value
    )
    
    # Get max valid rank (excluding DNF values)
    valid_ranks = y_true[~invalid_mask]
    if len(valid_ranks) > 0:
        max_rank = np.max(valid_ranks)
        # Replace invalid values with max_rank + 1
        y_true_processed[invalid_mask] = max_rank + 1
        
    # Winner match
    y_true_ranksort = np.argsort(y_true_processed)
    y_pred_ranksort = np.argsort(y_pred_proba, axis=0)[::-1]
    
    winner_match = y_true_ranksort[0] == y_pred_ranksort[0]
    
    # Top 3 Set Match - considers [1,3,2] and [2,3,1] as matching
    top3_set_match = set(y_true_ranksort[:3]) == set(y_pred_ranksort[:3])
    
    # Top 3 Exact Match - only considers exact matches like [1,3,2] and [1,3,2]
    top3_exact_match = np.array_equal(y_true_ranksort[:3], y_pred_ranksort[:3])
    
    return {
        'Winner Match': float(winner_match),
        'Top 3 Set Match': float(top3_set_match), 
        'Top 3 Exact Match': float(top3_exact_match)
    }


In [3]:
def evaluate_prediction_sets(eval_dict):
    """
    Evaluate different prediction sets against ground truth for each race and calculate mean metrics
    
    Args:
        eval_dict: Dictionary containing race data with ground truth and different prediction sets
        
    Returns:
        tuple: (eval_result, mean_results_df)
            - eval_result: Dictionary with detailed evaluation metrics for each race
            - mean_results_df: DataFrame comparing mean metrics across prediction types
    """
    # Initialize results dictionary with race_ids as first level keys
    eval_result = {race_id: {} for race_id in eval_dict}

    # Get prediction types from first race data
    first_race_id = next(iter(eval_dict))
    pred_types = [key for key in eval_dict[first_race_id].keys() if key != 'ground_truth']

    # Initialize dictionaries to store mean results
    mean_results = {pred_type: {} for pred_type in pred_types}

    # Loop through each race
    for race_id in eval_dict:
        race_data = eval_dict[race_id]
        ground_truth = race_data['ground_truth']
        
        # Evaluate each prediction type
        for pred_type in pred_types:
            pred_probs = race_data[pred_type]
            
            # Evaluate predictions for this race
            race_eval = evaluate_horse_race_positions(
                ground_truth,
                pred_probs
            )
            
            # Store results for this race under race_id first, then pred_type
            eval_result[race_id][pred_type] = race_eval

    # Calculate mean results for each prediction type
    for pred_type in pred_types:
        # Initialize dict to store means for each metric
        metric_means = {}
        
        # Get metrics from first race to know what metrics exist
        first_race = next(iter(eval_result.values()))
        metrics = first_race[pred_type].keys()
        
        # For each metric, calculate mean across all races
        for metric in metrics:
            total = 0
            num_races = 0
            for race_id in eval_result:
                total += eval_result[race_id][pred_type][metric]
                num_races += 1
            metric_means[metric] = total / num_races
            
        mean_results[pred_type] = metric_means
    
    # Convert mean results to DataFrame for easy comparison
    mean_results_df = pd.DataFrame(mean_results)
    
    return eval_result, mean_results_df

# # Run evaluation
# eval_result, mean_results_df = evaluate_prediction_sets(eval_dict)

# # Display mean results comparison
# print("\nMean Evaluation Metrics Comparison:")
# print(mean_results_df)


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [4]:
def train_lightgbm_ranker(train_df, val_df=None, race_id_col='race_id', label_col='clean_position', 
                         cat_features=None, params=None, n_estimators=300):
    """
    Train a LightGBM ranking model for horse race position prediction.
    
    Args:
        train_df: Training DataFrame with features and labels
        val_df: Optional validation DataFrame with features and labels
        race_id_col: Column name for race identifier
        label_col: Column name containing position/rank labels
        cat_features: List of categorical feature names
        params: LightGBM parameters dict
        n_estimators: Number of boosting rounds
        
    Returns:
        Trained model and validation predictions (if val_df provided)
    """
    # Default ranking parameters if none provided
    if params is None:
        params = {
            'objective': 'lambdarank',
            'metric': 'ndcg',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'max_position': 20,  # Maximum number of positions to consider
            'label_gain': list(range(20)), # Gain for each position 0-19
        }

    # Prepare features and group info
    features = [col for col in train_df.columns if col not in [label_col, race_id_col]]
    X_train = train_df[features]
    y_train = train_df[label_col]
    # Convert positions to gains (lower position = higher gain)
    max_pos = y_train.max()
    y_train = max_pos - y_train + 1
    
    # Get group sizes for training data
    train_groups = train_df.groupby(race_id_col).size().values

    # Create training dataset
    train_dataset = lgb.Dataset(
        X_train, 
        label=y_train,
        group=train_groups,
        categorical_feature=cat_features if cat_features else 'auto'
    )
    
    # Prepare validation data if provided
    if val_df is not None:
        X_val = val_df[features]
        y_val = val_df[label_col]
        y_val = max_pos - y_val + 1
        val_groups = val_df.groupby(race_id_col).size().values
        
        val_dataset = lgb.Dataset(
            X_val,
            label=y_val, 
            group=val_groups,
            reference=train_dataset,
            categorical_feature=cat_features if cat_features else 'auto'
        )
        valid_sets = [val_dataset]
    else:
        valid_sets = None
        X_val = None

    # Train model
    model = lgb.train(
        params,
        train_dataset,
        num_boost_round=n_estimators,
        valid_sets=None,
        # verbose_eval=100 if val_df is not None else -1
    )
    
    # Get validation predictions if validation data was provided
    val_preds = model.predict(X_val) if val_df is not None else None
    
    return model, val_preds


# load_data

In [5]:
horse_race_df = pd.read_parquet('../feature_data/horse_race_df.parquet')
horse_race_df = horse_race_df.sort_values(by=['clean_race_date','race_id'], ascending=True).reset_index(drop=True)
basic_cat_ordinal_df = pd.read_parquet('../feature_data/basic_cat_ordinal_features.parquet')

In [74]:
horse_race_df = horse_race_df.sort_values(by=['clean_race_date','race_id'], ascending=True).reset_index(drop=True)
horse_race_df['horse_jockey'] = horse_race_df['horse_id'] + '_' + horse_race_df['jockey']

In [7]:
horse_feats = pd.DataFrame()
horse_feats['awght_dwght_ratio'] = horse_race_df['clean_actual_weight'] / horse_race_df['clean_declared_horse_weight']
horse_feats['awght_dwght_delta'] = horse_race_df['clean_actual_weight'] - horse_race_df['clean_declared_horse_weight']

In [8]:
rid = horse_race_df['race_id'].sample(1).values[0]
use_cols = ['race_id','horse_id', 'clean_race_date', 'clean_position', 'draw', 'horse_number']
horse_race_df[horse_race_df['race_id']==rid][use_cols]

Unnamed: 0,race_id,horse_id,clean_race_date,clean_position,draw,horse_number
11933,2015-178,T348,20151114,1,9,7.0
11934,2015-178,S358,20151114,2,1,10.0
11935,2015-178,S419,20151114,3,12,2.0
11936,2015-178,T432,20151114,4,7,6.0
11937,2015-178,S247,20151114,5,8,3.0
11938,2015-178,T167,20151114,6,6,5.0
11939,2015-178,T196,20151114,7,14,1.0
11940,2015-178,S104,20151114,8,4,13.0
11941,2015-178,P351,20151114,9,13,14.0
11942,2015-178,T397,20151114,10,2,8.0


In [76]:
basic_num_features = ['horse_number', 'clean_actual_weight', 'clean_declared_horse_weight',
                  'clean_win_odds', 'race_distance']


basic_cat_features = basic_cat_ordinal_df.columns.tolist() + ['horse_jockey']
basic_cat_features 

['jockey',
 'trainer',
 'race_course',
 'race_course_track',
 'race_class',
 'track_condition',
 'horse_jockey']

In [77]:
# label_encoding
lbl_enc = project_class.DataFrameLabelTransformer()
race_enc_df = lbl_enc.fit_transform(horse_race_df[['race_id']].copy())

# binary encoding
lbl_enc = project_class.DataFrameLabelTransformer()
lbl_df = lbl_enc.fit_transform(horse_race_df[basic_cat_features].copy())

binary_enc = project_class.DataFrameBinaryEncoder(cat_cols=basic_cat_features,  verbose=False)
bin_df = binary_enc.fit_transform(lbl_df)

freq_df = project_utils.freq_encoding(lbl_df)

print(lbl_df.shape, bin_df.shape,freq_df.shape)

(29520, 7) (29520, 41) (29520, 7)


In [78]:
print(bin_df.columns.tolist())

['jockey_bin_0', 'jockey_bin_1', 'jockey_bin_2', 'jockey_bin_3', 'jockey_bin_4', 'jockey_bin_5', 'jockey_bin_6', 'trainer_bin_0', 'trainer_bin_1', 'trainer_bin_2', 'trainer_bin_3', 'trainer_bin_4', 'trainer_bin_5', 'trainer_bin_6', 'race_course_bin_0', 'race_course_track_bin_0', 'race_course_track_bin_1', 'race_course_track_bin_2', 'race_course_track_bin_3', 'race_class_bin_0', 'race_class_bin_1', 'race_class_bin_2', 'race_class_bin_3', 'track_condition_bin_0', 'track_condition_bin_1', 'track_condition_bin_2', 'track_condition_bin_3', 'horse_jockey_bin_0', 'horse_jockey_bin_1', 'horse_jockey_bin_2', 'horse_jockey_bin_3', 'horse_jockey_bin_4', 'horse_jockey_bin_5', 'horse_jockey_bin_6', 'horse_jockey_bin_7', 'horse_jockey_bin_8', 'horse_jockey_bin_9', 'horse_jockey_bin_10', 'horse_jockey_bin_11', 'horse_jockey_bin_12', 'horse_jockey_bin_13']


In [79]:
freq_df.columns

Index(['jockey_freq', 'trainer_freq', 'race_course_freq',
       'race_course_track_freq', 'race_class_freq', 'track_condition_freq',
       'horse_jockey_freq'],
      dtype='object')

# horse weight delta

In [13]:
def calculate_weight_delta(df, horse_id_col, weight_col):
    """
    Calculate weight change between consecutive races for each horse.
    
    Args:
        df: DataFrame containing horse race data
        horse_id_col: Column name containing horse IDs
        weight_col: Column name containing horse weights
        
    Returns:
        Numpy array containing weight deltas between consecutive races.
        For first race of each horse, delta will be NaN.
    """
    # Create array to store deltas, initialize with NaN
    weight_deltas = np.full(len(df), np.nan)
    
    # Get unique horses
    horses = df[horse_id_col].unique()
    
    # Calculate weight delta for each horse
    for horse in horses:
        # Get all races for this horse in chronological order
        horse_mask = df[horse_id_col] == horse
        horse_data = df[horse_mask].copy()
        
        if len(horse_data) > 1:  # Only calculate if horse has multiple races
            # Get weight values
            weights = horse_data[weight_col].values
            
            # Calculate deltas between consecutive races
            deltas = weights[1:] - weights[:-1]
            
            # Store deltas in result array, skipping first race
            horse_indices = horse_data.index[1:]  # Indices for all races except first
            weight_deltas[horse_indices] = deltas
            
    return weight_deltas

In [14]:
horse_dweight_delta = calculate_weight_delta(horse_race_df, 'horse_id','clean_declared_horse_weight')
horse_aweight_delta = calculate_weight_delta(horse_race_df, 'horse_id','clean_actual_weight')

horse_weigh_feats_df = pd.DataFrame(horse_race_df['horse_id'])
horse_weigh_feats_df['dweight_delta'] = horse_dweight_delta
horse_weigh_feats_df['aweight_delta'] = horse_aweight_delta

# horse race inteval 

In [60]:
def calculate_feature_delta(df, id_col, feat_col):

    # Create array to store deltas, initialize with NaN
    feature_deltas = np.full(len(df), np.nan)
    
    # Get unique horses
    ids = df[id_col].unique()
    # print(len(ids))
    # Calculate weight delta for each horse
    for tid in ids :
        # Get all races for this horse in chronological order
        tid_mask = df[id_col] == tid
        tid_data = df[tid_mask].copy()        
        if len(tid_data) > 1:  # Only calculate if horse has multiple races
            # Get weight values
            values = tid_data[feat_col].values            
            # Calculate deltas between consecutive races
            deltas = values[1:] - values[:-1]            
            # Store deltas in result array, skipping first race
            tid_indices = tid_data.index[1:]  # Indices for all races except first
            feature_deltas[tid_indices] = deltas            
    return feature_deltas

In [42]:
horse_date_delta = calculate_feature_delta(horse_race_df, 'horse_id', 'clean_race_date')

2155


In [59]:
jockey_date_delta = calculate_feature_delta(horse_race_df, 'jockey', 'clean_race_date')

105


In [63]:

horse_jockey_date_delta = calculate_feature_delta(horse_race_df, 'horse_jockey', 'clean_race_date')

In [65]:
key_cols = ['race_id', 'horse_id', 'horse_jockey', 'jockey','clean_race_date', 'clean_position']
feature_df = horse_race_df[key_cols].copy()

In [66]:
feature_df['horse_date_delta'] = horse_date_delta
feature_df['jockey_date_delta'] = jockey_date_delta
feature_df['horse_jockey_date_delta'] = horse_jockey_date_delta

In [71]:
feature_df[feature_df['horse_jockey']=='P120_H W Lai']

Unnamed: 0,race_id,horse_id,horse_jockey,jockey,clean_race_date,clean_position,horse_date_delta,jockey_date_delta,horse_jockey_date_delta
4322,2014-349,P120,P120_H W Lai,H W Lai,20150128,6,,0.0,
4630,2014-374,P120,P120_H W Lai,H W Lai,20150207,8,79.0,0.0,79.0
5498,2014-443,P120,P120_H W Lai,H W Lai,20150308,8,101.0,0.0,101.0
5843,2014-470,P120,P120_H W Lai,H W Lai,20150318,1,10.0,0.0,10.0
6332,2014-508,P120,P120_H W Lai,H W Lai,20150401,2,83.0,0.0,83.0
6914,2014-554,P120,P120_H W Lai,H W Lai,20150422,2,21.0,0.0,21.0
7978,2014-639,P120,P120_H W Lai,H W Lai,20150524,3,18.0,0.0,102.0
8577,2014-686,P120,P120_H W Lai,H W Lai,20150610,1,86.0,0.0,86.0
9169,2014-733,P120,P120_H W Lai,H W Lai,20150627,7,17.0,0.0,17.0
9717,2014-776,P120,P120_H W Lai,H W Lai,20150712,10,85.0,0.0,85.0


In [73]:
feature_df['horse_jockey'].nunique()

13959

# run model

In [157]:
# rank model benchmark features
df0 = pd.concat([horse_race_df[basic_num_features]], axis=1)
df0['race_id'] = race_enc_df['race_id']

horse_weight_delta_feats = ['dweight_delta', 'aweight_delta']
df1 = pd.concat([df0, lbl_df,  bin_df, freq_df, horse_feats, 
                 horse_weigh_feats_df[horse_weight_delta_feats]], axis=1)


key_cols = ['race_id', 'horse_id', 'horse_jockey', 'jockey','clean_race_date', 'clean_position']
feature_cols = [c for c in feature_df.columns if c not in key_cols]
df2 = pd.concat([df1, feature_df[feature_cols]], axis=1)

remove_cols = [c for c in df2.columns if 'horse_jockey_bin' in c]
remove_cols += ['horse_number']
df2 = df2.drop(remove_cols, axis=1)
print(df2.shape)

(29520, 53)


In [158]:
run_df = df2.copy()
# train_df = run_df.loc[train_idx].reset_index(drop=True)
# # train_df = train_df.loc[int(len(train_df)/2), len(train_df)]
# val_df = run_df.loc[val_idx]


params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'boosting_type': 'gbdt',
    # 'n_estimators':300,
    'num_leaves': 32,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,  # Column sampling
    'bagging_fraction': 0.8,  # Row sampling 
    'bagging_freq': 5,
    'verbose': -1,
    'max_position': 20,  # Maximum number of positions to consider
    'label_gain': list(range(20))
}

target1 = 'clean_position'
# target2 = 'is_top3'
basic_cat_features = basic_cat_ordinal_df.columns.tolist() + ['horse_jockey']
use_cats = [c for c in run_df.columns if c in basic_cat_features]
print(use_cats)

df_rank = run_df.copy()
df_rank[target1] = horse_race_df[target1]
# df_rank = df_rank[df_rank['clean_position']!=99].reset_index(drop=True)

feature_names = [c for c in run_df.columns if c not in [target1]]
print(feature_names)

['jockey', 'trainer', 'race_course', 'race_course_track', 'race_class', 'track_condition', 'horse_jockey']
['clean_actual_weight', 'clean_declared_horse_weight', 'clean_win_odds', 'race_distance', 'race_id', 'jockey', 'trainer', 'race_course', 'race_course_track', 'race_class', 'track_condition', 'horse_jockey', 'jockey_bin_0', 'jockey_bin_1', 'jockey_bin_2', 'jockey_bin_3', 'jockey_bin_4', 'jockey_bin_5', 'jockey_bin_6', 'trainer_bin_0', 'trainer_bin_1', 'trainer_bin_2', 'trainer_bin_3', 'trainer_bin_4', 'trainer_bin_5', 'trainer_bin_6', 'race_course_bin_0', 'race_course_track_bin_0', 'race_course_track_bin_1', 'race_course_track_bin_2', 'race_course_track_bin_3', 'race_class_bin_0', 'race_class_bin_1', 'race_class_bin_2', 'race_class_bin_3', 'track_condition_bin_0', 'track_condition_bin_1', 'track_condition_bin_2', 'track_condition_bin_3', 'jockey_freq', 'trainer_freq', 'race_course_freq', 'race_course_track_freq', 'race_class_freq', 'track_condition_freq', 'horse_jockey_freq', 'awgh

In [159]:

train_years = ['2014','2015','2016']
val_years = ['2017']
train_idx = horse_race_df[horse_race_df['year'].isin(train_years)].index
val_idx = horse_race_df[horse_race_df['year'].isin(val_years)].index
print(len(train_idx), len(val_idx))

train_df = df_rank.loc[train_idx]
train_df = train_df[train_df['clean_position']!=99].reset_index(drop=True)
# train_df = train_df.loc[int(len(train_df)/2), len(train_df)-1]
# start_idx = int((len(train_df)/10)*2)
# train_df = train_df.loc[start_idx:]
val_df = df_rank.loc[val_idx]

group_col = 'race_id'
print(train_df.shape, val_df.shape)

print(f'building model for target - {target1}')
model_rank, val_rank_preds = train_lightgbm_ranker(train_df, val_df, group_col, target1,
                                        cat_features=use_cats, params=params, 
                                        n_estimators=300)

23232 6288
(23231, 54) (6288, 54)
building model for target - clean_position


## model evaluation

In [160]:
val_horse_race_df = horse_race_df.loc[val_idx].reset_index(drop=True)
pred_col  = f'lgbm_v1_preds_ranker'
val_horse_race_df[pred_col] = val_rank_preds

eval_dict = {}
ground_truth = {}

for race in val_horse_race_df['race_id'].unique():
    race_df = val_horse_race_df[val_horse_race_df['race_id']==race]
    n_horse = race_df.shape[0]    
    eval_dict[race] = {}
    eval_dict[race]['ground_truth'] = race_df['clean_position'].values
    eval_dict[race][pred_col] = race_df[pred_col].values


eval_result, lgbmv1_rank_results_df = evaluate_prediction_sets(eval_dict)
lgbmv1_rank_results_df


Unnamed: 0,lgbm_v1_preds_ranker
Winner Match,0.29354
Top 3 Set Match,0.08023
Top 3 Exact Match,0.01761


In [161]:
baseline_mean_results = pd.read_parquet('../evaluation_results/valdf_random_winodd_baseline_mean_results.parquet')
compare_results = baseline_mean_results.copy()
compare_results[pred_col_rank] = lgbmv1_rank_results_df
compare_results

Unnamed: 0,random_probs,winning_odd_preds,lgbm_v1_preds_ranker
Winner Match,0.07632,0.29354,0.29354
Top 3 Set Match,0.00783,0.05871,0.08023
Top 3 Exact Match,0.00196,0.00978,0.01761


# avg 

In [162]:
pairwise_pred = pd.read_parquet('../feature_data/model_results/val_results_lgbm_pairwise.parquet')

In [172]:
pairwise_pred_vals = pairwise_pred['prediction'].rank(pct=True).values
rank_pred_vals = project_utils.get_rank(val_rank_preds)

avg_df = val_horse_race_df[['race_id']].copy()
avg_df['pw_preds'] = pairwise_pred_vals
avg_df['rank_preds'] = rank_pred_vals
avg_df['preds'] = project_utils.get_rank(((pairwise_pred_vals + rank_pred_vals)/2))

In [173]:
val_horse_race_df = horse_race_df.loc[val_idx].reset_index(drop=True)
pred_col  = f'pw_rank_avg'
val_horse_race_df[pred_col] = avg_df['preds']

eval_dict = {}
ground_truth = {}

for race in val_horse_race_df['race_id'].unique():
    race_df = val_horse_race_df[val_horse_race_df['race_id']==race]
    n_horse = race_df.shape[0]    
    eval_dict[race] = {}
    eval_dict[race]['ground_truth'] = race_df['clean_position'].values
    eval_dict[race][pred_col] = race_df[pred_col].values


eval_result, pwrank_avg_results_df = evaluate_prediction_sets(eval_dict)
pwrank_avg_results_df

Unnamed: 0,pw_rank_avg
Winner Match,0.30528
Top 3 Set Match,0.07436
Top 3 Exact Match,0.01957


In [174]:
baseline_mean_results = pd.read_parquet('../evaluation_results/valdf_random_winodd_baseline_mean_results.parquet')
compare_results = baseline_mean_results.copy()
compare_results[pred_col_rank] = pwrank_avg_results_df
compare_results

Unnamed: 0,random_probs,winning_odd_preds,lgbm_v1_preds_ranker
Winner Match,0.07632,0.29354,0.30528
Top 3 Set Match,0.00783,0.05871,0.07436
Top 3 Exact Match,0.00196,0.00978,0.01957


# time-progress prediction

In [175]:
print(train_df.shape, val_df.shape)

(23231, 54) (6288, 54)


In [192]:
def make_progressive_predictions(train_df, val_df, date_col='clean_race_date', 
                                       race_id_col='race_id', target_col='clean_position',
                                        cat_features=None,
                                       params = None,  n_estimators=300,
                                       min_train_days=None, max_train_days=None,
                                       save_models=False, model_dir=None,
                                       verbose=True):
    """
    Enhanced version with progress bar and additional features
    """
    val_df = val_df.sort_values(date_col)
    unique_dates = val_df[date_col].unique()
    predictions_list = []
    current_train_df = train_df.copy()
    
    if verbose:
        print(f"Initial training set size: {len(current_train_df):,} rows")
        print(f"Processing {len(unique_dates)} dates...")
        
        if min_train_days or max_train_days:
            print(f"Training window: {min_train_days or 'None'} to {max_train_days or 'None'} days")
    
    # Process each date with tqdm progress bar
    for pred_date in tqdm(unique_dates, desc="Processing dates"):
        # Apply training window if specified
        if min_train_days or max_train_days:
            current_train_df['days_diff'] = (
                pd.to_datetime(pred_date) - pd.to_datetime(current_train_df[date_col])
            ).dt.days
            
            train_mask = True
            if min_train_days:
                train_mask &= current_train_df['days_diff'] >= min_train_days
            if max_train_days:
                train_mask &= current_train_df['days_diff'] <= max_train_days
                
            training_data = current_train_df[train_mask].drop('days_diff', axis=1)
        else:
            training_data = current_train_df
        
        # Get validation data for current date
        current_val_mask = val_df[date_col] == pred_date
        current_val_df = val_df[current_val_mask]
        
        # Train model and make predictions
        model, current_pred_val = train_lightgbm_ranker(training_data, current_val_df, race_id_col, 
                           target_col, cat_features, params, n_estimators)
        
        # Save model if requested
        # if save_models and model_dir:
        #     os.makedirs(model_dir, exist_ok=True)
        #     model_path = os.path.join(model_dir, f"model_{pred_date}.txt")
        #     model.save_model(model_path)
        
        # Store predictions with metadata
        current_preds = pd.DataFrame({
            'original_index': current_val_df.index,
            'predictions': current_pred_val,
            'pred_date': pred_date,
            'training_size': len(training_data)
        })
        predictions_list.append(current_preds)
        
        # Update training data
        current_train_df = pd.concat([
            current_train_df,
            current_val_df
        ]).reset_index(drop=True)
    
    # Combine and reorder predictions
    all_predictions = pd.concat(predictions_list)
    all_predictions = all_predictions.sort_values('original_index')
    
    if verbose:
        print("\nPrediction Summary:")
        print(f"Total predictions: {len(all_predictions):,}")
        print(f"Dates processed: {len(unique_dates):,}")
        print(f"Final training set size: {len(current_train_df):,}")
    
    return all_predictions['predictions'].values, all_predictions

In [194]:

train_years = ['2014','2015','2016']
val_years = ['2017']
train_idx = horse_race_df[horse_race_df['year'].isin(train_years)].index
val_idx = horse_race_df[horse_race_df['year'].isin(val_years)].index
print(len(train_idx), len(val_idx))

df_rank = run_df.copy()
df_rank['clean_race_date'] = horse_race_df['clean_race_date']
df_rank['clean_position'] = horse_race_df['clean_position']
train_df = df_rank.loc[train_idx]
train_df = train_df[train_df['clean_position']!=99].reset_index(drop=True)
# train_df = train_df.loc[int(len(train_df)/2), len(train_df)-1]
# start_idx = int((len(train_df)/10)*2)
# train_df = train_df.loc[start_idx:]
val_df = df_rank.loc[val_idx]

group_col = 'race_id'
print(train_df.shape, val_df.shape)

print(f'building model for target - {target1}')
all_pred_vals, all_predictions = make_progressive_predictions(train_df, val_df, 
                                       date_col='clean_race_date', 
                                       race_id_col='race_id', target_col='clean_position',
                                       cat_features=None,
                                       params = None,  n_estimators=300)

23232 6288
(23231, 55) (6288, 55)
building model for target - clean_position
Initial training set size: 23,231 rows
Processing 56 dates...


Processing dates:   0%|          | 0/56 [00:00<?, ?it/s]


Prediction Summary:
Total predictions: 6,288
Dates processed: 56
Final training set size: 29,519


In [195]:
all_pred_vals.shape

(6288,)

In [196]:
val_horse_race_df = horse_race_df.loc[val_idx].reset_index(drop=True)
pred_col  = f'lgbrank_timeprog_preds'
val_horse_race_df[pred_col] = all_pred_vals

eval_dict = {}
ground_truth = {}

for race in val_horse_race_df['race_id'].unique():
    race_df = val_horse_race_df[val_horse_race_df['race_id']==race]
    n_horse = race_df.shape[0]    
    eval_dict[race] = {}
    eval_dict[race]['ground_truth'] = race_df['clean_position'].values
    eval_dict[race][pred_col] = race_df[pred_col].values


eval_result, lgbrank_timeprog_results_df = evaluate_prediction_sets(eval_dict)
lgbrank_timeprog_results_df


Unnamed: 0,lgbrank_timeprog_preds
Winner Match,0.29159
Top 3 Set Match,0.08219
Top 3 Exact Match,0.00391


In [25]:
reload(project_utils)
impt_df = project_utils.lgbm_feature_importance(model_rank)
print(impt_df)

                        feature  feature_importance
0                clean_win_odds                1241
1                 dweight_delta                 875
2             awght_dwght_ratio                 810
3   clean_declared_horse_weight                 757
4                 aweight_delta                 674
5             awght_dwght_delta                 653
6                       trainer                 540
7                        jockey                 521
8           clean_actual_weight                 388
9                  horse_number                 349
10                race_distance                 291
11                  jockey_freq                 253
12       race_course_track_freq                 226
13            race_course_track                 200
14                 trainer_freq                 192
15              race_class_freq                 164
16         track_condition_freq                 108
17             race_class_bin_2                  62
18          