In [118]:
import requests
import os
import sys
import pandas as pd
import numpy as np


from typing import Optional
import glob
from scipy import stats
# Add the parent directory of this notebook to sys.path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(notebook_dir)
sys.path.append(parent_dir)

from project_tools import project_utils, project_class

import datetime
import json
from tqdm.notebook import tqdm
import gc
# import ds_utils
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, ndcg_score
from scipy.stats import kendalltau, spearmanr


%matplotlib inline

from importlib import reload
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.0f' % x)

# Or for more precise control
pd.set_option('display.float_format', '{:.5f}'.format)

# load_data 

In [16]:
train_df = pd.read_parquet('../feature_data/train_horse_race_df.parquet')
val_df = pd.read_parquet('../feature_data/val_horse_race_df.parquet')                           

# evaluation function

In [165]:
def evaluate_horse_race_positions(y_true, y_pred_proba, dnf_value=99):
    """
    Evaluate predictions for a single race's finishing positions.
    
    Args:
        y_true: 1D array of true finishing positions
        y_pred_proba: 1D array of predicted probabilities
        dnf_value: Value used to indicate Did Not Finish
        
    Returns:
        Dictionary of evaluation metrics
    """
    # Handle NaN and DNF values in ground truth
    y_true_processed = y_true.copy()
    invalid_mask = np.logical_or(
        np.isnan(y_true),
        y_true == dnf_value
    )
    
    # Get max valid rank (excluding DNF values)
    valid_ranks = y_true[~invalid_mask]
    if len(valid_ranks) > 0:
        max_rank = np.max(valid_ranks)
        # Replace invalid values with max_rank + 1
        y_true_processed[invalid_mask] = max_rank + 1
        
    # Winner match
    y_true_ranksort = np.argsort(y_true_processed)
    y_pred_ranksort = np.argsort(y_pred_proba, axis=0)[::-1]
    
    winner_match = y_true_ranksort[0] == y_pred_ranksort[0]
    
    # Top 3 Set Match - considers [1,3,2] and [2,3,1] as matching
    top3_set_match = set(y_true_ranksort[:3]) == set(y_pred_ranksort[:3])
    
    # Top 3 Exact Match - only considers exact matches like [1,3,2] and [1,3,2]
    top3_exact_match = np.array_equal(y_true_ranksort[:3], y_pred_ranksort[:3])
    
    return {
        'Winner Match': float(winner_match),
        'Top 3 Set Match': float(top3_set_match), 
        'Top 3 Exact Match': float(top3_exact_match)
    }


In [167]:
def evaluate_prediction_sets(eval_dict):
    """
    Evaluate different prediction sets against ground truth for each race and calculate mean metrics
    
    Args:
        eval_dict: Dictionary containing race data with ground truth and different prediction sets
        
    Returns:
        tuple: (eval_result, mean_results_df)
            - eval_result: Dictionary with detailed evaluation metrics for each race
            - mean_results_df: DataFrame comparing mean metrics across prediction types
    """
    # Initialize results dictionary with race_ids as first level keys
    eval_result = {race_id: {} for race_id in eval_dict}

    # Get prediction types from first race data
    first_race_id = next(iter(eval_dict))
    pred_types = [key for key in eval_dict[first_race_id].keys() if key != 'ground_truth']

    # Initialize dictionaries to store mean results
    mean_results = {pred_type: {} for pred_type in pred_types}

    # Loop through each race
    for race_id in eval_dict:
        race_data = eval_dict[race_id]
        ground_truth = race_data['ground_truth']
        
        # Evaluate each prediction type
        for pred_type in pred_types:
            pred_probs = race_data[pred_type]
            
            # Evaluate predictions for this race
            race_eval = evaluate_horse_race_positions(
                ground_truth,
                pred_probs
            )
            
            # Store results for this race under race_id first, then pred_type
            eval_result[race_id][pred_type] = race_eval

    # Calculate mean results for each prediction type
    for pred_type in pred_types:
        # Initialize dict to store means for each metric
        metric_means = {}
        
        # Get metrics from first race to know what metrics exist
        first_race = next(iter(eval_result.values()))
        metrics = first_race[pred_type].keys()
        
        # For each metric, calculate mean across all races
        for metric in metrics:
            total = 0
            num_races = 0
            for race_id in eval_result:
                total += eval_result[race_id][pred_type][metric]
                num_races += 1
            metric_means[metric] = total / num_races
            
        mean_results[pred_type] = metric_means
    
    # Convert mean results to DataFrame for easy comparison
    mean_results_df = pd.DataFrame(mean_results)
    
    return eval_result, mean_results_df

# # Run evaluation
# eval_result, mean_results_df = evaluate_prediction_sets(eval_dict)

# # Display mean results comparison
# print("\nMean Evaluation Metrics Comparison:")
# print(mean_results_df)


# base line model 

In [133]:
view_cols = ['race_id','clean_position', 'horse_number', 'horse_id', 'draw', 
             'clean_win_odds', 'clean_finish_time', 'is_winner', 'is_top3']

In [134]:
val_df = val_df.sort_values(['race_id', 'draw'], ascending=True).reset_index(drop=True)

In [135]:
race_id = val_df['race_id'].sample(1).values[0]
val_df[val_df['race_id']==race_id][view_cols].head(10)

Unnamed: 0,race_id,clean_position,horse_number,horse_id,draw,clean_win_odds,clean_finish_time,is_winner,is_top3
5743,2016-765,6,6.0,P435,1,14.0,70.42,0,0
5744,2016-765,4,5.0,T210,2,6.39844,70.13,0,0
5745,2016-765,7,8.0,S216,3,9.79688,70.43,0,0
5746,2016-765,12,7.0,A195,4,99.0,71.03,0,0
5747,2016-765,1,3.0,V017,5,4.10156,69.89,1,1
5748,2016-765,9,1.0,T248,6,41.0,70.53,0,0
5749,2016-765,5,10.0,A007,7,5.30078,70.28,0,0
5750,2016-765,8,2.0,A163,8,99.0,70.47,0,0
5751,2016-765,11,4.0,P212,9,14.0,70.74,0,0
5752,2016-765,3,12.0,T427,10,17.0,70.1,0,1


In [154]:
eval_dict = {}
ground_truth = {}
# even_probs = []
random_probs = []
winning_odd_preds = [] 

for race in val_df['race_id'].unique():
    race_df = val_df[val_df['race_id']==race]
    n_horse = race_df.shape[0]    
    eval_dict[race] = {}
    eval_dict[race]['ground_truth'] = race_df['clean_position'].values
    # eval_dict[race]['even_probs'] = np.array([1/n_horse] * n_horse)
    eval_dict[race]['random_probs'] = np.random.random(n_horse)
    eval_dict[race]['winning_odd_preds'] = -1* race_df['clean_win_odds'].values

In [168]:
# Run evaluation
eval_result, mean_results_df = evaluate_prediction_sets(eval_dict)

In [169]:
race_id = val_df['race_id'].sample(1).values[0]
print(race_id)
pd.DataFrame.from_dict(eval_result[race_id])

2016-605


Unnamed: 0,random_probs,winning_odd_preds
Winner Match,0.0,0.0
Top 3 Set Match,0.0,0.0
Top 3 Exact Match,0.0,0.0


In [170]:
mean_results_df

Unnamed: 0,random_probs,winning_odd_preds
Winner Match,0.07632,0.29354
Top 3 Set Match,0.00783,0.05871
Top 3 Exact Match,0.00196,0.00978


In [171]:
save_file = '../evaluation_results/valdf_random_winodd_baseline_race_eval_result.pkl'
project_utils.pickle_data(save_file, eval_result)

creating file ../evaluation_results/valdf_random_winodd_baseline_race_eval_result.pkl


'../evaluation_results/valdf_random_winodd_baseline_race_eval_result.pkl'

In [172]:
mean_results_df.to_parquet('../evaluation_results/valdf_random_winodd_baseline_mean_results.parquet')

# ad-hoc data analysis

In [4]:
project_utils.analyze_dataframe(val_df)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Unnamed: 0,feature,missing_count,missing_ratio,value_types,mean,median,min,25%,75%,max,std
0,finishing_position,0,0,[str],,,,,,,
1,horse_number,0,0,[float],7.0,7.0,1.0,4.0,10.0,14.0,4.0
2,horse_name,0,0,[str],,,,,,,
3,horse_id,0,0,[str],,,,,,,
4,jockey,0,0,[str],,,,,,,
5,trainer,0,0,[str],,,,,,,
6,actual_weight,0,0,[str],,,,,,,
7,declared_horse_weight,0,0,[str],,,,,,,
8,draw,0,0,[str],,,,,,,
9,length_behind_winner,0,0,[str],,,,,,,
