In [2]:
import requests
import os
import sys
import pandas as pd
import numpy as np

# import key sklearn ml algos
# import key sklearn metrics
# import 

from typing import Optional
import glob
# Add the parent directory of this notebook to sys.path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(notebook_dir)
sys.path.append(parent_dir)
from project_tools import project_utils, project_class

import datetime
import json
from tqdm import tqdm
from tqdm.notebook import tqdm
import gc
# import ds_utils
import random
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [None]:
df['feature_missing'] = df['feature'].isnull().astype(int)
df['feature'].fillna(df['feature'].mean(), inplace=True)

In [None]:
def convert_timestamp_to_seconds(timestamp: str) -> float:
    """
    Convert text timestamp in format "M.SS.ss" to total seconds
    where M=minutes, SS=seconds, ss=decimal seconds
    
    Examples:
        "1.41.91" -> 101.91 (1 min 41.91 sec)
        "1.40.12" -> 100.12 (1 min 40.12 sec)
        "0.58.41" -> 58.41 (58.41 sec)
    
    Args:
        timestamp: String timestamp in M.SS.ss format
        
    Returns:
        Float value representing total seconds
    """
    parts = timestamp.split('.')
    
    if len(parts) != 3:
        raise ValueError(f"Invalid timestamp format: {timestamp}. Expected format: M.SS.ss")
        
    minutes = float(parts[0])
    seconds = float(parts[1])
    decimal = float(parts[2]) / 100  # Convert decimal part to fraction
    
    total_seconds = minutes * 60 + seconds + decimal
    
    return total_seconds


In [None]:
def convert_date_to_int(date_str: str) -> int:
    """
    Convert date string in YYYY-MM-DD format to integer that preserves ordering
    
    Args:
        date_str: Date string in YYYY-MM-DD format
        
    Returns:
        Integer in format YYYYMMDD
        
    Example:
        '2015-11-18' -> 20151118b
        '2016-03-31' -> 20160331
    """
    # Remove hyphens and convert to integer
    return int(date_str.replace('-', ''))

# Example usage:
dates = ['2015-11-18', '2015-03-25', '2016-03-31', '2015-07-05', '2016-11-06']
date_ints = [convert_date_to_int(d) for d in dates]
print(f"Original dates: {dates}")
print(f"Integer dates: {date_ints}")


In [None]:
# running position calculation instructions 
# given a horse race dataframe, each row provide information about a horse in a specific race,  with the following relevant columns:
#  - horse_id - id of a horse
# - is_winer - if the horse is a winer 
# - is_top3 - if the horse finished in top 3

# the dataframe is sorted by time order with records of earlier race on top
# write a function that calculate, and return the following for each horse at each race:
# - running average of position of the last 3 races
# - running average of position of the last 5 races
# - running average of position of the last 7 races

# for each results above, output a column

# specific instruction:
# - if at any given race, for a given horse, there isn't enough races to make up the running number of 3, 5 or 7, return NaN
# - for any position with 99 - treat this as missing value 




function instruction to calculate horse prediction metrics
with a horse racing dataframe, each row provide information about a horse in a specific race,  with the following relevant columns:
- horse_id - id of a horse
- is_winer - if the horse is a winer - 0 or 1
- is_top3 - if the horse finished in top 3 - 0 or 1
- race_id - id of a race
- draw number - the draw number of the horse in the race

now, given a list of races, for each race, given a list of predictions for each of the horse winning the race
- ordered by the draw number, 
- the higher the value the more likely the horse is to win

propose what is the best metric to evaluate the prediction quality of prediction top 1 and top 3 for a list of races



do the following for each race:
- convert the list of predictions into rank
- check if the rank 1 (lowest value) prediction is the same as the "is_winner" of the horse in the race
- check if the top rank 3 prediction is the same as the "is_top3" of the horse in the race
- some race might have more than 3 top_3, i.e. more than 3 is_top3 = 1, in this case, check if the top 3 predictions contains label "1"

return two dictionaries:
- one for top 1 prediction:





For evaluating a machine learning model predicting horse race outcomes, particularly the winner and top 3 finishers, along with winning probabilities for each horse, you should consider several evaluation metrics. Here's a comprehensive approach to evaluate such predictions:

## Evaluation Metrics

1. Log Loss: This metric is ideal for evaluating probabilistic predictions and penalizes confident misclassifications heavily[2].

2. Brier Score: Similar to log loss, it measures the accuracy of probabilistic predictions.

3. Accuracy: While not ideal for imbalanced datasets, it can still provide a basic measure of correctness for winner predictions[3].

4. Top-K Accuracy: This is particularly useful for evaluating predictions of top 3 finishers[3].

5. Mean Reciprocal Rank (MRR): This metric is valuable when you're interested in the rank of the correct prediction, especially useful for top 3 predictions.

6. ROC-AUC: This can be used to evaluate the model's ability to distinguish between classes, which is useful for binary classification (e.g., whether a horse finishes in the top 3 or not)[4].

## Python Implementation

Here's a Python example demonstrating how to calculate these metrics:

```python
import numpy as np
from sklearn.metrics import log_loss, brier_score_loss, accuracy_score, roc_auc_score

def evaluate_horse_race_predictions(y_true, y_pred_proba, top_k=3):
    # Ensure y_true is one-hot encoded
    y_true_onehot = np.eye(len(y_pred_proba[0]))[y_true]
    
    # Log Loss
    log_loss_value = log_loss(y_true_onehot, y_pred_proba)
    
    # Brier Score
    brier_score_value = brier_score_loss(y_true_onehot.ravel(), y_pred_proba.ravel())
    
    # Accuracy (for winner prediction)
    y_pred = np.argmax(y_pred_proba, axis=1)
    accuracy_value = accuracy_score(y_true, y_pred)
    
    # Top-K Accuracy
    top_k_predictions = np.argsort(y_pred_proba, axis=1)[:, -top_k:]
    top_k_accuracy = np.mean([1 if y_true[i] in top_k_predictions[i] else 0 for i in range(len(y_true))])
    
    # Mean Reciprocal Rank
    mrr = np.mean([1 / (np.where(np.argsort(y_pred_proba[i])[::-1] == y_true[i])[0][0] + 1) for i in range(len(y_true))])
    
    # ROC-AUC (for top 3 prediction)
    y_true_top3 = np.array([1 if i in top_k_predictions[j] else 0 for j, i in enumerate(y_true)])
    y_pred_top3 = np.max(y_pred_proba, axis=1)
    roc_auc_value = roc_auc_score(y_true_top3, y_pred_top3)
    
    return {
        'Log Loss': log_loss_value,
        'Brier Score': brier_score_value,
        'Accuracy': accuracy_value,
        f'Top-{top_k} Accuracy': top_k_accuracy,
        'Mean Reciprocal Rank': mrr,
        'ROC-AUC (Top 3)': roc_auc_value
    }

# Example usage
y_true = np.array([2, 0, 1, 2, 1])  # True winners
y_pred_proba = np.array([
    [0.1, 0.3, 0.6],
    [0.4, 0.3, 0.3],
    [0.2, 0.5, 0.3],
    [0.1, 0.4, 0.5],
    [0.3, 0.4, 0.3]
])

results = evaluate_horse_race_predictions(y_true, y_pred_proba)
for metric, value in results.items():
    print(f"{metric}: {value:.4f}")
```

This script calculates various metrics for evaluating horse race predictions:

1. Log Loss: Evaluates the quality of probabilistic predictions.
2. Brier Score: Measures the accuracy of probabilistic predictions.
3. Accuracy: Calculates the proportion of correct winner predictions[3].
4. Top-K Accuracy: Measures how often the true winner is in the top K predictions[3].
5. Mean Reciprocal Rank: Evaluates the ranking of the correct prediction.
6. ROC-AUC: Assesses the model's ability to distinguish between top 3 finishers and others[4].

When interpreting these metrics:

- Lower values are better for Log Loss and Brier Score.
- Higher values are better for Accuracy, Top-K Accuracy, Mean Reciprocal Rank, and ROC-AUC.

This comprehensive evaluation approach will give you a well-rounded assessment of your model's performance in predicting horse race outcomes, considering both the winner predictions and the probabilistic estimates for each horse.

Citations:
[1] https://github.com/dominicplouffe/HorseRacingPrediction
[2] https://www.reddit.com/r/MachineLearning/comments/15ek21a/d_model_design_for_outputting_reliable_multiclass/
[3] https://www.appsilon.com/post/machine-learning-evaluation-metrics-classification
[4] https://www.kdnuggets.com/2020/04/performance-evaluation-metrics-classification.html
[5] https://towardsdatascience.com/how-to-choose-the-best-evaluation-metric-for-classification-problems-638e845da334?gi=0b4482a290fa

# prediction evaluation when the prediction is a list of probabilities for each horse

Based on your request, I'll modify the evaluation function to handle the scenario where the ground truth for each race is the finishing position of each horse. This approach allows for a more comprehensive evaluation of the model's performance in predicting the entire race outcome, not just the winner. Here's the modified function with explanations:

```python
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, ndcg_score
from scipy.stats import kendalltau

def evaluate_horse_race_positions(y_true, y_pred_proba):
    n_races, n_horses = y_pred_proba.shape
    
    # Convert probabilities to predicted rankings
    y_pred_ranks = n_horses - np.argsort(y_pred_proba, axis=1)
    
    # Mean Squared Error
    mse = mean_squared_error(y_true, y_pred_ranks)
    
    # Mean Absolute Error
    mae = mean_absolute_error(y_true, y_pred_ranks)
    
    # Spearman's Rank Correlation
    spearman_corr = np.mean([kendalltau(y_true[i], y_pred_ranks[i]).correlation 
                             for i in range(n_races)])
    
    # Normalized Discounted Cumulative Gain (NDCG)
    ndcg = ndcg_score(y_true.reshape(1, -1), y_pred_proba.reshape(1, -1))
    
    # Exact Match Accuracy
    exact_match = np.mean(np.all(y_true == y_pred_ranks, axis=1))
    
    # Top-K Accuracy (for K=1, 2, 3)
    top_k_accuracy = {}
    for k in [1, 2, 3]:
        top_k_pred = np.argsort(-y_pred_proba, axis=1)[:, :k]
        top_k_true = np.argsort(y_true, axis=1)[:, :k]
        top_k_accuracy[f'Top-{k} Accuracy'] = np.mean([
            len(set(top_k_pred[i]) & set(top_k_true[i])) / k 
            for i in range(n_races)
        ])
    
    return {
        'Mean Squared Error': mse,
        'Mean Absolute Error': mae,
        "Spearman's Rank Correlation": spearman_corr,
        'NDCG': ndcg,
        'Exact Match Accuracy': exact_match,
        **top_k_accuracy
    }

# Example usage
y_true = np.array([
    [4, 1, 3, 2, 5],
    [2, 5, 1, 4, 3],
    [3, 1, 5, 2, 4]
])

y_pred_proba = np.array([
    [0.1, 0.4, 0.2, 0.2, 0.1],
    [0.3, 0.1, 0.4, 0.1, 0.1],
    [0.2, 0.3, 0.1, 0.3, 0.1]
])

results = evaluate_horse_race_positions(y_true, y_pred_proba)
for metric, value in results.items():
    print(f"{metric}: {value:.4f}")
```

This modified function evaluates the predictions based on the full finishing order of each race. Here's an explanation of the metrics used:

1. **Mean Squared Error (MSE)**: Measures the average squared difference between predicted and actual rankings.

2. **Mean Absolute Error (MAE)**: Measures the average absolute difference between predicted and actual rankings.

3. **Spearman's Rank Correlation**: Assesses the monotonic relationship between predicted and actual rankings. A higher value indicates better prediction of the overall order.

4. **Normalized Discounted Cumulative Gain (NDCG)**: Measures the quality of ranking, giving more importance to correct predictions at higher ranks.

5. **Exact Match Accuracy**: Calculates the proportion of races where the predicted order exactly matches the true order.

6. **Top-K Accuracy**: For K=1, 2, and 3, it measures how well the model predicts the top K finishers, regardless of their exact order.

This evaluation approach provides a comprehensive assessment of the model's performance in predicting horse race outcomes, considering the entire finishing order rather than just the winner or top few positions. It captures various aspects of ranking quality, from exact matches to overall correlation of rankings.

To use this function, provide the true finishing positions (`y_true`) and the predicted probabilities (`y_pred_proba`) for each horse in each race. The function will return a dictionary of evaluation metrics that you can use to assess and compare different models or prediction approaches.


In [None]:
eval_dict = {}
ground_truth = {}
even_probs = []
random_probs = []
winning_odd_preds = [] 

for race in val_df['race_id'].unique():
    race_df = val_df[val_df['race_id']==race]
    n_horse = race_df.shape[0]    
    eval_dict[race] = {}
    eval_dict[race]['ground_truth'] = race_df['clean_position'].values
    eval_dict[race]['even_probs'] = np.array([1/n_horse] * n_horse)
    eval_dict[race]['random_probs'] = np.random.random(n_horse)
    eval_dict[race]['winning_odd_preds'] = race_df['clean_win_odds'].values


In [None]:
def evaluate_prediction_sets(eval_dict):
    """
    Evaluate different prediction sets against ground truth for each race and calculate mean metrics
    
    Args:
        eval_dict: Dictionary containing race data with ground truth and different prediction sets
        
    Returns:
        tuple: (eval_result, mean_results_df)
            - eval_result: Dictionary with detailed evaluation metrics for each race
            - mean_results_df: DataFrame comparing mean metrics across prediction types
    """
    # Initialize results dictionary
    eval_result = {
        'even_probs': [],
        'random_probs': [], 
        'winning_odd_preds': []
    }

    # Initialize dictionaries to store mean results
    mean_results = {
        'even_probs': {},
        'random_probs': {},
        'winning_odd_preds': {}
    }

    # Loop through each race
    for race_id in eval_dict:
        race_data = eval_dict[race_id]
        ground_truth = race_data['ground_truth'].reshape(1,-1)
        
        # Evaluate each prediction type
        for pred_type in ['even_probs', 'random_probs', 'winning_odd_preds']:
            pred_probs = race_data[pred_type].reshape(1,-1)
            
            # Evaluate predictions for this race
            race_eval = evaluate_horse_race_positions(
                ground_truth,
                pred_probs.reshape(1,-1)
            )
            
            # Store results for this race
            eval_result[pred_type].append(race_eval)

    # Calculate mean results for each prediction type
    for pred_type in ['even_probs', 'random_probs', 'winning_odd_preds']:
        # Initialize dict to store means for each metric
        metric_means = {}
        
        # Get all metrics from first result to know what to average
        metrics = eval_result[pred_type][0].keys()
        
        # Calculate mean for each metric
        for metric in metrics:
            metric_values = [result[metric] for result in eval_result[pred_type]]
            metric_means[metric] = np.mean(metric_values)
        
        mean_results[pred_type] = metric_means
    
    # Convert mean results to DataFrame for easy comparison
    mean_results_df = pd.DataFrame(mean_results)
    
    return eval_result, mean_results_df

# Run evaluation
eval_result, mean_results_df = evaluate_prediction_sets(eval_dict)

# Display mean results comparison
print("\nMean Evaluation Metrics Comparison:")
print(mean_results_df)



# lightgbm training function prompt

