In [10]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRanker
from sklearn.model_selection import train_test_split

In [11]:
df = pd.read_csv("../data/cleaned_race_results_for_lightgbm.csv")
df = df.sort_values(['race_id', 'finish_position'])

In [12]:
features = ['odds', 'horse_weight', 'jockey_weight', 'age', 'final_time']
X = df[features]

race_ids = df['race_id'].unique()
train_races, test_races = train_test_split(race_ids, test_size=0.2, random_state=42)

train_df = df[df['race_id'].isin(train_races)]
test_df = df[df['race_id'].isin(test_races)]

X_train = train_df[features]
y_train = 17 - train_df['finish_position']
group_train = train_df.groupby('race_id').size().to_list()

X_test = test_df[features]
y_test = 17 - test_df['finish_position']
group_test = test_df.groupby('race_id').size().to_list()

In [13]:
param_sets = [
    {'learning_rate': 0.05, 'num_leaves': 31, 'min_child_samples': 20},
    {'learning_rate': 0.1, 'num_leaves': 63, 'min_child_samples': 10},
    {'learning_rate': 0.07, 'num_leaves': 40, 'min_child_samples': 15},
]

best_accuracy = 0
best_model = None
best_test_df = None

for params in param_sets:
    model = LGBMRanker(
        objective='lambdarank',
        metric='ndcg',
        n_estimators=200,
        random_state=42,
        **params
    )
    model.fit(X_train, y_train, group=group_train)
    test_df['pred_score'] = model.predict(X_test)
    test_df['pred_prob'] = test_df.groupby('race_id')['pred_score'].transform(lambda x: np.exp(x - np.max(x)) / np.sum(np.exp(x - np.max(x))))
    test_df['predicted_rank'] = test_df.groupby('race_id')['pred_prob'].rank(ascending=False, method='first')
    top1_preds = test_df.groupby('race_id').apply(lambda g: g.loc[g['pred_prob'].idxmax()])
    top1_acc = (top1_preds['finish_position'] == 1).mean()
    print(f"Params: {params}, Top-1 Accuracy: {top1_acc:.4f}")
    if top1_acc > best_accuracy:
        best_accuracy = top1_acc
        best_model = model
        best_test_df = test_df.copy()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 639
[LightGBM] [Info] Number of data points in the train set: 13040, number of used features: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['pred_score'] = model.predict(X_test)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['pred_prob'] = test_df.groupby('race_id')['pred_score'].transform(lambda x: np.exp(x - np.max(x)) / np.sum(np.exp(x - np.max(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predicted_r

Params: {'learning_rate': 0.05, 'num_leaves': 31, 'min_child_samples': 20}, Top-1 Accuracy: 0.4167
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000345 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 639
[LightGBM] [Info] Number of data points in the train set: 13040, number of used features: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['pred_score'] = model.predict(X_test)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['pred_prob'] = test_df.groupby('race_id')['pred_score'].transform(lambda x: np.exp(x - np.max(x)) / np.sum(np.exp(x - np.max(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predicted_r

Params: {'learning_rate': 0.1, 'num_leaves': 63, 'min_child_samples': 10}, Top-1 Accuracy: 0.4902
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000293 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 639
[LightGBM] [Info] Number of data points in the train set: 13040, number of used features: 5
Params: {'learning_rate': 0.07, 'num_leaves': 40, 'min_child_samples': 15}, Top-1 Accuracy: 0.4412


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['pred_score'] = model.predict(X_test)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['pred_prob'] = test_df.groupby('race_id')['pred_score'].transform(lambda x: np.exp(x - np.max(x)) / np.sum(np.exp(x - np.max(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predicted_r

In [14]:
results_df = best_test_df[['race_id', 'horse_id', 'finish_position', 'pred_score', 'pred_prob', 'predicted_rank']]
results_df.to_csv("../data/lightgbm_ranker_results.csv", index=False)

print(f"\nBest Top-1 Accuracy: {best_accuracy:.4f}")


Best Top-1 Accuracy: 0.4902
