In [15]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.calibration import CalibratedClassifierCV
import matplotlib.pyplot as plt

In [16]:
df = pd.read_csv("../data/cleaned_race_results.csv")
race_counts = df['race_id'].value_counts()
races_with_16 = race_counts[race_counts == 16].index
df = df[df['race_id'].isin(races_with_16)]

In [17]:
X = df[["track_distance", "total_weight", "age", "sex_c", "sex_f", "sex_g", "sex_h", "sex_m", "grade_g1", "grade_g2", "grade_g3", "weather_weather01", "weather_weather02", "weather_weather03", "weather_weather04", "speed_mps", "avg_speed_mps", "avg_final_time_hist"]]
y = df["top1"]
race_id = df["race_id"]
horse_id = df["horse_id"]
finish_position = df["finish_position"]

In [18]:
unique_race_id = race_id.unique()
train_races, test_races = train_test_split(unique_race_id, test_size=0.2)
train_index = race_id.isin(train_races)
test_index = race_id.isin(test_races)

In [19]:
X_train = X[train_index]
y_train = y[train_index]
X_test = X[test_index]
y_test = y[test_index]
race_id_train = race_id[train_index]
race_id_test = race_id[test_index]
horse_id_train = horse_id[train_index]
horse_id_test = horse_id[test_index]
finish_position_test = finish_position[test_index]

In [20]:
param_grid = {
    'max_depth': [3, 5],
    'num_leaves': [15, 31],
    'min_child_samples': [10, 20],
    'feature_fraction': [0.8, 1.0]
}

In [21]:
lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", is_unbalance=True, force_row_wise=True, random_state=42)

In [22]:
group_kfold = GroupKFold(n_splits=3) #same groups stay together
model = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    cv=group_kfold.split(X_train, y_train, groups=race_id_train),
    verbose=1,
    n_jobs=-1
)

In [23]:
model.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits

[LightGBM] [Info] Number of positive: 545, number of negative: 8143
[LightGBM] [Info] Number of positive: 543, number of negative: 8145
[LightGBM] [Info] Number of positive: 545, number of negative: 8143[LightGBM] [Info] Number of positive: 546, number of negative: 8158

[LightGBM] [Info] Number of positive: 543, number of negative: 8145
[LightGBM] [Info] Number of positive: 545, number of negative: 8143
[LightGBM] [Info] Number of positive: 543, number of negative: 8145
[LightGBM] [Info] Number of positive: 546, number of negative: 8158
[LightGBM] [Info] Total Bins 1013
[LightGBM] [Info] Total Bins 1013
[LightGBM] [Info] Total Bins 1013
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 8688, number of used features: 17
[LightGBM] [Info] Number of data points in the train set: 8688, number of used features: 17
[Light

In [24]:
X_model, X_cali, y_model, y_cali = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [25]:
best_lightgbm = model.best_estimator_
best_lightgbm.fit(X_model, y_model)

[LightGBM] [Info] Number of positive: 670, number of negative: 9762
[LightGBM] [Info] Total Bins 1030
[LightGBM] [Info] Number of data points in the train set: 10432, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064225 -> initscore=-2.678975
[LightGBM] [Info] Start training from score -2.678975
