In [256]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score, KFold, train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, roc_auc_score
from sklearn.metrics import plot_roc_curve
from category_encoders import OneHotEncoder

from sklearn.preprocessing import KBinsDiscretizer

from trueskill import Rating, quality_1vs1, rate_1vs1

In [257]:
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set

<function seaborn.rcmod.set(context='notebook', style='darkgrid', palette='deep', font='sans-serif', font_scale=1, color_codes=True, rc=None)>

In [258]:
horse = pd.read_csv('race_result_horse.csv')

In [259]:
race = pd.read_csv('race_result_race.csv')

In [260]:
race.columns

Index(['src', 'race_date', 'race_course', 'race_number', 'race_id',
       'race_class', 'race_distance', 'track_condition', 'race_name', 'track',
       'sectional_time', 'incident_report'],
      dtype='object')

In [261]:
del race['incident_report']

In [262]:
profiles = pd.read_csv('horse_profiles_2.csv', index_col = 0)

In [263]:
horse = horse.merge(race, on='race_id', how='left')

In [264]:
horse = horse.merge(profiles, on='horse_id', how='left')

In [265]:
del horse['src']; del horse['import_type']; del horse['age']; 
del horse['sectional_time']; del horse['length_behind_winner']; del horse['race_name']; 
del horse['color']; del horse['sex']; del horse['dam']; del horse['dam_sire']; del horse['horse_id'];

In [266]:
for i in range(1,7):
    del horse['running_position_{}'.format(i)]

### Create Target (Finishing 1st)

In [267]:
target = 'win'
horse[target] = (horse['finishing_position'].isin(['1', '1 DH'])).astype(np.int)

In [268]:
horse.drop(index = 17242, inplace = True)

In [269]:
horse.drop(index = 11053, inplace = True)

### Win Odds Probability

In [270]:
horse.shape

(30187, 21)

In [271]:
horse = horse[horse['win_odds'] != '---']

In [272]:
horse['win_odds'] = horse['win_odds'].astype(float)

In [273]:
unique_race_ids = list(horse['race_id'].unique())

In [274]:
horse_win_odds_prob = []
for i in unique_race_ids:
    race = horse[horse.race_id == i]
    race_win_odds = race.win_odds
    inverse_odds = race_win_odds.map(lambda x: 1/x)
    inverse_odds_sum = inverse_odds.sum()
    scaled_inverse_odds = list(inverse_odds.map(lambda x: x/inverse_odds_sum))
    horse_names = list(race.horse_name)
    len_horses = list(range(len(horse_names)))
    dates = list(race.race_date)
    for x in len_horses:
        new_horse = {'horse_name': horse_names[x], 'win_odds_prob': scaled_inverse_odds[x], 'race_date': dates[x]}
        horse_win_odds_prob.append(new_horse)

In [275]:
df = pd.DataFrame(horse_win_odds_prob)

In [276]:
horse = horse.merge(df, on=['horse_name', 'race_date'], how='left')

In [277]:
horse.shape

(29598, 22)

In [278]:
horse['finish_time'] = horse['finish_time'].str.split('.')

In [279]:
horse = horse[horse['finish_time'].apply(lambda x: len(x)) == 3]

In [280]:
horse['finish_time'] = horse['finish_time'].apply(lambda x: int(x[0])*60.0 + int(x[1]) + int(x[2])*0.01)

In [281]:
horse['speed'] = horse['race_distance'] / horse['finish_time']

### Probabilities from Speed

In [282]:
horse[horse['race_id'] == '2015-030']['speed']

10098    16.823216
10099    16.752757
10100    16.738736
10101    16.729402
10102    16.701461
10103    16.696814
10104    16.657413
10105    16.641243
10106    16.634322
10107    16.599806
10108    16.463164
10109    16.319869
Name: speed, dtype: float64

### Horse TrueSkill

In [283]:
from trueskill import Rating, quality_1vs1, rate_1vs1

In [284]:
horse = horse.replace(to_replace =["1 DH"], value ="1")
horse = horse.replace(to_replace =["2 DH"], value ="2")
horse = horse.replace(to_replace =["3 DH"], value ="3")
horse = horse.replace(to_replace =["4 DH"], value ="4")
horse = horse.replace(to_replace =["5 DH"], value ="5")
horse = horse.replace(to_replace =["6 DH"], value ="6")
horse = horse.replace(to_replace =["7 DH"], value ="7")
horse = horse.replace(to_replace =["8 DH"], value ="8")
horse = horse.replace(to_replace =["9 DH"], value ="9")
horse = horse.replace(to_replace =["10 DH"], value ="10")
horse = horse.replace(to_replace =["11 DH"], value ="11")
horse = horse.replace(to_replace =["12 DH"], value ="12")
horse = horse.replace(to_replace =["13 DH"], value ="13")
horse = horse.replace(to_replace =["14 DH"], value ="14")
horse = horse[horse.finishing_position != 'WV']
horse = horse[horse.finishing_position != 'TNP']
horse = horse[horse.finishing_position != 'DISQ']
horse = horse[horse.finishing_position != 'WX-A']
horse = horse[horse.finishing_position != 'DNF']
horse = horse[horse.finishing_position != 'WV-A']
horse = horse[horse.finishing_position != 'WX']
horse = horse[horse.finishing_position != 'PU']
horse = horse[horse.finishing_position != 'UR']
horse = horse[horse.finishing_position != 'FE']

In [285]:
horse['finishing_position'] = horse['finishing_position'].astype(int)

In [286]:
import trueskill
ts = trueskill.TrueSkill()
horses = horse['horse_name'].unique().ravel()
rating_dict = dict()
for x in horses:
    rating_dict[x] = ts.create_rating()
race_groups = horse.groupby('race_id')
def update_ratings(new_ratings, rating_dict):
    for key in new_ratings.keys():
        rating_dict[key] = new_ratings[key]
    return rating_dict
feature_list = []
mu_list = []
sigma_list = []
k_list = []
true_skill_dicts = []
for i, group in enumerate(race_groups):
    horse_ids = group[1]['horse_name'].values.tolist()
    date = group[1]['race_date'].values.tolist()[0]
    horse_dict = [(rating_dict[k],) for k in horse_ids if k in rating_dict]
    for k in horse_ids:
        mu_list.append(rating_dict[k].mu)
        sigma_list.append(rating_dict[k].sigma)
        k_list.append(k)
        true_skill_values = {'mu_horse': rating_dict[k].mu, 'sigma_horse': rating_dict[k].sigma, 'horse_name': k, 'race_date' : date}
        true_skill_dicts.append(true_skill_values)
        
    # for ranks 0 is winner
    ranks = (group[1]['finishing_position']-1).astype(np.int).values.tolist()
    updated_horses = ts.rate(horse_dict, ranks=ranks)
    updated_horses = dict(zip(horse_ids,list(sum(updated_horses, ()))))    
    # update the rating dictionary
    rating_dict = update_ratings(updated_horses, rating_dict)

In [287]:
# rating_dict

In [288]:
df = pd.DataFrame(true_skill_dicts)

In [289]:
horse = horse.merge(df, on=['horse_name', 'race_date'], how='left')

In [290]:
horse.shape

(29519, 25)

In [291]:
horse['horse_rank'] = horse.mu_horse - (3 * horse.sigma_horse)

In [292]:
# horse[horse['horse_name'] == 'MASSIVE'].sort_values(by = 'race_date')

### Jockey TrueSkill

In [293]:
import trueskill
ts = trueskill.TrueSkill()
jockeys = horse['jockey'].unique().ravel()
rating_dict = dict()
for x in jockeys:
    rating_dict[x] = ts.create_rating()
race_groups = horse.groupby('race_id')
def update_ratings(new_ratings, rating_dict):
    for key in new_ratings.keys():
        rating_dict[key] = new_ratings[key]
    return rating_dict
feature_list = []
mu_list = []
sigma_list = []
k_list = []
true_skill_dicts = []
for i, group in enumerate(race_groups):
    jockey_ids = group[1]['jockey'].values.tolist()
    date = group[1]['race_date'].values.tolist()[0]
    horse_id = group[1]['horse_name'].values.tolist()
    jockey_dict = [(rating_dict[k],) for k in jockey_ids if k in rating_dict]
    for x, k in enumerate(jockey_ids):
        mu_list.append(rating_dict[k].mu)
        sigma_list.append(rating_dict[k].sigma)
        k_list.append(k)
        true_skill_values = {'mu_jockey': rating_dict[k].mu, 'sigma_jockey': rating_dict[k].sigma, 'race_date' : date, 'jockey': k, 'horse_name':horse_id[x]}
        true_skill_dicts.append(true_skill_values)
        
    # for ranks 0 is winner
    ranks = (group[1]['finishing_position']-1).astype(np.int).values.tolist()
    updated_horses = ts.rate(jockey_dict, ranks=ranks)
    updated_horses = dict(zip(jockey_ids,list(sum(updated_horses, ()))))    
    # update the rating dictionary
    rating_dict = update_ratings(updated_horses, rating_dict)

In [294]:
df = pd.DataFrame(true_skill_dicts)

In [295]:
# df.info()

In [296]:
# horse.shape

In [297]:
horse = horse.merge(df, on=['horse_name', 'race_date', 'jockey'], how='left')

In [298]:
horse.shape

(29519, 28)

In [299]:
# horse.columns

In [300]:
horse['jockey_rank'] = horse.mu_jockey - (3 * horse.sigma_jockey)

In [301]:
horse[horse['jockey'] == 'B Prebble']

Unnamed: 0,finishing_position,horse_number,horse_name,jockey,trainer,actual_weight,declared_horse_weight,draw,finish_time,win_odds,...,sire,win,win_odds_prob,speed,mu_horse,sigma_horse,horse_rank,mu_jockey,sigma_jockey,jockey_rank
0,1,1.0,DOUBLE DRAGON,B Prebble,D Cruz,133,1032,1,82.33,3.8,...,Lord Of Warriors,1,0.214701,17.004737,25.000000,8.333333,0.000000,25.000000,8.333333,0.000000
23,11,7.0,AUTUMN GOLD,B Prebble,S Woods,123,1011,14,82.34,21.0,...,Ransom O'War,0,0.038864,17.002672,25.000000,8.333333,0.000000,22.880915,1.913473,17.140495
78,13,4.0,EXAGGERATION,B Prebble,J Moore,127,1141,4,57.74,57.0,...,Exceed And Excel,0,0.014221,17.319016,25.000000,8.333333,0.000000,39.351758,5.409320,23.123797
90,11,8.0,BEST TANGO,B Prebble,W Y So,123,1089,2,82.78,8.0,...,Mujahid,0,0.101781,16.912298,25.000000,8.333333,0.000000,25.441136,3.484735,14.986930
100,8,7.0,CULTURAL CITY,B Prebble,W Y So,124,1070,9,83.64,41.0,...,Elusive City,0,0.019912,16.738403,25.000000,8.333333,0.000000,22.261409,2.777814,13.927966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29456,14,9.0,HAPPY HAPPY,B Prebble,L Ho,124,1123,7,86.23,71.0,...,Rip Van Winkle,0,0.011438,16.235649,16.969617,3.078301,7.734713,25.470013,0.612413,23.632774
29459,3,7.0,ASHKIYR,B Prebble,J Moore,126,1076,4,111.63,15.0,...,Rock Of Gibraltar,0,0.054094,16.124698,24.850663,0.761025,22.567589,25.247236,0.612940,23.408415
29476,6,9.0,REMARKABLE,B Prebble,J Size,123,1077,3,84.91,41.0,...,More Than Ready,0,0.019967,16.488046,18.460840,3.020453,9.399480,25.392828,0.612691,23.554754
29487,4,5.0,GONNA RUN,B Prebble,C Fownes,131,1057,9,96.58,16.0,...,Hurricane Cat,0,0.050654,16.566577,29.678225,0.834493,27.174746,25.435152,0.612329,23.598166


### Elapsed Time Since Previous Race

In [302]:
horse['race_date'] = pd.to_datetime(horse['race_date'], format='%Y-%m-%d', errors='ignore')

In [303]:
list_of_horses = list(horse.horse_name.unique())
horse_time_since_by_race = []
for i in list_of_horses:
    time_since_last_race = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['race_date'].diff())
    len_dates = list(range(len(time_since_last_race)))
    dates = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['race_date'])
    for x in len_dates:
        new_horse = {'horse_name': i, 'days_since_last_race': time_since_last_race[x], 'race_date': dates[x]}
        horse_time_since_by_race.append(new_horse)

In [304]:
df = pd.DataFrame(horse_time_since_by_race)

In [305]:
horse = horse.merge(df, on=['horse_name', 'race_date'], how='left')

In [306]:
horse['days_since_last_race'] = horse['days_since_last_race'].astype(int)/(24*60*60*(10**9))

Remove null values.

In [307]:
# horse = horse[horse.days_since_last_race.notnull()]

### Keep only 1200m Races

In [308]:
horse = horse[horse.race_distance == 1200]

### Horse Winning Percentage

In [309]:
list_of_horses = list(horse.horse_name.unique())
winning_percentage = []
for i in list_of_horses:
    wins = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['win'].cumsum().shift())
    races = list(range(len(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['win'])))
    dates = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['race_date'])
    len_races = list(range(len(races)))
    for x in len_races:
        try:
            win_perc = wins[x]/races[x]
            new_horse = {'horse_name': i, 'horse_win_perc': win_perc, 'race_date': dates[x]}
            winning_percentage.append(new_horse)
        except:
            new_horse = {'horse_name': i, 'horse_win_perc': None, 'race_date': dates[x]}
            winning_percentage.append(new_horse)

In [310]:
df = pd.DataFrame(winning_percentage)

In [311]:
horse = horse.merge(df, on=['horse_name', 'race_date'], how='left')

In [312]:
horse.shape

(9923, 31)

### Jockey Winning Percentage

In [313]:
list_of_jockeys = list(horse.jockey.unique())
winning_percentage = []
for i in list_of_jockeys:
    wins = list(horse.loc[horse['jockey'] == i].sort_values(by = ['race_date'])['win'].cumsum().shift())
    races = list(range(len(horse.loc[horse['jockey'] == i].sort_values(by = ['race_date'])['win'])))
    dates = list(horse.loc[horse['jockey'] == i].sort_values(by = ['race_date'])['race_date'])
    horses = list(horse.loc[horse['jockey'] == i].sort_values(by = ['race_date'])['horse_name'])
    len_races = list(range(len(races)))
    for x in len_races:
        try:
            win_perc = wins[x]/races[x]
            new_jockey = {'jockey': i, 'jockey_win_perc': win_perc, 'race_date': dates[x], 'horse_name': horses[x]}
            winning_percentage.append(new_jockey)
        except:
            new_jockey = {'jockey': i, 'jockey_win_perc': None, 'race_date': dates[x], 'horse_name': horses[x]}
            winning_percentage.append(new_jockey)

In [314]:
df = pd.DataFrame(winning_percentage)

In [315]:
horse = horse.merge(df, on=['horse_name', 'race_date', 'jockey'], how='left')

In [316]:
horse.shape

(9923, 32)

### Trainer Winning Percentage

In [317]:
list_of_trainers = list(horse.trainer.unique())
winning_percentage_trainer = []
for i in list_of_trainers:
    wins = list(horse.loc[horse['trainer'] == i].sort_values(by = ['race_date'])['win'].cumsum().shift())
    races = list(range(len(horse.loc[horse['trainer'] == i].sort_values(by = ['race_date'])['win'])))
    dates = list(horse.loc[horse['trainer'] == i].sort_values(by = ['race_date'])['race_date'])
    horses = list(horse.loc[horse['trainer'] == i].sort_values(by = ['race_date'])['horse_name'])
    len_races = list(range(len(races)))
    for x in len_races:
        try:
            win_perc_t = wins[x]/races[x]
            new_trainer = {'trainer': i, 'trainer_win_perc': win_perc_t, 'race_date': dates[x], 'horse_name': horses[x]}
            winning_percentage_trainer.append(new_trainer)
        except:
            new_trainer = {'trainer': i, 'trainer_win_perc': None, 'race_date': dates[x], 'horse_name': horses[x]}
            winning_percentage_trainer.append(new_trainer)

In [318]:
df = pd.DataFrame(winning_percentage_trainer)

In [319]:
horse = horse.merge(df, on=['horse_name', 'race_date', 'trainer'], how='left')

In [320]:
horse.shape

(9923, 33)

### Find Weight Loss From Previous Race

Drop all rows with missing Declared Horse Weights.

In [321]:
horse = horse[horse['declared_horse_weight'] != '-']

In [322]:
list_of_horses = list(horse.horse_name.unique())
horse_weight_change_by_race = []
for i in list_of_horses:
    horse_weight_change = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['declared_horse_weight'].astype('int').pct_change())
    len_weights = list(range(len(horse_weight_change)))
    dates = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['race_date'])
    for x in len_weights:
        new_horse = {'horse_name': i, 'weight_change': horse_weight_change[x], 'race_date': dates[x]}
        horse_weight_change_by_race.append(new_horse)

In [323]:
df = pd.DataFrame(horse_weight_change_by_race)

In [324]:
horse = horse.merge(df, on=['horse_name', 'race_date'], how='left')

Remove null values.

In [71]:
# horse = horse[horse.weight_change.notnull()]

In [72]:
# horse.shape

### Remove Unwanted Fields

In [73]:
# del horse['src']; del horse['incident_report']; del horse['import_type']; del horse['age']; 
# del horse['sectional_time']; del horse['length_behind_winner']; del horse['race_name']; 
# del horse['color']; del horse['sex']; del horse['dam']; del horse['dam_sire']; del horse['horse_id'];

In [74]:
# for i in range(1,7):
#     del horse['running_position_{}'.format(i)]

### Turn Finish Time into Seconds

In [75]:
# horse['finish_time'] = horse['finish_time'].str.split('.')

In [76]:
# horse = horse[horse['finish_time'].apply(lambda x: len(x)) == 3]

In [77]:
# horse['finish_time'] = horse['finish_time'].apply(lambda x: int(x[0])*60.0 + int(x[1]) + int(x[2])*0.01)

### Tracks

In [78]:
horse.race_course.value_counts()

Sha Tin         5854
Happy Valley    4069
Name: race_course, dtype: int64

### Create Average Speed Category

In [79]:
# horse['speed'] = horse['race_distance'] / horse['finish_time']

In [80]:
# gp = horse[['horse_name', 'speed']].groupby('horse_name').mean()

In [81]:
# gp2 = horse[['horse_name', 'speed']].groupby('horse_name').std()
# gp2 = gp2.rename({'speed':'speed_std'}, axis=1)

# del horse['speed']
# horse = horse.merge(gp, on='horse_name', how='left')
# horse = horse.merge(gp2, on='horse_name', how='left')

# horse.head()

In [82]:
# gp = horse[['race_id', 'speed']].groupby('race_id').mean()
# gp = gp.rename({'speed':'race_avg_speed'}, axis=1)
# gp

In [83]:
# horse = horse.merge(gp, on='race_id', how='left')

# horse.head()

In [84]:
# horse['race_rel_speed'] = horse['speed'] - horse['race_avg_speed']

# horse.head()

In [325]:
gp = horse[['draw', target]].groupby('draw').mean().rename({target:'draw_encoded'},axis=1)
# gp

In [326]:
horse = horse.merge(gp, on='draw', how='left')

In order to avoid using future data to predict a race, I created a column for average speed at a particular distance for each horse only using past data.

In [327]:
horse.loc[horse['horse_name'] == 'MASSIVE'].sort_values(by = ['race_date'])['speed'].rolling(1).mean().shift(1)

1             NaN
196     17.101325
521     17.076989
979     17.030940
2647    16.968326
3035    17.067274
3458    16.882386
Name: speed, dtype: float64

In [328]:
horse.loc[horse['horse_name'] == 'MASSIVE'].sort_values(by = ['race_date'])['speed']

1       17.101325
196     17.076989
521     17.030940
979     16.968326
2647    17.067274
3035    16.882386
3458    16.944366
Name: speed, dtype: float64

In [329]:
list_of_horses = list(horse.horse_name.unique())
horse_average_speed_by_race = []
for i in list_of_horses:
    horse_speeds = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['speed'].expanding().mean().shift(1))
    len_speeds = list(range(len(horse_speeds)))
    dates = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['race_date'])
    for x in len_speeds:
        new_horse = {'horse_name': i, 'average_speed': horse_speeds[x], 'race_date': dates[x]}
        horse_average_speed_by_race.append(new_horse)

In [330]:
df = pd.DataFrame(horse_average_speed_by_race)

In [331]:
horse = horse.merge(df, on=['horse_name', 'race_date'], how='left')

In [332]:
# horse.sort_values(by='horse_name')

In [333]:
horse.shape

(9923, 36)

In [334]:
horse_relative_average_speed = []
for i in unique_race_ids:
    race = horse[horse.race_id == i]
    race_average_speed = race.average_speed.mean()
    average_speed_diff = list(-1 * (race.average_speed - race_average_speed))
    horse_names = list(race.horse_name)
    len_horses = list(range(len(horse_names)))
    dates = list(race.race_date)
    for x in len_horses:
        new_horse = {'horse_name': horse_names[x], 'average_speed_diff': average_speed_diff[x], 'race_date': dates[x]}
        horse_relative_average_speed.append(new_horse)

In [335]:
# horse_relative_average_speed

In [336]:
df = pd.DataFrame(horse_relative_average_speed)

In [337]:
horse = horse.merge(df, on=['horse_name', 'race_date'], how='left')

In [338]:
horse.shape

(9923, 37)

### Max Speed Differential

In [339]:
list_of_horses = list(horse.horse_name.unique())
horse_max_speed = []
for i in list_of_horses:
    horse_max_speeds = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['speed'].cummax().shift(1))
    len_speeds = list(range(len(horse_max_speeds)))
    dates = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['race_date'])
    for x in len_speeds:
        new_horse = {'horse_name': i, 'max_speed': horse_max_speeds[x], 'race_date': dates[x]}
        horse_max_speed.append(new_horse)

In [340]:
df = pd.DataFrame(horse_max_speed)

In [341]:
horse = horse.merge(df, on=['horse_name', 'race_date'], how='left')

In [342]:
unique_race_ids = list(horse.race_id.unique())

In [343]:
horse_relative_max_speed = []
for i in unique_race_ids:
    race = horse[horse.race_id == i]
    race_max_speed = race.max_speed.max()
    max_speed_diff = list(-1 * (race.max_speed - race_max_speed))
    horse_names = list(race.horse_name)
    len_horses = list(range(len(horse_names)))
    dates = list(race.race_date)
    for x in len_horses:
        new_horse = {'horse_name': horse_names[x], 'max_speed_diff': max_speed_diff[x], 'race_date': dates[x]}
        horse_relative_max_speed.append(new_horse)

In [344]:
df = pd.DataFrame(horse_relative_max_speed)

In [345]:
horse = horse.merge(df, on=['horse_name', 'race_date'], how='left')

In [346]:
horse.shape

(9923, 39)

### Min Speed Differential

In [347]:
list_of_horses = list(horse.horse_name.unique())
horse_min_speed = []
for i in list_of_horses:
    horse_min_speeds = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['speed'].cummin().shift(1))
    len_speeds = list(range(len(horse_min_speeds)))
    dates = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['race_date'])
    for x in len_speeds:
        new_horse = {'horse_name': i, 'min_speed': horse_min_speeds[x], 'race_date': dates[x]}
        horse_min_speed.append(new_horse)

In [348]:
df = pd.DataFrame(horse_min_speed)

In [349]:
horse = horse.merge(df, on=['horse_name', 'race_date'], how='left')

In [350]:
horse_relative_min_speed = []
for i in unique_race_ids:
    race = horse[horse.race_id == i]
    race_min_speed = race.min_speed.min()
    min_speed_diff = list(race.min_speed - race_min_speed)
    horse_names = list(race.horse_name)
    len_horses = list(range(len(horse_names)))
    dates = list(race.race_date)
    for x in len_horses:
        new_horse = {'horse_name': horse_names[x], 'min_speed_diff': min_speed_diff[x], 'race_date': dates[x]}
        horse_relative_min_speed.append(new_horse)

In [351]:
df = pd.DataFrame(horse_relative_min_speed)

In [352]:
horse = horse.merge(df, on=['horse_name', 'race_date'], how='left')

In [353]:
horse.shape

(9923, 41)

### Find Horse Weight at Max Speed

In [114]:
# horse.columns

In [115]:
# import math

In [116]:
# list_of_horses = list(horse.horse_name.unique())
# max_speed_weight = []
# # for i in list_of_horses:
# horse_max_speeds = list(horse.loc[horse['horse_name'] == 'LUCKY BUBBLES'].sort_values(by = ['race_date'])['speed'].cummax().shift(1))
# horse_speeds = list(horse.loc[horse['horse_name'] == 'LUCKY BUBBLES'].sort_values(by = ['race_date'])['speed'].shift(1))
# horse_weights = list(horse.loc[horse['horse_name'] == 'LUCKY BUBBLES'].sort_values(by = ['race_date'])['declared_horse_weight'].shift(1))

# print(horse_max_speeds)
# print(horse_speeds)
# print(horse_weights)
# # num_of_races = list(range(len(horse_max_speeds)))
# # for x in num_of_races:
# #     if math.isnan(horse_max_speeds[x]):
# #         max_speed_weight.append[None]
# #     elif horse_max_speeds[x] == horse_speeds[x]:
# #         k = x
# #         max_speed_weight.append[horse_weights[x]]
# #     elif horse_max_speeds[x] > horse_speeds[x]:
# #         max_speed_weight.append[horse_weights[k]]

# # len_speeds = list(range(len(horse_max_speeds)))
# # dates = list(horse.loc[horse['horse_name'] == 'MASSIVE'].sort_values(by = ['race_date'])['race_date'])
# # for x in len_speeds:
# #     new_horse = {'horse_name': 'MASSIVE', 'max_speed': horse_max_speeds[x], 'race_date': dates[x]}
# #     horse_max_speed.append(new_horse)

In [117]:
# print(horse_max_speed)

In [118]:
# horse.loc[horse['horse_name'] == 'MASSIVE'].sort_values(by = ['race_date'])

In [119]:
# max_speed_weights

### Previous Finishing Position at Same Distance

In [354]:
list_of_horses = list(horse.horse_name.unique())
previous_finishing_position = []
for i in list_of_horses:
    prev_fin_pos = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['finishing_position'].shift())
    len_pos = list(range(len(prev_fin_pos)))
    dates = list(horse.loc[horse['horse_name'] == i].sort_values(by = ['race_date'])['race_date'])
    for x in len_pos:
        new_horse = {'horse_name': i, 'previous_finishing_position': prev_fin_pos[x], 'race_date': dates[x]}
        previous_finishing_position.append(new_horse)

In [355]:
df = pd.DataFrame(previous_finishing_position)

In [356]:
horse = horse.merge(df, on=['horse_name', 'race_date'], how='left')

In [357]:
horse.shape

(9923, 42)

### Cleaning the Data

In [124]:
# horse.win_odds = horse.win_odds.astype('float')

In [358]:
horse = horse[horse.average_speed.notnull()]

In [359]:
horse = horse[horse.jockey_win_perc.notnull()]

In [360]:
horse.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8274 entries, 144 to 9922
Data columns (total 42 columns):
finishing_position             8274 non-null int64
horse_number                   8274 non-null float64
horse_name                     8274 non-null object
jockey                         8274 non-null object
trainer                        8274 non-null object
actual_weight                  8274 non-null object
declared_horse_weight          8274 non-null object
draw                           8274 non-null object
finish_time                    8274 non-null float64
win_odds                       8274 non-null float64
race_id                        8274 non-null object
race_date                      8274 non-null datetime64[ns]
race_course                    8274 non-null object
race_number                    8274 non-null int64
race_class                     8274 non-null object
race_distance                  8274 non-null int64
track_condition                8274 non-null object

### Decision Tree

In [372]:
import category_encoders as ce
from sklearn.compose import ColumnTransformer

In [386]:
horse.columns

Index(['finishing_position', 'horse_number', 'horse_name', 'jockey', 'trainer',
       'actual_weight', 'declared_horse_weight', 'draw', 'finish_time',
       'win_odds', 'race_id', 'race_date', 'race_course', 'race_number',
       'race_class', 'race_distance', 'track_condition', 'track', 'origin',
       'sire', 'win', 'win_odds_prob', 'speed', 'mu_horse', 'sigma_horse',
       'horse_rank', 'mu_jockey', 'sigma_jockey', 'jockey_rank',
       'days_since_last_race', 'horse_win_perc', 'jockey_win_perc',
       'trainer_win_perc', 'weight_change', 'draw_encoded', 'average_speed',
       'average_speed_diff', 'max_speed', 'max_speed_diff', 'min_speed',
       'min_speed_diff', 'previous_finishing_position'],
      dtype='object')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

In [378]:
cat_cols = ['track_condition', 'track', 'race_class']
num_cols = ['draw_encoded', 'race_number', 'max_speed_diff', 'min_speed_diff', 'horse_rank', 'jockey_win_perc','actual_weight', 'days_since_last_race', 'win_odds_prob']

In [388]:
used_cols = num_cols + cat_cols

In [389]:
num_imputer = SimpleImputer(strategy='median')
scaler = RobustScaler()
encoder = ce.TargetEncoder(cols=cat_cols, handle_missing="value")

num_transformer = make_pipeline(num_imputer, scaler)
cat_transformer = make_pipeline(encoder)

preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_cols),('cat', cat_transformer, cat_cols)])

model = LogisticRegression()

dt_pipe = make_pipeline(preprocessor, model)

X, y = horse[used_cols], horse[target]

scores = cross_val_score(dt_pipe, X, y, cv=5, scoring = 'roc_auc')
scores.mean(), scores.std()

(0.7773045569727846, 0.030766007759518824)

In [380]:
N=3
dt = DecisionTreeClassifier(class_weight='balanced', criterion='entropy', 
                            random_state=42, 
                            max_depth=N, max_leaf_nodes=2**N-1)

imputer = SimpleImputer()
dt_pipe = make_pipeline(imputer, dt)

used_cols = ['draw_encoded', 'race_number', 'max_speed_diff', 'min_speed_diff', 'horse_rank', 'jockey_win_perc','actual_weight', 'days_since_last_race', 'win_odds']
X, y = horse[used_cols], horse[target]

scores = cross_val_score(dt_pipe, X.values, y.values, cv=5, scoring='roc_auc', n_jobs=-1)
scores.mean(), scores.std()

(0.7717660260328614, 0.025640466024998625)

Working model

In [366]:
N=3
dt = DecisionTreeClassifier(class_weight='balanced', criterion='entropy', 
                            random_state=42, 
                            max_depth=N, max_leaf_nodes=2**N-1)

imputer = SimpleImputer()
dt_pipe = make_pipeline(imputer, dt)

used_cols = ['draw_encoded', 'race_number', 'max_speed_diff', 'min_speed_diff', 'horse_rank', 'jockey_win_perc','actual_weight', 'days_since_last_race', 'win_odds']
X, y = horse[used_cols], horse[target]

scores = cross_val_score(dt_pipe, X.values, y.values, cv=5, scoring='roc_auc', n_jobs=-1)
scores.mean(), scores.std()

(0.7717660260328614, 0.025640466024998625)

In [158]:
N=3
dt = DecisionTreeClassifier(class_weight='balanced', criterion='gini', 
                            random_state=42, 
                            max_depth=N, max_leaf_nodes=2**N-1)
imputer = SimpleImputer()
dt_pipe = make_pipeline(imputer, dt)

used_cols = ['draw_encoded', 'race_number', 'max_speed_diff', 'min_speed_diff', 'horse_rank', 'jockey_win_perc','actual_weight', 'days_since_last_race', 'win_odds']
X, y = horse[used_cols], horse[target]

scores = cross_val_score(dt_pipe, X.values, y.values, cv=5, scoring='roc_auc', n_jobs=-1)
scores.mean(), scores.std()

(0.770644530768352, 0.025739661886030798)

Gini appears to produce a higher AUC and a lower standard deviation.

### Random Forest

In [183]:
from sklearn.ensemble import RandomForestClassifier

In [184]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

In [185]:
rf = RandomForestClassifier()

In [197]:
rf = RandomForestClassifier(n_estimators=50, criterion='entropy', 
                            max_depth=8, min_samples_leaf=5, random_state=42, 
                            class_weight='balanced', n_jobs=-1)

scores = cross_val_score(rf, X, y, cv=5, scoring='roc_auc')
print(scores.mean(), "+/-", scores.std())

0.7638081779879858 +/- 0.02980962325833098


In [187]:
rf = RandomForestClassifier()

grid_params = {'criterion' : ['gini', 'entropy'], 
              'n_estimators' : [10, 25, 50, 100], 
              'max_depth': [4, 6, 8, 10, 12], 
              'min_samples_leaf' : [1, 3, 5, 7]}
grid_search = GridSearchCV(rf, grid_params, cv = 3, scoring = 'roc_auc')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [188]:
rf_grid = grid_search.best_estimator_
rf_grid.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [199]:
rf = RandomForestClassifier(n_estimators=50, criterion='entropy', 
                            max_depth=4, min_samples_leaf=7, random_state=42, 
                            class_weight='balanced', n_jobs=-1)

scores = cross_val_score(rf, X, y, cv=5, scoring='roc_auc')
print(scores.mean(), "+/-", scores.std())

0.7738580093858992 +/- 0.026604119394645003


### LogReg Baseline 

In [134]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, balanced_accuracy_score, roc_auc_score
from category_encoders import WOEEncoder
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer, FunctionTransformer
from sklearn.model_selection import cross_val_score

In [222]:
used_cols = ['draw_encoded', 'race_number', 'max_speed_diff', 'min_speed_diff', 
             'horse_rank', 'jockey_win_perc','actual_weight', 'days_since_last_race', 
             'win_odds']
X, y = horse[used_cols], horse[target]

In [223]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

In [224]:
scaler = MinMaxScaler()
lr = LogisticRegression(class_weight='balanced', penalty='l1', C=0.99, solver='liblinear') # LASSO in the LogReg
pipe = make_pipeline(scaler, lr)

pipe.fit(X_train, y_train)

train_preds = pipe.predict(X_train)
test_preds = pipe.predict(X_test)

In [136]:
print("TRAIN:")
print(confusion_matrix(y_train, train_preds))
print('')
print("TEST:")
print(confusion_matrix(y_test, test_preds))

TRAIN:
[[3235 2417]
 [  97  456]]

TEST:
[[1073  836]
 [  31  129]]


In [137]:
print("Accuracy:"); print("="*len("Accuracy:"))
print("TRAIN:", accuracy_score(y_train, train_preds))
print("TEST:", accuracy_score(y_test, test_preds))

Accuracy:
TRAIN: 0.594842868654311
TEST: 0.5809569840502659


In [138]:
print("Balanced Accuracy:"); print("="*len("Balanced Accuracy:"))
print("TRAIN:", balanced_accuracy_score(y_train, train_preds))
print("TEST:", balanced_accuracy_score(y_test, test_preds))

Balanced Accuracy:
TRAIN: 0.6984784467147604
TEST: 0.6841621922472498


In [139]:
train_preds = pipe.predict_proba(X_train)[:,1]
test_preds = pipe.predict_proba(X_test)[:,1]

print("AUC:"); print("="*len("AUC:"))
print("TRAIN:", roc_auc_score(y_train, train_preds))
print("TEST:", roc_auc_score(y_test, test_preds))

AUC:
====
TRAIN: 0.780353319537388
TEST: 0.7528810895756941


In [180]:
def stringify(data):
    df = pd.DataFrame(data)
    for c in df.columns.tolist():
        df[c] = df[c].astype(str)
    return df

binner = KBinsDiscretizer(n_bins=6, encode='ordinal', strategy='quantile')
objectify = FunctionTransformer(func=stringify, 
                                validate=False)
encoder = WOEEncoder()
scorecard = make_pipeline(binner, objectify, encoder, lr)

scores = cross_val_score(scorecard, X, y, cv=5, scoring='roc_auc')
print(scores.mean(), "+/-", scores.std())

0.6702921348326661 +/- 0.05755549580530418


In [181]:
# X, y = make_classification(n_samples=9001, n_features=20, n_redundant=2, n_informative=5, random_state=0, n_clusters_per_class=3)

print("LOGIT:")
scores = cross_val_score(pipe, X, y, cv=5, scoring='roc_auc')
print(scores.mean(), "+/-", scores.std())
print()
print("SCORECARD:")
scores = cross_val_score(scorecard, X, y, cv=5, scoring='roc_auc')
print(scores.mean(), "+/-", scores.std())

LOGIT:
0.771805485285606 +/- 0.02735373718207819

SCORECARD:
0.6702921348326661 +/- 0.05755549580530418


### Support Vector Machine (SVM)

In [None]:
used_cols = ['draw_encoded', 'race_number', 'max_speed_diff', 'min_speed_diff', 'horse_rank', 'jockey_win_perc','actual_weight', 'days_since_last_race', 'win_odds']
X, y = horse[used_cols], horse[target]

In [142]:
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier

from sklearn.svm import SVC

In [144]:
# used_cols = [c for c in df_10.columns.tolist() if c not in [target]]
# X, y = df_10[used_cols], df_10[target]

svm = SVC(probability=True, class_weight='balanced', random_state=42)

scores = cross_val_score(svm, X, y, cv=5, scoring='roc_auc')
print(scores.mean(), "+/-", scores.std())

0.7807590540541118 +/- 0.029166041137835302


### AdaBoost (Very Slow)

In [145]:
clf = AdaBoostClassifier(svm, n_estimators=50, 
                         learning_rate=0.1, random_state=42)

scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
print(scores.mean(), "+/-", scores.std())

0.7609080495905245 +/- 0.032781309275707396


### Gradient Boosted Classifier

In [155]:
clf = GradientBoostingClassifier(n_estimators=2000,
                                 learning_rate=0.05, random_state=42, 
                                 subsample=0.9, max_depth=5, max_features=5,
                                 min_samples_leaf=2, min_samples_split=5,
                                 validation_fraction=0.20, # use 20% of the data as hold-out for early stopping
                                 n_iter_no_change=50, # allowed to go 50 iterations without improvement to hold-out score
                                 verbose=0)

scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
print(scores.mean(), "+/-", scores.std())

0.7679734740089599 +/- 0.022059818490738937


In [156]:
clf = GradientBoostingClassifier(n_estimators=2000,
                                 learning_rate=0.01, random_state=42, 
                                 subsample=0.9, max_depth=5, max_features=5,
                                 min_samples_leaf=2, min_samples_split=5,
                                 validation_fraction=0.20, # use 20% of the data as hold-out for early stopping
                                 n_iter_no_change=50, # allowed to go 50 iterations without improvement to hold-out score
                                 verbose=0)

scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
print(scores.mean(), "+/-", scores.std())

0.7762193765085372 +/- 0.024414924446882504


In [157]:
clf = GradientBoostingClassifier(n_estimators=2000,
                                 learning_rate=0.005, random_state=42, 
                                 subsample=0.9, max_depth=5, max_features=5,
                                 min_samples_leaf=2, min_samples_split=5,
                                 validation_fraction=0.20, # use 20% of the data as hold-out for early stopping
                                 n_iter_no_change=50, # allowed to go 50 iterations without improvement to hold-out score
                                 verbose=0)

scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
print(scores.mean(), "+/-", scores.std())

0.7771058078794784 +/- 0.024645114444199423


Lowering the learning rate increases the mean AUC but also increases the standard deviation.