# Horse Racing Classification
### Jamie Heneghan, Melka Konshie, Kyle Duffy, Walter Robson, Sam Allison
### CSE 40647

**Getting the Data**

In [42]:
import pandas as pd
import numpy as np
runs = pd.read_csv("./runs.csv")
races = pd.read_csv("./races.csv")

In [43]:
runs

Unnamed: 0,race_id,horse_no,horse_id,result,won,lengths_behind,horse_age,horse_country,horse_type,horse_rating,...,time2,time3,time4,time5,time6,finish_time,win_odds,place_odds,trainer_id,jockey_id
0,0,1,3917,10,0.0,8.00,3,AUS,Gelding,60,...,21.59,23.86,24.62,,,83.92,9.7,3.7,118,2
1,0,2,2157,8,0.0,5.75,3,NZ,Gelding,60,...,21.99,23.30,23.70,,,83.56,16.0,4.9,164,57
2,0,3,858,7,0.0,4.75,3,NZ,Gelding,60,...,21.59,23.90,24.22,,,83.40,3.5,1.5,137,18
3,0,4,1853,9,0.0,6.25,3,SAF,Gelding,60,...,21.83,23.70,24.00,,,83.62,39.0,11.0,80,59
4,0,5,2796,6,0.0,3.75,3,GB,Gelding,60,...,21.75,23.22,23.50,,,83.24,50.0,14.0,9,154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79442,6348,10,1238,6,0.0,1.25,5,AUS,Gelding,87,...,23.98,23.53,22.52,,,96.17,99.0,,73,40
79443,6348,11,985,7,0.0,2.25,5,NZ,Gelding,84,...,24.10,23.77,22.61,,,96.30,99.0,,164,63
79444,6348,12,3147,10,0.0,3.00,5,NZ,Gelding,83,...,24.06,23.37,22.43,,,96.44,99.0,,111,68
79445,6348,13,1982,8,0.0,2.50,3,AUS,Gelding,82,...,24.02,23.45,22.29,,,96.34,4.6,,118,95


In [44]:
races.head()

Unnamed: 0,race_id,date,venue,race_no,config,surface,distance,going,horse_ratings,prize,...,place_combination3,place_combination4,place_dividend1,place_dividend2,place_dividend3,place_dividend4,win_combination1,win_dividend1,win_combination2,win_dividend2
0,0,1997-06-02,ST,1,A,0,1400,GOOD TO FIRM,40-15,485000.0,...,6.0,,36.5,25.5,18.0,,8,121.0,,
1,1,1997-06-02,ST,2,A,0,1200,GOOD TO FIRM,40-15,485000.0,...,4.0,,12.5,47.0,33.5,,5,23.5,,
2,2,1997-06-02,ST,3,A,0,1400,GOOD TO FIRM,60-40,625000.0,...,13.0,,23.0,23.0,59.5,,11,70.0,,
3,3,1997-06-02,ST,4,A,0,1200,GOOD TO FIRM,120-95,1750000.0,...,10.0,,14.0,24.5,16.0,,5,52.0,,
4,4,1997-06-02,ST,5,A,0,1600,GOOD TO FIRM,60-40,625000.0,...,1.0,,15.5,28.0,17.5,,2,36.5,,


In [45]:
## Take the features we want (This can be changed)

In [46]:
runs_features = runs[['race_id', 'won', 'horse_age', 'horse_country', 'horse_type',
                      'horse_rating', 'horse_gear', 'declared_weight', 'actual_weight',
                      'draw', 'win_odds', 'horse_id']]

races_features = races[['race_id', 'venue', 'config', 'distance', 'going', 'race_class']]


In [47]:
## Merge the races and runs by race ID

In [48]:
df = pd.merge(runs_features, races_features, on='race_id')

In [49]:
## Drop Missing Values (this can be changed too, just easier this way)

In [50]:
df = df.dropna()

In [51]:
df.shape

(79445, 17)

**Clean & Process the Data**

In [52]:
## Convert gear -> binary
df['horse_gear'] = np.where(df['horse_gear'] == '--', 0, 1)

df.columns

Index(['race_id', 'won', 'horse_age', 'horse_country', 'horse_type',
       'horse_rating', 'horse_gear', 'declared_weight', 'actual_weight',
       'draw', 'win_odds', 'horse_id', 'venue', 'config', 'distance', 'going',
       'race_class'],
      dtype='object')

In [53]:
## Categorical -> Numerical (one hot encoding)
df = pd.get_dummies(df)

In [54]:
df.head()

Unnamed: 0,race_id,won,horse_age,horse_rating,horse_gear,declared_weight,actual_weight,draw,win_odds,horse_id,...,going_FAST,going_GOOD,going_GOOD TO FIRM,going_GOOD TO YIELDING,going_SLOW,going_SOFT,going_WET FAST,going_WET SLOW,going_YIELDING,going_YIELDING TO SOFT
0,0,0.0,3,60,0,1020.0,133,7,9.7,3917,...,0,0,1,0,0,0,0,0,0,0
1,0,0.0,3,60,0,980.0,133,12,16.0,2157,...,0,0,1,0,0,0,0,0,0,0
2,0,0.0,3,60,0,1082.0,132,8,3.5,858,...,0,0,1,0,0,0,0,0,0,0
3,0,0.0,3,60,0,1118.0,127,13,39.0,1853,...,0,0,1,0,0,0,0,0,0,0
4,0,0.0,3,60,0,972.0,131,14,50.0,2796,...,0,0,1,0,0,0,0,0,0,0


In [55]:
df.shape

(79445, 55)

### Models

In [110]:
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

## This is predicting whether a horse wins the race or not.
X_data = df.drop(columns=['won','race_id','win_odds'])
#drop 'won' cause that's the Y, drop race_id cause its noise?
## Theres definitely a better way to do this. We should be predicting which horse out of n horses in race x wins, not if horse is a winner in general
## Maybe we can compare the "likelihood of winning" for all n horses in race X, choose winner based on that, but then we need a model which gives us likelihood

## I hope this works, im trying to get a df of only the win_odds for testing purposes later
Y_data = df[['won', 'win_odds']]


## Probably a better sampling method, but idk this is easiest
X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=.2, random_state=69)

only_odds = y_test.drop(columns=['won'])
y_train = y_train.drop(columns=['win_odds'])
y_train = y_train.values.ravel()
#print(y_train)
y_test = y_test.drop(columns=['win_odds'])
y_test = y_test.values.ravel()
only_odds = only_odds.values.ravel()


In [99]:
print("X_train.shape", X_train.shape)
print("y_train.shape", len(y_train), "#Winners:", sum(y_train))
print("X_test.shape", X_test.shape)
print("y_test.shape", len(y_test), "#Winners:", sum(y_test))

X_train.shape (63556, 52)
y_train.shape 63556 #Winners: 5075.0
X_test.shape (15889, 52)
y_test.shape 15889 #Winners: 1285.0


In [134]:
clf_knn = KNeighborsClassifier()
clf_id3 = tree.DecisionTreeClassifier(criterion="entropy", max_depth=10)
clf_id3_overfit = tree.DecisionTreeClassifier(criterion="entropy")
clf_cart = tree.DecisionTreeClassifier(max_depth=10)
clf_bayes = GaussianNB()
clf_rbf = SVC(probability=True)
clf_forest = RandomForestClassifier()
clf_boost = AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(max_depth=2), random_state=69)
clf_mlp = MLPClassifier(hidden_layer_sizes=(50,50,), max_iter=1000, tol=0.001, random_state=420)

clf_knn.fit(X_train, y_train)
clf_id3.fit(X_train, y_train)
clf_id3_overfit.fit(X_train, y_train)
clf_cart.fit(X_train, y_train)
clf_bayes.fit(X_train, y_train)
clf_rbf.fit(X_train, y_train)
clf_forest.fit(X_train, y_train)
clf_boost.fit(X_train, y_train)
clf_mlp.fit(X_train, y_train)

y_pred_knn = clf_knn.predict(X_test)
y_pred_id3 = clf_id3.predict(X_test)
y_pred_id3_of = clf_id3_overfit.predict(X_test)
y_pred_cart = clf_cart.predict(X_test)
y_pred_bayes = clf_bayes.predict(X_test)
y_pred_rbf = clf_rbf.predict(X_test)
y_pred_forest = clf_forest.predict(X_test)
y_pred_boost = clf_boost.predict(X_test)
y_pred_mlp = clf_mlp.predict(X_test)


In [135]:
print("Accuracy:")
print("- KNN:", metrics.accuracy_score(y_test, y_pred_knn))
print("- ID3:", metrics.accuracy_score(y_test, y_pred_id3))
print("- ID3 (overfitting):", metrics.accuracy_score(y_test, y_pred_id3_of))
print("- CART:", metrics.accuracy_score(y_test, y_pred_cart))
print("- Naive Bayes:", metrics.accuracy_score(y_test, y_pred_bayes))
print("- RBF Kernel SVC:", metrics.accuracy_score(y_test, y_pred_rbf))
print("- Random Forest:", metrics.accuracy_score(y_test, y_pred_forest))
print("- AdaBoost:", metrics.accuracy_score(y_test, y_pred_boost))
print("- MLP:", metrics.accuracy_score(y_test, y_pred_mlp))

Accuracy:
- KNN: 0.9149096859462521
- ID3: 0.9156019887972812
- ID3 (overfitting): 0.8431619359305179
- CART: 0.9156019887972812
- Naive Bayes: 0.8904273396689534
- RBF Kernel SVC: 0.919126439675247
- Random Forest: 0.9188746931839638
- AdaBoost: 0.9187488199383221
- MLP: 0.919126439675247


In [136]:
## More Metrics!
print("- KNN:\n", metrics.classification_report(y_test, y_pred_knn))
print("- ID3:\n", metrics.classification_report(y_test, y_pred_id3))
print("")
print("- ID3 (overfitting):\n", metrics.classification_report(y_test, y_pred_id3_of))
print("")
print("- CART:\n", metrics.classification_report(y_test, y_pred_cart))
print("")
print("- Naive Bayes:\n", metrics.classification_report(y_test, y_pred_bayes))
print("")
print("- RBF Kernel SVC:\n", metrics.classification_report(y_test, y_pred_rbf))
print("")
print("- Random Forest:\n", metrics.classification_report(y_test, y_pred_forest, zero_division=0))
print("")
print("- AdaBoost:\n", metrics.classification_report(y_test, y_pred_boost))
print("")
print("- MLP:\n", metrics.classification_report(y_test, y_pred_mlp))
print("")

- KNN:
               precision    recall  f1-score   support

         0.0       0.92      0.99      0.96     14604
         1.0       0.19      0.02      0.03      1285

    accuracy                           0.91     15889
   macro avg       0.56      0.51      0.49     15889
weighted avg       0.86      0.91      0.88     15889

- ID3:
               precision    recall  f1-score   support

         0.0       0.92      1.00      0.96     14604
         1.0       0.05      0.00      0.00      1285

    accuracy                           0.92     15889
   macro avg       0.48      0.50      0.48     15889
weighted avg       0.85      0.92      0.88     15889


- ID3 (overfitting):
               precision    recall  f1-score   support

         0.0       0.92      0.91      0.91     14604
         1.0       0.09      0.11      0.10      1285

    accuracy                           0.84     15889
   macro avg       0.51      0.51      0.51     15889
weighted avg       0.85      0.84  

  _warn_prf(average, modifier, msg_start, len(result))


In [137]:
## Maybe we should do regression on 'won' feature so we can get a probability value instead of binary value
## Idk, this is just a start

### Betting \$1 on Every Predicted Winner

In [None]:
## loop through every run in the test data
## predict whether its a winner or not, and then use the odds of the run to add/subtract profit
bankroll_knn = 0
bankroll_id3 = 0
bankroll_id3_of = 0
bankroll_cart = 0
bankroll_bayes = 0
bankroll_rbf = 0
bankroll_forest = 0
bankroll_boost = 0
bankroll_mlp = 0

#print(y_test.shape)
#print(only_odds.shape)
#print(y_test[61253])
i = 0

for index, run in X_test.iterrows():
    #print(index)
    #bet a dollar to place (need to change this to win...)
    prediction_knn = clf_knn.predict([run])
    prediction_id3 = clf_id3.predict([run])
    prediction_id3_of = clf_id3_overfit.predict([run])
    prediction_cart = clf_cart.predict([run])
    prediction_bayes = clf_bayes.predict([run])
    prediction_rbf = clf_rbf.predict([run])
    prediction_forest = clf_forest.predict([run])
    prediction_boost = clf_boost.predict([run])
    prediction_mlp = clf_mlp.predict([run])
    
    #print(prediction)
    odds = only_odds[i]
    if prediction_knn:
        if (y_test[i] == 1):
            bankroll_knn += odds
        else:
            bankroll_knn -= 1
            
    if prediction_id3:
        if (y_test[i] == 1):
            bankroll_id3 += odds
        else:
            bankroll_id3 -= 1
    
    if prediction_id3_of:
        if (y_test[i] == 1):
            bankroll_id3_of += odds
        else:
            bankroll_id3_of -= 1
    
    if prediction_cart:
        if (y_test[i] == 1):
            bankroll_cart += odds
        else:
            bankroll_cart -= 1
    
    if prediction_bayes:
        if (y_test[i] == 1):
            bankroll_bayes += odds
        else:
            bankroll_bayes -= 1
            
    if prediction_rbf:
        if (y_test[i] == 1):
            bankroll_rbf += odds
        else:
            bankroll_rbf -= 1
            
    if prediction_forest:
        if (y_test[i] == 1):
            bankroll_forest += odds
        else:
            bankroll_forest -= 1
            
    if prediction_boost:
        if (y_test[i] == 1):
            bankroll_boost += odds
        else:
            bankroll_boost -= 1
    
    if prediction_mlp:
        if (y_test[i] == 1):
            bankroll_mlp += odds
        else:
            bankroll_mlp -= 1

    i += 1
    

        

In [132]:
print("-----Betting 1 on Every Predicted Winner-----")
print("Bankroll KNN: $", round(bankroll_knn,2))
print("Bankroll ID3: $", round(bankroll_id3,2))
print("Bankroll ID3_OF: $", round(bankroll_id3_of,2))
print("Bankroll CART: $", round(bankroll_cart,2))
print("Bankroll Bayes: $", round(bankroll_bayes,2))
print("Bankroll rbf: $", round(bankroll_rbf,2))
print("Bankroll Forest: $", round(bankroll_forest,2))
print("Bankroll boost: $", round(bankroll_boost,2))
print("Bankroll mlp: $", round(bankroll_mlp,2))

-----Betting 1 on Every Predicted Winner-----
Bankroll KNN: $ 33.6
Bankroll ID3: $ -23.9
Bankroll ID3_OF: $ -250.8
Bankroll CART: $ -2.7
Bankroll Bayes: $ -66.8
Bankroll rbf: $ 0
Bankroll Forest: $ 5.9
Bankroll boost: $ -5.1
Bankroll mlp: $ 0


## Betting $1 on Every Horse whose "True Odds" are better than Odds Given

In [158]:
## loop through every run in the test data
## predict whether its a winner or not, and then use the odds of the run to add/subtract profit

# I only really care about TP and FP cause we only bet on a predicted positive
bankroll_knn = 0
knn_TP, knn_FP = 0, 0

bankroll_id3 = 0
id3_TP, id3_FP = 0, 0

bankroll_id3_of = 0
id3_of_TP, id3_of_FP = 0, 0

bankroll_cart = 0
cart_TP, cart_FP = 0, 0

bankroll_bayes = 0
bayes_TP, bayes_FP = 0, 0

bankroll_rbf = 0
rbf_TP, rbf_FP = 0, 0

bankroll_forest = 0
forest_TP, forest_FP = 0, 0

bankroll_boost = 0
boost_TP, boost_FP = 0, 0

bankroll_mlp = 0
mlp_TP, mlp_FP = 0, 0

#print(y_test.shape)
#print(only_odds.shape)
#print(y_test[61253])
i = 0

print(only_odds[0])

#Probably a better way to do this
for index, run in X_test.iterrows():
    #print(index)
    #bet a dollar to place (need to change this to win...)
    prediction_knn = clf_knn.predict_proba([run])
    prediction_id3 = clf_id3.predict_proba([run])
    prediction_id3_of = clf_id3_overfit.predict_proba([run])
    prediction_cart = clf_cart.predict_proba([run])
    prediction_bayes = clf_bayes.predict_proba([run])
    prediction_rbf = clf_rbf.predict_proba([run])
    prediction_forest = clf_forest.predict_proba([run])
    prediction_boost = clf_boost.predict_proba([run])
    prediction_mlp = clf_mlp.predict_proba([run])
    
    #print(prediction)
    
    ## Decimal odds -> implied odds = 1/(decimal odds)
    implied_odds = 1.0/only_odds[i]
    odds = only_odds[i]
    
    
    if prediction_knn[0][1] > implied_odds:
        if (y_test[i] == 1):
            bankroll_knn += odds
            knn_TP += 1
        else:
            bankroll_knn -= 1
            knn_FP += 1
            
    if prediction_id3[0][1] > implied_odds:
        if (y_test[i] == 1):
            bankroll_id3 += odds
            id3_TP += 1
        else:
            bankroll_id3 -= 1
            id3_FP += 1
    
    if prediction_id3_of[0][1] > implied_odds:
        if (y_test[i] == 1):
            bankroll_id3_of += odds
            id3_of_TP += 1
        else:
            bankroll_id3_of -= 1
            id3_of_FP += 1
    
    if prediction_cart[0][1] > implied_odds:
        if (y_test[i] == 1):
            bankroll_cart += odds
            cart_TP += 1
        else:
            bankroll_cart -= 1
            cart_FP += 1
    
    if prediction_bayes[0][1] > implied_odds:
        if (y_test[i] == 1):
            bankroll_bayes += odds
            bayes_TP += 1
        else:
            bankroll_bayes -= 1
            bayes_FP += 1
            
    if prediction_rbf[0][1] > implied_odds:
        if (y_test[i] == 1):
            bankroll_rbf += odds
            rbf_TP += 1
        else:
            bankroll_rbf -= 1
            rbf_FP += 1
            
    if prediction_forest[0][1] > implied_odds:
        if (y_test[i] == 1):
            bankroll_forest += odds
            forest_TP += 1
        else:
            bankroll_forest -= 1
            forest_FP += 1
            
    if prediction_boost[0][1] > implied_odds:
        if (y_test[i] == 1):
            bankroll_boost += odds
            boost_TP += 1
        else:
            bankroll_boost -= 1
            boost_FP += 1
    
    if prediction_mlp[0][1] > implied_odds:
        if (y_test[i] == 1):
            bankroll_mlp += odds
            mlp_TP += 1
        else:
            bankroll_mlp -= 1
            mlp_FP += 1

    i += 1
    


99.0


In [160]:
print("-----Betting 1 on Every Predicted Winner-----")
print("Bankroll KNN: $", round(bankroll_knn,2))
print("Bankroll ID3: $", round(bankroll_id3,2))
print("Bankroll ID3_OF: $", round(bankroll_id3_of,2))
print("Bankroll CART: $", round(bankroll_cart,2))
print("Bankroll Bayes: $", round(bankroll_bayes,2))
print("Bankroll rbf: $", round(bankroll_rbf,2))
print("Bankroll Forest: $", round(bankroll_forest,2))
print("Bankroll boost: $", round(bankroll_boost,2))
print("Bankroll mlp: $", round(bankroll_mlp,2),"\n")

print("KNN TP: ", knn_TP, "KNN FP: ", knn_FP)
print("ID3 TP: ", id3_TP, "ID3 FP: ", id3_FP)
print("ID3_OF TP: ", id3_of_TP, "ID3_OF FP: ", id3_of_FP)
print("CART TP: ", cart_TP, "CART FP: ", cart_FP)
print("BAYES TP: ", bayes_TP, "BAYES FP: ", bayes_FP)
print("rbf TP: ", rbf_TP, "rbf FP: ", rbf_FP)
print("forest TP: ", forest_TP, "forest FP: ", forest_FP)
print("boost TP: ", boost_TP, "boost FP: ", boost_FP)
print("mlp TP: ", mlp_TP, "mlp FP: ", mlp_FP)

-----Betting 1 on Every Predicted Winner-----
Bankroll KNN: $ -580.8
Bankroll ID3: $ -2478.1
Bankroll ID3_OF: $ -307.0
Bankroll CART: $ -2443.3
Bankroll Bayes: $ -2544.3
Bankroll rbf: $ -2382.0
Bankroll Forest: $ -2358.8
Bankroll boost: $ -2636.2
Bankroll mlp: $ -2454.5 

KNN TP:  327 KNN FP:  4241
ID3 TP:  243 ID3 FP:  7852
ID3_OF TP:  1479 ID3_OF FP:  0
CART TP:  247 CART FP:  8085
BAYES TP:  597 BAYES FP:  11033
rbf TP:  244 rbf FP:  8699
forest TP:  283 forest FP:  8171
boost TP:  1209 boost FP:  14513
mlp TP:  261 mlp FP:  8865


In [156]:
# so that didn't work ^
# We can try using a betting Criterion (Kelly Criterion) which changes amount bet based on perceived advantage.
#
# we could also try regression on time to complete the race, and compare the predicted time for each horse in a race
#
# we should also optimize the models, experiment with more / different features (avg speed of last N races for example)
#
# Features used in Bolton and Chapman:
# Lifetime Win %, Average Speed Rating, Winnings/Race past year, Weight, Post Position, New Distance?, Last Speed Rating, Jockey Win Percent, Jockey Win #
# I think most of these can be calculated form our data. 

#this is just a start