# Horse Racing Classification
### Jamie Heneghan, Melka Konshie, Kyle Duffy, Walter Robson, Sam Allison
### CSE 40647

**Getting the Data**

In [3]:
import pandas as pd
import numpy as np
runs = pd.read_csv("./runs.csv")
races = pd.read_csv("./races.csv")

In [4]:
runs

Unnamed: 0,race_id,horse_no,horse_id,result,won,lengths_behind,horse_age,horse_country,horse_type,horse_rating,...,time2,time3,time4,time5,time6,finish_time,win_odds,place_odds,trainer_id,jockey_id
0,0,1,3917,10,0.0,8.00,3,AUS,Gelding,60,...,21.59,23.86,24.62,,,83.92,9.7,3.7,118,2
1,0,2,2157,8,0.0,5.75,3,NZ,Gelding,60,...,21.99,23.30,23.70,,,83.56,16.0,4.9,164,57
2,0,3,858,7,0.0,4.75,3,NZ,Gelding,60,...,21.59,23.90,24.22,,,83.40,3.5,1.5,137,18
3,0,4,1853,9,0.0,6.25,3,SAF,Gelding,60,...,21.83,23.70,24.00,,,83.62,39.0,11.0,80,59
4,0,5,2796,6,0.0,3.75,3,GB,Gelding,60,...,21.75,23.22,23.50,,,83.24,50.0,14.0,9,154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79442,6348,10,1238,6,0.0,1.25,5,AUS,Gelding,87,...,23.98,23.53,22.52,,,96.17,99.0,,73,40
79443,6348,11,985,7,0.0,2.25,5,NZ,Gelding,84,...,24.10,23.77,22.61,,,96.30,99.0,,164,63
79444,6348,12,3147,10,0.0,3.00,5,NZ,Gelding,83,...,24.06,23.37,22.43,,,96.44,99.0,,111,68
79445,6348,13,1982,8,0.0,2.50,3,AUS,Gelding,82,...,24.02,23.45,22.29,,,96.34,4.6,,118,95


In [5]:
races.head()

Unnamed: 0,race_id,date,venue,race_no,config,surface,distance,going,horse_ratings,prize,...,place_combination3,place_combination4,place_dividend1,place_dividend2,place_dividend3,place_dividend4,win_combination1,win_dividend1,win_combination2,win_dividend2
0,0,1997-06-02,ST,1,A,0,1400,GOOD TO FIRM,40-15,485000.0,...,6.0,,36.5,25.5,18.0,,8,121.0,,
1,1,1997-06-02,ST,2,A,0,1200,GOOD TO FIRM,40-15,485000.0,...,4.0,,12.5,47.0,33.5,,5,23.5,,
2,2,1997-06-02,ST,3,A,0,1400,GOOD TO FIRM,60-40,625000.0,...,13.0,,23.0,23.0,59.5,,11,70.0,,
3,3,1997-06-02,ST,4,A,0,1200,GOOD TO FIRM,120-95,1750000.0,...,10.0,,14.0,24.5,16.0,,5,52.0,,
4,4,1997-06-02,ST,5,A,0,1600,GOOD TO FIRM,60-40,625000.0,...,1.0,,15.5,28.0,17.5,,2,36.5,,


In [6]:
## Take the features we want (This can be changed)

In [7]:
runs_features = runs[['race_id', 'won', 'horse_age', 'horse_country', 'horse_type',
                      'horse_rating', 'horse_gear', 'declared_weight', 'actual_weight',
                      'draw', 'place_odds', 'horse_id']]

races_features = races[['race_id', 'venue', 'config', 'distance', 'going', 'race_class']]


In [8]:
## Merge the races and runs by race ID

In [9]:
df = pd.merge(runs_features, races_features, on='race_id')

In [10]:
## Drop Missing Values (this can be changed too, just easier this way)

In [11]:
df = df.dropna()

In [12]:
df.shape

(75710, 17)

**Clean & Process the Data**

In [13]:
## Convert gear -> binary
df['horse_gear'] = np.where(df['horse_gear'] == '--', 0, 1)

df.columns

Index(['race_id', 'won', 'horse_age', 'horse_country', 'horse_type',
       'horse_rating', 'horse_gear', 'declared_weight', 'actual_weight',
       'draw', 'place_odds', 'horse_id', 'venue', 'config', 'distance',
       'going', 'race_class'],
      dtype='object')

In [14]:
## Categorical -> Numerical (one hot encoding)
df = pd.get_dummies(df)

In [15]:
df.head()

Unnamed: 0,race_id,won,horse_age,horse_rating,horse_gear,declared_weight,actual_weight,draw,place_odds,horse_id,...,going_FAST,going_GOOD,going_GOOD TO FIRM,going_GOOD TO YIELDING,going_SLOW,going_SOFT,going_WET FAST,going_WET SLOW,going_YIELDING,going_YIELDING TO SOFT
0,0,0.0,3,60,0,1020.0,133,7,3.7,3917,...,0,0,1,0,0,0,0,0,0,0
1,0,0.0,3,60,0,980.0,133,12,4.9,2157,...,0,0,1,0,0,0,0,0,0,0
2,0,0.0,3,60,0,1082.0,132,8,1.5,858,...,0,0,1,0,0,0,0,0,0,0
3,0,0.0,3,60,0,1118.0,127,13,11.0,1853,...,0,0,1,0,0,0,0,0,0,0
4,0,0.0,3,60,0,972.0,131,14,14.0,2796,...,0,0,1,0,0,0,0,0,0,0


In [16]:
df.shape

(75710, 54)

### Models

In [17]:
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

## This is predicting whether a horse wins the race or not.
X_data = df.drop(columns=['won','race_id']) #drop 'won' cause that's the Y, drop race_id cause its noise?
## Theres definitely a better way to do this. We should be predicting which horse out of n horses in race x wins, not if horse is a winner in general
## Maybe we can compare the "likelihood of winning" for all n horses in race X, choose winner based on that, but then we need a model which gives us likelihood

Y_data = df['won']


## Probably a sampling method, but idk this is easiest
X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=.2, random_state=69)

In [18]:
print("X_train.shape", X_train.shape)
print("y_train.shape", len(y_train), "#Winners:", sum(y_train))
print("X_test.shape", X_test.shape)
print("y_test.shape", len(y_test), "#Winners:", sum(y_test))

X_train.shape (60568, 52)
y_train.shape 60568 #Winners: 4845.0
X_test.shape (15142, 52)
y_test.shape 15142 #Winners: 1214.0


In [19]:
clf_knn = KNeighborsClassifier()
clf_id3 = tree.DecisionTreeClassifier(criterion="entropy", max_depth=10)
clf_id3_overfit = tree.DecisionTreeClassifier(criterion="entropy")
clf_cart = tree.DecisionTreeClassifier(max_depth=10)
clf_bayes = GaussianNB()
clf_rbf = SVC()
clf_forest = RandomForestClassifier()
clf_boost = AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(max_depth=2), random_state=69)
clf_mlp = MLPClassifier(hidden_layer_sizes=(50,50,), max_iter=1000, tol=0.001, random_state=420)

clf_knn.fit(X_train, y_train)
clf_id3.fit(X_train, y_train)
clf_id3_overfit.fit(X_train, y_train)
clf_cart.fit(X_train, y_train)
clf_bayes.fit(X_train, y_train)
clf_rbf.fit(X_train, y_train)
clf_forest.fit(X_train, y_train)
clf_boost.fit(X_train, y_train)
clf_mlp.fit(X_train, y_train)

y_pred_knn = clf_knn.predict(X_test)
y_pred_id3 = clf_id3.predict(X_test)
y_pred_id3_of = clf_id3_overfit.predict(X_test)
y_pred_cart = clf_cart.predict(X_test)
y_pred_bayes = clf_bayes.predict(X_test)
y_pred_rbf = clf_rbf.predict(X_test)
y_pred_forest = clf_forest.predict(X_test)
y_pred_boost = clf_boost.predict(X_test)
y_pred_mlp = clf_mlp.predict(X_test)


In [20]:
print("Accuracy:")
print("- KNN:", metrics.accuracy_score(y_test, y_pred_knn))
print("- ID3:", metrics.accuracy_score(y_test, y_pred_id3))
print("- ID3 (overfitting):", metrics.accuracy_score(y_test, y_pred_id3_of))
print("- CART:", metrics.accuracy_score(y_test, y_pred_cart))
print("- Naive Bayes:", metrics.accuracy_score(y_test, y_pred_bayes))
print("- RBF Kernel SVC:", metrics.accuracy_score(y_test, y_pred_rbf))
print("- Random Forest:", metrics.accuracy_score(y_test, y_pred_forest))
print("- AdaBoost:", metrics.accuracy_score(y_test, y_pred_boost))
print("- MLP:", metrics.accuracy_score(y_test, y_pred_mlp))

Accuracy:
- KNN: 0.9131554616299036
- ID3: 0.9159952450138688
- ID3 (overfitting): 0.8581429137498349
- CART: 0.9162594109100515
- Naive Bayes: 0.7425703341698586
- RBF Kernel SVC: 0.9198256505085194
- Random Forest: 0.9192973187161537
- AdaBoost: 0.9192973187161537
- MLP: 0.919693567560428


In [22]:
## More Metrics!
print("- KNN:\n", metrics.classification_report(y_test, y_pred_knn))
print("- ID3:\n", metrics.classification_report(y_test, y_pred_id3))
print("")
print("- ID3 (overfitting):\n", metrics.classification_report(y_test, y_pred_id3_of))
print("")
print("- CART:\n", metrics.classification_report(y_test, y_pred_cart))
print("")
print("- Naive Bayes:\n", metrics.classification_report(y_test, y_pred_bayes))
print("")
print("- RBF Kernel SVC:\n", metrics.classification_report(y_test, y_pred_rbf))
print("")
print("- Random Forest:\n", metrics.classification_report(y_test, y_pred_forest, zero_division=0))
print("")
print("- AdaBoost:\n", metrics.classification_report(y_test, y_pred_boost))
print("")
print("- MLP:\n", metrics.classification_report(y_test, y_pred_mlp))
print("")

- KNN:
               precision    recall  f1-score   support

         0.0       0.92      0.99      0.95     13928
         1.0       0.15      0.02      0.03      1214

    accuracy                           0.91     15142
   macro avg       0.54      0.50      0.49     15142
weighted avg       0.86      0.91      0.88     15142

[[13805   123]
 [ 1192    22]]
- ID3:
               precision    recall  f1-score   support

         0.0       0.92      0.99      0.96     13928
         1.0       0.31      0.04      0.07      1214

    accuracy                           0.92     15142
   macro avg       0.62      0.52      0.51     15142
weighted avg       0.87      0.92      0.88     15142


- ID3 (overfitting):
               precision    recall  f1-score   support

         0.0       0.93      0.92      0.92     13928
         1.0       0.15      0.17      0.16      1214

    accuracy                           0.86     15142
   macro avg       0.54      0.54      0.54     15142
weig

In [2]:
## Maybe we should do regression on 'won' feature so we can get a probability value instead of binary value
## Idk, this is just a start