In [1]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pickle
from kaggle.api.kaggle_api_extended import KaggleApi

In [2]:
# Function to download the dataset
def download_horseracing_dataset():
    api = KaggleApi()
    api.authenticate()

    dataset_path = "data/"
    os.makedirs(dataset_path, exist_ok=True)

    dataset_name = "gunner38/horseracing"
    print(f"Downloading the dataset '{dataset_name}'...")
    api.dataset_download_files(dataset_name, path=dataset_path, unzip=True)
    print(f"Dataset saved in the folder: {dataset_path}")

# Download the dataset
if not os.path.exists("data/tips.csv"):
    download_horseracing_dataset()

# Load the dataset with adjusted encoding
try:
    data = pd.read_csv("data/tips.csv", encoding="latin1")  # Adjust the encoding if necessary
    print("Dataset successfully loaded. First rows:")
    print(data.head())
except UnicodeDecodeError as e:
    print(f"Error loading the dataset: {e}")
    print("Check the file's encoding.")

# Basic preprocessing (adjust as needed)
data = data.dropna()  # Remove missing values


Downloading the dataset 'gunner38/horseracing'...
Dataset URL: https://www.kaggle.com/datasets/gunner38/horseracing
Dataset saved in the folder: data/
Dataset successfully loaded. First rows:
   UID  ID    Tipster        Date      Track              Horse Bet Type  \
0    1   1  Tipster A  24/07/2015      Ascot          Fredricka      Win   
1    2   2  Tipster A  24/07/2015     Thirsk      Spend A Penny      Win   
2    3   3  Tipster A  24/07/2015       York  Straightothepoint      Win   
3    4   4  Tipster A  24/07/2015  Newmarket     Miss Inga Sock      Win   
4    5   5  Tipster A  25/07/2015      Ascot              Peril      Win   

   Odds Result  TipsterActive  
0  8.00   Lose           True  
1  4.50   Lose           True  
2  7.00   Lose           True  
3  5.00   Lose           True  
4  4.33    Win           True  


In [3]:
data.head()

Unnamed: 0,UID,ID,Tipster,Date,Track,Horse,Bet Type,Odds,Result,TipsterActive
0,1,1,Tipster A,24/07/2015,Ascot,Fredricka,Win,8.0,Lose,True
1,2,2,Tipster A,24/07/2015,Thirsk,Spend A Penny,Win,4.5,Lose,True
2,3,3,Tipster A,24/07/2015,York,Straightothepoint,Win,7.0,Lose,True
3,4,4,Tipster A,24/07/2015,Newmarket,Miss Inga Sock,Win,5.0,Lose,True
4,5,5,Tipster A,25/07/2015,Ascot,Peril,Win,4.33,Win,True


In [4]:
# Encode categorical variables
label_encoders = {}
for column in ["Tipster", "Track", "Horse", "Bet Type", "Result"]:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Feature and target selection
X = data[["Tipster", "Track", "Horse", "Bet Type", "Odds", "TipsterActive"]]
y = data["Result"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, predictions))

# Save the trained model
with open("model/horse_racing_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model trained and saved in 'model/horse_racing_model.pkl'")


Accuracy: 0.7783006535947713
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.94      0.87      6128
           1       0.34      0.13      0.19      1522

    accuracy                           0.78      7650
   macro avg       0.58      0.53      0.53      7650
weighted avg       0.72      0.78      0.74      7650

Model trained and saved in 'model/horse_racing_model.pkl'
