In [9]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pickle
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [10]:
pd.set_option('display.max_columns', None)

In [None]:
# Caminhos relativos baseados na localização do notebook
RAW_DATA_PATH = "../data/01_raw/Horse Racing Results.CSV"
PROCESSED_DATA_PATH = "../data/02_processed/clean_Horse Racing Results.CSV"

# Função para baixar e carregar o dataset
def load_horseracing_dataset():
    if not os.path.exists(RAW_DATA_PATH):
        api = KaggleApi()
        api.authenticate()
        # Baixar o dataset na pasta `data/01_raw`
        api.dataset_download_files("bogdandoicin/horse-racing-results-2017-2020", path="../data/01_raw", unzip=True)
    
    try:
        # Carregar o dataset
        return pd.read_csv(RAW_DATA_PATH, encoding="latin1", sep=";", on_bad_lines="skip", engine="python")
    except Exception as e:
        raise RuntimeError(f"Failed to load dataset: {e}")

# Carregar o dataset
data = load_horseracing_dataset()
print("Dataset successfully loaded. First rows:")
print(data.head())

# Pré-processamento básico
df = data.dropna()
print("Dataset shape:", df.shape)
print("Columns:", df.columns)

# Salvar a versão limpa do dataset em `data/02_processed`
os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True)  # Garantir que a pasta `02_processed` exista
df.to_csv(PROCESSED_DATA_PATH, index=False)
print(f"Clean dataset saved at '{PROCESSED_DATA_PATH}'")


Dataset URL: https://www.kaggle.com/datasets/bogdandoicin/horse-racing-results-2017-2020
Dataset successfully loaded. First rows:
         Dato    Track  Race Number  Distance Surface  Prize money  \
0  03.09.2017  Sha Tin           10      1400   Gress      1310000   
1  16.09.2017  Sha Tin           10      1400   Gress      1310000   
2  14.10.2017  Sha Tin           10      1400   Gress      1310000   
3  11.11.2017  Sha Tin            9      1600   Gress      1310000   
4  26.11.2017  Sha Tin            9      1600   Gress      1310000   

   Starting position         Jockey  Jockey weight  Country  Horse age  \
0                  6      K C Leung             52  Sverige          7   
1                 14         C Y Ho             52  Sverige          7   
2                  8         C Y Ho             52  Sverige          7   
3                 13  Brett Prebble             54  Sverige          7   
4                  9         C Y Ho             52  Sverige          7   

  Tr

In [14]:
df.head()

Unnamed: 0,Dato,Track,Race Number,Distance,Surface,Prize money,Starting position,Jockey,Jockey weight,Country,Horse age,TrainerName,Race time,Path,Final place,FGrating,Odds,RaceType,HorseId,JockeyId,TrainerID
0,03.09.2017,Sha Tin,10,1400,Gress,1310000,6,K C Leung,52,Sverige,7,CH Yip,8338,2,9,110,22,Handicap,1736,8656,6687
1,16.09.2017,Sha Tin,10,1400,Gress,1310000,14,C Y Ho,52,Sverige,7,CH Yip,8156,3,4,124,48,Handicap,1736,8659,6687
2,14.10.2017,Sha Tin,10,1400,Gress,1310000,8,C Y Ho,52,Sverige,7,CH Yip,8236,1,6,118,11,Handicap,1736,8659,6687
3,11.11.2017,Sha Tin,9,1600,Gress,1310000,13,Brett Prebble,54,Sverige,7,CH Yip,9653,0,8,107,11,Handicap,1736,8453,6687
4,26.11.2017,Sha Tin,9,1600,Gress,1310000,9,C Y Ho,52,Sverige,7,CH Yip,9417,0,3,123,40,Handicap,1736,8659,6687
