In [30]:
import pandas as pd
import numpy as np
import os

DIST = 265

pd.set_option('display.max_columns', None)

In [31]:
def load_dataset(low_memory=True):
    file_name = fr'C:\machine learning\gb_greyhound\grayhound\data\train\datasets\dataset_{DIST}.0.csv'
    return pd.read_csv(file_name, low_memory=low_memory)

main_df = load_dataset(low_memory=False)

In [32]:
columns_to_drop = ['SP', 'dogBorn', 'dogColour', 'dogId', 'dogSeason', 
           'dogSex', 'dogSire', 'dogSire', 'meetingId', 
           'ownerName', 'raceForecast', 'raceId', 'raceNumber', 
           'raceHandicap', 'racePrizes', 'raceTricast', 'raceType', 
           'resultAdjustedTime', 'resultMarketCnt', 'resultMarketPos',
           'trainerName', 'trapHandicap', 'raceTime', 'raceTime',
            'resultSectionalTime', 'raceDistance', 'resultComment',
            'trackName'
]

df = main_df.drop(columns_to_drop, axis=1)
df.head()

Unnamed: 0,dogName,raceClass,raceDate,raceGoing,resultBtnDistance,resultDogWeight,resultPosition,resultPriceDenominator,resultPriceNumerator,resultRunTime,trapNumber
0,Murlens Crash,OR,12/01/2014,,,34.9,1.0,4.0,5.0,16.17,6
1,A New Venture,OR,12/01/2014,,5 1/2,27.8,2.0,1.0,4.0,16.62,1
2,Lynton Kingpin,OR,12/01/2014,,1/2,31.5,3.0,1.0,10.0,16.65,2
3,Matching Kian,OR,12/01/2014,,2,34.1,4.0,1.0,2.0,16.81,3
4,Jumeirah Lark,OR,12/01/2014,,1 3/4,32.6,5.0,1.0,25.0,16.96,5


In [33]:
columns = df.columns

for col in columns:
    print(col, ':', df[col].nunique())

dogName : 3115
raceClass : 12
raceDate : 1831
raceGoing : 15
resultBtnDistance : 72
resultDogWeight : 209
resultPosition : 7
resultPriceDenominator : 13
resultPriceNumerator : 26
resultRunTime : 269
trapNumber : 6


In [34]:
df.replace(['', ' ', np.nan], np.nan)
df_cleaned = df.dropna(subset=['dogName', 'raceClass', 'resultPosition', 'resultPriceDenominator', 'resultPriceNumerator', 'raceDate'])

In [35]:
df_cleaned['resultPosition'].value_counts()

resultPosition
1.0    7233
2.0    7204
3.0    7204
4.0    7164
5.0    7113
6.0    6379
0.0      20
Name: count, dtype: int64

In [36]:
df_cleaned = df_cleaned[df_cleaned['resultPosition'] != 0.0]

In [37]:
len(df_cleaned)

42297

In [38]:
df_cleaned['raceClass'].unique()

array(['OR', 'D2', 'D3', 'IT', 'D1', 'HD1', 'D4', 'D5', 'IV', 'OR3',
       'OR1'], dtype=object)

In [39]:
len(df_cleaned[df_cleaned['raceClass'] == 'HD1'])

618

In [40]:
df_cleaned = df_cleaned[df_cleaned['raceClass'] != 'HD1']

In [41]:
df_cleaned.loc[:, 'raceDate'] = pd.to_datetime(df_cleaned['raceDate'], format='%d/%m/%Y')
df_cleaned.loc[:, 'forecast'] = df_cleaned['resultPriceNumerator'] / df_cleaned['resultPriceDenominator']

In [42]:
def process_dog_data(df):
    dog_names = df['dogName'].unique()

    results = []
    
    for dog in dog_names:
        dog_data = df[df['dogName'] == dog].sort_values(by='raceDate')
        
        dog_results = []
        for i, row in dog_data.iterrows():
            current_race_date = row['raceDate']
            
            previous_races = dog_data[dog_data['raceDate'] < current_race_date].tail(5)
            
            by = previous_races['resultBtnDistance'].tolist()
            finished = previous_races['resultPosition'].tolist()
            going = previous_races['raceGoing'].tolist()
            price_dens = previous_races['resultPriceDenominator'].tolist()
            price_nums = previous_races['resultPriceNumerator'].tolist()
            race_grade = previous_races['raceClass'].tolist()
            run_time = previous_races['resultRunTime'].tolist()
            trap = previous_races['trapNumber'].tolist()
            weight = previous_races['resultDogWeight'].tolist()

            while len(by) < 5:
                by.append(np.nan)
                finished.append(np.nan)
                going.append(np.nan)
                price_dens.append(np.nan)
                price_nums.append(np.nan)
                race_grade.append(np.nan)
                run_time.append(np.nan)
                trap.append(np.nan)
                weight.append(np.nan)

            result = (by + finished + going + 
                      price_dens + price_nums +
                      race_grade + run_time + 
                      trap + weight)

            dog_results.append(result)
        
        dog_data_results = pd.DataFrame(dog_results, columns=[
            'by_1', 'by_2', 'by_3', 'by_4', 'by_5',
            'finished_1', 'finished_2', 'finished_3', 'finished_4', 'finished_5',
            'going_1', 'going_2', 'going_3', 'going_4', 'going_5',
            'price_dens_1', 'price_dens_2', 'price_dens_3', 'price_dens_4', 'price_dens_5',
            'price_nums_1', 'price_nums_2', 'price_nums_3', 'price_nums_4', 'price_nums_5',
            'race_grade_1', 'race_grade_2', 'race_grade_3', 'race_grade_4', 'race_grade_5',
            'run_time_1', 'run_time_2', 'run_time_3', 'run_time_4', 'run_time_5',
            'trap_1', 'trap_2', 'trap_3', 'trap_4', 'trap_5',
            'weight_1', 'weight_2', 'weight_3', 'weight_4', 'weight_5'
        ])
        dog_data.reset_index(drop=True, inplace=True)
        dog_data_results.reset_index(drop=True, inplace=True)
        
        combined_data = pd.concat([dog_data, dog_data_results], axis=1)
        results.append(combined_data)
    
    final_df = pd.concat(results).sort_index()

    return final_df

df_cleaned = df_cleaned.replace(['', ' ', None], np.nan)
df_full = process_dog_data(df_cleaned)

  df_cleaned = df_cleaned.replace(['', ' ', None], np.nan)


In [43]:
df_full = df_full.dropna(subset=['finished_1'])

In [44]:
df_full.head()

Unnamed: 0,dogName,raceClass,raceDate,raceGoing,resultBtnDistance,resultDogWeight,resultPosition,resultPriceDenominator,resultPriceNumerator,resultRunTime,trapNumber,forecast,by_1,by_2,by_3,by_4,by_5,finished_1,finished_2,finished_3,finished_4,finished_5,going_1,going_2,going_3,going_4,going_5,price_dens_1,price_dens_2,price_dens_3,price_dens_4,price_dens_5,price_nums_1,price_nums_2,price_nums_3,price_nums_4,price_nums_5,race_grade_1,race_grade_2,race_grade_3,race_grade_4,race_grade_5,run_time_1,run_time_2,run_time_3,run_time_4,run_time_5,trap_1,trap_2,trap_3,trap_4,trap_5,weight_1,weight_2,weight_3,weight_4,weight_5
1,Drimeen Heights,D3,2021-01-04,-40.0,DIS,30.1,6.0,1.0,6.0,,5,6.0,,,,,,1.0,,,,,0.0,,,,,1.0,,,,,6.0,,,,,D4,,,,,17.23,,,,,6.0,,,,,29.2,,,,
1,Rough Conor,D5,2022-09-26,-5.0,3/4,34.7,3.0,1.0,3.0,17.42,2,3.0,HD,,,,,4.0,,,,,-10.0,,,,,1.0,,,,,4.0,,,,,D5,,,,,17.66,,,,,2.0,,,,,34.0,,,,
1,Russmur Bound,D2,2018-10-08,-5.0,3,31.8,6.0,1.0,6.0,17.51,4,6.0,1,,,,,6.0,,,,,-5.0,,,,,2.0,,,,,11.0,,,,,D2,,,,,17.6,,,,,4.0,,,,,31.7,,,,
1,Killieford Novo,D1,2021-01-08,-5.0,1 1/2,34.4,3.0,8.0,11.0,16.99,4,1.375,1/2,,,,,2.0,,,,,0.0,,,,,8.0,,,,,13.0,,,,,D1,,,,,16.73,,,,,3.0,,,,,34.7,,,,
1,Clomantagh Moth,D3,2018-10-09,-10.0,,24.1,1.0,3.0,10.0,17.37,3,3.333333,1 1/4,,,,,4.0,,,,,-5.0,,,,,2.0,,,,,9.0,,,,,D3,,,,,17.79,,,,,3.0,,,,,23.7,,,,


In [45]:
df_full = df_full.replace([None], np.nan)

In [46]:
def fill_miss_values(row):
    last_results = None
    size = None
    start_index = None

    for i in range(2, 6):
        if pd.isna(row[f'finished_{i}']):
            last_results = row[[f'by_{i - 1}', f'finished_{i - 1}', 
                                f'going_{i - 1}', f'price_dens_{i - 1}', f'price_nums_{i - 1}',
                                f'race_grade_{i - 1}', f'run_time_{i - 1}', f'trap_{i - 1}', f'weight_{i - 1}']]
            size = 5 - i + 1
            start_index = i
            break
    
    if size is not None:
        for j in range(size):
            idx = start_index + j
            row[f'by_{idx}'] = last_results[f'by_{start_index - 1}']
            row[f'finished_{idx}'] = last_results[f'finished_{start_index - 1}']
            row[f'going_{idx}'] = last_results[f'going_{start_index - 1}']
            row[f'price_dens_{idx}'] = last_results[f'price_dens_{start_index - 1}']
            row[f'price_nums_{idx}'] = last_results[f'price_nums_{start_index - 1}']
            row[f'race_grade_{idx}'] = last_results[f'race_grade_{start_index - 1}']
            row[f'run_time_{idx}'] = last_results[f'run_time_{start_index - 1}']
            row[f'trap_{idx}'] = last_results[f'trap_{start_index - 1}']
            row[f'weight_{idx}'] = last_results[f'weight_{start_index - 1}']
    
    return row

In [47]:
df_full = df_full.apply(fill_miss_values, axis=1)

In [48]:
for i in range(5):
    df_full[f'odds_{i+1}'] = df_full[f'price_nums_{i+1}'] / df_full[f'price_dens_{i+1}']

columns_to_drop = ['dogName', 'raceDate', 'resultBtnDistance', 
                   'raceGoing', 'resultPriceDenominator', 
                   'resultPriceNumerator', 'resultRunTime',
                   'resultDogWeight', 'price_dens_1', 'price_dens_2',
                   'price_dens_3', 'price_dens_4', 'price_dens_5',
                   'price_nums_1', 'price_nums_2', 'price_nums_3',
                   'price_nums_4', 'price_nums_5']
df_full = df_full.drop(columns_to_drop, axis=1)

In [49]:
df_full.head()

Unnamed: 0,raceClass,resultPosition,trapNumber,forecast,by_1,by_2,by_3,by_4,by_5,finished_1,finished_2,finished_3,finished_4,finished_5,going_1,going_2,going_3,going_4,going_5,race_grade_1,race_grade_2,race_grade_3,race_grade_4,race_grade_5,run_time_1,run_time_2,run_time_3,run_time_4,run_time_5,trap_1,trap_2,trap_3,trap_4,trap_5,weight_1,weight_2,weight_3,weight_4,weight_5,odds_1,odds_2,odds_3,odds_4,odds_5
1,D3,6.0,5,6.0,,,,,,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,D4,D4,D4,D4,D4,17.23,17.23,17.23,17.23,17.23,6.0,6.0,6.0,6.0,6.0,29.2,29.2,29.2,29.2,29.2,6.0,6.0,6.0,6.0,6.0
1,D5,3.0,2,3.0,HD,HD,HD,HD,HD,4.0,4.0,4.0,4.0,4.0,-10.0,-10.0,-10.0,-10.0,-10.0,D5,D5,D5,D5,D5,17.66,17.66,17.66,17.66,17.66,2.0,2.0,2.0,2.0,2.0,34.0,34.0,34.0,34.0,34.0,4.0,4.0,4.0,4.0,4.0
1,D2,6.0,4,6.0,1,1,1,1,1,6.0,6.0,6.0,6.0,6.0,-5.0,-5.0,-5.0,-5.0,-5.0,D2,D2,D2,D2,D2,17.6,17.6,17.6,17.6,17.6,4.0,4.0,4.0,4.0,4.0,31.7,31.7,31.7,31.7,31.7,5.5,5.5,5.5,5.5,5.5
1,D1,3.0,4,1.375,1/2,1/2,1/2,1/2,1/2,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,D1,D1,D1,D1,D1,16.73,16.73,16.73,16.73,16.73,3.0,3.0,3.0,3.0,3.0,34.7,34.7,34.7,34.7,34.7,1.625,1.625,1.625,1.625,1.625
1,D3,1.0,3,3.333333,1 1/4,1 1/4,1 1/4,1 1/4,1 1/4,4.0,4.0,4.0,4.0,4.0,-5.0,-5.0,-5.0,-5.0,-5.0,D3,D3,D3,D3,D3,17.79,17.79,17.79,17.79,17.79,3.0,3.0,3.0,3.0,3.0,23.7,23.7,23.7,23.7,23.7,4.5,4.5,4.5,4.5,4.5


In [50]:
import re

def convert_dist_by(sp):
    try:
        if isinstance(sp, float) and pd.isna(sp):
            return np.nan
        elif isinstance(sp, (int, float)):
            return sp
        elif isinstance(sp, str):
            parts = sp.split()
            
            if len(parts) == 2:  # Формат "3 3/4"
                whole_part = int(parts[0])
                fraction_part = parts[1]
                
                if '/' in fraction_part:
                    num, den = fraction_part.split('/')
                    fraction_value = int(num) / int(den)
                    return whole_part + fraction_value
                else:
                    return np.nan
            
            elif len(parts) == 1:  # Формат "3/4" или "3"
                if '/' in parts[0]:  # Если это дробь
                    num, den = parts[0].split('/')
                    return int(num) / int(den)
                else:  # Если это просто целое число
                    return float(parts[0])
            else:
                return np.nan
        else:
            return np.nan
    except (ValueError, IndexError) as e:
        return np.nan
    except Exception as e:
        return np.nan

In [51]:
for i in range(5):
    df_full.loc[:, f'by_{i+1}'] = df_full[f'by_{i+1}'].apply(convert_dist_by).round(2)

In [52]:
def set_adv_lagg(pos, by):
    if by is None or pd.isna(by):
        return np.nan
    result = np.round(by * 0.8, 2) if pos == 1 else np.round(by * -0.8, 2)
    return result

In [53]:
for i in range(5):
    df_full[f'by_{i+1}'] = df_full.apply(lambda row: set_adv_lagg(row[f'finished_{i+1}'], row[f'by_{i+1}']), axis=1)

In [54]:
import joblib

def save_encoder(encoder) -> None:
    dir_name = os.path.join("grayhound", "encoders")
    os.makedirs(dir_name, exist_ok=True)
    joblib.dump(encoder, os.path.join(dir_name, f"encoder_{DIST}.pkl"))

In [55]:
from sklearn.preprocessing import OrdinalEncoder

columns_cat = ['raceClass', 'race_grade_1', 'race_grade_2', 'race_grade_3', 'race_grade_4', 'race_grade_5']
df_cat = df_full[columns_cat]

encoder = OrdinalEncoder()
encoder.fit(df_cat)

save_encoder(encoder)

df_encoded = encoder.transform(df_cat)
df_encoded_df = pd.DataFrame(df_encoded, columns=columns_cat)
df_full[columns_cat] = df_encoded_df

In [56]:
y = df_full['resultPosition']
df_full = df_full.drop(['resultPosition'], axis=1)

In [57]:
def save_imputer(imputer):
    dir_name = os.path.join("grayhound", "imputers")
    os.makedirs(dir_name, exist_ok=True)
    joblib.dump(imputer, os.path.join(dir_name, f"imputer_{DIST}.pkl"))

In [58]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3, weights='distance', keep_empty_features=False)
imputer.fit(df_full)

save_imputer(imputer)

X = imputer.transform(df_full)

In [33]:
df_ready = pd.DataFrame(X, columns=df_full.columns)

In [34]:
def save_dataset(dataset):
    dir_name = os.path.join("grayhound", "datasets_to_use")
    os.makedirs(dir_name, exist_ok=True)
    file_path = os.path.join(dir_name, f"dataset_{DIST}.csv")
    dataset.to_csv(file_path)

In [35]:
save_dataset(df_ready)

In [36]:
from sklearn.model_selection import train_test_split

X = df_ready.drop(['resultPosition'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'n_estimators': [3, 10, 30, 50, 100, 120],
        'max_features': [5, 10, 15, 25, 44],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10, 15, 20],
    },
    {
        'bootstrap': [False],
        'n_estimators': [3, 10, 30, 50, 100, 120],
        'max_features': [5, 10, 15, 25, 44],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10, 15, 20],
    }
]

forest_reg = RandomForestClassifier(random_state=42)
grd_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_log_loss')
grd_search.fit(X, y)



In [38]:
final_model = grd_search.best_estimator_

In [39]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(final_model, 
                         X_test, y_test,
                         scoring="neg_log_loss", cv=10)
forest_scores = np.sqrt(-scores)

In [40]:
def display_scores(scores):
    print("Scores' sum:", scores),
    print("Mean:", scores.mean()),
    print("Std:", scores.std())

display_scores(forest_scores)

Scores' sum: [1.3226498  1.31017044 1.31678911 1.32076701 1.30998194 1.31799933
 1.31042505 1.31515906 1.31543396 1.31315752]
Mean: 1.3152533227619632
Std: 0.00419972728047283


In [41]:
def save_model(model):
    file_name = f"model_{DIST}.pkl"
    file_path = os.path.join("grayhound", "models", "random_forest_class")
    os.makedirs(file_path, exist_ok=True)
    joblib.dump(model, os.path.join(file_path, file_name))

In [42]:
grd_search.best_params_

{'max_depth': 10,
 'max_features': 25,
 'min_samples_split': 15,
 'n_estimators': 120}