In [42]:
import pandas as pd
import numpy as np
import os

DIST = 250

In [43]:
def load_dataset(low_memory=True):
    file_name = fr'C:\machine learning\gb_greyhound\grayhound\data\train\datasets\dataset_{DIST}.0.csv'
    return pd.read_csv(file_name, low_memory=low_memory)

main_df = load_dataset(low_memory=False)

In [44]:
main_df.head()

Unnamed: 0,SP,dogBorn,dogColour,dogId,dogName,dogSeason,dogSex,dogSire,meetingId,ownerName,...,resultMarketPos,resultPosition,resultPriceDenominator,resultPriceNumerator,resultRunTime,resultSectionalTime,trackName,trainerName,trapHandicap,trapNumber
0,4/1,May-2011,be,417324.0,Rastafari Jon,,d,Hondo Black,282648,Mrs P J Collins,...,3.0,1.0,1.0,4.0,15.37,,Peterborough,R B York,,5
1,6/1,Aug-2010,bk,412773.0,Sisnemesis Ned,,d,Droopys Kewell,282648,Mr G E Marshall,...,4.0,2.0,1.0,6.0,15.54,,Peterborough,P A Braithwaite,,6
2,6/1,Jun-2009,wbk,384230.0,Knockrour Pele,,d,Droopys Corelli,282648,Mr P Ward,...,4.0,3.0,1.0,6.0,15.64,,Peterborough,P Ward,,3
3,9/4,Oct-2011,wbk,426757.0,Pat My Boy,,d,Royal Impact,282648,Mrs W M Scoles,...,2.0,4.0,4.0,9.0,15.81,,Peterborough,W M Scoles,,1
4,20/1,Jun-2010,bk,404601.0,Cheers Buddy,,d,Johnny Gatillo,282648,The Spanish Olive Shop,...,6.0,5.0,1.0,20.0,15.9,,Peterborough,L J Pruhs,,4


In [45]:
columns_to_drop = ['SP', 'dogBorn', 'dogColour', 'dogId', 'dogSeason', 
           'dogSex', 'dogSire', 'dogSire', 'meetingId', 
           'ownerName', 'raceForecast', 'raceId', 'raceNumber', 
           'raceHandicap', 'racePrizes', 'raceTricast', 'raceType', 
           'resultAdjustedTime', 'resultMarketCnt', 'resultMarketPos',
           'trainerName', 'trapHandicap', 'raceTime', 'raceTime',
            'resultSectionalTime', 'raceDistance', 'resultComment',
]

df = main_df.drop(columns_to_drop, axis=1)
df.head()

Unnamed: 0,dogName,raceClass,raceDate,raceGoing,resultBtnDistance,resultDogWeight,resultPosition,resultPriceDenominator,resultPriceNumerator,resultRunTime,trackName,trapNumber
0,Rastafari Jon,D3,10/01/2014,-5.0,,34.9,1.0,1.0,4.0,15.37,Peterborough,5
1,Sisnemesis Ned,D3,10/01/2014,-5.0,2 1/4,28.3,2.0,1.0,6.0,15.54,Peterborough,6
2,Knockrour Pele,D3,10/01/2014,-5.0,1 1/4,33.3,3.0,1.0,6.0,15.64,Peterborough,3
3,Pat My Boy,D3,10/01/2014,-5.0,2,29.6,4.0,4.0,9.0,15.81,Peterborough,1
4,Cheers Buddy,D3,10/01/2014,-5.0,1,31.2,5.0,1.0,20.0,15.9,Peterborough,4


In [46]:
columns = df.columns

for col in columns:
    print(col, ':', df[col].nunique())

dogName : 1068
raceClass : 8
raceDate : 904
raceGoing : 16
resultBtnDistance : 56
resultDogWeight : 175
resultPosition : 7
resultPriceDenominator : 13
resultPriceNumerator : 26
resultRunTime : 206
trackName : 3
trapNumber : 6


In [47]:
df.replace(['', ' ', np.nan], np.nan)
df_cleaned = df.dropna(subset=['dogName', 'raceClass', 'resultPosition', 'resultPriceDenominator', 'resultPriceNumerator', 'raceDate', 'trackName'])

In [48]:
df_cleaned['resultPosition'].value_counts()

resultPosition
2.0    1784
1.0    1781
3.0    1779
4.0    1770
5.0    1749
6.0    1350
0.0       5
Name: count, dtype: int64

In [49]:
df_cleaned[df_cleaned['resultPosition'] == 0.0]

Unnamed: 0,dogName,raceClass,raceDate,raceGoing,resultBtnDistance,resultDogWeight,resultPosition,resultPriceDenominator,resultPriceNumerator,resultRunTime,trackName,trapNumber
9128,Ballymac Payoff,D2,16/08/2019,,,36.4,0.0,1.0,4.0,,Peterborough,1
9129,Abigails Boy,D2,16/08/2019,,,30.7,0.0,1.0,3.0,,Peterborough,2
9130,Garlyn Grace,D2,16/08/2019,,,30.0,0.0,1.0,3.0,,Peterborough,3
9131,Rolling Stone,D2,16/08/2019,,,32.0,0.0,4.0,7.0,,Peterborough,4
9132,Cash Is Gold,D2,16/08/2019,,,33.3,0.0,4.0,9.0,,Peterborough,5


In [50]:
df_cleaned = df_cleaned.drop(index=[9128, 9129, 9130, 9131, 9132])

In [51]:
df_cleaned['raceClass'].unique()

array(['D3', 'D2', 'A15', 'OR', 'D1', 'IT', 'D4', 'D5'], dtype=object)

In [52]:
len(df_cleaned[df_cleaned['raceClass'] == 'A15'])

35

In [53]:
len(df_cleaned)

10213

In [54]:
df_cleaned = df_cleaned[df_cleaned['raceClass'] != 'A15']

In [55]:
len(df_cleaned)

10178

In [56]:
df_cleaned['raceDate'] = pd.to_datetime(df_cleaned['raceDate'], format='%d/%m/%Y')
df_cleaned['forecast'] = df_cleaned['resultPriceNumerator'] / df_cleaned['resultPriceDenominator']

In [57]:
def process_dog_data(df):
    dog_names = df['dogName'].unique()

    results = []
    
    for dog in dog_names:
        dog_data = df[df['dogName'] == dog].sort_values(by='raceDate')
        
        dog_results = []
        for i, row in dog_data.iterrows():
            current_race_date = row['raceDate']
            
            previous_races = dog_data[dog_data['raceDate'] < current_race_date].tail(5)
            
            by = previous_races['resultBtnDistance'].tolist()
            finished = previous_races['resultPosition'].tolist()
            going = previous_races['raceGoing'].tolist()
            price_dens = previous_races['resultPriceDenominator'].tolist()
            price_nums = previous_races['resultPriceNumerator'].tolist()
            race_grade = previous_races['raceClass'].tolist()
            run_time = previous_races['resultRunTime'].tolist()
            trap = previous_races['trapNumber'].tolist()
            weight = previous_races['resultDogWeight'].tolist()

            while len(by) < 5:
                by.append(np.nan)
                finished.append(np.nan)
                going.append(np.nan)
                price_dens.append(np.nan)
                price_nums.append(np.nan)
                race_grade.append(np.nan)
                run_time.append(np.nan)
                trap.append(np.nan)
                weight.append(np.nan)

            result = (by + finished + going + 
                      price_dens + price_nums +
                      race_grade + run_time + 
                      trap + weight)

            dog_results.append(result)
        
        dog_data_results = pd.DataFrame(dog_results, columns=[
            'by_1', 'by_2', 'by_3', 'by_4', 'by_5',
            'finished_1', 'finished_2', 'finished_3', 'finished_4', 'finished_5',
            'going_1', 'going_2', 'going_3', 'going_4', 'going_5',
            'price_dens_1', 'price_dens_2', 'price_dens_3', 'price_dens_4', 'price_dens_5',
            'price_nums_1', 'price_nums_2', 'price_nums_3', 'price_nums_4', 'price_nums_5',
            'race_grade_1', 'race_grade_2', 'race_grade_3', 'race_grade_4', 'race_grade_5',
            'run_time_1', 'run_time_2', 'run_time_3', 'run_time_4', 'run_time_5',
            'trap_1', 'trap_2', 'trap_3', 'trap_4', 'trap_5',
            'weight_1', 'weight_2', 'weight_3', 'weight_4', 'weight_5'
        ])
        dog_data.reset_index(drop=True, inplace=True)
        dog_data_results.reset_index(drop=True, inplace=True)
        
        combined_data = pd.concat([dog_data, dog_data_results], axis=1)
        results.append(combined_data)
    
    final_df = pd.concat(results).sort_index()

    return final_df

df_cleaned = df_cleaned.replace(['', ' ', None], np.nan)
df_full = process_dog_data(df_cleaned)

In [58]:
df_full.head()

Unnamed: 0,dogName,raceClass,raceDate,raceGoing,resultBtnDistance,resultDogWeight,resultPosition,resultPriceDenominator,resultPriceNumerator,resultRunTime,...,trap_1,trap_2,trap_3,trap_4,trap_5,weight_1,weight_2,weight_3,weight_4,weight_5
0,Rastafari Jon,D3,2014-01-10,-5.0,,34.9,1.0,1.0,4.0,15.37,...,,,,,,,,,,
0,Unique Lillian,D2,2017-02-04,-10.0,1 1/2,27.3,6.0,1.0,6.0,15.86,...,,,,,,,,,,
0,Oi Oi Leemac,D2,2017-02-10,5.0,1/2,33.9,4.0,1.0,6.0,15.57,...,,,,,,,,,,
0,Reaping Reward,D4,2017-12-08,5.0,3/4,26.5,4.0,1.0,7.0,15.86,...,,,,,,,,,,
0,Headford Suzie,D3,2017-12-06,0.0,1 1/4,23.0,4.0,1.0,3.0,15.85,...,,,,,,,,,,


In [59]:
df_full = df_full.dropna(subset=['finished_1'])

In [60]:
df_full.head()

Unnamed: 0,dogName,raceClass,raceDate,raceGoing,resultBtnDistance,resultDogWeight,resultPosition,resultPriceDenominator,resultPriceNumerator,resultRunTime,...,trap_1,trap_2,trap_3,trap_4,trap_5,weight_1,weight_2,weight_3,weight_4,weight_5
1,Still Thinking,D3,2017-11-29,0.0,2 1/4,28.8,3.0,2.0,7.0,15.42,...,4.0,,,,,28.7,,,,
1,Caz Rage,OR,2014-10-29,,NK,32.7,4.0,4.0,11.0,15.49,...,2.0,,,,,32.6,,,,
1,Bilkos Victory,D3,2017-06-28,-10.0,1/2,33.8,2.0,4.0,9.0,15.46,...,3.0,,,,,34.2,,,,
1,Burnpark Trump,D4,2017-11-03,0.0,SH,33.3,4.0,2.0,7.0,15.96,...,4.0,,,,,33.7,,,,
1,Headford Master,D4,2020-02-29,,,32.1,1.0,4.0,6.0,15.45,...,6.0,,,,,32.3,,,,


In [61]:
df_full = df_full.replace([None], np.nan)

In [62]:
def fill_miss_values(row):
    last_results = None
    size = None
    start_index = None

    for i in range(2, 6):
        if pd.isna(row[f'finished_{i}']):
            last_results = row[[f'by_{i - 1}', f'finished_{i - 1}', 
                                f'going_{i - 1}', f'price_dens_{i - 1}', f'price_nums_{i - 1}',
                                f'race_grade_{i - 1}', f'run_time_{i - 1}', f'trap_{i - 1}', f'weight_{i - 1}']]
            size = 5 - i + 1
            start_index = i
            break
    
    if size is not None:
        for j in range(size):
            idx = start_index + j
            row[f'by_{idx}'] = last_results[f'by_{start_index - 1}']
            row[f'finished_{idx}'] = last_results[f'finished_{start_index - 1}']
            row[f'going_{idx}'] = last_results[f'going_{start_index - 1}']
            row[f'price_dens_{idx}'] = last_results[f'price_dens_{start_index - 1}']
            row[f'price_nums_{idx}'] = last_results[f'price_nums_{start_index - 1}']
            row[f'race_grade_{idx}'] = last_results[f'race_grade_{start_index - 1}']
            row[f'run_time_{idx}'] = last_results[f'run_time_{start_index - 1}']
            row[f'trap_{idx}'] = last_results[f'trap_{start_index - 1}']
            row[f'weight_{idx}'] = last_results[f'weight_{start_index - 1}']
    
    return row

In [63]:
df_full = df_full.apply(fill_miss_values, axis=1)

In [64]:
for i in range(5):
    df_full[f'odds_{i+1}'] = df_full[f'price_nums_{i+1}'] / df_full[f'price_dens_{i+1}']

columns_to_drop = ['dogName', 'raceDate', 'resultBtnDistance', 
                   'raceGoing', 'resultPriceDenominator', 
                   'resultPriceNumerator', 'resultRunTime',
                   'resultDogWeight', 'price_dens_1', 'price_dens_2',
                   'price_dens_3', 'price_dens_4', 'price_dens_5',
                   'price_nums_1', 'price_nums_2', 'price_nums_3',
                   'price_nums_4', 'price_nums_5']
df_full = df_full.drop(columns_to_drop, axis=1)

In [65]:
df_full.head()

Unnamed: 0,raceClass,resultPosition,trackName,trapNumber,forecast,by_1,by_2,by_3,by_4,by_5,...,weight_1,weight_2,weight_3,weight_4,weight_5,odds_1,odds_2,odds_3,odds_4,odds_5
1,D3,3.0,Peterborough,4,3.5,2,2,2,2,2,...,28.7,28.7,28.7,28.7,28.7,7.0,7.0,7.0,7.0,7.0
1,OR,4.0,Peterborough,6,2.75,,,,,,...,32.6,32.6,32.6,32.6,32.6,1.1,1.1,1.1,1.1,1.1
1,D3,2.0,Peterborough,4,2.25,SH,SH,SH,SH,SH,...,34.2,34.2,34.2,34.2,34.2,3.5,3.5,3.5,3.5,3.5
1,D4,4.0,Peterborough,4,3.5,1,1,1,1,1,...,33.7,33.7,33.7,33.7,33.7,2.5,2.5,2.5,2.5,2.5
1,D4,1.0,Peterborough,6,1.5,2 3/4,2 3/4,2 3/4,2 3/4,2 3/4,...,32.3,32.3,32.3,32.3,32.3,8.0,8.0,8.0,8.0,8.0


In [66]:
import re

def convert_dist_by(sp):
    try:
        if isinstance(sp, float) and pd.isna(sp):
            return np.nan
        elif isinstance(sp, (int, float)):
            return sp
        elif isinstance(sp, str):
            parts = sp.split()
            
            if len(parts) == 2:  # Формат "3 3/4"
                whole_part = int(parts[0])
                fraction_part = parts[1]
                
                if '/' in fraction_part:
                    num, den = fraction_part.split('/')
                    fraction_value = int(num) / int(den)
                    return whole_part + fraction_value
                else:
                    return np.nan
            
            elif len(parts) == 1:  # Формат "3/4" или "3"
                if '/' in parts[0]:  # Если это дробь
                    num, den = parts[0].split('/')
                    return int(num) / int(den)
                else:  # Если это просто целое число
                    return float(parts[0])
            else:
                return np.nan
        else:
            return np.nan
    except (ValueError, IndexError) as e:
        return np.nan
    except Exception as e:
        return np.nan

In [67]:
for i in range(5):
    df_full.loc[:, f'by_{i+1}'] = df_full[f'by_{i+1}'].apply(convert_dist_by).round(2)

In [68]:
def set_adv_lagg(pos, by):
    if by is None or pd.isna(by):
        return np.nan
    result = np.round(by * 0.8, 2) if pos == 1 else np.round(by * -0.8, 2)
    return result

In [69]:
for i in range(5):
    df_full[f'by_{i+1}'] = df_full.apply(lambda row: set_adv_lagg(row[f'finished_{i+1}'], row[f'by_{i+1}']), axis=1)

In [70]:
import joblib

def save_encoder(encoder) -> None:
    dir_name = os.path.join("grayhound", "encoders")
    os.makedirs(dir_name, exist_ok=True)
    joblib.dump(encoder, os.path.join(dir_name, f"encoder_{DIST}.pkl"))

In [71]:
df_full = df_full.drop('trackName', axis=1)

In [72]:
from sklearn.preprocessing import OrdinalEncoder

columns_cat = ['raceClass', 'race_grade_1', 'race_grade_2', 'race_grade_3', 'race_grade_4', 'race_grade_5']
df_cat = df_full[columns_cat]

encoder = OrdinalEncoder()
encoder.fit(df_cat)

save_encoder(encoder)

df_encoded = encoder.transform(df_cat)
df_encoded_df = pd.DataFrame(df_encoded, columns=columns_cat)
df_full[columns_cat] = df_encoded_df

In [73]:
def save_imputer(imputer):
    dir_name = os.path.join("grayhound", "imputers")
    os.makedirs(dir_name, exist_ok=True)
    joblib.dump(imputer, os.path.join(dir_name, f"imputer_{DIST}.pkl"))

In [74]:
y = df_full['resultPosition']
df_full = df_full.drop(['resultPosition'], axis=1)

In [75]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3, weights='distance', keep_empty_features=False)
imputer.fit(df_full)

save_imputer(imputer)

X = imputer.transform(df_full)

In [78]:
df_ready = pd.DataFrame(X, columns=df_full.columns)

In [79]:
def save_dataset(dataset):
    dir_name = os.path.join("grayhound", "datasets_to_use")
    os.makedirs(dir_name, exist_ok=True)
    file_path = os.path.join(dir_name, f"dataset_{DIST}.csv")
    dataset.to_csv(file_path)

In [80]:
save_dataset(df_ready)

In [81]:
from sklearn.model_selection import train_test_split

X = df_ready.drop(['resultPosition'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [82]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'n_estimators': [3, 10, 30], #grow
        'max_features': [5, 10, 15, 25, 44],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],  # groww
    },
    {
        'bootstrap': [False],
        'n_estimators': [3, 10, 30], # groww
        'max_features': [5, 10, 15, 25, 44],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10], # groww
    }
]

forest_reg = RandomForestClassifier(random_state=42)
grd_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_log_loss')
grd_search.fit(X, y)

In [83]:
final_model = grd_search.best_estimator_

In [84]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(final_model, 
                         X_test, y_test,
                         scoring="neg_log_loss", cv=10)
forest_scores = np.sqrt(-scores)

In [85]:
def display_scores(scores):
    print("Scores' sum:", scores),
    print("Mean:", scores.mean()),
    print("Std:", scores.std())

display_scores(forest_scores)

Scores' sum: [1.32871368 1.33585682 1.32051799 1.33650323 1.33259799 1.31653525
 1.33793905 1.32980075 1.32003723 1.32877397]
Mean: 1.328727592942424
Std: 0.007098200969729281


In [86]:
def save_model(model):
    file_name = f"model_{DIST}.pkl"
    file_path = os.path.join("grayhound", "models", "random_forest_class")
    os.makedirs(file_path, exist_ok=True)
    joblib.dump(model, os.path.join(file_path, file_name))

In [87]:
grd_search.best_params_

{'bootstrap': False,
 'max_depth': 10,
 'max_features': 10,
 'min_samples_split': 10,
 'n_estimators': 30}

In [88]:
save_model(final_model)