In [27]:
import pandas as pd
import numpy as np
import os

DIST = 264

pd.set_option('display.max_columns', None)

In [28]:
def load_dataset(low_memory=True):
    file_name = fr'C:\machine learning\gb_greyhound\grayhound\data\train\datasets\dataset_{DIST}.0.csv'
    return pd.read_csv(file_name, low_memory=low_memory)

main_df = load_dataset(low_memory=False)

In [29]:
columns_to_drop = ['SP', 'dogBorn', 'dogColour', 'dogId', 'dogSeason', 
           'dogSex', 'dogSire', 'dogSire', 'meetingId', 
           'ownerName', 'raceForecast', 'raceId', 'raceNumber', 
           'raceHandicap', 'racePrizes', 'raceTricast', 'raceType', 
           'resultAdjustedTime', 'resultMarketCnt', 'resultMarketPos',
           'trainerName', 'trapHandicap', 'raceTime', 'raceTime',
            'resultSectionalTime', 'raceDistance', 'resultComment',
]

df = main_df.drop(columns_to_drop, axis=1)
df.head()

Unnamed: 0,dogName,raceClass,raceDate,raceGoing,resultBtnDistance,resultDogWeight,resultPosition,resultPriceDenominator,resultPriceNumerator,resultRunTime,trackName,trapNumber
0,Andlyns Asabat,OR,25/01/2014,0.0,,29.7,1.0,1.0,1.0,15.51,Monmore,1
1,Kirk Superstar,OR,25/01/2014,0.0,1,34.7,2.0,4.0,6.0,15.58,Monmore,6
2,Luminous Walk,OR,25/01/2014,0.0,4,31.9,3.0,1.0,5.0,15.89,Monmore,2
3,Yoga Man,OR,25/01/2014,0.0,1 1/2,36.3,4.0,1.0,7.0,16.01,Monmore,3
4,Catunda Kevin,OR,25/01/2014,0.0,1 1/2,34.7,5.0,1.0,8.0,16.12,Monmore,5


In [30]:
columns = df.columns

for col in columns:
    print(col, ':', df[col].nunique())

dogName : 2298
raceClass : 10
raceDate : 1569
raceGoing : 18
resultBtnDistance : 51
resultDogWeight : 181
resultPosition : 7
resultPriceDenominator : 13
resultPriceNumerator : 23
resultRunTime : 220
trackName : 1
trapNumber : 6


In [31]:
df = df.drop('trackName', axis=1)

In [32]:
df.replace(['', ' ', np.nan], np.nan)
df_cleaned = df.dropna(subset=['dogName', 'raceClass', 'resultPosition', 'resultPriceDenominator', 'resultPriceNumerator', 'raceDate'])

In [33]:
df_cleaned['resultPosition'].value_counts()

resultPosition
1.0    3509
2.0    3497
3.0    3497
5.0    3480
4.0    3471
6.0    3229
Name: count, dtype: int64

In [34]:
df_cleaned['raceClass'].unique()

array(['OR', 'IT', 'D1', 'D2', 'D3', 'D5', 'D4', 'OR3', 'IV', 'OR2'],
      dtype=object)

In [35]:
df_cleaned.loc[:, 'raceDate'] = pd.to_datetime(df_cleaned['raceDate'], format='%d/%m/%Y')
df_cleaned.loc[:, 'forecast'] = df_cleaned['resultPriceNumerator'] / df_cleaned['resultPriceDenominator']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:, 'forecast'] = df_cleaned['resultPriceNumerator'] / df_cleaned['resultPriceDenominator']


In [36]:
def process_dog_data(df):
    dog_names = df['dogName'].unique()

    results = []
    
    for dog in dog_names:
        dog_data = df[df['dogName'] == dog].sort_values(by='raceDate')
        
        dog_results = []
        for i, row in dog_data.iterrows():
            current_race_date = row['raceDate']
            
            previous_races = dog_data[dog_data['raceDate'] < current_race_date].tail(5)
            
            by = previous_races['resultBtnDistance'].tolist()
            finished = previous_races['resultPosition'].tolist()
            going = previous_races['raceGoing'].tolist()
            price_dens = previous_races['resultPriceDenominator'].tolist()
            price_nums = previous_races['resultPriceNumerator'].tolist()
            race_grade = previous_races['raceClass'].tolist()
            run_time = previous_races['resultRunTime'].tolist()
            trap = previous_races['trapNumber'].tolist()
            weight = previous_races['resultDogWeight'].tolist()

            while len(by) < 5:
                by.append(np.nan)
                finished.append(np.nan)
                going.append(np.nan)
                price_dens.append(np.nan)
                price_nums.append(np.nan)
                race_grade.append(np.nan)
                run_time.append(np.nan)
                trap.append(np.nan)
                weight.append(np.nan)

            result = (by + finished + going + 
                      price_dens + price_nums +
                      race_grade + run_time + 
                      trap + weight)

            dog_results.append(result)
        
        dog_data_results = pd.DataFrame(dog_results, columns=[
            'by_1', 'by_2', 'by_3', 'by_4', 'by_5',
            'finished_1', 'finished_2', 'finished_3', 'finished_4', 'finished_5',
            'going_1', 'going_2', 'going_3', 'going_4', 'going_5',
            'price_dens_1', 'price_dens_2', 'price_dens_3', 'price_dens_4', 'price_dens_5',
            'price_nums_1', 'price_nums_2', 'price_nums_3', 'price_nums_4', 'price_nums_5',
            'race_grade_1', 'race_grade_2', 'race_grade_3', 'race_grade_4', 'race_grade_5',
            'run_time_1', 'run_time_2', 'run_time_3', 'run_time_4', 'run_time_5',
            'trap_1', 'trap_2', 'trap_3', 'trap_4', 'trap_5',
            'weight_1', 'weight_2', 'weight_3', 'weight_4', 'weight_5'
        ])
        dog_data.reset_index(drop=True, inplace=True)
        dog_data_results.reset_index(drop=True, inplace=True)
        
        combined_data = pd.concat([dog_data, dog_data_results], axis=1)
        results.append(combined_data)
    
    final_df = pd.concat(results).sort_index()

    return final_df

df_cleaned = df_cleaned.replace(['', ' ', None], np.nan)
df_full = process_dog_data(df_cleaned)

  df_cleaned = df_cleaned.replace(['', ' ', None], np.nan)


In [37]:
df_full = df_full.dropna(subset=['finished_1'])

In [38]:
df_full.head()

Unnamed: 0,dogName,raceClass,raceDate,raceGoing,resultBtnDistance,resultDogWeight,resultPosition,resultPriceDenominator,resultPriceNumerator,resultRunTime,trapNumber,forecast,by_1,by_2,by_3,by_4,by_5,finished_1,finished_2,finished_3,finished_4,finished_5,going_1,going_2,going_3,going_4,going_5,price_dens_1,price_dens_2,price_dens_3,price_dens_4,price_dens_5,price_nums_1,price_nums_2,price_nums_3,price_nums_4,price_nums_5,race_grade_1,race_grade_2,race_grade_3,race_grade_4,race_grade_5,run_time_1,run_time_2,run_time_3,run_time_4,run_time_5,trap_1,trap_2,trap_3,trap_4,trap_5,weight_1,weight_2,weight_3,weight_4,weight_5
1,Uptown Girl,D2,2023-11-18,0.0,,24.7,1.0,1.0,4.0,15.55,5,4.0,1 1/2,,,,,4.0,,,,,-5.0,,,,,1.0,,,,,3.0,,,,,D2,,,,,15.85,,,,,4.0,,,,,24.5,,,,
1,Chauffeur Ossie,D5,2019-08-07,,HD,30.7,6.0,1.0,10.0,16.46,1,10.0,1/2,,,,,5.0,,,,,,,,,,1.0,,,,,10.0,,,,,D5,,,,,16.52,,,,,1.0,,,,,30.7,,,,
1,Knockalton Moe,D1,2019-11-30,,1,31.2,5.0,1.0,6.0,15.89,4,6.0,1 1/4,,,,,4.0,,,,,,,,,,1.0,,,,,12.0,,,,,D1,,,,,15.96,,,,,6.0,,,,,31.5,,,,
1,Passing Sail,D4,2018-08-15,0.0,1/2,34.5,2.0,4.0,11.0,16.04,1,2.75,3 1/4,,,,,3.0,,,,,-10.0,,,,,1.0,,,,,4.0,,,,,D4,,,,,16.11,,,,,1.0,,,,,35.4,,,,
1,Ballymac Odie,D1,2022-12-21,0.0,1/2,33.4,6.0,1.0,6.0,16.33,2,6.0,3/4,,,,,4.0,,,,,-15.0,,,,,1.0,,,,,3.0,,,,,D1,,,,,15.92,,,,,1.0,,,,,33.1,,,,


In [39]:
df_full = df_full.replace([None], np.nan)

In [40]:
def fill_miss_values(row):
    last_results = None
    size = None
    start_index = None

    for i in range(2, 6):
        if pd.isna(row[f'finished_{i}']):
            last_results = row[[f'by_{i - 1}', f'finished_{i - 1}', 
                                f'going_{i - 1}', f'price_dens_{i - 1}', f'price_nums_{i - 1}',
                                f'race_grade_{i - 1}', f'run_time_{i - 1}', f'trap_{i - 1}', f'weight_{i - 1}']]
            size = 5 - i + 1
            start_index = i
            break
    
    if size is not None:
        for j in range(size):
            idx = start_index + j
            row[f'by_{idx}'] = last_results[f'by_{start_index - 1}']
            row[f'finished_{idx}'] = last_results[f'finished_{start_index - 1}']
            row[f'going_{idx}'] = last_results[f'going_{start_index - 1}']
            row[f'price_dens_{idx}'] = last_results[f'price_dens_{start_index - 1}']
            row[f'price_nums_{idx}'] = last_results[f'price_nums_{start_index - 1}']
            row[f'race_grade_{idx}'] = last_results[f'race_grade_{start_index - 1}']
            row[f'run_time_{idx}'] = last_results[f'run_time_{start_index - 1}']
            row[f'trap_{idx}'] = last_results[f'trap_{start_index - 1}']
            row[f'weight_{idx}'] = last_results[f'weight_{start_index - 1}']
    
    return row

In [41]:
df_full = df_full.apply(fill_miss_values, axis=1)

In [42]:
for i in range(5):
    df_full[f'odds_{i+1}'] = df_full[f'price_nums_{i+1}'] / df_full[f'price_dens_{i+1}']

columns_to_drop = ['dogName', 'raceDate', 'resultBtnDistance', 
                   'raceGoing', 'resultPriceDenominator', 
                   'resultPriceNumerator', 'resultRunTime',
                   'resultDogWeight', 'price_dens_1', 'price_dens_2',
                   'price_dens_3', 'price_dens_4', 'price_dens_5',
                   'price_nums_1', 'price_nums_2', 'price_nums_3',
                   'price_nums_4', 'price_nums_5']
df_full = df_full.drop(columns_to_drop, axis=1)

In [43]:
df_full.head()

Unnamed: 0,raceClass,resultPosition,trapNumber,forecast,by_1,by_2,by_3,by_4,by_5,finished_1,finished_2,finished_3,finished_4,finished_5,going_1,going_2,going_3,going_4,going_5,race_grade_1,race_grade_2,race_grade_3,race_grade_4,race_grade_5,run_time_1,run_time_2,run_time_3,run_time_4,run_time_5,trap_1,trap_2,trap_3,trap_4,trap_5,weight_1,weight_2,weight_3,weight_4,weight_5,odds_1,odds_2,odds_3,odds_4,odds_5
1,D2,1.0,5,4.0,1 1/2,1 1/2,1 1/2,1 1/2,1 1/2,4.0,4.0,4.0,4.0,4.0,-5.0,-5.0,-5.0,-5.0,-5.0,D2,D2,D2,D2,D2,15.85,15.85,15.85,15.85,15.85,4.0,4.0,4.0,4.0,4.0,24.5,24.5,24.5,24.5,24.5,3.0,3.0,3.0,3.0,3.0
1,D5,6.0,1,10.0,1/2,1/2,1/2,1/2,1/2,5.0,5.0,5.0,5.0,5.0,,,,,,D5,D5,D5,D5,D5,16.52,16.52,16.52,16.52,16.52,1.0,1.0,1.0,1.0,1.0,30.7,30.7,30.7,30.7,30.7,10.0,10.0,10.0,10.0,10.0
1,D1,5.0,4,6.0,1 1/4,1 1/4,1 1/4,1 1/4,1 1/4,4.0,4.0,4.0,4.0,4.0,,,,,,D1,D1,D1,D1,D1,15.96,15.96,15.96,15.96,15.96,6.0,6.0,6.0,6.0,6.0,31.5,31.5,31.5,31.5,31.5,12.0,12.0,12.0,12.0,12.0
1,D4,2.0,1,2.75,3 1/4,3 1/4,3 1/4,3 1/4,3 1/4,3.0,3.0,3.0,3.0,3.0,-10.0,-10.0,-10.0,-10.0,-10.0,D4,D4,D4,D4,D4,16.11,16.11,16.11,16.11,16.11,1.0,1.0,1.0,1.0,1.0,35.4,35.4,35.4,35.4,35.4,4.0,4.0,4.0,4.0,4.0
1,D1,6.0,2,6.0,3/4,3/4,3/4,3/4,3/4,4.0,4.0,4.0,4.0,4.0,-15.0,-15.0,-15.0,-15.0,-15.0,D1,D1,D1,D1,D1,15.92,15.92,15.92,15.92,15.92,1.0,1.0,1.0,1.0,1.0,33.1,33.1,33.1,33.1,33.1,3.0,3.0,3.0,3.0,3.0


In [44]:
import re

def convert_dist_by(sp):
    try:
        if isinstance(sp, float) and pd.isna(sp):
            return np.nan
        elif isinstance(sp, (int, float)):
            return sp
        elif isinstance(sp, str):
            parts = sp.split()
            
            if len(parts) == 2:  # Формат "3 3/4"
                whole_part = int(parts[0])
                fraction_part = parts[1]
                
                if '/' in fraction_part:
                    num, den = fraction_part.split('/')
                    fraction_value = int(num) / int(den)
                    return whole_part + fraction_value
                else:
                    return np.nan
            
            elif len(parts) == 1:  # Формат "3/4" или "3"
                if '/' in parts[0]:  # Если это дробь
                    num, den = parts[0].split('/')
                    return int(num) / int(den)
                else:  # Если это просто целое число
                    return float(parts[0])
            else:
                return np.nan
        else:
            return np.nan
    except (ValueError, IndexError) as e:
        return np.nan
    except Exception as e:
        return np.nan

In [45]:
for i in range(5):
    df_full.loc[:, f'by_{i+1}'] = df_full[f'by_{i+1}'].apply(convert_dist_by).round(2)

In [46]:
def set_adv_lagg(pos, by):
    if by is None or pd.isna(by):
        return np.nan
    result = np.round(by * 0.8, 2) if pos == 1 else np.round(by * -0.8, 2)
    return result

In [47]:
for i in range(5):
    df_full[f'by_{i+1}'] = df_full.apply(lambda row: set_adv_lagg(row[f'finished_{i+1}'], row[f'by_{i+1}']), axis=1)

In [48]:
import joblib

def save_encoder(encoder) -> None:
    dir_name = os.path.join("grayhound", "encoders")
    os.makedirs(dir_name, exist_ok=True)
    joblib.dump(encoder, os.path.join(dir_name, f"encoder_{DIST}.pkl"))

In [49]:
from sklearn.preprocessing import OrdinalEncoder

columns_cat = ['raceClass', 'race_grade_1', 'race_grade_2', 'race_grade_3', 'race_grade_4', 'race_grade_5']
df_cat = df_full[columns_cat]

encoder = OrdinalEncoder()
encoder.fit(df_cat)

save_encoder(encoder)

df_encoded = encoder.transform(df_cat)
df_encoded_df = pd.DataFrame(df_encoded, columns=columns_cat)
df_full[columns_cat] = df_encoded_df

In [50]:
y = df_full['resultPosition']

In [51]:
df_full = df_full.drop(['resultPosition'], axis=1)

In [52]:
def save_imputer(imputer):
    dir_name = os.path.join("grayhound", "imputers")
    os.makedirs(dir_name, exist_ok=True)
    joblib.dump(imputer, os.path.join(dir_name, f"imputer_{DIST}.pkl"))

In [53]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3, weights='distance', keep_empty_features=False)
imputer.fit(df_full)

save_imputer(imputer)

X = imputer.transform(df_full)

In [30]:
df_ready = pd.DataFrame(X, columns=df_full.columns)

In [31]:
def save_dataset(dataset):
    dir_name = os.path.join("grayhound", "datasets_to_use")
    os.makedirs(dir_name, exist_ok=True)
    file_path = os.path.join(dir_name, f"dataset_{DIST}.csv")
    dataset.to_csv(file_path)

In [32]:
save_dataset(df_ready)

In [33]:
from sklearn.model_selection import train_test_split


X = df_ready.drop(['resultPosition'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'n_estimators': [3, 10, 30],
        'max_features': [5, 10, 15, 25, 44],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
    },
    {
        'bootstrap': [False],
        'n_estimators': [3, 10, 30],
        'max_features': [5, 10, 15, 25, 44],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
    }
]

forest_reg = RandomForestClassifier(random_state=42)
grd_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_log_loss')
grd_search.fit(X, y)

In [35]:
final_model = grd_search.best_estimator_

In [36]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(final_model, 
                         X_test, y_test,
                         scoring="neg_log_loss", cv=10)
forest_scores = np.sqrt(-scores)

In [37]:
def display_scores(scores):
    print("Scores' sum:", scores),
    print("Mean:", scores.mean()),
    print("Std:", scores.std())

display_scores(forest_scores)

Scores' sum: [1.33556391 1.32986552 1.32319054 1.33135428 1.32503644 1.31396284
 1.33075137 1.32212868 1.32463138 1.32793855]
Mean: 1.3264423515027177
Std: 0.005744827865913883


In [38]:
def save_model(model):
    file_name = f"model_{DIST}.pkl"
    file_path = os.path.join("grayhound", "models", "random_forest_class")
    os.makedirs(file_path, exist_ok=True)
    joblib.dump(model, os.path.join(file_path, file_name))

In [39]:
grd_search.best_params_

{'bootstrap': False,
 'max_depth': 10,
 'max_features': 15,
 'min_samples_split': 5,
 'n_estimators': 30}

In [40]:
save_model(final_model)

In [None]:
le_loaded = joblib.load('label_encoder.pkl')