In [93]:
import pandas as pd
import numpy as np
import os

In [2]:
pd.set_option('display.max_columns', None)

In [50]:
DIST = 480

In [3]:
def load_test() -> None:
    dir_path = r'C:\machine learning\gb_greyhound\grayhound\data\test'
    file_name = 'prepared_test_2024-08-28.csv'
    file_path = os.path.join(dir_path, file_name)
    return pd.read_csv(file_path)

def load_dataset() -> None:
    dir_path = r'C:\machine learning\gb_greyhound\grayhound\data\train\datasets'
    file_name = 'dataset_480.0.csv'
    file_path = os.path.join(dir_path, file_name)
    return pd.read_csv(file_path, low_memory=False)

In [4]:
testing = load_dataset()

In [5]:
to_drop = ['SP', 'dogBorn', 'dogColour', 'dogId', 'dogSeason', 
           'dogSex', 'dogSire', 'dogSire', 'meetingId', 
           'ownerName', 'raceForecast', 'raceId', 'raceNumber', 
           'raceHandicap', 'racePrizes', 'raceTricast', 'raceType', 
           'resultAdjustedTime', 'resultMarketCnt', 'resultMarketPos',
           'trainerName', 'trapHandicap']

testing_tr = testing.drop(to_drop, axis=1)

In [6]:
testing_tr['raceDate'] = pd.to_datetime(testing_tr['raceDate'], format='%d/%m/%Y')

In [7]:
testing_tr['forecast'] = testing_tr['resultPriceNumerator'] / testing_tr['resultPriceDenominator']

In [8]:
def get_last_5_races(row, df):
    current_race_date = row['raceDate']
    current_dog_name = row['dogName']
    
    try:
        valid_races = df[(df['dogName'] == current_dog_name) & df['raceDate'].notna()]
        valid_races = valid_races[valid_races['raceDate'] < current_race_date]

        previous_races = valid_races.sort_values(by='raceDate', ascending=False)
        last_5_races = previous_races.head(5)
        
        by = []
        comments = []
        dist = []
        finished = []
        going = []
        price_dens = []
        price_nums = []
        race_grade = []
        run_time = []
        sec_time = []
        trap = []
        weight = []
        
        for _, race in last_5_races.iterrows():
            by.append(race['resultBtnDistance'])
            comments.append(race['resultComment'])
            dist.append(race['raceDistance'])
            finished.append(race['resultPosition'])
            going.append(race['raceGoing'])
            price_dens.append(race['resultPriceDenominator'])
            price_nums.append(race['resultPriceNumerator'])
            race_grade.append(race['raceClass'])
            run_time.append(race['resultRunTime'])
            sec_time.append(race['resultSectionalTime'])
            trap.append(race['trapNumber'])
            weight.append(race['resultDogWeight'])

        # Если количество гонок меньше 5, заполняем недостающие элементы None
        while len(run_time) < 5:
            by.append(None)
            comments.append(None)
            dist.append(None)
            finished.append(None)
            going.append(None)
            price_dens.append(None)
            price_nums.append(None)
            race_grade.append(None)
            run_time.append(None)
            sec_time.append(None)
            trap.append(None)
            weight.append(None)

        result = (by + comments + dist + 
                  finished + going + 
                  price_dens + price_nums
                 + race_grade + run_time + 
                  sec_time + trap + weight)

        return result
    except Exception as e:
        print(f"Error at row {row.name}: {e}")
        raise

In [9]:
results = testing_tr.apply(lambda row: get_last_5_races(row, testing_tr), axis=1)
race_results_full = pd.DataFrame(results.tolist(), columns=[
    'by_1', 'by_2', 'by_3', 'by_4', 'by_5',
    'comments_1', 'comments_2', 'comments_3', 'comments_4', 'comments_5',
    'dist_1', 'dist_2', 'dist_3', 'dist_4', 'dist_5',
    'finished_1', 'finished_2', 'finished_3', 'finished_4', 'finished_5',
    'going_1', 'going_2', 'going_3', 'going_4', 'going_5',
    'price_dens_1', 'price_dens_2', 'price_dens_3', 'price_dens_4', 'price_dens_5',
    'price_nums_1', 'price_nums_2', 'price_nums_3', 'price_nums_4', 'price_nums_5',
    'race_grade_1', 'race_grade_2', 'race_grade_3', 'race_grade_4', 'race_grade_5',
    'run_time_1', 'run_time_2', 'run_time_3', 'run_time_4', 'run_time_5',
    'sec_time_1', 'sec_time_2', 'sec_time_3', 'sec_time_4', 'sec_time_5',
    'trap_1', 'trap_2', 'trap_3', 'trap_4', 'trap_5',
    'weight_1', 'weight_2', 'weight_3', 'weight_4', 'weight_5'
])

In [10]:
testing_full = pd.concat([testing_tr, race_results_full], axis=1)

In [11]:
testing_full.tail()

Unnamed: 0,dogName,raceClass,raceDate,raceDistance,raceGoing,raceTime,resultBtnDistance,resultComment,resultDogWeight,resultPosition,resultPriceDenominator,resultPriceNumerator,resultRunTime,resultSectionalTime,trackName,trapNumber,forecast,by_1,by_2,by_3,by_4,by_5,comments_1,comments_2,comments_3,comments_4,comments_5,dist_1,dist_2,dist_3,dist_4,dist_5,finished_1,finished_2,finished_3,finished_4,finished_5,going_1,going_2,going_3,going_4,going_5,price_dens_1,price_dens_2,price_dens_3,price_dens_4,price_dens_5,price_nums_1,price_nums_2,price_nums_3,price_nums_4,price_nums_5,race_grade_1,race_grade_2,race_grade_3,race_grade_4,race_grade_5,run_time_1,run_time_2,run_time_3,run_time_4,run_time_5,sec_time_1,sec_time_2,sec_time_3,sec_time_4,sec_time_5,trap_1,trap_2,trap_3,trap_4,trap_5,weight_1,weight_2,weight_3,weight_4,weight_5
766256,Edermine King,A8,2024-08-03,480.0,0.0,22:11:00,3,"SAw,EP,VeryWide4",27.1,2.0,4.0,9.0,29.68,3.71,Perry Barr,5,2.25,1/2,,,,,"SAwEP,FcdCk&Bmp1",,,,,480.0,,,,,3.0,,,,,-10.0,,,,,1.0,,,,,7.0,,,,,A8,,,,,30.16,,,,,3.76,,,,,5.0,,,,,27.2,,,,
766257,Rubys Charm,A8,2024-08-03,480.0,0.0,22:11:00,1,"EP,Mid,Ld- 1/2",24.8,3.0,8.0,15.0,29.76,3.59,Perry Barr,3,1.875,3 1/2,2,2 1/2,4 1/4,4,"EP,Crd1&2,FcdCk 1/2,Stb4","SAwEP,Crd1&2","EP,Mid,Ld1-RunIn","BlkStt,Crd 3/4","CrdStt,EP,Mid",480.0,480.0,480.0,480.0,480.0,5.0,4.0,3.0,6.0,6.0,0.0,0.0,0.0,-40.0,10.0,1.0,1.0,1.0,1.0,1.0,5.0,20.0,7.0,12.0,7.0,A8,A7,A7,A7,A7,30.19,29.84,29.74,30.54,29.94,3.51,3.62,3.56,3.69,3.64,4.0,3.0,4.0,4.0,4.0,24.4,24.9,24.4,24.3,24.5
766258,Ballarue Girl,A8,2024-08-03,480.0,0.0,22:11:00,3 3/4,"SAw,Wide",28.0,4.0,1.0,28.0,30.06,3.78,Perry Barr,6,28.0,1/2,2 1/4,3/4,4 1/2,3/4,"CrdStt&2,Wide","VSAw,Wide","Wide,FcdCk1","SAw,Wide","VSAw,Wide",480.0,480.0,480.0,480.0,480.0,6.0,4.0,6.0,5.0,4.0,0.0,0.0,10.0,10.0,-10.0,1.0,1.0,1.0,1.0,1.0,14.0,28.0,28.0,18.0,18.0,A9,A9,A9,A9,A9,30.53,30.44,30.61,30.73,30.83,3.87,3.9,3.61,3.82,3.83,5.0,6.0,5.0,6.0,6.0,28.1,28.1,27.8,27.7,27.5
766259,Circleofdreams,A8,2024-08-03,480.0,0.0,22:11:00,3/4,"Rls,Eased2&3",25.6,5.0,1.0,4.0,30.12,3.7,Perry Barr,1,4.0,2 3/4,,1/2,1 3/4,3/4,"EP,Rls,RanOn","Rls,LdRunIn-NrLine","LckEP,Eased3,RanOnLate","StbStt,Eased3,RanOn","Rls,Eased&Crd3",480.0,480.0,480.0,480.0,480.0,3.0,2.0,4.0,3.0,5.0,0.0,,10.0,-10.0,10.0,1.0,1.0,8.0,1.0,1.0,5.0,7.0,15.0,2.0,10.0,A8,A8,A8,A8,A8,29.82,29.96,30.1,30.36,29.94,3.72,3.68,3.81,3.95,3.71,1.0,1.0,1.0,2.0,2.0,26.2,26.2,26.2,27.0,25.9
766260,Rathsilla Puno,A8,2024-08-03,480.0,0.0,22:11:00,DNF,"Mid,Ck&Fell 1/2",27.8,6.0,1.0,6.0,,3.69,Perry Barr,4,6.0,1 3/4,HD,1/2,1 3/4,3/4,"CrdRunUp,Crd&FcdCk1","SAw,Mid,Crd1","Mid,Crd1,Ld2-RunIn,Crd","EP,Mid,Bmp1","EP,Mid,Ld 1/2-NrLn",480.0,480.0,480.0,480.0,480.0,6.0,4.0,3.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0,2.0,1.0,9.0,5.0,4.0,7.0,4.0,A8,A8,A8,A8,A8,30.41,30.03,29.89,30.0,29.72,3.72,3.66,3.73,3.64,3.68,4.0,4.0,4.0,4.0,3.0,27.8,27.3,27.2,27.7,27.3


In [46]:
grades = testing_full['raceClass'].unique()

In [49]:
stats = {}
for grade in grades:
    grade_len = len(testing_full[testing_full['raceClass'] == grade])
    grade_rows = testing_full[testing_full['raceClass'] == grade]
    grade_stat = grade_rows[['resultRunTime', 'resultSectionalTime', 'resultDogWeight', 'raceGoing', 'forecast']].agg(['mean', 'min', 'max'])
    stats[grade] = {'count': grade_len, 'statistics': grade_stat}



In [53]:
comparison_results = {}

for grade, data in stats.items():
    if grade == 'A1':
        continue
        
    a1_stats = stats['A1']['statistics']
    current_stats = data['statistics']
    diff = current_stats - a1_stats
    comparison_results[grade] = {'len': data['count'], 'diff': diff}

for grade, diff in comparison_results.items():
    print(f"Разница между A1 и {grade}:")
    print(diff)
    print("\n")

Разница между A1 и A4:
{'len': 91631, 'diff':       resultRunTime  resultSectionalTime  resultDogWeight  raceGoing  forecast
mean       0.411647            -0.076568        -1.561533   2.017591 -0.097576
min      -11.700000             0.000000         0.000000 -30.000000  0.100000
max        0.210000             8.740000         1.400000 -10.000000  7.000000}


Разница между A1 и A8:
{'len': 65201, 'diff':       resultRunTime  resultSectionalTime  resultDogWeight  raceGoing  forecast
mean       0.930182            -0.095353        -2.728733    4.26684  0.066042
min        0.420000             0.000000         0.000000  -10.00000  0.063636
max        0.790000           -30.010000         1.500000    0.00000  7.000000}


Разница между A1 и A6:
{'len': 84966, 'diff':       resultRunTime  resultSectionalTime  resultDogWeight  raceGoing  \
mean       0.621022              0.00745        -2.130492   2.627718   
min        0.170000              0.00000         0.000000 -60.000000   
max     

In [51]:
output_df = pd.concat({k: v['statistics'] for k, v in stats.items()}, axis=1)

import json

with open(f'{DIST}_statistics.json', 'w') as f:
    json.dump({k: {'count': v['count'], 'statistics': v['statistics'].to_dict()} for k, v in stats.items()}, f, indent=4)

In [55]:
diff_df = pd.concat({k: v['diff'] for k, v in comparison_results.items()}, axis=1)
with open(f'{DIST}_comparison_results.json', 'w') as f:
        json.dump({k: {'count': v['len'], 'diff': v['diff'].to_dict()} for k, v in comparison_results.items()}, f, indent=4)

In [56]:
grades

array(['A4', 'A8', 'A6', 'A2', 'A5', 'A3', 'A1', 'A7', 'HP', 'A9', 'OR',
       'A10', 'IT', 'P4', 'P3', 'B1', 'P1', 'P6', 'B5', 'IV', 'P2', 'B6',
       'B4', 'P5', 'S2', 'H1', 'D4', 'H2', 'KS', 'P8', 'B8', 'B7', 'P7',
       'P9', 'B3', 'D5', 'T3', 'B2', 'D2', 'T2', 'T1', 'S1', 'OR1', 'OR3',
       'OR2', 'P10'], dtype=object)

In [64]:
to_drop = ['OR', 'IT', 'P4', 'P3', 'P1', 'P6', 'IV', 'P2', 'P5', 'S2', 'H1', 'D4', 'H2', 'KS', 'P8', 'P7', 'P9', 'D5', 'T3', 'D2', 'T2', 'T1', 'S1', 'P10']
testing_full_dropped = testing_full[~testing_full['raceClass'].isin(to_drop)]
testing_full_dropped = testing_full_dropped[~testing_full_dropped['race_grade_1'].isin(to_drop)]
testing_full_dropped = testing_full_dropped[~testing_full_dropped['race_grade_2'].isin(to_drop)]
testing_full_dropped = testing_full_dropped[~testing_full_dropped['race_grade_3'].isin(to_drop)]
testing_full_dropped = testing_full_dropped[~testing_full_dropped['race_grade_4'].isin(to_drop)]
testing_full_dropped = testing_full_dropped[~testing_full_dropped['race_grade_5'].isin(to_drop)]

In [65]:
print(len(testing_full))
print(len(testing_full_dropped))

766261
668013


# Отделение

In [94]:
testing_full_dropped.dropna(subset=['dogName'], axis=0, inplace=True)

In [95]:
testing_full_dropped = testing_full_dropped.replace(['', ' ', None, 'NaN'], np.nan)

In [96]:
def convert_dist_by(dist_by):
    shorthand_mapping = {
        'HD': 0.1,
        'SH': 0.2,
        'NK': 0.3,
        'DIS': 30.0,
        'DH': 0.0,
        'DNF': 50,
    }

    if dist_by is None or pd.isna(dist_by) or dist_by == '.':
        return None
    else:
        try:
            if dist_by in shorthand_mapping:
                return shorthand_mapping[dist_by]
            parts = dist_by.split()
            if len(parts) == 1:  # If there's only one value in the string
                if '/' in parts[0]:  # If it's a fraction (e.g., '3/4')
                    num, denom = parts[0].split('/')
                    return float(num) / float(denom)
                else:  # If it's a whole number (e.g., '1')
                    return float(parts[0])
            elif len(parts) == 2:  # If there are two parts (e.g., '1 3/4')
                whole_num = float(parts[0])
                num, denom = parts[1].split('/')
                return whole_num + (float(num) / float(denom))
            else:
                raise ValueError(f"Unexpected format: {dist_by}")
        except Exception as e:
            print(f"Error converting distance: {dist_by}, {e}")
            return None

In [102]:
for i in range(5):
    testing_full_dropped.loc[:, f'by_{i+1}'] = testing_full_dropped[f'by_{i+1}'].apply(convert_dist_by).round(2)

In [104]:
def set_adv_lagg(pos, by):
    if by is None or pd.isna(by):
        return None
    result = np.round(by * 0.8, 2) if pos == 1 else np.round(by * -0.8, 2)
    print(f"pos: {pos}, by: {by}, result: {result}")  # Для отладки
    return result

In [105]:
for i in range(5):
    testing_full_dropped[f'by_{i+1}'] = testing_full_dropped.apply(lambda row: set_adv_lagg(row[f'finished_{i+1}'], row[f'by_{i+1}']), axis=1)

In [106]:
testing_full_dropped = testing_full_dropped.dropna(subset=['dist_1', 'dist_2', 'dist_3'], axis=0)

In [107]:
def convert_dist_to_int(dist):
    if isinstance(dist, str):
        cleaned_dist = re.sub(r'[^\d]', '', dist)
        if cleaned_dist:
            return int(cleaned_dist)
        else:
            return np.nan
    elif isinstance(dist, float) and pd.isna(dist):
        return np.nan
    else:
        return dist

In [108]:
for i in range(5):
    testing_full_dropped.loc[:, f'dist_{i+1}'] = testing_full_dropped[f'dist_{i+1}'].apply(lambda dist: convert_dist_to_int(dist))

In [109]:
for i in range(5):
    testing_full_dropped[f'odds_{i+1}'] = testing_full_dropped[f'price_nums_{i+1}'] / testing_full_dropped[f'price_dens_{i+1}']
    

In [110]:
def convert_forecast(sp):
    if isinstance(sp, str):
        sp = re.sub(r'[^\d/]', '', sp)
        if sp != '':
            num, den = sp.split('/')
            return int(num) / int(den)
        else:
            return np.nan
    elif isinstance(sp, float) and pd.isna(sp):
        return np.nan
    else:
        return sp

In [111]:
testing_full_dropped.loc[:, 'forecast'] = testing_full_dropped['forecast'].apply(lambda sp: convert_forecast(sp))

for i in range(5):
    testing_full_dropped.loc[:, f'odds_{i+1}'] = testing_full_dropped[f'odds_{i+1}'].apply(lambda sp: convert_forecast(sp))

In [112]:
def convert_signed_string_to_number(gng):
    try:
        return int(gng.strip())
    except ValueError:
        return np.nan

In [113]:
for i in range(5):
    testing_full_dropped.loc[:, f'going_{i+1}'] = pd.to_numeric(testing_full_dropped[f'going_{i+1}'], errors='coerce').fillna(0).astype('int32')

In [114]:
import joblib
def save_encoder(encoder, dist):
    dir_name = os.path.join("grayhound", "encoders")
    os.makedirs(dir_name, exist_ok=True)
    joblib.dump(encoder, os.path.join(dir_name, f"encoder_{dist}.pkl"))

In [115]:
testing_full_dropped = testing_full_dropped.dropna(subset=['resultPosition'])

In [116]:
from sklearn.preprocessing import OrdinalEncoder

columns_cat = ['raceClass', 'race_grade_1', 'race_grade_2', 'race_grade_3', 'race_grade_4', 'race_grade_5', 'trackName']
testing_cat = testing_full_dropped[columns_cat]

encoder = OrdinalEncoder()
encoder.fit(testing_cat)

save_encoder(encoder, 480)

testing_encoded = encoder.transform(testing_cat)
testing_encoded_df = pd.DataFrame(testing_encoded, columns=columns_cat)
testing_full_dropped[columns_cat] = testing_encoded_df

# -------------------------------------

In [117]:
to_drop = ['dogName', 'raceDate', 'raceTime', 
           'resultBtnDistance', 'resultPriceDenominator', 
           'resultPriceNumerator', 'resultRunTime', 
           'resultSectionalTime', 'resultDogWeight',
           'raceGoing', 'price_dens_1', 'price_dens_2', 'price_dens_3', 'price_dens_4', 'price_dens_5', 
           'price_nums_1', 'price_nums_2', 'price_nums_3', 'price_nums_4', 'price_nums_5',
           'resultComment', 'comments_1', 'comments_2', 'comments_3', 'comments_4', 'comments_5']

testing_full_dropped = testing_full_dropped.drop(to_drop, axis=1)
testing_full_dropped = testing_full_dropped.dropna(subset=['resultPosition'])

In [118]:
def save_imputer(imputer, dist):
    dir_name = os.path.join("grayhound", "imputers")
    os.makedirs(dir_name, exist_ok=True)
    joblib.dump(imputer, os.path.join(dir_name, f"imputer_{dist}.pkl"))

In [120]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3, weights='distance', keep_empty_features=False)
imputer.fit(testing_full_dropped)

save_imputer(imputer, 480)

X = imputer.transform(testing_full_dropped)

In [121]:
testing_ready = pd.DataFrame(X, columns=testing_full_dropped.columns)

ValueError: Shape of passed values is (587786, 51), indices imply (587786, 56)

In [None]:
for i in range(5):
    testing_ready.loc[testing_ready[f'finished_{i+1}'] == 1, f'by_{i+1}'] *= -1

In [None]:
for i in range(5):
    testing_ready.loc[:, f'dist_{i+1}'] = testing_ready.loc[:, f'dist_{i+1}'].round()
    testing_ready.loc[:, f'finished_{i+1}'] = testing_ready.loc[:, f'finished_{i+1}'].round()
    testing_ready.loc[:, f'trap_{i+1}'] = testing_ready.loc[:, f'trap_{i+1}'].round()

# ------------------------

In [287]:
from sklearn.ensemble import RandomForestClassifier

In [288]:
testing_train = testing_ready.drop(['raceDistance', 'trackName', 'dist_1', 'dist_2', 'dist_3', 'dist_4', 'dist_5'], axis=1)

In [293]:
testing_train.to_csv(r'C:\machine learning\gb_greyhound\grayhound\data\test\TO_CONTINUE.csv')

In [3]:
testing_train = pd.read_csv(r'C:\machine learning\gb_greyhound\grayhound\data\test\TO_CONTINUE.csv')

In [4]:
from sklearn.model_selection import train_test_split

y = testing_train['resultPosition']
X = testing_train.drop(['resultPosition'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [221]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
testing_sclaed = scaler.fit_transform(X_train)

In [230]:
from sklearn.preprocessing import MinMaxScaler

min_max = MinMaxScaler()
testing_min_max = min_max.fit_transform(X_train)

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [5, 10, 15, 25, 44]},
    {'bootstrap': [False], 'n_estimators': [3, 10, 30], 'max_features': [5, 10, 15, 25, 44]}
]

model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, 
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

In [147]:
grid_search.fit(X, y)

In [148]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 10, 'n_estimators': 30}

In [149]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

2.293614216529695 {'max_features': 5, 'n_estimators': 3}
2.1564952081930397 {'max_features': 5, 'n_estimators': 10}
2.0912015732146845 {'max_features': 5, 'n_estimators': 30}
2.2761022859685123 {'max_features': 10, 'n_estimators': 3}
2.139694017414631 {'max_features': 10, 'n_estimators': 10}
2.0745004263145095 {'max_features': 10, 'n_estimators': 30}
2.2662990417044164 {'max_features': 15, 'n_estimators': 3}
2.12480769528505 {'max_features': 15, 'n_estimators': 10}
2.0689174008607703 {'max_features': 15, 'n_estimators': 30}
2.2648915205342854 {'max_features': 25, 'n_estimators': 3}
2.1227849301717194 {'max_features': 25, 'n_estimators': 10}
2.0776708014669447 {'max_features': 25, 'n_estimators': 30}
2.259771392306807 {'max_features': 44, 'n_estimators': 3}
2.126517715471941 {'max_features': 44, 'n_estimators': 10}
2.0688613037793804 {'max_features': 44, 'n_estimators': 30}
2.273012244553945 {'bootstrap': False, 'max_features': 5, 'n_estimators': 3}
2.1396108101780853 {'bootstrap': Fals

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=44),
    }

forest_reg = RandomForestClassifier(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=20, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X, y)

In [13]:
rnd_search.best_params_

{'max_features': 2, 'n_estimators': 192}

In [14]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

2.4348628209065906 {'max_features': 39, 'n_estimators': 180}
2.3714949263556107 {'max_features': 29, 'n_estimators': 15}
2.4317992789149283 {'max_features': 43, 'n_estimators': 72}
2.3818483200803637 {'max_features': 21, 'n_estimators': 103}
2.3846193682129755 {'max_features': 19, 'n_estimators': 75}
2.3376220129814422 {'max_features': 11, 'n_estimators': 88}
2.4253978829330034 {'max_features': 36, 'n_estimators': 104}
2.3968088693390626 {'max_features': 24, 'n_estimators': 131}
2.4065007959324136 {'max_features': 22, 'n_estimators': 53}
2.120148008868904 {'max_features': 2, 'n_estimators': 88}
2.4063698619406018 {'max_features': 30, 'n_estimators': 38}
2.1073016926369927 {'max_features': 2, 'n_estimators': 192}
2.3759072188515975 {'max_features': 21, 'n_estimators': 161}
2.3288964597621717 {'max_features': 12, 'n_estimators': 58}
2.399822122363429 {'max_features': 22, 'n_estimators': 89}
2.392485726021249 {'max_features': 27, 'n_estimators': 59}
2.440877087969891 {'max_features': 42, 

In [8]:
final_model = rnd_search.best_estimator_

In [9]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(final_model, 
                         X_test, y_test,
                         scoring="neg_mean_squared_error", cv=10)
forest_scores = np.sqrt(-scores)

In [10]:
def display_scores(scores):
    print("Scores' sum:", scores),
    print("Mean:", scores.mean()),
    print("Std:", scores.std())

display_scores(forest_scores)

Scores' sum: [2.11909794 2.10991665 2.10953783 2.11138392 2.12827092 2.09466283
 2.1248259  2.10960097 2.10851152 2.10941154]
Mean: 2.1125220032263377
Std: 0.00901602545946166


In [11]:
import joblib
import os

def save_model(model, dist):
    file_name = f"rnd_frst_mdl_{dist}.pkl"
    file_path = os.path.join("grayhound", "models", "random_forest_class")
    os.makedirs(file_path, exist_ok=True)
    joblib.dump(model, os.path.join(file_path, file_name))

In [12]:
save_model(final_model, 480)