In [2]:
import pandas as pd
import numpy as np

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score

### Выгрузим данные

In [5]:
train = pd.read_csv('train.csv')

In [4]:
test = pd.read_csv('test.csv')

In [6]:
train = train.dropna(subset=['price'])
test = test.dropna(subset=['price'])

In [7]:
train = train.fillna(train.median())
test = test.fillna(train.median())

In [8]:
train.drop(['id', 'date'], axis=1, inplace=True)
X_train = train.drop("price",axis=1).values
y_train = train["price"].values

In [9]:
test.drop(['id', 'date'], axis=1, inplace=True)
X_test = test.drop("price",axis=1).values
y_test = test["price"].values

In [81]:
rf = RandomForestRegressor()
y_pred = rf.fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)

0.09656161092104198

Подбор гиперпараметров с помощью RandomizedSearch

In [76]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200, num = 20)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = list(range(1, 11))
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [77]:
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 30,
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = 3)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   27.2s
[Parallel(n_jobs=3)]: Done  90 out of  90 | elapsed:   59.7s finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=30, n_jobs=3,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110, 120,
                                                      130, 140, 150, 160, 170,
                                                      180, 190, 200, None],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10],
                                        'n_estimators': [10, 20, 30, 40, 50, 60,
                                                         70, 80, 90, 100, 110,
                                                         120, 130, 140, 150,
       

In [78]:
rf_random.best_params_

{'n_estimators': 190,
 'min_samples_leaf': 6,
 'max_features': 'log2',
 'max_depth': 160,
 'bootstrap': False}

In [79]:
y_pred = rf_random.best_estimator_.predict(X_test)

In [80]:
r2_score(y_test, y_pred)

0.13341085614906156

###  Используем hyper_opt

Для этого нужно определить 4 вещи:
    
    Оптимизируемую функцию
    Пространство параметров
    Алгоритм поиска
    Кол-во шагов оптимизации

In [19]:
!pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 2.4 MB/s eta 0:00:01
Collecting py4j
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 8.6 MB/s eta 0:00:01
Installing collected packages: py4j, hyperopt
Successfully installed hyperopt-0.2.7 py4j-0.10.9.5


In [62]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from sklearn.model_selection import train_test_split, cross_val_score

In [33]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.20, random_state=3)

In [68]:
def objective(search_space):
    model = RandomForestRegressor(**search_space, random_state=42)
    accuracy = cross_val_score(model, X_train, y_train, cv=3, scoring='r2').mean()
    return {'loss': -accuracy, 'status': STATUS_OK}

In [69]:
search_space = {
    'n_estimators': hp.randint('n_estimators',10,200),
    'max_depth': hp.randint('max_depth',10,200),           
    'min_samples_leaf': hp.randint('min_samples_leaf',1,10),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2']),
    'bootstrap': hp.choice('bootstrap',[True, False])
             }

In [70]:
algorithm=tpe.suggest  # использовать Tree of Parzen Estimators


In [71]:
best_params = fmin(
  fn=objective,
  space=search_space,
  algo=algorithm,
  max_evals=30
)

100%|██████████| 30/30 [03:07<00:00,  6.27s/trial, best loss: -0.13597090846141854]


In [72]:
space_eval(search_space, best_params)

{'bootstrap': False,
 'max_depth': 108,
 'max_features': 'log2',
 'min_samples_leaf': 9,
 'n_estimators': 169}

In [73]:
best_params

{'bootstrap': 1,
 'max_depth': 108,
 'max_features': 2,
 'min_samples_leaf': 9,
 'n_estimators': 169}

In [74]:
y_pred = RandomForestRegressor(**best_params).fit(X_train, y_train).predict(X_test)

In [75]:
r2_score(y_test, y_pred)

0.1309053772551546