# Задание

Попробуйте настроить параметры градиентного бустинга на этом датасете, и посмотрите, как изменение отдельных параметров влияет на результат.

#### Конечная цель - сказать значение параметров xgb, при которых будет самое хорошее качество на кроссвалидации

In [69]:
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import pandas
import numpy as np
import scipy
import signal

bioresponce = pandas.read_csv('../../ml2018jan_feb/seminar01/bioresponse.csv', header=0, sep=',')
bioresponce_target = bioresponce.Activity.values
bioresponce_data = bioresponce.iloc[:, 1:]

#def signal_handler(signum, frame):
#    raise Exception("Timed out!")

def estimate_params(params):
#    signal.signal(signal.SIGALRM, signal_handler)
#    signal.alarm(30)
    estimator = xgb.XGBClassifier(**params)

    try:
        score = np.mean(cross_val_score(
            estimator, bioresponce_data, bioresponce_target,
            scoring = 'accuracy', cv = 3
        ))
    except:
        score = None
    
    return score

In [11]:
print(estimate_params({
    'learning_rate': 0.1, 
    'max_depth': 5, 
    'n_estimators': 100, 
    'min_child_weight': 3,
    'seed': 42
}))

0.791254356515


In [39]:
print(estimate_params({
    'learning_rate': 0.1, 
    'max_depth': 5, 
    'n_estimators': 60, 
    'min_child_weight': 3,
    'seed': 42
}))

0.789919744205


In [17]:
print(estimate_params({
    'learning_rate': 0.1, 
    'max_depth': 5, 
    'n_estimators': 100000, 
    'min_child_weight': 3,
    'seed': 42
}))

None


In [18]:
print(estimate_params({
    'learning_rate': 0.1, 
    'max_depth': 5, 
    'n_estimators': 1000, 
    'min_child_weight': 3,
    'seed': 42
}))

0.792053503864


# Подбор параметров

##### Сначала проводится GridSearchCV по отдельным параметрам для того, чтобы приблизительно определить границы,потом наилучший вариант определяется через RandomizedSearchCV

In [26]:
params = {
    'learning_rate': [0.1], 
    'max_depth': [5],
    'n_estimators': np.arange(100, 1000, 100), 
    'min_child_weight': [3],
    'seed': 42
}

In [20]:
model = xgb.XGBClassifier()

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [27]:
grid_test = GridSearchCV(estimator = model, param_grid = params, cv = 3, scoring = 'accuracy')

In [28]:
grid_test.fit(bioresponce_data, bioresponce_target)
print(grid_test.best_params_, grid_test.best_score_)

{'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 800, 'seed': 42} 0.795521194348


In [43]:
values = grid_test.cv_results_['param_n_estimators'].data
scores = grid_test.cv_results_['mean_test_score']
temp = scores.argsort()
ranks = np.empty_like(temp)
ranks[temp] = np.arange(len(scores),0,-1)
np.transpose([values,scores,ranks])

array([[100, 0.7912556651559585, 7],
       [200, 0.7893894961343642, 9],
       [300, 0.7915222607304718, 6],
       [400, 0.7901892828579046, 8],
       [500, 0.7939216209010931, 3],
       [600, 0.7941882164756066, 2],
       [700, 0.7928552386030392, 5],
       [800, 0.7955211943481738, 1],
       [900, 0.7936550253265796, 4]], dtype=object)

In [52]:
params2 = {
    'learning_rate': np.arange(0.01,1,0.01),
    'max_depth': [5],
    'n_estimators': [100], 
    'min_child_weight': [3],
    'seed': [42]
}

In [53]:
grid_test2 = GridSearchCV(estimator = model, param_grid = params2, cv = 3, scoring = 'accuracy')

In [54]:
grid_test2.fit(bioresponce_data, bioresponce_target)
print(grid_test2.best_params_, grid_test2.best_score_)

{'learning_rate': 0.19, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 100, 'seed': 42} 0.796054385497


In [66]:
values = grid_test2.cv_results_['param_learning_rate'].data
scores = grid_test2.cv_results_['mean_test_score']
temp = scores.argsort()
ranks = np.empty_like(temp)
ranks[temp] = np.arange(len(scores),0,-1)
np.transpose([values,scores,ranks])

array([[0.01, 0.7739269528125833, 88],
       [0.02, 0.7797920554518795, 66],
       [0.029999999999999999, 0.7864569448147161, 35],
       [0.040000000000000001, 0.7861903492402026, 39],
       [0.050000000000000003, 0.7883231138363103, 24],
       [0.060000000000000005, 0.7899226872833911, 14],
       [0.069999999999999993, 0.7883231138363103, 25],
       [0.080000000000000002, 0.7885897094108237, 23],
       [0.089999999999999997, 0.790989069581445, 8],
       [0.099999999999999992, 0.7912556651559585, 6],
       [0.11, 0.7920554518794988, 5],
       [0.12, 0.7885897094108237, 19],
       [0.13, 0.7907224740069315, 9],
       [0.14000000000000001, 0.7893894961343642, 15],
       [0.15000000000000002, 0.783524393495068, 51],
       [0.16, 0.790455878432418, 11],
       [0.17000000000000001, 0.790455878432418, 12],
       [0.18000000000000002, 0.7952545987736603, 3],
       [0.19, 0.7960543854972008, 1],
       [0.20000000000000001, 0.7885897094108237, 21],
       [0.21000000000000002

In [97]:
params3 = {
    'learning_rate': [0.1],
    'max_depth': [5],
    'n_estimators': [100], 
    'min_child_weight': np.arange(1,10,1),
    #'seed': [42]
}

In [98]:
grid_test3 = GridSearchCV(estimator = model, param_grid = params3, cv = 3, scoring = 'accuracy')

In [99]:
grid_test3.fit(bioresponce_data, bioresponce_target)
print(grid_test3.best_params_, grid_test3.best_score_)

{'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 100} 0.791255665156


In [100]:
values = grid_test3.cv_results_['param_min_child_weight'].data
scores = grid_test3.cv_results_['mean_test_score']
temp = scores.argsort()
ranks = np.empty_like(temp)
ranks[temp] = np.arange(len(scores),0,-1)
np.transpose([values,scores,ranks])

array([[1, 0.786990135963743, 7],
       [2, 0.790455878432418, 2],
       [3, 0.7912556651559585, 1],
       [4, 0.7864569448147161, 8],
       [5, 0.7888563049853372, 3],
       [6, 0.7845907757931219, 9],
       [7, 0.7883231138363103, 5],
       [8, 0.7883231138363103, 4],
       [9, 0.7877899226872834, 6]], dtype=object)

In [121]:
params_rand = {
    'learning_rate': scipy.stats.uniform(0.05,0.3),
    'max_depth': scipy.stats.randint(3, 10),
    'n_estimators': scipy.stats.randint(10, 1000, 10), 
    'min_child_weight': scipy.stats.randint(2, 4),
    'seed': [42]
}

In [126]:
%%time
rand_opt = RandomizedSearchCV(model, param_distributions=params_rand, cv=3, random_state=42, scoring = 'accuracy', n_iter = 200).fit(bioresponce_data, bioresponce_target)
print(rand_opt.best_params_, rand_opt.best_score_)

{'learning_rate': 0.12978137748927168, 'max_depth': 7, 'min_child_weight': 2, 'n_estimators': 759, 'seed': 42} 0.795521194348
Wall time: 5h 53min 1s
