In [1]:
import numpy as np
import sklearn
import xgboost as xgb

from sklearn.model_selection import GridSearchCV 
from data_generation import get_data

In [2]:
np.random.seed(seed=123)
N = 1000
y_data, d_data, x_data = get_data(N)

XGBRegressor(objective='reg:logistic') and XGBClassifier(objective='binary:logistic') are the same algorithms under the hood. 

In [3]:
xgb_model_m = xgb.XGBRegressor(objective='reg:logistic', n_estimators=25, max_depth=3)
xgb_model_m.fit(X=x_data, y=d_data)
xgb_model_m.predict(x_data[-15:])

array([0.89653105, 0.53149956, 0.9260376 , 0.73061   , 0.57376504,
       0.79329354, 0.8108675 , 0.93139756, 0.58864594, 0.7045877 ,
       0.85751104, 0.8302279 , 0.6523029 , 0.4519204 , 0.5249612 ],
      dtype=float32)

In [4]:
xgb_model_m = xgb.XGBClassifier(objective='binary:logistic', n_estimators=25, max_depth=3)
xgb_model_m.fit(X=x_data, y=d_data)
xgb_model_m.predict_proba(x_data[-15:])[:,1]

array([0.89653105, 0.53149956, 0.9260376 , 0.73061   , 0.57376504,
       0.79329354, 0.8108675 , 0.93139756, 0.58864594, 0.7045877 ,
       0.85751104, 0.8302279 , 0.6523029 , 0.4519204 , 0.5249612 ],
      dtype=float32)

For our purposes, eval_metric does not change anything.

In [6]:
xgb_model_m = xgb.XGBRegressor(objective='reg:logistic', n_estimators=25, max_depth=3, early_stopping_rounds=10)
xgb_model_m.fit(X=x_data, y=d_data, eval_set=[(x_data, y_data)], verbose=True)
xgb_model_m.predict(x_data[-15:])

[0]	validation_0-rmse:1.25899
[1]	validation_0-rmse:1.24150
[2]	validation_0-rmse:1.22811
[3]	validation_0-rmse:1.21620
[4]	validation_0-rmse:1.20903
[5]	validation_0-rmse:1.19743
[6]	validation_0-rmse:1.19167
[7]	validation_0-rmse:1.18830
[8]	validation_0-rmse:1.18273
[9]	validation_0-rmse:1.17783
[10]	validation_0-rmse:1.17502
[11]	validation_0-rmse:1.17247
[12]	validation_0-rmse:1.17180
[13]	validation_0-rmse:1.17059
[14]	validation_0-rmse:1.16843
[15]	validation_0-rmse:1.16665
[16]	validation_0-rmse:1.16583
[17]	validation_0-rmse:1.16557
[18]	validation_0-rmse:1.16343
[19]	validation_0-rmse:1.16276
[20]	validation_0-rmse:1.16255
[21]	validation_0-rmse:1.16152
[22]	validation_0-rmse:1.16199
[23]	validation_0-rmse:1.16124
[24]	validation_0-rmse:1.16055


array([0.89653105, 0.53149956, 0.9260376 , 0.73061   , 0.57376504,
       0.79329354, 0.8108675 , 0.93139756, 0.58864594, 0.7045877 ,
       0.85751104, 0.8302279 , 0.6523029 , 0.4519204 , 0.5249612 ],
      dtype=float32)

In [7]:
xgb_model_m = xgb.XGBClassifier(objective='binary:logistic', n_estimators=25, max_depth=3, early_stopping_rounds=10)
xgb_model_m.fit(X=x_data, y=d_data, eval_set=[(x_data, y_data)], verbose=True)
xgb_model_m.predict_proba(x_data[-15:])[:,1]

[0]	validation_0-logloss:0.53838
[1]	validation_0-logloss:0.44264
[2]	validation_0-logloss:0.36141
[3]	validation_0-logloss:0.28300
[4]	validation_0-logloss:0.23498
[5]	validation_0-logloss:0.15838
[6]	validation_0-logloss:0.12010
[7]	validation_0-logloss:0.09572
[8]	validation_0-logloss:0.05021
[9]	validation_0-logloss:0.00682
[10]	validation_0-logloss:-0.01535
[11]	validation_0-logloss:-0.03301
[12]	validation_0-logloss:-0.04292
[13]	validation_0-logloss:-0.05150
[14]	validation_0-logloss:-0.06602
[15]	validation_0-logloss:-0.08914
[16]	validation_0-logloss:-0.09661
[17]	validation_0-logloss:-0.09632
[18]	validation_0-logloss:-0.12322
[19]	validation_0-logloss:-0.12919
[20]	validation_0-logloss:-0.13275
[21]	validation_0-logloss:-0.14188
[22]	validation_0-logloss:-0.14079
[23]	validation_0-logloss:-0.14776
[24]	validation_0-logloss:-0.15560


array([0.89653105, 0.53149956, 0.9260376 , 0.73061   , 0.57376504,
       0.79329354, 0.8108675 , 0.93139756, 0.58864594, 0.7045877 ,
       0.85751104, 0.8302279 , 0.6523029 , 0.4519204 , 0.5249612 ],
      dtype=float32)

In [8]:
xgb_model_m = xgb.XGBClassifier(objective='binary:logistic', eval_metric='rmse', n_estimators=25, max_depth=3, early_stopping_rounds=10)
xgb_model_m.fit(X=x_data, y=d_data, eval_set=[(x_data, y_data)], verbose=True)
xgb_model_m.predict_proba(x_data[-15:])[:,1]

[0]	validation_0-rmse:1.25899
[1]	validation_0-rmse:1.24150
[2]	validation_0-rmse:1.22811
[3]	validation_0-rmse:1.21620
[4]	validation_0-rmse:1.20903
[5]	validation_0-rmse:1.19743
[6]	validation_0-rmse:1.19167
[7]	validation_0-rmse:1.18830
[8]	validation_0-rmse:1.18273
[9]	validation_0-rmse:1.17783
[10]	validation_0-rmse:1.17502
[11]	validation_0-rmse:1.17247
[12]	validation_0-rmse:1.17180
[13]	validation_0-rmse:1.17059
[14]	validation_0-rmse:1.16843
[15]	validation_0-rmse:1.16665
[16]	validation_0-rmse:1.16583
[17]	validation_0-rmse:1.16557
[18]	validation_0-rmse:1.16343
[19]	validation_0-rmse:1.16276
[20]	validation_0-rmse:1.16255
[21]	validation_0-rmse:1.16152
[22]	validation_0-rmse:1.16199
[23]	validation_0-rmse:1.16124
[24]	validation_0-rmse:1.16055


array([0.89653105, 0.53149956, 0.9260376 , 0.73061   , 0.57376504,
       0.79329354, 0.8108675 , 0.93139756, 0.58864594, 0.7045877 ,
       0.85751104, 0.8302279 , 0.6523029 , 0.4519204 , 0.5249612 ],
      dtype=float32)

In GridSearchCV, StratifiedKFold is used for classifiers, usual KFold for regressors.

In [9]:
param_grid = {
    'n_estimators': [5, 10, 25, 50],
    'max_depth': [2, 3, 4, 5]
}

In [10]:
xgb_model_m = xgb.XGBRegressor(objective='reg:logistic')
grid_search_m = GridSearchCV(estimator=xgb_model_m, param_grid=param_grid, cv=5, 
                             scoring='neg_brier_score')
grid_search_m.fit(X=x_data, y=d_data)
print(grid_search_m.best_params_)
print(grid_search_m.best_score_)

Traceback (most recent call last):
  File "C:\Users\henry\MA_CausalML\dml\Lib\site-packages\sklearn\model_selection\_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\henry\MA_CausalML\dml\Lib\site-packages\sklearn\metrics\_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\henry\MA_CausalML\dml\Lib\site-packages\sklearn\metrics\_scorer.py", line 344, in _score
    response_method = _check_response_method(estimator, self._response_method)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\henry\MA_CausalML\dml\Lib\site-packages\sklearn\utils\validation.py", line 2106, in _check_response_method
    raise AttributeError(
AttributeError: XGBRegressor 

{'max_depth': 2, 'n_estimators': 5}
nan




In [11]:
xgb_model_m = xgb.XGBRegressor(objective='reg:logistic')
grid_search_m = GridSearchCV(estimator=xgb_model_m, param_grid=param_grid, cv=5, 
                             scoring='neg_mean_squared_error')
grid_search_m.fit(X=x_data, y=d_data)
print(grid_search_m.best_params_)
print(grid_search_m.best_score_)

{'max_depth': 3, 'n_estimators': 10}
-0.19262917532502372


In [14]:
xgb_model_m = xgb.XGBClassifier(objective='binary:logistic')
grid_search_m = GridSearchCV(estimator=xgb_model_m, param_grid=param_grid, cv=5, 
                             scoring='neg_log_loss')
grid_search_m.fit(X=x_data, y=d_data)
print(grid_search_m.best_params_)
print(grid_search_m.best_score_)

{'max_depth': 2, 'n_estimators': 25}
-0.5672723602064291


In [18]:
xgb_model_m = xgb.XGBClassifier(objective='binary:logistic')
grid_search_m = GridSearchCV(estimator=xgb_model_m, param_grid=param_grid, cv=5, 
                             scoring='neg_brier_score')
grid_search_m.fit(X=x_data, y=d_data)
print(grid_search_m.best_params_)
print(grid_search_m.best_score_)

{'max_depth': 2, 'n_estimators': 25}
-0.1925595245332532
