In [18]:
from collections import Counter, OrderedDict
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score

from imblearn.under_sampling import RandomUnderSampler

from hyperopt import hp
from hyperopt.pyll.base import scope

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from ray.tune.sklearn import TuneSearchCV

import tools

In [6]:
#Run n interations of hyperparameter tuning
#Run Buoruta 
#Repeat until accuracy measure don't improve in n loops
#Save best parameters and features

In [4]:
water = pd.read_csv('water_potability.csv')

In [5]:
rus = RandomUnderSampler(random_state = 42)
X =  water[['Hardness','Solids','Chloramines','Conductivity','Organic_carbon','Turbidity']]
y = water['Potability']
X_res, y_res = rus.fit_resample(X, y)

In [14]:
xgboost_params_hyperopt = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 20, 1000, 10)),
    'max_depth': hp.uniformint('max_depth', 2, 16),
    'learning_rate': hp.uniform('learning_rate', 0.05, 1.0),
    'subsample': hp.quniform('subsample', 0.3, 1.0, 0.1),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.3, 1.0, 0.1),
    'colsample_bylevel': hp.quniform('colsample_bylevel', 0.3, 1.0, 0.1),
    'colsample_bynode': hp.quniform('colsample_bynode', 0.3, 1.0, 0.1)
#    'reg_alpha': hp.quniform('reg_alpha', 1, 100, .5),
#    'reg_lambda': scope.int(hp.quniform('reg_lambda', 1, 100, 1))
}

lgbm_params_hyperopt = {
    'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss', 'rf']),
#    'num_leaves': hp.uniformint('num_leaves', 2, 25), 
    'max_depth': hp.uniformint('max_depth', 2, 25),
    'learning_rate': hp.uniform('learning_rate', 0.05, 1.0),
    'n_estimators': scope.int(hp.quniform('n_estimators', 20, 1000, 10)),
#    'subsample_for_bin',
#    'class_weight',
#    'min_split_gain',
#    'min_child_weight',
#    'min_child_samples',
    'subsample': hp.quniform('subsample', 0.3, 1.0, 0.1),
#    'subsample_freq': hp.uniformint('subsample_freq', 2, 16),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.3, 1.0, 0.1),
#    'reg_alpha': hp.quniform('reg_alpha', 1, 100, .5),
#    'reg_lambda': scope.int(hp.quniform('reg_lambda', 1, 100, 1))
}

catboost_params_hyperopt = {
    'iterations': scope.int(hp.quniform('iterations', 10, 1000, 10)),
    'learning_rate': hp.uniform('learning_rate', 0.05, 1.0),
    'depth': hp.uniformint('depth', 2, 16),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', .0001, 1000.0),
#    'random_strength',
#    'bagging_temperature',
#    'border_count': hp.uniformint('border_count', 2, 254),
#    'has_time',
#    'rsm': hp.quniform('rsm', 0.3, 1.0, 0.1),
    'grow_policy': hp.choice('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide'])
#    'boosting_type': hp.choice('boosting_type', ['Ordered', 'Plain'])
#    'min_data_in_leaf': hp.uniformint('min_data_in_leaf', 2, 64),
#only with lossguide tree growth        'max_leaves': hp.uniformint('max_leaves', 2, 64)  
}


In [15]:
clf_xgb = XGBClassifier(random_state=42, 
                        verbosity=0, 
                        use_label_encoder=False)

clf_lgbm = LGBMClassifier(objective='binary',
                          force_col_wise=True,
                          random_state=42, 
                          verbosity=0, 
                          use_label_encoder=False)

clf_cb = CatBoostClassifier(random_state=42, 
                            verbose=0)


In [16]:
#cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
cv = StratifiedKFold(n_splits=5)

search = TuneSearchCV(estimator=clf_cb, 
                      search_optimization='hyperopt', 
                      param_distributions=catboost_params_hyperopt, 
                      n_trials=5, 
                      cv=cv, 
                      scoring={"f1":"f1",
                               "precision":"precision",
                               "recall":"recall",
                               "accuracy":"accuracy"}, 
                      refit='f1', 
                      return_train_score=False,
                      early_stopping=False, 
                      verbose=1, 
                      n_jobs=8)

In [17]:
search.fit(X_res, y_res)

TuneSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             early_stopping=False,
             estimator=<catboost.core.CatBoostClassifier object at 0x7f3b23ab3af0>,
             loggers=[<class 'ray.tune.logger.JsonLogger'>,
                      <class 'ray.tune.logger.CSVLogger'>],
             mode='max', n_jobs=8, n_trials=5,
             param_distributions={'depth': <hyperopt.pyll.base.Apply object at 0x7f3b23b404...
                                  'l2_leaf_reg': <hyperopt.pyll.base.Apply object at 0x7f3b23b40610>,
                                  'learning_rate': <hyperopt.pyll.base.Apply object at 0x7f3b23b40340>},
             refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=binary),
                      'precision': make_scorer(precision_score, average=binary),
                      'recall': make_scorer(recall_score, average=binary)},
             search_o

In [15]:
results = tools.get_results(search)
results

{'params': {'depth': 13,
  'grow_policy': 'SymmetricTree',
  'iterations': 220,
  'l2_leaf_reg': 63.720346615093185,
  'learning_rate': 0.7308730412159931},
 'refit_metric': 'f1',
 'best_refit_index': 4,
 'f1_refit': 0.4827106299356596,
 'precision_refit_best': 0.4938983327614458,
 'recall_refit_best': 0.4726102941176471,
 'accuracy_refit_best': 0.49412992294520547}

In [15]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
#cv = StratifiedKFold(n_splits=5)

search = BayesSearchCV(estimator=XGBRFClassifier(use_label_encoder=False,verbosity=0,), 
                       search_spaces=params, cv=cv, scoring='f1', refit='f1', n_iter=500, 
                       n_jobs=8, verbose=1, random_state=42, return_train_score=True)
search.fit(X_res.values, y_res)
# report the best result
print(search.best_score_)
print(search.best_params_)

bp = search.best_params_
hp = list(bp.keys())
v = list(bp.values())

Fitting 10 folds for each of 1 candidates, totalling 10 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


KeyboardInterrupt: 

In [None]:
clf = XGBRFClassifier(n_estimators=bp["n_estimators"], 
                             criterion=bp["criterion"], 
                             max_depth=bp["max_depth"], 
                             min_samples_split=bp["min_samples_split"], 
                             min_samples_leaf=bp["min_samples_leaf"],
                             min_weight_fraction_leaf=bp["min_weight_fraction_leaf"],
                             max_features=bp["max_features"],
                             max_leaf_nodes=None,
                             min_impurity_decrease=bp["min_impurity_decrease"],
                             bootstrap=bp["bootstrap"],
                             oob_score=False,
                             n_jobs=-1)

In [7]:
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y)
label_encoded_y = label_encoder.transform(y)

In [94]:
clf = RandomForestClassifier(n_estimators=bp["n_estimators"], 
                             criterion=bp["criterion"], 
                             max_depth=bp["max_depth"], 
                             min_samples_split=bp["min_samples_split"], 
                             min_samples_leaf=bp["min_samples_leaf"],
                             min_weight_fraction_leaf=bp["min_weight_fraction_leaf"],
                             max_features=bp["max_features"],
                             max_leaf_nodes=None,
                             min_impurity_decrease=bp["min_impurity_decrease"],
                             bootstrap=bp["bootstrap"],
                             oob_score=False,
                             n_jobs=-1)

In [12]:
cross_val_score(clf_xgb, X_res, y_res, cv=5, scoring='f1')

array([0.5440613 , 0.45360825, 0.49206349, 0.45714286, 0.4852071 ])

In [35]:
clf = RandomForestClassifier(n_estimators=1000, 
                             criterion='gini', 
                             max_depth=None, 
                             min_samples_split=2, 
                             min_samples_leaf=1,
                             min_weight_fraction_leaf=0.0,
                             max_features=None,
                             max_leaf_nodes=None,
                             min_impurity_decrease=0.0,
                             bootstrap=True,
                             oob_score=False,
                             n_jobs=-1)

In [88]:
params = OrderedDict()
params["n_estimators"] = (20, 1000)
params["criterion"] = ["gini", "entropy"]
params["max_depth"] = (1, 9)
params["min_samples_split"] = (2, 100)
params["min_samples_leaf"] = (1, 100)
params["min_weight_fraction_leaf"] = (0.0, 0.5)
params["max_features"] = ["sqrt", "log2", None]
#params["max_leaf_nodes"] = None
params["min_impurity_decrease"] = (0.0, 1.0)
params["bootstrap"] = [True, False]

In [32]:
X_clf_train, X_clf_valid, y_clf_train, y_clf_valid = train_test_split(
    X_res.values, y_res, test_size=0.3, shuffle=False)

In [18]:
clf = XGBRFClassifier(n_estimators=1000,
                      max_depth=None, 
                      #max_leaves=0, 
                      #max_bin=0,
                      grow_policy='depthwise',
                      learning_rate=0.1,
                      objective='reg:squarederror',
                      booster='gbtree',
                      tree_method='exact',
                      #gamma=0.0,
                      #min_child_weight=0.0,
                      #max_delta_step=0.0,
                      subsample=0.7,
                      colsample_bytree=0.7,
                      colsample_bylevel=0.7,
                      colsample_bynode=0.7,
                      #reg_alpha=0.0,
                      #reg_lambda=0.0,
                      #scale_pos_weight=0.0,
                      use_label_encoder=False,
                      verbosity=0,       
                      n_jobs=-1)

In [70]:
params = {
    "n_estimators": (20,1000),
    "criterion": ["gini", "entropy"],
    "max_depth": (1, 9),
    "min_samples_split": (2, 100),
    "min_samples_leaf": (1, 100),
    "min_weight_fraction_leaf": (0.0, 0.5),
    "max_features": ["sqrt", "log2", None],
    #"max_leaf_nodes": None,
    "min_impurity_decrease": (0.0, 1.0),
    "bootstrap": [True, False],
}

In [122]:
#xgboost search space
xgboost_params_bayesian = OrderedDict()
params["n_estimators"] = (20, 1000)
params["max_depth"] = (1, 50)
#params["max_leaves"] = (1, 9)
#params["max_bin"] = (2, 100)
params["grow_policy"] = ['depthwise', 'lossguide']
params["learning_rate"] = (0.0001, 0.99)
#params["objective"] = 
#params["booster"] = ["gbtree", "gblinear", "dart"]
params["tree_method"] = ["exact", "approx", "hist"]
params["gamma"] = (0.0, 1000.0)
params["min_child_weight"] = (0.0, 500.0)
#params["max_delta_step"] = (0.0, 1.0)
params["subsample"] = (0.0, 1.0)
#params["subsampleing_method"] = ['uniform', 'gradient']
params["colsample_bytree"] = (0.1, 1.0)
params["colsample_bylevel"] = (0.1, 1.0)
params["colsample_bynode"] = (0.1, 1.0)
params["reg_alpha"] = (0.0, 100.0)
params["reg_lambda"] = (0.0, 100.0)
#params["scale_pos_weight"] = (0.0, 1.0)
#params["base_score"] = (0.0, 1.0)

In [14]:
#import skopt
#from skopt import BayesSearchCV

In [76]:
params = {
    "gamma": [0.5, 1, 1.5, 2, 5],
    "subsample": (0.1, 1.0),
    "colsample_bytree":(0.1, 1.0),
    "max_depth": [3, 4, 5],
}