In [None]:
from __future__ import division
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from time import time
from catboost import CatBoostClassifier
import gc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint
from hyperopt import hp, tpe, STATUS_OK, Trials, fmin
from sklearn.ensemble import GradientBoostingClassifier

start = time()
train = pd.read_csv('D:/Driver/ohe_train_v2.csv')
train.fillna(-1,inplace=True)
# test = pd.read_csv('D:/Driver/ohe_test_v2.csv',na_values=-1)

unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(unwanted, axis=1)
# test = test.drop(unwanted, axis=1)

X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = train[features].values
y = train['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

def objective(space):
    clf = GradientBoostingClassifier(n_estimators=space['n_estimators'],
                                     subsample=space['subsample'],
                                     max_features=space['max_features'],
                                     max_depth=space['max_depth'],
                                     min_samples_split=space['min_samples_split'],
                                     min_samples_leaf=space['min_samples_leaf'],
                                     random_state=2017,
                                     verbose=5,
                                )

    clf.fit(X_train, y_train)

    pred = clf.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, pred)
    return {'loss': 1-auc, 'status': STATUS_OK}

#     skf = StratifiedKFold(n_splits=5, random_state=2017)
#     scores = cross_val_score(clf, X, y,
#                              cv=skf, scoring='roc_auc', n_jobs=-1)
#     return {'loss': 1 - scores.mean(), 'status': STATUS_OK}


space = {
    'n_estimators': hp.choice('n_estimators', np.arange(200, 1100, 100, dtype=int)),
    'max_depth': hp.choice('max_depth', np.arange(3, 16, dtype=int)),
    'max_features': hp.quniform('max_features', 0.4, 1.0, 0.05),
    'min_samples_split': hp.choice('min_samples_split', np.arange(2, 501, 1)),
    'min_samples_leaf': hp.choice('min_samples_leaf', np.arange(1, 501, 1)),
    'subsample': hp.quniform('subsample', 0.6, 1.0, 0.05),
}

trials = Trials()
best = fmin(objective,
            space,
            algo=tpe.suggest,
            max_evals=1,  # change
            trials=trials)

print(best)
print('Time: {} mins'.format((time() - start) / 60))