In [1]:
import os
import pickle
import itertools
import pandas as pd
import numpy as np
#from modelgym import model
import functools
import modelgym
from modelgym.util import TASK_CLASSIFICATION
from modelgym.trainer import Trainer
from modelgym.tracker import ProgressTrackerFile, ProgressTrackerMongo
from sklearn.metrics import roc_auc_score
from hyperopt.mongoexp import MongoTrials
from modelgym.util import split_and_preprocess
from sklearn.model_selection import train_test_split
from collections import OrderedDict

In [2]:
########### NROWS, N_ESTIMATORS, N_PROBES, TEST_SIZE, N_CV_SPLITS, OPTIMIZER
config_tuple = {
    'test': (1000, 100,  2, 0.5, 2, 'random'),
    'pror': (None, 1000, 100, 0.5, 2, 'random'), # production with random hyperopt suggestor
    'prot': (None, 1000, 100, 0.5, 2, 'tpe'),    # production with tpe hyperopt suggestor
    'demi': (10000, 100, 5, 0.5, 2, 'random')
}
CONFIG = 'test' if 'EXP_CONFIG' not in os.environ else os.environ['EXP_CONFIG']
NROWS, N_ESTIMATORS, N_PROBES, TEST_SIZE, N_CV_SPLITS, OPTIMIZER = config_tuple[CONFIG]
CANDIDATES = OrderedDict([
    ('XGBoost', modelgym.XGBModel), 
    ('LightGBM', modelgym.LGBModel),
    ('RandomForestClassifier',modelgym.RFModel)
])
RESULTS_DIR = "results"
LOAD_CACHE = False
if 'MONGO_PORT_27017_TCP_ADDR' in os.environ:
    mongo_host = os.environ['MONGO_PORT_27017_TCP_ADDR'] if 'MONGO_PORT_27017_TCP_ADDR' in os.environ else 'cern-mc01h'
    mongo_port = int(os.environ['MONGO_PORT_27017_TCP_PORT']) if 'MONGO_PORT_27017_TCP_PORT' in os.environ else 27017
    mongo_db = os.environ['MONGO_DB'] if 'MONGO_DB' in os.environ else 'trials'
    tracker_factory = functools.partial(ProgressTrackerMongo, mongo_host, mongo_port, mongo_db, config_key=CONFIG)
    print ("Using Mongo as backend for tracking")
else:
    tracker_factory = functools.partial(ProgressTrackerFile, RESULTS_DIR, config_key=CONFIG)
    print ("Using File as backend for tracking")

print ("Running experiment cofiguration:", CONFIG)

Using File as backend for tracking
Running experiment cofiguration: test


## Download & read data file

In [3]:
%%bash 
if [ ! -d data ] ; then 
    mkdir data 
    cd data
    curl https://cernbox.cern.ch/index.php/s/N1dpSAPgl30szYM/download | gunzip -c > XY2d.pickle
    cd ..
fi
ls -l data

total 266224
-rw-r--r--  1 macbook  staff  136304022 Aug 11 14:40 XY2d.pickle


In [4]:
def read_data(fname, nrows=None, shuffle=True):
    with open(fname,'rb') as fh:
        X, y = pickle.load(fh,encoding='bytes')
    index = np.arange(X.shape[0])
    if nrows is None:
        nrows = X.shape[0]
    weights = np.ones(nrows) # uh, well...
    if shuffle:
        index_perm = np.random.permutation(index)
    else:
        index_perm = index
    return X[index_perm[:nrows]], y[index_perm[:nrows]], weights


X, y, weights = read_data("data/XY2d.pickle", nrows=NROWS)

In [5]:
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, weights, test_size=TEST_SIZE)

In [6]:
cv_pairs, (dtrain, dtest) = split_and_preprocess(X_train.copy(), y_train, 
                                                X_test.copy(), y_test, 
                                                cat_cols=[], n_splits=N_CV_SPLITS)

## Run them all

In [7]:
trackers = {}
def init_keys_dict():
    return dict([(k, None) for k in CANDIDATES.keys()])
default_cv_result = init_keys_dict()
tuned_cv_result = init_keys_dict()
default_test_result = init_keys_dict()
tuned_test_result = init_keys_dict()
trials = init_keys_dict()
trainer = Trainer(hyperopt_evals=N_PROBES, n_estimators=N_ESTIMATORS)

In [8]:
for model_id, model_class in CANDIDATES.items():
    model = model_class(TASK_CLASSIFICATION)
    print ("~"*20, model.get_name(), "~"*20)
    trackers[model_id] = tracker_factory(model_name=model.get_name())
    if LOAD_CACHE:
        default_cv_result[model_id], default_test_result[model_id], tuned_cv_result[model_id], tuned_test_result[model_id], trials[model_id] = \
            trackers[model_id].load_state(as_list=True)
    
    
    if default_cv_result[model_id] is None:
        default_cv_result[model_id] = trainer.crossval_fit_eval(model, cv_pairs)
        trackers[model_id].save_state(default_cv=default_cv_result[model_id])
    trainer.print_result(default_cv_result[model_id], 'Default {} result on CV'.format(model.get_name()))

    if default_test_result[model_id] is None:
        default_test_result[model_id] = trainer.fit_eval(model, dtrain, dtest,
                                                  default_cv_result[model_id]['params'],
                                                  default_cv_result[model_id]['best_n_estimators'],
                                                  custom_metric = {'roc_auc': roc_auc_score})
        trackers[model_id].save_state(default_test=default_test_result[model_id])

    trainer.print_result(default_test_result[model_id], 'Default {} result on TEST'.format(model.get_name()), extra_keys=['roc_auc'])

        
    if tuned_cv_result[model_id] is None:
        print('Hyperopt iterations:\n\n')
        tuned_cv_result[model_id] = trainer.crossval_optimize_params(model, cv_pairs,  algo_name=OPTIMIZER, 
                                                           trials=trials[model_id], tracker=trackers[model_id])
        trackers[model_id].save_state(tuned_cv=tuned_cv_result[model_id])
    trainer.print_result(tuned_cv_result[model_id], 'Tuned {} result on cv'.format(model.get_name()))

    if tuned_test_result[model_id] is None:
        tuned_test_result[model_id] = trainer.fit_eval(model, dtrain, dtest,
                                            tuned_cv_result[model_id]['params'],
                                            tuned_cv_result[model_id]['best_n_estimators'],
                                            custom_metric = {'roc_auc': roc_auc_score})
        trackers[model_id].save_state(tuned_test=tuned_test_result[model_id])
    trainer.print_result(tuned_test_result[model_id], 'Tuned {} result on test'.format(model.get_name()), extra_keys=['roc_auc'])

    trackers[model_id].save_state(default_cv=default_cv_result[model_id], default_test=default_test_result[model_id], 
                               tuned_cv=tuned_cv_result[model_id], tuned_test=tuned_test_result[model_id], trials=trials[model_id])
    

~~~~~~~~~~~~~~~~~~~~ XGBoost ~~~~~~~~~~~~~~~~~~~~
BST  <xgboost.core.Booster object at 0x1159fc6d8>
RES [0.63879, 0.594076, 0.555677, 0.528966, 0.503484, 0.483325, 0.463834, 0.448553, 0.435343, 0.424926, 0.414957, 0.407857, 0.400873, 0.394292, 0.387076, 0.383874, 0.38081, 0.372378, 0.367894, 0.363748, 0.362701, 0.36271, 0.361124, 0.357371, 0.352849, 0.351781, 0.352419, 0.34834, 0.348148, 0.347298, 0.346438, 0.343527, 0.343406, 0.337836, 0.336972, 0.334423, 0.336547, 0.338204, 0.336691, 0.337057, 0.339666, 0.335501, 0.334719, 0.333408, 0.33419, 0.334643, 0.335052, 0.334504, 0.33321, 0.333896, 0.334879, 0.334047, 0.333691, 0.333594, 0.331351, 0.332904, 0.332131, 0.332774, 0.333414, 0.333466, 0.333126, 0.33299, 0.333708, 0.331236, 0.333603, 0.33439, 0.334471, 0.335698, 0.33436, 0.336219, 0.336308, 0.337511, 0.338087, 0.339563, 0.338635, 0.339184, 0.339584, 0.340892, 0.342912, 0.344065, 0.344881, 0.346565, 0.346565, 0.348872, 0.349709, 0.348517, 0.349556, 0.350273, 0.350932, 0.349453, 0.34

BST  <xgboost.core.Booster object at 0x1159fc860>
RES [0.581495, 0.515429, 0.473935, 0.442495, 0.415819, 0.395727, 0.379557, 0.374789, 0.368373, 0.36314, 0.359892, 0.363029, 0.364177, 0.36978]
saved state to results/tracker_test_XGBoost.pickle
Tuned XGBoost result on test:

loss = 0.36978
n_estimators = 14
params = {'alpha': 0, 'colsample_bylevel': 0.7405046763756069, 'colsample_bytree': 0.6151399892026908, 'eta': 0.22398969443165967, 'gamma': 5.745168482871384e-07, 'lambda': 0.019440307789833522, 'max_depth': 10, 'min_child_weight': 0.16853765259109013, 'subsample': 0.6197753137030573, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'silent': 1}
roc_auc = 0.822144
saved state to results/tracker_test_XGBoost.pickle
~~~~~~~~~~~~~~~~~~~~ LightGBM ~~~~~~~~~~~~~~~~~~~~
saved state to results/tracker_test_LightGBM.pickle
Default LightGBM result on CV:

loss = 0.353675349257
best_n_estimators = 28
params = {'boosting_type': 'gbdt', 'colsample_bytree': 1, 'drop_rate': 0.1, 'is_unbal

{'criterion': 'gini', 'max_depth': 5, 'max_features': 3, 'n_estimators': 16, 'normalize': 0, 'scale': 1}
{'max_depth': 1, 'max_features': 4, 'n_estimators': 10, 'criterion': 'gini', 'verbose': 0}
[0.85264868999808752, 0.74909160451329126, 0.84064830751577746, 0.84461656148403141, 0.78490151080512527, 0.82066360680818506, 0.84863262574105958, 0.79699751386498363, 0.84466437177280562, 0.85260087970931353, 0.78514056224899598, 0.8087588449034232, 0.78509275196022177, 0.84863262574105958, 0.84863262574105958, 0.80885446548097162, 0.83280742015681775, 0.84863262574105958, 0.85260087970931353, 0.84863262574105958, 0.84863262574105958, 0.84863262574105958, 0.67742398164084905, 0.80474278064639515, 0.84475999235035371, 0.8287435456110156, 0.84466437177280562, 0.84064830751577746, 0.84471218206157961, 0.80488621151271766, 0.84863262574105958, 0.84069611780455167, 0.8366322432587493, 0.84863262574105958, 0.83668005354752351, 0.82095046854082998, 0.84069611780455145, 0.85666475425511568, 0.844664

{'criterion': 'entropy', 'max_depth': 12, 'max_features': 4, 'n_estimators': 5, 'normalize': 1, 'scale': 0}
{'max_depth': 1, 'max_features': 4, 'n_estimators': 10, 'criterion': 'gini', 'verbose': 0}
[0.84601880576196997, 0.85200682971406583, 0.84999879758555175, 0.84999879758555175, 0.74805810066613754, 0.84801481374600185, 0.84603082990645218, 0.83599066926388188, 0.84003078180987423, 0.83997066108746365, 0.84999879758555175, 0.84999879758555175, 0.81799052497414815, 0.85403891013154409, 0.84797874131255568, 0.85599884568212969, 0.70802972368515993, 0.84003078180987423, 0.81605463771252673, 0.84601880576196997, 0.85599884568212969, 0.85803092609960796, 0.84601880576196997, 0.84998677344106977, 0.85801890195512598, 0.8459947574730059, 0.8220065892311762, 0.85599884568212969, 0.84999879758555175, 0.83603876584181036, 0.84999879758555175, 0.84603082990645218, 0.85203087800303001, 0.84997474929658756, 0.84600678161748799, 0.84203881393838831, 0.83600269340836386, 0.82807878219464692, 0.84

AttributeError: 'XYCDataset' object has no attribute 'get_label'

## Compare

In [None]:
metric, mes_min = 'roc_auc', False
full_results = {}
for i in CANDIDATES.keys():
    if i in trackers:
        tracker = trackers[i]
    else:
        tracker = tracker_factory(model_name=i)
        tracker.load_state()
    full_results.update({i:{'tuned': tracker.state['tuned_test'], 'default': tracker.state['default_test']}})

In [None]:
def plot_metric_results(full_results, index, metric, is_min_better=True):
    test_results_list = []
    for i in index:
        test_results_list.append([full_results[i]['default'][metric], full_results[i]['tuned'][metric]])
        
    test_results = np.array(test_results_list)
    if is_min_better:
        baseline = test_results.min()
    else:
        baseline = test_results.max()
    diff = 100 * test_results / baseline - 100
    test_results_formatted = [['{:.6f} ({:+.2f}%)'.format(test_results[i, j], diff[i, j]) for j in range(2)] for i in range(len(index))]

    print (pd.DataFrame(test_results_formatted, columns=['default', 'tuned'], index=index))
    
    full_names = [" ".join(i) for i in itertools.product(index, ['default', 'tuned'])]

    named_results = zip(full_names, test_results.flatten())

    sorted_results = sorted(named_results, key=lambda x: x[1], reverse=not is_min_better)
    xticks = ['%s\n%.5f' % (name, loss) for name, loss in sorted_results]

    pyplot.figure(figsize=(20, 7))
    pyplot.scatter(range(len(full_names)), list(zip(*sorted_results))[1], s=150)
    pyplot.xticks(range(len(full_names)), xticks, fontsize=15)
    pyplot.yticks(fontsize=12)
    pyplot.title('Comparison', fontsize=20)
    pyplot.ylabel(metric, fontsize=16)

In [None]:
%pylab inline --no-import-all
metric, is_min_better = 'roc_auc', False
plot_metric_results(full_results, CANDIDATES.keys(), metric, is_min_better=is_min_better)