In [1]:
from modelgym.models import CtBClassifier, LGBMClassifier
from modelgym.utils import XYCDataset
from modelgym.trainers import TpeTrainer
from modelgym.metrics import Accuracy
from modelgym.report import Report
from modelgym.utils import ModelSpace
from hyperopt import hp
from pathlib import Path
import numpy as np
from wonderlandClient import ModelGymClient
from skopt.space import Integer, Categorical, Real
from modelgym.trainers import TpeTrainer, GPTrainer, RFTrainer, RandomTrainer
from modelgym.metrics import Accuracy, RocAuc, F1
import math


  return f(*args, **kwds)


## ModelGymClient 

ModelGymClient connects to the wonderland server during initialization. By default it uses config file "~/.wonder/config.yaml", but you can specify your own config.


In [2]:
client = ModelGymClient()
for param, val in client.config.items():
        print(param+":", val)

connect_to: lambda-yadro-cpu1.westeurope.cloudapp.azure.com:50051
local_project_root: ~/repo-storage/test
user: imusinov
azurefs_acc_name: mylake
azurefs_acc_key: nTYA+KhHEIuy2DVyG8uGuNev3qKGJ8Qm975hCkMgm+hGc7AW17RhnygFTKSNho5Iu8s3zwYcqxgrmte0tROBog==
azurefs_share: myshare
client_cert: ~/.wonder/credentials/imusinov.crt
client_key: ~/.wonder/credentials/imusinov.key
ca_cert: ~/.wonder/credentials/wonderland.crt
max_msg_size_megabytes: 515


## Prepare data

Data have to be in format *.csv, a target column have to be called 'y'

In [3]:
#standard sample
from sklearn.datasets import load_breast_cancer
import pandas as pd

data = load_breast_cancer()
y = pd.DataFrame(data=data.target, columns=["y"])                 
X = pd.DataFrame(data=data.data)
df = pd.concat([X, y], axis=1)

## Hyperparamaters range

You don't have to specify all parameters, but then you have to be sure that they have right default values

In [19]:
#Catboost model space 
catboost_space = [Integer(low=100, high=500, name='iterations'),
 Integer(low=1, high=11, name='depth'),
 Real(low=math.exp(-5), high=1e-1, prior='log-uniform', name='learning_rate'),
 Real(low=0, high=1, prior='uniform', transform='identity', name='rsm'),
 Categorical(categories=('Newton', 'Gradient'), prior=None, name='leaf_estimation_method'),
 Integer(low=1, high=10, name='l2_leaf_reg'),
 Real(low=0, high=2, prior='uniform', transform='identity', name='bagging_temperature')]

model_ctb = ModelSpace(CtBClassifier,
                   space=catboost_space,
                   space_update=False)

In [20]:
#Lightgbm model space
lgbm_space = [Real(low=math.exp(-7), high=1, prior='log-uniform', name='learning_rate'),
                 Integer(low=round(math.exp(1)), high=round(math.exp(7)), name='num_leaves'),
                 Real(low=0.5, high=1, name='feature_fraction'),
                 Real(low=0.5, high=1, name='bagging_fraction'),
                 Integer(low=1, high=round(math.exp(6)), name='min_data_in_leaf'),
                 Real(low=math.exp(-16), high=math.exp(5), prior='log-uniform', name='min_sum_hessian_in_leaf'),
                 Real(low=math.exp(-16), high=math.exp(2), prior='log-uniform', name='lambda_l1'),
                 Real(low=math.exp(-16), high=math.exp(2), prior='log-uniform', name='lambda_l2')]

model_lgbm = ModelSpace(LGBMClassifier,
                   space=lgbm_space,
                   space_update=False)

In [21]:
#trainer with optimization algorithm GP
trainer = GPTrainer(model_lgbm)

## Training



In [22]:
best = trainer.crossval_optimize_params(opt_metric=RocAuc(),          #optimizing metrics 
                                        dataset=df,   #data or path to the data 
                                        cv=3, 
                                        opt_evals=5,        #number of optimization iterations
                                        metrics=[RocAuc()], #all calculated metrics
                                        workers=1,          #number of parallel jobs on the same iteration
                                        client=client,      #only for cluster optimization 
                                        timeout=400)        #timeout for 1 job 

ValueError: Dataset doesn't have 'y' column

In [9]:
print(trainer.get_best_results())

{'LGBMClassifier': {'result': {'output': {'loss': -0.9115528132916405, 'metric_cv_results': [{'roc_auc': 0.8935816428333888}, {'roc_auc': 0.9225517890772127}, {'roc_auc': 0.9185250079643198}], 'params': {'learning_rate': 0.12544100545133802, 'num_leaves': 453, 'feature_fraction': 0.5604591957913877, 'bagging_fraction': 0.700983639133073, 'min_data_in_leaf': 169, 'min_sum_hessian_in_leaf': 10.38977065214724, 'lambda_l1': 6.438433262746808, 'lambda_l2': 0.009275802375336071}, 'status': 'ok', 'loss_variance': 0.012813429602372648}, 'result_model_path': PosixPath('/home/igor/repo-storage/test/imusinov/model-AFSoUsYC9JW6/model.pickle')}, 'model_space': <modelgym.utils.model_space.ModelSpace object at 0x7f6ab932c6a0>}}


In [10]:
best_model = trainer.get_best_model()
best_model

<modelgym.models.lightgbm_model.LGBMClassifier at 0x7f6ababb5cf8>