Paper: https://arxiv.org/pdf/2007.04074

Manual: https://automl.github.io/auto-sklearn/master/examples/20_basic/example_classification.html#sphx-glr-examples-20-basic-example-classification-py

In [1]:
import numpy as np
import pandas as pd

In [2]:
from pprint import pprint

import sklearn.datasets
import sklearn.metrics

import autosklearn.classification

In [3]:
autosklearn.__version__

'0.15.0'

1. Data Processing

In [4]:
#Set Code path 
alg_path = '/home/korawich/Desktop/AutoML/Algorithm/Auto-sklearn'

In [5]:
#Datafile 
#set data path
save_path = '/home/korawich/Desktop/AutoML/Dataset/Bioresponse'
#set save_path
train_save_path = save_path + '/data_train.csv'
val_save_path   = save_path + '/data_val.csv'
test_save_path  = save_path + '/data_test.csv'

#load files
train_df = pd.read_csv(train_save_path, index_col=0)
val_df   = pd.read_csv(val_save_path, index_col=0)
test_df  = pd.read_csv(test_save_path, index_col=0)

#set X&y
X_train = train_df.loc[:, train_df.columns != 'target'].values
y_train  = train_df.loc[:, train_df.columns == 'target'].values

X_val = val_df.loc[:, val_df.columns != 'target'].values
y_val  = val_df.loc[:, val_df.columns == 'target'].values

X_test = test_df.loc[:, test_df.columns != 'target'].values
y_test  = test_df.loc[:, test_df.columns == 'target'].values

2. Autosklearn

In [6]:
#Set search space -> same as in table 18 of Autosklearn 2.0 paper (2020)

search_space_dict = {
            'classifier': [
                "extra_trees",
                "gradient_boosting",
                "mlp",
                "passive_aggressive",
                "random_forest",
                "sgd"],
            'feature_preprocessor': ["no_preprocessing"]
}

In [7]:
#use custom metrics
def mcc(y_test, prediction):
    
    mcc_score = sklearn.metrics.matthews_corrcoef(y_test, prediction)
    
    return mcc_score

mcc_scorer = autosklearn.metrics.make_scorer(
    name="mcc",
    score_func=mcc,
    optimum=1,
    greater_is_better=True,
    needs_proba=False,
    needs_threshold=False
)

In [8]:
automl = autosklearn.classification.AutoSklearnClassifier(
    ensemble_size=1, #use in the paper, always choosing the current best model
    initial_configurations_via_metalearning=0,
    allow_string_features=False,
    time_left_for_this_task=1200,
    per_run_time_limit=120,
    resampling_strategy="cv",
    resampling_strategy_arguments={"folds": 10},
    metric=mcc_scorer,
    seed=1,
    include=search_space_dict,
    exclude=None,
    delete_tmp_folder_after_terminate=False
)

  del sys.path[0]


In [9]:
automl.fit(X_train, y_train, dataset_name="bioresponse")

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

AutoSklearnClassifier(allow_string_features=False,
                      delete_tmp_folder_after_terminate=False,
                      ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      ensemble_kwargs={'ensemble_size': 1}, ensemble_size=1,
                      include={'classifier': ['extra_trees',
                                              'gradient_boosting', 'mlp',
                                              'passive_aggressive',
                                              'random_forest', 'sgd'],
                               'feature_preprocessor': ['no_preprocessing']},
                      initial_configurations_via_metalearning=0, metric=mcc,
                      per_run_time_limit=120, resampling_strategy='cv',
                      resampling_strategy_arguments={'folds': 10},
                      time_left_for_this_task=1200)

In [10]:
print(automl.leaderboard())

          rank  ensemble_weight           type      cost  duration
model_id                                                          
2            1              1.0  random_forest  0.437622  57.59792


In [11]:
#During fit(), models are fit on individual cross-validation folds.
#Refit use models on whole datasets
automl.refit(X_train.copy(), y_train)
predictions = automl.predict(X_test)
print("MCC score:", sklearn.metrics.matthews_corrcoef(y_test, predictions))

In [12]:
#Report the models found by Auto-sklearn
pprint(automl.show_models(), indent=4)

{   2: {   'cost': 0.43762151632703433,
           'ensemble_weight': 1.0,
           'estimators': [   {   'balancing': Balancing(random_state=1),
                                 'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f297b8a3c50>,
                                 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f297b97c5c0>,
                                 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f297b8a3860>,
                                 'sklearn_classifier': RandomForestClassifier(max_features=20, n_estimators=512, n_jobs=1,
                       random_state=1, warm_start=True)},
                             {   'balancing': Balancing(random_state=1),
                                 'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f297babb518>,

In [13]:
#Report statistics about the search
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: bioresponse
  Metric: mcc
  Best validation score: 0.562378
  Number of target algorithm runs: 23
  Number of successful target algorithm runs: 18
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 4
  Number of target algorithms that exceeded the memory limit: 1

