Paper: https://arxiv.org/pdf/2007.04074

Manual: https://automl.github.io/auto-sklearn/master/examples/20_basic/example_classification.html#sphx-glr-examples-20-basic-example-classification-py

In [1]:
import numpy as np
import pandas as pd

In [2]:
from pprint import pprint

import sklearn.datasets
import sklearn.metrics

import autosklearn

from autosklearn.experimental.askl2 import AutoSklearn2Classifier

In [3]:
autosklearn.__version__

'0.15.0'

1. Data Processing

In [4]:
#Set Code path 
alg_path = '/home/korawich/Desktop/AutoML/Algorithm/Auto-sklearn'

In [5]:
#Datafile 
#set data path
save_path = '/home/korawich/Desktop/AutoML/Dataset/Bioresponse'
#set save_path
train_save_path = save_path + '/data_train.csv'
val_save_path   = save_path + '/data_val.csv'
test_save_path  = save_path + '/data_test.csv'

#load files
train_df = pd.read_csv(train_save_path, index_col=0)
val_df   = pd.read_csv(val_save_path, index_col=0)
test_df  = pd.read_csv(test_save_path, index_col=0)

#set X&y
X_train = train_df.loc[:, train_df.columns != 'target'].values
y_train  = train_df.loc[:, train_df.columns == 'target'].values

X_val = val_df.loc[:, val_df.columns != 'target'].values
y_val  = val_df.loc[:, val_df.columns == 'target'].values

X_test = test_df.loc[:, test_df.columns != 'target'].values
y_test  = test_df.loc[:, test_df.columns == 'target'].values

2. Autosklearn

In [19]:
automl = AutoSklearn2Classifier(
    ensemble_size=1, #use in the paper, always choosing the current best model
    allow_string_features=False,
    time_left_for_this_task=1200,
    per_run_time_limit=120,
    seed=1
)

In [20]:
automl.fit(X_train, y_train, dataset_name="bioresponse")

  f"{self.__class__.__name__} is executed with {num_workers} workers only. "


AutoSklearn2Classifier(allow_string_features=False,
                       ensemble_kwargs={'ensemble_size': 1}, ensemble_size=1,
                       metric=accuracy, per_run_time_limit=120,
                       time_left_for_this_task=1200)

In [21]:
print(automl.leaderboard())

          rank  ensemble_weight         type      cost    duration
model_id                                                          
22           1              1.0  extra_trees  0.208252  115.965575


In [22]:
#During fit(), models are fit on individual cross-validation folds.
#Refit use models on whole datasets
automl.refit(X_train.copy(), y_train)
predictions = automl.predict(X_test)
print("MCC score:", sklearn.metrics.matthews_corrcoef(y_test, predictions))

MCC score: 0.6011409324671427


In [23]:
#Report the models found by Auto-sklearn
pprint(automl.show_models(), indent=4)

{   22: {   'balancing': Balancing(random_state=1, strategy='weighting'),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f06439549e8>,
            'cost': 0.20825242718446604,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f064314a160>,
            'ensemble_weight': 1.0,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f0643954e48>,
            'model_id': 22,
            'rank': 1,
            'sklearn_classifier': ExtraTreesClassifier(criterion='entropy', max_features=174,
                     min_samples_split=14, n_estimators=32, n_jobs=1,
                     random_state=1, warm_start=True)}}


In [24]:
#Report statistics about the search
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: bioresponse
  Metric: accuracy
  Best validation score: 0.791748
  Number of target algorithm runs: 27
  Number of successful target algorithm runs: 26
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 1
  Number of target algorithms that exceeded the memory limit: 0

