In [30]:
import autosklearn.classification
import sklearn.datasets
import sklearn.metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import numpy as np
import warnings

# Digits dataset

Load the digits dataset:

In [31]:
X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

Train with gradient boosting without hyperparameter tuning:

In [32]:
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)

print("accuracy on training set: %f" % gbrt.score(X_train, y_train))
print("accuracy on test set: %f" % gbrt.score(X_test, y_test))

accuracy on training set: 1.000000
accuracy on test set: 0.962222


Train with autosklearn for 2 minutes:

In [33]:
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=120)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    automl.fit(X_train, y_train)

y_hat_train = automl.predict(X_train)
y_hat_test = automl.predict(X_test)

Time limit for a single run is higher than total time limit. Capping the limit for a single run to the total time given to SMAC (118.939937)


In [34]:
print("accuracy on training set: %f" % sklearn.metrics.accuracy_score(y_train, y_hat_train))
print("accuracy on test set: %f" % sklearn.metrics.accuracy_score(y_test, y_hat_test))

accuracy on training set: 0.997030
accuracy on test set: 0.993333


Show best performing model with its hyperparameters:

In [35]:
automl.cv_results_['params'][np.argmax(automl.cv_results_['mean_test_score'])]

{'balancing:strategy': 'none',
 'categorical_encoding:__choice__': 'one_hot_encoding',
 'classifier:__choice__': 'libsvm_svc',
 'imputation:strategy': 'median',
 'preprocessor:__choice__': 'select_rates',
 'rescaling:__choice__': 'standardize',
 'categorical_encoding:one_hot_encoding:use_minimum_fraction': 'True',
 'classifier:libsvm_svc:C': 870.2240970463429,
 'classifier:libsvm_svc:gamma': 0.010682839357128344,
 'classifier:libsvm_svc:kernel': 'poly',
 'classifier:libsvm_svc:max_iter': -1,
 'classifier:libsvm_svc:shrinking': 'False',
 'classifier:libsvm_svc:tol': 2.4851608604406576e-05,
 'preprocessor:select_rates:alpha': 0.4608103694360143,
 'preprocessor:select_rates:mode': 'fdr',
 'preprocessor:select_rates:score_func': 'f_classif',
 'categorical_encoding:one_hot_encoding:minimum_fraction': 0.010000000000000004,
 'classifier:libsvm_svc:coef0': 0.5325949351918051,
 'classifier:libsvm_svc:degree': 3}

In [36]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: d74860caaa557f473ce23908ff7ba369
  Metric: accuracy
  Best validation score: 0.986517
  Number of target algorithm runs: 5
  Number of successful target algorithm runs: 4
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 1
  Number of target algorithms that exceeded the memory limit: 0



# Breast cancer dataset

Load the breast cancer dataset:

In [37]:
cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0)

Train with gradient boosting without hyperparameter tuning:

In [38]:
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)

print("accuracy on training set: %f" % gbrt.score(X_train, y_train))
print("accuracy on test set: %f" % gbrt.score(X_test, y_test))

accuracy on training set: 1.000000
accuracy on test set: 0.958042


Train with autosklearn for 2 minutes:

In [40]:
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=120)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    automl.fit(X_train, y_train)
y_hat_train = automl.predict(X_train)
y_hat_test = automl.predict(X_test)

Time limit for a single run is higher than total time limit. Capping the limit for a single run to the total time given to SMAC (118.803135)


In [41]:
print("accuracy on training set: %f" % sklearn.metrics.accuracy_score(y_train, y_hat_train))
print("accuracy on test set: %f" % sklearn.metrics.accuracy_score(y_test, y_hat_test))

accuracy on training set: 0.992958
accuracy on test set: 0.965035


Show best performing model with its hyperparameters:

In [43]:
automl.cv_results_['params'][np.argmax(automl.cv_results_['mean_test_score'])]

{'balancing:strategy': 'weighting',
 'categorical_encoding:__choice__': 'no_encoding',
 'classifier:__choice__': 'extra_trees',
 'imputation:strategy': 'median',
 'preprocessor:__choice__': 'polynomial',
 'rescaling:__choice__': 'minmax',
 'classifier:extra_trees:bootstrap': 'False',
 'classifier:extra_trees:criterion': 'gini',
 'classifier:extra_trees:max_depth': 'None',
 'classifier:extra_trees:max_features': 0.5670424455696162,
 'classifier:extra_trees:max_leaf_nodes': 'None',
 'classifier:extra_trees:min_impurity_decrease': 0.0,
 'classifier:extra_trees:min_samples_leaf': 8,
 'classifier:extra_trees:min_samples_split': 16,
 'classifier:extra_trees:min_weight_fraction_leaf': 0.0,
 'classifier:extra_trees:n_estimators': 100,
 'preprocessor:polynomial:degree': 3,
 'preprocessor:polynomial:include_bias': 'True',
 'preprocessor:polynomial:interaction_only': 'False'}

In [44]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: 528ab82dffffb5039325fc43a2c64979
  Metric: accuracy
  Best validation score: 0.978723
  Number of target algorithm runs: 22
  Number of successful target algorithm runs: 21
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 1
  Number of target algorithms that exceeded the memory limit: 0

