## Setting up scikit-learn

In [None]:
!sudo apt-get install build-essential swig
!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install
!pip install auto-sklearn

Reading package lists... Done
Building dependency tree       
Reading state information... Done
build-essential is already the newest version (12.4ubuntu1).
swig is already the newest version (3.0.12-1).
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   207  100   207    0     0    862      0 --:--:-- --:--:-- --:--:--   862


In [None]:
!pip install scikit-learn==0.24.0



In [None]:
import sklearn

print(sklearn.__version__)

0.24.0


## Loading required libraries

In [None]:
# example of auto-sklearn for the insurance regression dataset
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from autosklearn.regression import AutoSklearnRegressor
from autosklearn.metrics import mean_absolute_error as auto_mean_absolute_error

## Loading data

In [None]:
# load dataset
url = 'ratings.csv'
dataframe = read_csv(url)

In [None]:
dataframe

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


## Splitting dataset in to train and test sets

In [None]:
# split into input and output elements
data = dataframe.values
data = data.astype('float32')
X, y = data[:, :-1], data[:, -1]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [None]:
# define search
model = AutoSklearnRegressor(time_left_for_this_task=60*60, per_run_time_limit=30, n_jobs=8)
# perform the search
model.fit(X_train, y_train)
# summarize
print(model.sprint_statistics())
# evaluate best model
y_hat = model.predict(X_test)
mae = mean_absolute_error(y_test, y_hat)
print("MAE: %.3f" % mae)

auto-sklearn results:
  Dataset name: 98b85bc476de34e3b5187670f5f8e445
  Metric: r2
  Best validation score: 0.171401
  Number of target algorithm runs: 238
  Number of successful target algorithm runs: 138
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 86
  Number of target algorithms that exceeded the memory limit: 14

MAE: 0.733


In [None]:
from sklearn.metrics import mean_squared_error

rms = mean_squared_error(y_test, y_hat, squared=False)

In [None]:
mae = mean_absolute_error(y_test, y_hat)
mae

0.7334022431695156

In [None]:
rms

0.942036577903963

In [None]:
model.show_models()

"[(0.620000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'quantile_transformer', 'feature_preprocessor:__choice__': 'extra_trees_preproc_for_regression', 'regressor:__choice__': 'k_nearest_neighbors', 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.07091618793106647, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:n_quantiles': 839, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:output_distribution': 'uniform', 'feature_preprocessor:extra_trees_preproc_for_regression:bootstrap': 'False', 'feature_preprocessor:extra_trees_preproc_for_regression:criterion': 'friedman

In [None]:
model.cv_results_.keys()

dict_keys(['mean_test_score', 'mean_fit_time', 'params', 'rank_test_scores', 'status', 'budgets', 'param_data_preprocessing:categorical_transformer:categorical_encoding:__choice__', 'param_data_preprocessing:categorical_transformer:category_coalescence:__choice__', 'param_data_preprocessing:numerical_transformer:imputation:strategy', 'param_data_preprocessing:numerical_transformer:rescaling:__choice__', 'param_feature_preprocessor:__choice__', 'param_regressor:__choice__', 'param_data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction', 'param_data_preprocessing:numerical_transformer:rescaling:quantile_transformer:n_quantiles', 'param_data_preprocessing:numerical_transformer:rescaling:quantile_transformer:output_distribution', 'param_data_preprocessing:numerical_transformer:rescaling:robust_scaler:q_max', 'param_data_preprocessing:numerical_transformer:rescaling:robust_scaler:q_min', 'param_feature_preprocessor:extra_trees_preproc_for_regress

In [None]:
print(model.show_models())

[(0.620000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'quantile_transformer', 'feature_preprocessor:__choice__': 'extra_trees_preproc_for_regression', 'regressor:__choice__': 'k_nearest_neighbors', 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.07091618793106647, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:n_quantiles': 839, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:output_distribution': 'uniform', 'feature_preprocessor:extra_trees_preproc_for_regression:bootstrap': 'False', 'feature_preprocessor:extra_trees_preproc_for_regression:criterion': 'friedman_

In [None]:
model.automl_.runhistory_.data

OrderedDict([(RunKey(config_id=1, instance_id='{"task_id": "98b85bc476de34e3b5187670f5f8e445"}', seed=0, budget=0.0),
              RunValue(cost=1.0, time=41.5925931930542, status=<StatusType.TIMEOUT: 2>, starttime=1610648954.898724, endtime=1610648997.6069982, additional_info={'error': 'Timeout', 'configuration_origin': 'Initial design'})),
             (RunKey(config_id=2, instance_id='{"task_id": "98b85bc476de34e3b5187670f5f8e445"}', seed=0, budget=0.0),
              RunValue(cost=1.0, time=42.134257078170776, status=<StatusType.TIMEOUT: 2>, starttime=1610648956.8437953, endtime=1610649000.0539107, additional_info={'error': 'Timeout', 'configuration_origin': 'Initial design'})),
             (RunKey(config_id=3, instance_id='{"task_id": "98b85bc476de34e3b5187670f5f8e445"}', seed=0, budget=0.0),
              RunValue(cost=1.0, time=41.31968021392822, status=<StatusType.TIMEOUT: 2>, starttime=1610648958.8135774, endtime=1610649001.2766078, additional_info={'error': 'Timeout', 'conf

In [None]:
import pandas as pd

ensemble_performance_frame = pd.DataFrame(model.automl_.ensemble_performance_history)

In [None]:
ensemble_performance_frame

Unnamed: 0,Timestamp,ensemble_optimization_score
0,2021-01-14 18:29:59.071398,-0.000020
1,2021-01-14 18:30:24.921803,0.001117
2,2021-01-14 18:31:00.361686,0.017356
3,2021-01-14 18:31:22.981627,0.059274
4,2021-01-14 18:32:38.791573,0.140459
...,...,...
72,2021-01-14 19:15:39.423482,0.172883
73,2021-01-14 19:15:57.392736,0.172883
74,2021-01-14 19:17:04.858183,0.172883
75,2021-01-14 19:19:02.056250,0.172883
