In [1]:
import argparse
import logging
import multiprocessing as mp
import os
import random as rn
import warnings

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig

In [2]:
train_path = "train/nyc_taxi_outliers_level_0.csv"
n_pred = 10
dt = "timestamp"
target = "value"
time_limit_min = 15

df_train = pd.read_csv(train_path)[:50]
df_train[dt] = pd.to_datetime(df_train[dt])
df_train

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844.0
1,2014-07-01 00:30:00,8127.0
2,2014-07-01 01:00:00,6210.0
3,2014-07-01 01:30:00,4656.0
4,2014-07-01 02:00:00,3820.0
5,2014-07-01 02:30:00,2873.0
6,2014-07-01 03:00:00,2369.0
7,2014-07-01 03:30:00,2064.0
8,2014-07-01 04:00:00,2221.0
9,2014-07-01 04:30:00,2158.0


In [3]:
time_series_settings = {
    "time_column_name": dt,
    "max_horizon": "auto",
    "target_lags": "auto",
    "target_rolling_window_size": "auto"
}
automl_config = AutoMLConfig(task="forecasting", training_data=df_train, label_column_name=target,
                             n_cross_validations=5, max_cores_per_iteration=-1, path=os.environ["TMPDIR"],
                             enable_early_stopping=True,
                             experiment_timeout_minutes=time_limit_min, **time_series_settings)
ws = Workspace.from_config()
experiment = Experiment(ws, "experiment")
best_run, fitted_model = experiment.submit(automl_config, show_output=True).get_output()

Running on local machine
Parent Run ID: AutoML_543e1079-5d3a-44c4-8cc7-9d2ebaeab822

Current status: DatasetFeaturization. Beginning to featurize the dataset.




Current status: DatasetFeaturizationCompleted. Completed featurizing the dataset.
Heuristic parameters: Target_Lag = '[0]', Target_Rolling_Window = '0', Max_Horizon = '2'.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturi



RobustScaler DecisionTree                      0:03:04       0.1141    0.1107
        19   StandardScalerWrapper DecisionTree             0:00:36       0.1175    0.1107
        20   MinMaxScaler DecisionTree                      0:00:36       0.1141    0.1107
        21   StandardScalerWrapper DecisionTree             0:00:19       0.1175    0.1107
        22   MinMaxScaler DecisionTree                      0:00:31       0.1175    0.1107
        23   StandardScalerWrapper DecisionTree             0:00:27       0.1221    0.1107
        24   StandardScalerWrapper DecisionTree             0:00:26       0.1299    0.1107
        25   StandardScalerWrapper DecisionTree             0:00:21       0.1175    0.1107
        26   StandardScalerWrapper RandomForest             0:00:25       0.1513    0.1107
        27   StandardScalerWrapper DecisionTree             0:00:26       0.1175    0.1107
        28   RobustScaler DecisionTree                      0:00:25       0.1255    0.1107
        29  

In [57]:
print("Best pipeline:")
try:
    ensemble = vars(fitted_model.steps[1][1])["_wrappedEnsemble"]
    print(ensemble.__class__)
    steps = ensemble.estimators_
except:
    steps = fitted_model.steps
best_pipeline = ""
for i, step in enumerate(steps):
    best_pipeline += f"{i}. {str(step)}\n"
print(best_pipeline)

Best pipeline:
<class 'sklearn.ensemble._voting.VotingRegressor'>
0. Pipeline(memory=None,
         steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('decisiontreeregressor',
                 DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse',
                                       max_depth=None, max_features=0.6,
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=0.008504540352732381,
                                       min_samples_split=0.0008991789964660114,
                                       min_weight_fraction_leaf=0.0,
                                       presort='deprecated', random_state=None,
                                       splitter='best'))],
         verbose=False)
1. Pipeline(memory=None,
         steps=[('minmaxscal

In [4]:
# try:
#     print("!")
#     print(vars(fitted_model.steps[1][1]))
#     print("!!")
#     print(vars(fitted_model.steps[1][1])["estimators"])
#     print("!!!")
#     print(vars(vars(fitted_model.steps[1][1])["estimators"][1]))
#     steps = vars(vars(fitted_model.steps[1][1])["estimators"][1])["steps"]
# except:
#     print("!!!!")
#     steps = fitted_model.steps
# print(steps)
# best_pipeline = "Best pipeline:"
# for i, step in enumerate(steps):
#     best_pipeline += f"\n{i}. {str(step[1])}"
# print(best_pipeline)

# x_pred = pd.date_range(df_train[dt].iloc[-1], periods=n_pred+1, freq=pd.infer_freq(df_train[dt]))[1:]
# y_pred = fitted_model.forecast(pd.DataFrame({dt: x_pred}))[0]

# df_pred = pd.DataFrame({dt: x_pred, target: y_pred})
# df_pred.to_csv(pred_path, index=False)

!
{'_wrappedEnsemble': VotingRegressor(estimators=[('9',
                             Pipeline(memory=None,
                                      steps=[('minmaxscaler',
                                              MinMaxScaler(copy=True,
                                                           feature_range=(0,
                                                                          1))),
                                             ('decisiontreeregressor',
                                              DecisionTreeRegressor(ccp_alpha=0.0,
                                                                    criterion='friedman_mse',
                                                                    max_depth=None,
                                                                    max_features=0.6,
                                                                    max_leaf_nodes=None,
                                                                    min_impurity_decrease=0.0,
 