diff --git a/.github/workflows/python-sdk-tutorial-auto-ml-forecasting-data-preparation.yml b/.github/workflows/python-sdk-tutorial-auto-ml-forecasting-data-preparation.yml new file mode 100644 index 0000000000..1190004eae --- /dev/null +++ b/.github/workflows/python-sdk-tutorial-auto-ml-forecasting-data-preparation.yml @@ -0,0 +1,64 @@ +name: auto-ml-forecasting-data-preparation +# This file is generated by v1/python-sdk/tutorials/automl-with-azureml/generate_workflows.py +on: + workflow_dispatch: + schedule: + - cron: "0 10 * * 2" + pull_request: + branches: + - main + paths: + - v1/python-sdk/tutorials/automl-with-azureml/forecasting-data-preparation/** + - v1/python-sdk/tutorials/automl-with-azureml/automl_env_linux.yml + - .github/workflows/python-sdk-tutorial-auto-ml-forecasting-data-preparation.yml +jobs: + build: + runs-on: ${{vars.V1_UBUNTU_RUNNER}} + defaults: + run: + shell: bash -l {0} + strategy: + fail-fast: false + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: "3.8" + - name: Run Install packages + run: | + chmod +x ./v1/scripts/install-packages.sh + ./v1/scripts/install-packages.sh + shell: bash + - name: create automl conda environment + uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: azure_automl + environment-file: v1/python-sdk/tutorials/automl-with-azureml/automl_env_linux.yml + auto-activate-base: false + - name: install papermill and set up the IPython kernel + run: | + pip install papermill==2.4.0 + python -m ipykernel install --user --name azure_automl --display-name "Python (azure_automl)" + pip list + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZ_CREDS}} + - name: Run update-azure-extensions + run: | + chmod +x ./v1/scripts/update-azure-extensions.sh + ./v1/scripts/update-azure-extensions.sh + shell: bash + - name: attach to workspace + run: az ml folder attach -w main -g azureml-examples + - name: run auto-ml-forecasting-data-preparation.ipynb + run: papermill -k python auto-ml-forecasting-data-preparation.ipynb auto-ml-forecasting-data-preparation.output.ipynb + working-directory: v1/python-sdk/tutorials/automl-with-azureml/forecasting-data-preparation + - name: upload notebook's working folder as an artifact + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: auto-ml-forecasting-data-preparation + path: v1/python-sdk/tutorials/automl-with-azureml/forecasting-data-preparation diff --git a/.github/workflows/python-sdk-tutorial-auto-ml-forecasting-demand-tcn.yml b/.github/workflows/python-sdk-tutorial-auto-ml-forecasting-demand-tcn.yml new file mode 100644 index 0000000000..11f6e3eff4 --- /dev/null +++ b/.github/workflows/python-sdk-tutorial-auto-ml-forecasting-demand-tcn.yml @@ -0,0 +1,64 @@ +name: auto-ml-forecasting-demand-tcn +# This file is generated by v1/python-sdk/tutorials/automl-with-azureml/generate_workflows.py +on: + workflow_dispatch: + schedule: + - cron: "0 11 * * 2" + pull_request: + branches: + - main + paths: + - v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/** + - v1/python-sdk/tutorials/automl-with-azureml/automl_env_linux.yml + - .github/workflows/python-sdk-tutorial-auto-ml-forecasting-demand-tcn.yml +jobs: + build: + runs-on: ${{vars.V1_UBUNTU_RUNNER}} + defaults: + run: + shell: bash -l {0} + strategy: + fail-fast: false + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: "3.8" + - name: Run Install packages + run: | + chmod +x ./v1/scripts/install-packages.sh + ./v1/scripts/install-packages.sh + shell: bash + - name: create automl conda environment + uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: azure_automl + environment-file: v1/python-sdk/tutorials/automl-with-azureml/automl_env_linux.yml + auto-activate-base: false + - name: install papermill and set up the IPython kernel + run: | + pip install papermill==2.4.0 + python -m ipykernel install --user --name azure_automl --display-name "Python (azure_automl)" + pip list + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZ_CREDS}} + - name: Run update-azure-extensions + run: | + chmod +x ./v1/scripts/update-azure-extensions.sh + ./v1/scripts/update-azure-extensions.sh + shell: bash + - name: attach to workspace + run: az ml folder attach -w main -g azureml-examples + - name: run auto-ml-forecasting-demand-tcn.ipynb + run: papermill -k python auto-ml-forecasting-demand-tcn.ipynb auto-ml-forecasting-demand-tcn.output.ipynb + working-directory: v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn + - name: upload notebook's working folder as an artifact + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: auto-ml-forecasting-demand-tcn + path: v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn diff --git a/v1/python-sdk/README.md b/v1/python-sdk/README.md index d274d0b1fa..8ead142881 100644 --- a/v1/python-sdk/README.md +++ b/v1/python-sdk/README.md @@ -54,7 +54,7 @@ These concepts are sufficient to understand all examples in this repository, whi path|status|notebooks|description -|-|-|- -[automl-with-azureml](tutorials/automl-with-azureml)|[![automl-nlp-text-classification-multiclass](https://github.com/Azure/azureml-examples/workflows/automl-nlp-text-classification-multiclass/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-automl-nlp-text-classification-multiclass.yml)
[![automl-nlp-text-classification-multilabel](https://github.com/Azure/azureml-examples/workflows/automl-nlp-text-classification-multilabel/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-automl-nlp-text-classification-multilabel.yml)
[![automl-nlp-ner](https://github.com/Azure/azureml-examples/workflows/automl-nlp-ner/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-automl-nlp-ner.yml)
[![auto-ml-classification-bank-marketing-all-features](https://github.com/Azure/azureml-examples/workflows/auto-ml-classification-bank-marketing-all-features/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-classification-bank-marketing-all-features.yml)
[![auto-ml-classification-credit-card-fraud](https://github.com/Azure/azureml-examples/workflows/auto-ml-classification-credit-card-fraud/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-classification-credit-card-fraud.yml)
[![auto-ml-classification-text-dnn](https://github.com/Azure/azureml-examples/workflows/auto-ml-classification-text-dnn/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-classification-text-dnn.yml)
[![auto-ml-continuous-retraining](https://github.com/Azure/azureml-examples/workflows/auto-ml-continuous-retraining/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-continuous-retraining.yml)
[![auto-ml-forecasting-backtest-many-models](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-backtest-many-models/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-backtest-many-models.yml)
[![auto-ml-forecasting-backtest-single-model](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-backtest-single-model/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-backtest-single-model.yml)
[![auto-ml-forecasting-bike-share](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-bike-share/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-bike-share.yml)
[![auto-ml-forecasting-energy-demand](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-energy-demand/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-energy-demand.yml)
[![auto-ml-forecasting-function](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-function/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-function.yml)
[![auto-ml-forecasting-github-dau](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-github-dau/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-github-dau.yml)
[![auto-ml-forecasting-hierarchical-timeseries](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-hierarchical-timeseries/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-hierarchical-timeseries.yml)
[![auto-ml-forecasting-many-models](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-many-models/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-many-models.yml)
[![auto-ml-forecasting-orange-juice-sales](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-orange-juice-sales/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-orange-juice-sales.yml)
[![auto-ml-forecasting-pipelines](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-pipelines/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-pipelines.yml)
[![auto-ml-forecasting-univariate-recipe-experiment-settings](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-univariate-recipe-experiment-settings/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-univariate-recipe-experiment-settings.yml)
[![auto-ml-forecasting-univariate-recipe-run-experiment](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-univariate-recipe-run-experiment/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-univariate-recipe-run-experiment.yml)
[![auto-ml-image-classification-multiclass-batch-scoring](https://github.com/Azure/azureml-examples/workflows/auto-ml-image-classification-multiclass-batch-scoring/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-image-classification-multiclass-batch-scoring.yml)
[![auto-ml-image-classification-multiclass](https://github.com/Azure/azureml-examples/workflows/auto-ml-image-classification-multiclass/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-image-classification-multiclass.yml)
[![auto-ml-image-classification-multilabel](https://github.com/Azure/azureml-examples/workflows/auto-ml-image-classification-multilabel/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-image-classification-multilabel.yml)
[![auto-ml-image-instance-segmentation](https://github.com/Azure/azureml-examples/workflows/auto-ml-image-instance-segmentation/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-image-instance-segmentation.yml)
[![auto-ml-image-object-detection](https://github.com/Azure/azureml-examples/workflows/auto-ml-image-object-detection/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-image-object-detection.yml)
[![auto-ml-classification-credit-card-fraud-local](https://github.com/Azure/azureml-examples/workflows/auto-ml-classification-credit-card-fraud-local/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-classification-credit-card-fraud-local.yml)
[![binary-classification-metric-and-confidence-interval](https://github.com/Azure/azureml-examples/workflows/binary-classification-metric-and-confidence-interval/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-binary-classification-metric-and-confidence-interval.yml)
[![auto-ml-regression-explanation-featurization](https://github.com/Azure/azureml-examples/workflows/auto-ml-regression-explanation-featurization/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-regression-explanation-featurization.yml)
[![auto-ml-regression](https://github.com/Azure/azureml-examples/workflows/auto-ml-regression/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-regression.yml)|[automl-nlp-text-classification-multiclass.ipynb](tutorials/automl-with-azureml/automl-nlp-multiclass/automl-nlp-text-classification-multiclass.ipynb)
[automl-nlp-text-classification-multilabel.ipynb](tutorials/automl-with-azureml/automl-nlp-multilabel/automl-nlp-text-classification-multilabel.ipynb)
[automl-nlp-ner.ipynb](tutorials/automl-with-azureml/automl-nlp-ner/automl-nlp-ner.ipynb)
[auto-ml-classification-bank-marketing-all-features.ipynb](tutorials/automl-with-azureml/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb)
[auto-ml-classification-credit-card-fraud.ipynb](tutorials/automl-with-azureml/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb)
[auto-ml-classification-text-dnn.ipynb](tutorials/automl-with-azureml/classification-text-dnn/auto-ml-classification-text-dnn.ipynb)
[auto-ml-continuous-retraining.ipynb](tutorials/automl-with-azureml/continuous-retraining/auto-ml-continuous-retraining.ipynb)
[auto-ml-forecasting-backtest-many-models.ipynb](tutorials/automl-with-azureml/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.ipynb)
[auto-ml-forecasting-backtest-single-model.ipynb](tutorials/automl-with-azureml/forecasting-backtest-single-model/auto-ml-forecasting-backtest-single-model.ipynb)
[auto-ml-forecasting-bike-share.ipynb](tutorials/automl-with-azureml/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb)
[auto-ml-forecasting-energy-demand.ipynb](tutorials/automl-with-azureml/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb)
[auto-ml-forecasting-function.ipynb](tutorials/automl-with-azureml/forecasting-forecast-function/auto-ml-forecasting-function.ipynb)
[auto-ml-forecasting-github-dau.ipynb](tutorials/automl-with-azureml/forecasting-github-dau/auto-ml-forecasting-github-dau.ipynb)
[auto-ml-forecasting-hierarchical-timeseries.ipynb](tutorials/automl-with-azureml/forecasting-hierarchical-timeseries/auto-ml-forecasting-hierarchical-timeseries.ipynb)
[auto-ml-forecasting-many-models.ipynb](tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb)
[auto-ml-forecasting-orange-juice-sales.ipynb](tutorials/automl-with-azureml/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb)
[auto-ml-forecasting-pipelines.ipynb](tutorials/automl-with-azureml/forecasting-pipelines/auto-ml-forecasting-pipelines.ipynb)
[auto-ml-forecasting-univariate-recipe-experiment-settings.ipynb](tutorials/automl-with-azureml/forecasting-recipes-univariate/auto-ml-forecasting-univariate-recipe-experiment-settings.ipynb)
[auto-ml-forecasting-univariate-recipe-run-experiment.ipynb](tutorials/automl-with-azureml/forecasting-recipes-univariate/auto-ml-forecasting-univariate-recipe-run-experiment.ipynb)
[auto-ml-image-classification-multiclass-batch-scoring.ipynb](tutorials/automl-with-azureml/image-classification-multiclass-batch-scoring/auto-ml-image-classification-multiclass-batch-scoring.ipynb)
[auto-ml-image-classification-multiclass.ipynb](tutorials/automl-with-azureml/image-classification-multiclass/auto-ml-image-classification-multiclass.ipynb)
[auto-ml-image-classification-multilabel.ipynb](tutorials/automl-with-azureml/image-classification-multilabel/auto-ml-image-classification-multilabel.ipynb)
[auto-ml-image-instance-segmentation.ipynb](tutorials/automl-with-azureml/image-instance-segmentation/auto-ml-image-instance-segmentation.ipynb)
[auto-ml-image-object-detection.ipynb](tutorials/automl-with-azureml/image-object-detection/auto-ml-image-object-detection.ipynb)
[auto-ml-classification-credit-card-fraud-local.ipynb](tutorials/automl-with-azureml/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb)
[binary-classification-metric-and-confidence-interval.ipynb](tutorials/automl-with-azureml/metrics/binary-classification-metric-and-confidence-interval.ipynb)
[auto-ml-regression-explanation-featurization.ipynb](tutorials/automl-with-azureml/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb)
[auto-ml-regression.ipynb](tutorials/automl-with-azureml/regression/auto-ml-regression.ipynb)|Tutorials showing how to build high quality machine learning models using Azure Automated Machine Learning. +[automl-with-azureml](tutorials/automl-with-azureml)|[![automl-nlp-text-classification-multiclass](https://github.com/Azure/azureml-examples/workflows/automl-nlp-text-classification-multiclass/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-automl-nlp-text-classification-multiclass.yml)
[![automl-nlp-text-classification-multilabel](https://github.com/Azure/azureml-examples/workflows/automl-nlp-text-classification-multilabel/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-automl-nlp-text-classification-multilabel.yml)
[![automl-nlp-ner](https://github.com/Azure/azureml-examples/workflows/automl-nlp-ner/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-automl-nlp-ner.yml)
[![auto-ml-classification-bank-marketing-all-features](https://github.com/Azure/azureml-examples/workflows/auto-ml-classification-bank-marketing-all-features/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-classification-bank-marketing-all-features.yml)
[![auto-ml-classification-credit-card-fraud](https://github.com/Azure/azureml-examples/workflows/auto-ml-classification-credit-card-fraud/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-classification-credit-card-fraud.yml)
[![auto-ml-classification-text-dnn](https://github.com/Azure/azureml-examples/workflows/auto-ml-classification-text-dnn/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-classification-text-dnn.yml)
[![auto-ml-continuous-retraining](https://github.com/Azure/azureml-examples/workflows/auto-ml-continuous-retraining/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-continuous-retraining.yml)
[![auto-ml-forecasting-backtest-many-models](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-backtest-many-models/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-backtest-many-models.yml)
[![auto-ml-forecasting-backtest-single-model](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-backtest-single-model/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-backtest-single-model.yml)
[![auto-ml-forecasting-bike-share](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-bike-share/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-bike-share.yml)
[![auto-ml-forecasting-data-preparation](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-data-preparation/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-data-preparation.yml)
[![auto-ml-forecasting-demand-tcn](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-demand-tcn/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-demand-tcn.yml)
[![auto-ml-forecasting-energy-demand](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-energy-demand/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-energy-demand.yml)
[![auto-ml-forecasting-function](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-function/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-function.yml)
[![auto-ml-forecasting-github-dau](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-github-dau/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-github-dau.yml)
[![auto-ml-forecasting-hierarchical-timeseries](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-hierarchical-timeseries/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-hierarchical-timeseries.yml)
[![auto-ml-forecasting-many-models](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-many-models/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-many-models.yml)
[![auto-ml-forecasting-orange-juice-sales](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-orange-juice-sales/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-orange-juice-sales.yml)
[![auto-ml-forecasting-pipelines](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-pipelines/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-pipelines.yml)
[![auto-ml-forecasting-univariate-recipe-experiment-settings](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-univariate-recipe-experiment-settings/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-univariate-recipe-experiment-settings.yml)
[![auto-ml-forecasting-univariate-recipe-run-experiment](https://github.com/Azure/azureml-examples/workflows/auto-ml-forecasting-univariate-recipe-run-experiment/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-forecasting-univariate-recipe-run-experiment.yml)
[![auto-ml-image-classification-multiclass-batch-scoring](https://github.com/Azure/azureml-examples/workflows/auto-ml-image-classification-multiclass-batch-scoring/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-image-classification-multiclass-batch-scoring.yml)
[![auto-ml-image-classification-multiclass](https://github.com/Azure/azureml-examples/workflows/auto-ml-image-classification-multiclass/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-image-classification-multiclass.yml)
[![auto-ml-image-classification-multilabel](https://github.com/Azure/azureml-examples/workflows/auto-ml-image-classification-multilabel/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-image-classification-multilabel.yml)
[![auto-ml-image-instance-segmentation](https://github.com/Azure/azureml-examples/workflows/auto-ml-image-instance-segmentation/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-image-instance-segmentation.yml)
[![auto-ml-image-object-detection](https://github.com/Azure/azureml-examples/workflows/auto-ml-image-object-detection/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-image-object-detection.yml)
[![auto-ml-classification-credit-card-fraud-local](https://github.com/Azure/azureml-examples/workflows/auto-ml-classification-credit-card-fraud-local/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-classification-credit-card-fraud-local.yml)
[![binary-classification-metric-and-confidence-interval](https://github.com/Azure/azureml-examples/workflows/binary-classification-metric-and-confidence-interval/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-binary-classification-metric-and-confidence-interval.yml)
[![auto-ml-regression-explanation-featurization](https://github.com/Azure/azureml-examples/workflows/auto-ml-regression-explanation-featurization/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-regression-explanation-featurization.yml)
[![auto-ml-regression](https://github.com/Azure/azureml-examples/workflows/auto-ml-regression/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-auto-ml-regression.yml)|[automl-nlp-text-classification-multiclass.ipynb](tutorials/automl-with-azureml/automl-nlp-multiclass/automl-nlp-text-classification-multiclass.ipynb)
[automl-nlp-text-classification-multilabel.ipynb](tutorials/automl-with-azureml/automl-nlp-multilabel/automl-nlp-text-classification-multilabel.ipynb)
[automl-nlp-ner.ipynb](tutorials/automl-with-azureml/automl-nlp-ner/automl-nlp-ner.ipynb)
[auto-ml-classification-bank-marketing-all-features.ipynb](tutorials/automl-with-azureml/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb)
[auto-ml-classification-credit-card-fraud.ipynb](tutorials/automl-with-azureml/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb)
[auto-ml-classification-text-dnn.ipynb](tutorials/automl-with-azureml/classification-text-dnn/auto-ml-classification-text-dnn.ipynb)
[auto-ml-continuous-retraining.ipynb](tutorials/automl-with-azureml/continuous-retraining/auto-ml-continuous-retraining.ipynb)
[auto-ml-forecasting-backtest-many-models.ipynb](tutorials/automl-with-azureml/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.ipynb)
[auto-ml-forecasting-backtest-single-model.ipynb](tutorials/automl-with-azureml/forecasting-backtest-single-model/auto-ml-forecasting-backtest-single-model.ipynb)
[auto-ml-forecasting-bike-share.ipynb](tutorials/automl-with-azureml/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb)
[auto-ml-forecasting-data-preparation.ipynb](tutorials/automl-with-azureml/forecasting-data-preparation/auto-ml-forecasting-data-preparation.ipynb)
[auto-ml-forecasting-demand-tcn.ipynb](tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/auto-ml-forecasting-demand-tcn.ipynb)
[auto-ml-forecasting-energy-demand.ipynb](tutorials/automl-with-azureml/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb)
[auto-ml-forecasting-function.ipynb](tutorials/automl-with-azureml/forecasting-forecast-function/auto-ml-forecasting-function.ipynb)
[auto-ml-forecasting-github-dau.ipynb](tutorials/automl-with-azureml/forecasting-github-dau/auto-ml-forecasting-github-dau.ipynb)
[auto-ml-forecasting-hierarchical-timeseries.ipynb](tutorials/automl-with-azureml/forecasting-hierarchical-timeseries/auto-ml-forecasting-hierarchical-timeseries.ipynb)
[auto-ml-forecasting-many-models.ipynb](tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb)
[auto-ml-forecasting-orange-juice-sales.ipynb](tutorials/automl-with-azureml/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb)
[auto-ml-forecasting-pipelines.ipynb](tutorials/automl-with-azureml/forecasting-pipelines/auto-ml-forecasting-pipelines.ipynb)
[auto-ml-forecasting-univariate-recipe-experiment-settings.ipynb](tutorials/automl-with-azureml/forecasting-recipes-univariate/auto-ml-forecasting-univariate-recipe-experiment-settings.ipynb)
[auto-ml-forecasting-univariate-recipe-run-experiment.ipynb](tutorials/automl-with-azureml/forecasting-recipes-univariate/auto-ml-forecasting-univariate-recipe-run-experiment.ipynb)
[auto-ml-image-classification-multiclass-batch-scoring.ipynb](tutorials/automl-with-azureml/image-classification-multiclass-batch-scoring/auto-ml-image-classification-multiclass-batch-scoring.ipynb)
[auto-ml-image-classification-multiclass.ipynb](tutorials/automl-with-azureml/image-classification-multiclass/auto-ml-image-classification-multiclass.ipynb)
[auto-ml-image-classification-multilabel.ipynb](tutorials/automl-with-azureml/image-classification-multilabel/auto-ml-image-classification-multilabel.ipynb)
[auto-ml-image-instance-segmentation.ipynb](tutorials/automl-with-azureml/image-instance-segmentation/auto-ml-image-instance-segmentation.ipynb)
[auto-ml-image-object-detection.ipynb](tutorials/automl-with-azureml/image-object-detection/auto-ml-image-object-detection.ipynb)
[auto-ml-classification-credit-card-fraud-local.ipynb](tutorials/automl-with-azureml/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb)
[binary-classification-metric-and-confidence-interval.ipynb](tutorials/automl-with-azureml/metrics/binary-classification-metric-and-confidence-interval.ipynb)
[auto-ml-regression-explanation-featurization.ipynb](tutorials/automl-with-azureml/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb)
[auto-ml-regression.ipynb](tutorials/automl-with-azureml/regression/auto-ml-regression.ipynb)|Tutorials showing how to build high quality machine learning models using Azure Automated Machine Learning. [automl-with-databricks](tutorials/automl-with-databricks)|[![automl-databricks-local-01](https://github.com/Azure/azureml-examples/workflows/automl-databricks-local-01/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-automl-databricks-local-01.yml)
[![automl-databricks-local-with-deployment](https://github.com/Azure/azureml-examples/workflows/automl-databricks-local-with-deployment/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-automl-databricks-local-with-deployment.yml)|[automl-databricks-local-01.ipynb](tutorials/automl-with-databricks/automl-databricks-local-01.ipynb)
[automl-databricks-local-with-deployment.ipynb](tutorials/automl-with-databricks/automl-databricks-local-with-deployment.ipynb)|*no description* [dataset-uploads](tutorials/dataset-uploads)|[![dataset-uploads](https://github.com/Azure/azureml-examples/workflows/python-sdk-tutorial-dataset-uploads/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-dataset-uploads.yml)|[upload_dataframe_register_as_dataset.ipynb](tutorials/dataset-uploads/upload_dataframe_register_as_dataset.ipynb)
[upload_directory_create_file_dataset.ipynb](tutorials/dataset-uploads/upload_directory_create_file_dataset.ipynb)|*no description* [deploy-local](tutorials/deploy-local)|[![deploy-local](https://github.com/Azure/azureml-examples/workflows/python-sdk-tutorial-deploy-local/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/python-sdk-tutorial-deploy-local.yml)|[1.deploy-local.ipynb](tutorials/deploy-local/1.deploy-local.ipynb)
[2.deploy-local-cli.ipynb](tutorials/deploy-local/2.deploy-local-cli.ipynb)|*no description* diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-data-preparation/auto-ml-forecasting-data-preparation.ipynb b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-data-preparation/auto-ml-forecasting-data-preparation.ipynb new file mode 100644 index 0000000000..488736ec40 --- /dev/null +++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-data-preparation/auto-ml-forecasting-data-preparation.ipynb @@ -0,0 +1,983 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Automated Machine Learning\n", + "**Data Preparation for Demand Forecasting Notebooks**\n", + "\n", + "## Contents\n", + "1. [Introduction](#Introduction)\n", + "1. [Setup](#Setup)\n", + "1. [Data Pre-processing](#DataWork)\n", + "1. [Dealing with Duplicates](#DealWithDuplicates)\n", + "1. [Data Partitioning](#DataPartition)\n", + "1. [Data Upload](#DataUpload)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Introduction\n", + "\n", + "The objective of this notebook is to illustrate how to pre-process the raw data and register partitioned datasets to be used in demand forecasting notebooks:
  1. Demand Forecastsing Using TCN ([link placeholder]())
  2. Demand Forecasting Using Many Models ([link placeholder]())
\n", + "For illustration purposes we use the UCI electricity data ([link](https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014#)) which contains electricity consumption data for 370 consumers measured at 15 minute intervals. In this notebook, we will show how to ingest the data from the original source, aggregate to an hourly frequency, select a subsample of unique time series, determine the approriate way to partition the data, and finally, register the datasets to be used on the aforementoned notebooks.\n", + "\n", + "Make sure you have executed the [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import logging\n", + "import os\n", + "import random\n", + "\n", + "import azureml.core\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from azureml.core.workspace import Workspace\n", + "from matplotlib import pyplot as plt\n", + "\n", + "random.seed(12345)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Accessing the Azure ML workspace requires authentication with Azure. The default authentication is an interactive authentication using the default tenant. Executing the `ws = Workspace.from_config()` line in the cell below will prompt for authentication the first time that it is run.\n", + "\n", + "If you have multiple Azure tenants, you can specify the tenant by replacing the ws = Workspace.from_config() line in the cell below with the following:\n", + "```\n", + "from azureml.core.authentication import InteractiveLoginAuthentication\n", + "auth = InteractiveLoginAuthentication(tenant_id = 'mytenantid')\n", + "ws = Workspace.from_config(auth = auth)\n", + "```\n", + "If you need to run in an environment where interactive login is not possible, you can use Service Principal authentication by replacing the ws = Workspace.from_config() line in the cell below with the following:\n", + "```\n", + "from azureml.core.authentication import ServicePrincipalAuthentication\n", + "auth = ServicePrincipalAuthentication('mytenantid', 'myappid', 'mypassword')\n", + "ws = Workspace.from_config(auth = auth)\n", + "```\n", + "For more details, see [aka.ms/aml-notebook-auth](aka.ms/aml-notebook-auth)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ws = Workspace.from_config()\n", + "datastore = ws.get_default_datastore()\n", + "\n", + "output = {}\n", + "output[\"Subscription ID\"] = ws.subscription_id\n", + "output[\"Workspace\"] = ws.name\n", + "output[\"Resource Group\"] = ws.resource_group\n", + "output[\"Location\"] = ws.location\n", + "pd.set_option(\"display.max_colwidth\", None)\n", + "outputDf = pd.DataFrame(data=output, index=[\"\"])\n", + "outputDf.T" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Data work\n", + "We start with declaring some of the parameters that will be used in this notebook.\n", + "\n", + "- `IS_ORIGINAL_DATASET_DOWNLOADED` is a flag for wether we want to download the original data from the source. The flag is here to reduce the download time since the original dataset is larger than 1 GB.\n", + "- `IS_MODIFIED_DATASET_UPLOADED` is a flag for whether the datasets are uploaded to the Datastore. We use it to prevent unintentional uploads of the same datasets.\n", + "- `DOES_PARTITION_INCLUDE_VALIDATION_SET` is a placeholder for determining whether the partitioned data should include the validation set. The value True/False will be determined later in the notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "IS_ORIGINAL_DATASET_DOWNLOADED = False\n", + "IS_MODIFIED_DATASET_UPLOADED = False\n", + "DOES_PARTITION_INCLUDE_VALIDATION_SET = (\n", + " None # place holder for the parameter value we will determine later.\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we specify parameters specific to the data we will work with.\n", + "\n", + "- **Target column** is what we want to forecast. In our case it is electricity consumption per customer measured in kilowatt hours (kWh).\n", + "- **Time column** is the time axis along which to predict.\n", + "- **Time series identifier columns** are identified by values of the columns listed `time_series_id_column_names`. In our case all unique time series are identified by a single column `customer_id`. However, it is quite common to have multiple columns identifying unique time series. See the link for a more detailed explanation on this topic." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target_column_name = \"usage\"\n", + "time_column_name = \"date\"\n", + "SERIES_ID = \"customer_id\"\n", + "time_series_id_column_names = [SERIES_ID]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the followng block of code we will download the data from the original source to the `data` folder, load the data and print the first few rows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not IS_ORIGINAL_DATASET_DOWNLOADED:\n", + " print(\"Downloading data from the source ...\\n---\")\n", + " # Download original data\n", + " from io import BytesIO\n", + " from urllib.request import urlopen\n", + " from zipfile import ZipFile\n", + "\n", + " zipurl = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip\"\n", + "\n", + " with urlopen(zipurl) as zipresp:\n", + " with ZipFile(BytesIO(zipresp.read())) as zfile:\n", + " zfile.extractall(\"data\")\n", + " IS_ORIGINAL_DATASET_DOWNLOADED = True\n", + "\n", + "DATA_LOCATION = os.path.join(os.getcwd(), \"data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Printing the first few rows of the downloaded data ...\\n---\")\n", + "raw_df = pd.read_table(\"data/LD2011_2014.txt\", sep=\";\", low_memory=False)\n", + "print(raw_df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The downloaded data is in \"wide\" format, meaning each column name that starts with \"MT_xxx\" represents one unique time series. The first columnn \"Unnamed: 0\" is actually a time stamp at which the obervation for every time series takes place. Let's rename this column to something more meaningful. We will call it `date`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_df.rename(columns={raw_df.columns[0]: \"date\"}, inplace=True)\n", + "raw_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The \"wide\" data format is not usable in AutoML, which is designed to accept a \"long\" format data, the same format that is accepted by typical scikit-learn machine learning models. To tranform the data from wide to long format, we will take each uniqe time series (date, MT_xxx) and stack them vertically. The end result will be a data frame containing 3 columns: (i) the `date` column, (ii) `custmer_id` column, which is the identifier of the time series and is derived from the column name in the wide format, and (iii) `usage` which is the target varaible we are trying to model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Converting data from wide to long format. This may take a few minutes ...\\n---\")\n", + "raw_df = pd.melt(raw_df, id_vars=\"date\", var_name=\"customer_id\", value_name=\"usage\")\n", + "raw_df[time_column_name] = pd.to_datetime(raw_df[time_column_name])\n", + "raw_df.to_csv(\"data/LD2011_2014_long_format.csv\", index=False)\n", + "\n", + "print(\"The first few rows of the data in the long format ...\\n---\")\n", + "raw_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nseries = raw_df.groupby(time_series_id_column_names).ngroups\n", + "print(\"Data contains {0} individual time-series.\".format(nseries))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data tracks customers' electricity consumption every 15 minutes and is measured in kilowatt (kW) consumed. Let's assume the business requirement is to generate 24 hour forecast in kilowatt hours (kWh). Such forecast at 15 minute frequency results in the forecast horizon of 96 steps ahead (there are 96 15-minute intervals in a 24-hour period). Moreover, if the requirement is to generate 24-hour ahead forecast, it makes more sence to aggregate data measured at 15-minute intervals to an hourly frequency. This will reduce the forecast horizon by a factor of 4. The shorter the forecast horizon usually results in higher probability of achieving better forecast accuracy. \n", + "\n", + "In the next block of code we will create a `datetime` column which will identify the date and the hour of the day each observation belogs to. We also convert the target variable from kW to kWh, where $kWh = \\frac{1}{4} \\times kW $. After the conversion is complete, the hourly data will be stored in the `raw_hourly_df` object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_df.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output of the previous command shows that the target varaible `usage` is an object. We need to transform it into a float in order to convert kW to kWh. The next command does exactly this. Because the original data contains European style format with decimals separated by commmas, we replace commas with periods before declaring the target variaible as a float." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_df[target_column_name] = (\n", + " raw_df[target_column_name].astype(str).apply(lambda x: float(x.replace(\",\", \".\")))\n", + ")\n", + "raw_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# aggregate data to hourly. Here, the hourly column is called \"datetime\"\n", + "new_time_column_name = \"datetime\"\n", + "\n", + "raw_df[new_time_column_name] = raw_df[time_column_name].dt.to_period(\"H\")\n", + "raw_df[target_column_name] = raw_df[target_column_name] / 4 # convert to kWh" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# convert to hourly consumption by adding kWh for every 15 min interval\n", + "raw_hourly_series = raw_df.groupby([SERIES_ID, new_time_column_name])[\n", + " target_column_name\n", + "].sum()\n", + "raw_hourly_df = pd.DataFrame(raw_hourly_series)\n", + "raw_hourly_df.reset_index(drop=False, inplace=True)\n", + "print(raw_hourly_df.head())\n", + "\n", + "del raw_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# convert time column to the datetime format\n", + "raw_hourly_df[new_time_column_name] = pd.to_datetime(\n", + " raw_hourly_df[new_time_column_name].astype(str)\n", + ")\n", + "raw_hourly_df.to_csv(\n", + " os.path.join(DATA_LOCATION, \"hourly_data_long_format.csv\"), index=False\n", + ")\n", + "raw_hourly_df.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's visualize a sample of 50 randomly selected series. The plots will be stored in the `output_folder`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_grains = list(\n", + " pd.unique(raw_hourly_df[time_series_id_column_names].values.ravel(\"K\"))\n", + ")\n", + "grains_to_plot = random.sample(all_grains, k=50)\n", + "print(f\"The following grains will be selected for plotting: \\n{grains_to_plot}\\n---\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_subset = raw_hourly_df[raw_hourly_df[SERIES_ID].isin(grains_to_plot)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create an output folder\n", + "OUTPUT_DIR = os.path.join(os.getcwd(), \"output_folder\")\n", + "os.makedirs(OUTPUT_DIR, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scripts.helper_scripts import _draw_one_plot\n", + "from matplotlib.backends.backend_pdf import PdfPages\n", + "\n", + "plot_filename = \"raw_ts_plots.pdf\"\n", + "\n", + "pdf = PdfPages(os.path.join(OUTPUT_DIR, plot_filename))\n", + "for grain, one_forecast in data_subset.groupby(SERIES_ID):\n", + " one_forecast[new_time_column_name] = pd.to_datetime(\n", + " one_forecast[new_time_column_name]\n", + " )\n", + " one_forecast.sort_values(new_time_column_name, inplace=True)\n", + " _draw_one_plot(\n", + " one_forecast,\n", + " new_time_column_name,\n", + " target_column_name,\n", + " time_series_id_column_names,\n", + " pdf,\n", + " plot_predictions=False,\n", + " )\n", + "pdf.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import IFrame\n", + "\n", + "IFrame(\"./output_folder/raw_ts_plots.pdf\", width=800, height=300)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Close examination of the consumption plots per customer shows there are quite a few customers that have no usage data prior to January 1, 2012. Some customers do not have the data until January of 2013 or 2014. It is reasonable at this point to drop all observations prior to January 1, 2012. We will call this object `clean_df`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# drop grains that have no usage as of Jan 1, 2012\n", + "tmp_df = raw_hourly_df[raw_hourly_df[new_time_column_name] == \"2012-01-01 01:00:00\"]\n", + "grains_to_drop = list(tmp_df[tmp_df[target_column_name] == 0][SERIES_ID])\n", + "print(f\"Number of grains to be dropped: {len(grains_to_drop)}\")\n", + "del tmp_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clean_df = raw_hourly_df[~raw_hourly_df[SERIES_ID].isin(grains_to_drop)]\n", + "\n", + "# drop observations prior to 1/1/2012 since they are zero for all grains\n", + "clean_df = clean_df[clean_df[new_time_column_name] > \"2011-12-31 23:00:00\"]\n", + "\n", + "del raw_hourly_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To save training runtime, we will use a small subset of 10 unique time series from the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_grains = list(pd.unique(clean_df[time_series_id_column_names].values.ravel(\"K\")))\n", + "selected_grains = random.sample(all_grains, k=10)\n", + "print(f\"The following grains will be selected: {selected_grains}\\n---\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_subset = clean_df[clean_df[SERIES_ID].isin(selected_grains)]\n", + "nseries = data_subset.groupby(time_series_id_column_names).ngroups\n", + "print(\"Data subset contains {0} individual time-series.\\n---\".format(nseries))\n", + "data_subset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "del all_grains, selected_grains, clean_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Full dataset\\n---\")\n", + "for grain, tmp_df in data_subset.groupby(time_series_id_column_names):\n", + " print(\n", + " \"Grain:{}.\\\n", + " Min date: {}\\\n", + " Max date: {}\\\n", + " N: {}\".format(\n", + " grain,\n", + " tmp_df[new_time_column_name].min(),\n", + " tmp_df[new_time_column_name].max(),\n", + " tmp_df.shape[0],\n", + " )\n", + " )\n", + "del tmp_df, grain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_subset.to_csv(os.path.join(OUTPUT_DIR, \"small_data.csv\"), index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Dealing with Duplicates\n", + "\n", + "In this section we will check for duplicate values in the data, i.e., there are several observations associated with the same time stamp for at least one unique time series. For example, if duplicate values were present in our data, it would look like the following:\n", + "\n", + "| customer_id | datetime | usage |\n", + "|:-- | :-- |:--: |\n", + "| ... | ... | ... |\n", + "|MT_001 | 2012-01-01 15:00 | ... |\n", + "|MT_001 | 2012-01-01 15:00 | ... |\n", + "| ... | ... | ... |\n", + "\n", + "In this example, there 2 observations associated with January 1, 2012 3:00 PM time stamp for the customer ID `MT_001`. AutoML will throw a user error if such scenario was encountered because it does not which value to use. The following block of code checks for a total number of duplicates in the data as well as as give us the breakdown per time series." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "duplicate_observations = data_subset.duplicated(\n", + " subset=[new_time_column_name, SERIES_ID], keep=False\n", + ").sum()\n", + "print(\n", + " f\"---\\nTotal duplicates: {data_subset.duplicated(subset=[new_time_column_name, SERIES_ID], keep=False).sum()}\\n---\"\n", + ")\n", + "for grain, tmp_df in data_subset.groupby(SERIES_ID):\n", + " print(\n", + " f\"Zone: {grain}. Number of duplicates: {tmp_df.duplicated(subset=[new_time_column_name, SERIES_ID], keep=False).sum()}\"\n", + " )\n", + "del tmp_df, grain" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we remove duplicates from the data if they are present." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if duplicate_observations > 0:\n", + " print(\n", + " f\"Removing duplicate observations\\n---\\nOriginal size: {data_subset.shape}\\n---\"\n", + " )\n", + " data_subset.drop_duplicates(\n", + " subset=[new_time_column_name, SERIES_ID], ignore_index=True, inplace=True\n", + " )\n", + " print(f\"Cleaned size: {data_subset.shape}\\n---\")\n", + "\n", + " for grain, tmp_df in data_subset.groupby(SERIES_ID):\n", + " print(\n", + " f\"Zone: {grain}. Number of duplicates: {tmp_df.duplicated(subset=[new_time_column_name], keep=False).sum()}\"\n", + " )\n", + " del tmp_df, grain" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Data Partitioning\n", + "The objective of this section is to determine whether you want to use the [many models solution accelerator](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models) or a [deep learning model](https://learn.microsoft.com/en-us/azure/machine-learning/concept-automl-forecasting-deep-learning). Many models approach allows users to train and manage models for millions of time series in parallel and may be an appropriate modelling choice when time series in your dataset exhibit heterogeneous behavior. During the model selection stage AutoML searches for the best non-DNN model or a combination of models (ensemble) for each time series or a group of time series. Please refer to this [link](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-forecast#many-models) for the workflow diagram for such framework. When using many models solution accelerator you do not need a validation set because AutoML uses a [rolling origin cross validation](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-forecast#many-models) on the training data for model selection. As a result, the data will need to be partitioned into train, test and inference sets.\n", + "\n", + "The deep learning approach allows us to train one model for all time series in the dataset because it can learn complex patterns. On average, the TCNForcaster requires a less frequent re-training compared to the many-models approach. Because of this, the user is expected to provide a validation set which is used to search for the best architecture.Thus, the data will need to be partitioned into train, test, validation and inference sets.\n", + "\n", + "The difference between the *test* and *inference* sets is the presence of the target column. The test set contains the target column and will be used to evaluate model performance using [rolling forecast](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-auto-train-forecast-v1#evaluating-model-accuracy-with-a-rolling-forecast). On the other hand, the target column is not present in the inference set to illustrate how to generate an actual forecast.\n", + "\n", + "Before making this decision, let's visualize the small subset of data we selected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_filename = \"ts_plots_small_data.pdf\"\n", + "\n", + "pdf = PdfPages(os.path.join(OUTPUT_DIR, plot_filename))\n", + "for grain, one_forecast in data_subset.groupby(SERIES_ID):\n", + " one_forecast[new_time_column_name] = pd.to_datetime(\n", + " one_forecast[new_time_column_name]\n", + " )\n", + " one_forecast.sort_values(new_time_column_name, inplace=True)\n", + " _draw_one_plot(\n", + " one_forecast,\n", + " new_time_column_name,\n", + " target_column_name,\n", + " time_series_id_column_names,\n", + " pdf,\n", + " plot_predictions=False,\n", + " )\n", + "pdf.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import IFrame\n", + "\n", + "IFrame(\"./output_folder/ts_plots_small_data.pdf\", width=800, height=300)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One way to determine the modelling framework is by performing a visual examination of the raw time series plots. If all time series exhibit similar behavior patterns, a deep learning model can be an excellent choice. If, on the other hand, individual time series show heterogeneous behavior, it is advised to run a many models accelerator which estimates one model per time seires as oppsed to one model for all time series.\n", + "\n", + "In our case, it seems like a deep learning model could be a good modeeling choice since the time series look fairly similar. As a result, we set the `DOES_PARTITION_INCLUDE_VALIDATION_SET` parameter to True. Please note that to explore the best option, you can still partition the data to run the low capacity models. To do so, set this parametr to False." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DOES_PARTITION_INCLUDE_VALIDATION_SET = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate train/valid/test/inference sets\n", + "\n", + "Since deep learning models are considered \"high capacity\" models, they generally do not require frequent re-training. As a result, we use 2 months of data for validation and test sets, respectively. The choice of 2 months is fairly arbitrary and can be modified to suit your needs. We use 2 months of validation and test sets data to reflect the infrequent re-traiing of the model given that the data frequency is hourly. Thus, there will be more than 1200 observations in the vlaidation and test sets per time series. This will give us enough data points to generate conclusions about the model's performance.\n", + "\n", + "This is in constrast to the ML models that require frequent retraining and, as a result, require much shorter test sets to have a reasonable understanding of the model accuracy.\n", + "\n", + "**Note:** Once the backtesting functionality is available, replace the statement regarding the shorter test set with the need of backtesting given a relatively frequent need to re-train the models compared to the DNNs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_test_periods = 60 * 24\n", + "n_valid_periods = 60 * 24 # applicable only to single/TCN model\n", + "n_inference_periods = 24\n", + "\n", + "\n", + "def split_last_n_by_series_id(df, n, time_column_name):\n", + " \"\"\"Group df by series identifiers and split on last n rows for each group.\"\"\"\n", + " df_grouped = df.sort_values(time_column_name).groupby( # Sort by ascending time\n", + " time_series_id_column_names, group_keys=False\n", + " )\n", + " df_head = df_grouped.apply(lambda dfg: dfg.iloc[:-n])\n", + " df_tail = df_grouped.apply(lambda dfg: dfg.iloc[-n:])\n", + " return df_head, df_tail\n", + "\n", + "\n", + "train_valid_test, inference = split_last_n_by_series_id(\n", + " data_subset, n_inference_periods, time_column_name=new_time_column_name\n", + ")\n", + "\n", + "if DOES_PARTITION_INCLUDE_VALIDATION_SET:\n", + " train_valid, test = split_last_n_by_series_id(\n", + " train_valid_test, n_test_periods, new_time_column_name\n", + " )\n", + " train, valid = split_last_n_by_series_id(\n", + " train_valid, n_valid_periods, new_time_column_name\n", + " )\n", + "else:\n", + " train, test = split_last_n_by_series_id(\n", + " train_valid_test, n_test_periods, new_time_column_name\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We drop the target column from the inference dataset to reflect the fact that the future is unknown and the forecast is our best guess about it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inference.drop(columns=[target_column_name], inplace=True)\n", + "inference.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we will examine the start and end dates as well as the number of observations per time series in each of the generated datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Full dataset\\n---\")\n", + "for grain, tmp_df in data_subset.groupby(time_series_id_column_names):\n", + " print(\n", + " \"Grain:{}.\\\n", + " Min date: {}\\\n", + " Max date: {}\\\n", + " N: {}\".format(\n", + " grain, tmp_df[\"datetime\"].min(), tmp_df[\"datetime\"].max(), tmp_df.shape[0]\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Train dataset\\n---\")\n", + "for grain, tmp_df in train.groupby(time_series_id_column_names):\n", + " print(\n", + " \"Grain:{}.\\\n", + " Min date: {}\\\n", + " Max date: {}\\\n", + " N: {}\".format(\n", + " grain, tmp_df[\"datetime\"].min(), tmp_df[\"datetime\"].max(), tmp_df.shape[0]\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if DOES_PARTITION_INCLUDE_VALIDATION_SET:\n", + " print(\"Valid dataset\\n---\")\n", + " for grain, tmp_df in valid.groupby(time_series_id_column_names):\n", + " print(\n", + " \"Grain:{}.\\\n", + " Min date: {}\\\n", + " Max date: {}\\\n", + " N: {}\".format(\n", + " grain,\n", + " tmp_df[\"datetime\"].min(),\n", + " tmp_df[\"datetime\"].max(),\n", + " tmp_df.shape[0],\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Test dataset\\n---\")\n", + "for grain, tmp_df in test.groupby(time_series_id_column_names):\n", + " print(\n", + " \"Grain:{}.\\\n", + " Min date: {}\\\n", + " Max date: {}\\\n", + " N: {}\".format(\n", + " grain, tmp_df[\"datetime\"].min(), tmp_df[\"datetime\"].max(), tmp_df.shape[0]\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Inference dataset\\n---\")\n", + "for grain, tmp_df in inference.groupby(time_series_id_column_names):\n", + " print(\n", + " \"Grain:{}.\\\n", + " Min date: {}\\\n", + " Max date: {}\\\n", + " N: {}\".format(\n", + " grain, tmp_df[\"datetime\"].min(), tmp_df[\"datetime\"].max(), tmp_df.shape[0]\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Upload data to datastore\n", + "The [Machine Learning service workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-workspace), is paired with an storage account, which contains the default data store. We will use it to upload the train and test sets data and create [tabular datasets](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py) for training and testing. A tabular dataset defines a series of lazily-evaluated, immutable operations to load data from the data source into tabular representation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATASET_PREFFIX_NAME = (\n", + " \"uci_electro_small_tcn\"\n", + " if DOES_PARTITION_INCLUDE_VALIDATION_SET\n", + " else \"uci_electro_small\"\n", + ")\n", + "print(f\"Dataset preffix name: {DATASET_PREFFIX_NAME}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "IS_MODIFIED_DATASET_UPLOADED" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from azureml.data.dataset_factory import TabularDatasetFactory\n", + "from azureml.core.dataset import Dataset\n", + "\n", + "if not IS_MODIFIED_DATASET_UPLOADED:\n", + " print(\"---\\nUploading data ...\\n---\")\n", + "\n", + " train_dataset = TabularDatasetFactory.register_pandas_dataframe(\n", + " train, target=(datastore, \"dataset/\"), name=f\"{DATASET_PREFFIX_NAME}_train\"\n", + " )\n", + "\n", + " if DOES_PARTITION_INCLUDE_VALIDATION_SET:\n", + " valid_dataset = TabularDatasetFactory.register_pandas_dataframe(\n", + " valid, target=(datastore, \"dataset/\"), name=f\"{DATASET_PREFFIX_NAME}_valid\"\n", + " )\n", + "\n", + " test_dataset = TabularDatasetFactory.register_pandas_dataframe(\n", + " test, target=(datastore, \"dataset/\"), name=f\"{DATASET_PREFFIX_NAME}_test\"\n", + " )\n", + "\n", + " inference_dataset = TabularDatasetFactory.register_pandas_dataframe(\n", + " inference,\n", + " target=(datastore, \"dataset/\"),\n", + " name=f\"{DATASET_PREFFIX_NAME}_inference\",\n", + " )\n", + "else:\n", + " print(\"Using uploaded data ...\\n---\")\n", + "\n", + " target_path_train = f\"{DATASET_PREFFIX_NAME}_train\"\n", + " target_path_valid = f\"{DATASET_PREFFIX_NAME}_valid\"\n", + " target_path_test = f\"{DATASET_PREFFIX_NAME}_test\"\n", + " target_path_inference = f\"{DATASET_PREFFIX_NAME}_test\"\n", + "\n", + " train_dataset = Dataset.get_by_name(ws, name=target_path_train)\n", + " if DOES_PARTITION_INCLUDE_VALIDATION_SET:\n", + " valid_dataset = Dataset.get_by_name(ws, name=target_path_valid)\n", + " test_dataset = Dataset.get_by_name(ws, name=target_path_test)\n", + " inference_dataset = Dataset.get_by_name(ws, name=target_path_inference)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Optional" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# delete downladed data files to save space\n", + "import shutil\n", + "\n", + "shutil.rmtree(\"data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "authors": [ + { + "name": "jialiu" + } + ], + "category": "tutorial", + "celltoolbar": "Raw Cell Format", + "compute": [ + "Remote" + ], + "datasets": [ + "Orange Juice Sales" + ], + "deployment": [ + "Azure Container Instance" + ], + "exclude_from_index": false, + "framework": [ + "Azure ML AutoML" + ], + "friendly_name": "Forecasting orange juice sales with deployment", + "index_order": 1, + "kernelspec": { + "display_name": "Python 3.8 - AzureML", + "language": "python", + "name": "python38-azureml" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "tags": [ + "None" + ], + "task": "Forecasting", + "vscode": { + "interpreter": { + "hash": "6bd77c88278e012ef31757c15997a7bea8c943977c43d6909403c00ae11d43ca" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-data-preparation/scripts/helper_scripts.py b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-data-preparation/scripts/helper_scripts.py new file mode 100644 index 0000000000..ffb82cd26d --- /dev/null +++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-data-preparation/scripts/helper_scripts.py @@ -0,0 +1,60 @@ +import pandas as pd +import numpy as np + +from typing import Any, Dict, Optional, List + + +from azureml.automl.core.shared import constants +from azureml.automl.core.shared.types import GrainType +from azureml.automl.runtime.shared.score import scoring + +from matplotlib import pyplot as plt +from matplotlib.backends.backend_pdf import PdfPages + + +def _format_grain_name(grain: GrainType) -> str: + """ + Convert grain name to string. + + :param grain: the grain name. + :return: the string representation of the given grain. + """ + if not isinstance(grain, tuple) and not isinstance(grain, list): + return str(grain) + grain = list(map(str, grain)) + return "|".join(grain) + + +def _draw_one_plot( + df: pd.DataFrame, + time_column_name: str, + target_column_name: str, + grain_column_names: List[str], + pdf: PdfPages, + plot_predictions=False, +) -> None: + """ + Draw the single plot. + + :param df: The data frame with the data to build plot. + :param time_column_name: The name of a time column. + :param grain_column_names: The name of grain columns. + :param pdf: The pdf backend used to render the plot. + """ + ACTUALS = "actual_level" if plot_predictions else target_column_name + PREDICTIONS = "predicted_level" + plot_columns = [ACTUALS, PREDICTIONS] if plot_predictions else [target_column_name] + + if isinstance(grain_column_names, str): + grain_column_names = [grain_column_names] + + fig, _ = plt.subplots(figsize=(20, 10)) + df = df.set_index(time_column_name) + plt.plot(df[plot_columns]) + plt.xticks(rotation=45) + if grain_column_names: + grain_name = [df[grain].iloc[0] for grain in grain_column_names] + plt.title(f"Time series ID: {_format_grain_name(grain_name)}") + plt.legend(plot_columns) + plt.close(fig) + pdf.savefig(fig) diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/auto-ml-forecasting-demand-tcn.ipynb b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/auto-ml-forecasting-demand-tcn.ipynb new file mode 100644 index 0000000000..b2117d1cd2 --- /dev/null +++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/auto-ml-forecasting-demand-tcn.ipynb @@ -0,0 +1,1509 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Automated Machine Learning\n", + "**Demand Forecasting Using TCN**\n", + "\n", + "## Contents\n", + "1. [Introduction](#Introduction)\n", + "1. [Setup](#Setup)\n", + "1. [Data](#Data)\n", + "1. [Train TCN](#TrainTCN)\n", + "1. [Train Baseline](#TrainBaseline)\n", + "1. [Test Set Inference](#TestSetInference)\n", + "1. [Test Set Evaluation](#TestSetEvaluation)\n", + "1. [Generate Forecast](#GenerateForecast)\n", + "1. [Schedule Inference Pipelines](#ScheduleInference)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## 1. Introduction\n", + "\n", + "The objective of this notebook is to illustrate how to use the AutoML deep learning model, temporal convolutional network (TCN), for demand forecasting tasks. It walks you through all stages of model evaluation and production process starting with data ingestion and concluding with scheduling inference runs. For more information on the TCN model in AutoML refer to this [publication](https://learn.microsoft.com/en-us/azure/machine-learning/concept-automl-forecasting-deep-learning).\n", + "\n", + "We use a subset of UCI electricity data ([link](https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014#)) with the objective of predicting electricity demand per consumer 24 hours ahead. The data was preprocessed using the [data prep notebook link placeholder]() notebook. Please refer to it for illustration on how to download the data from the source, aggregate to an hourly frequency, convert from wide to long format and upload to the Datastore. Here, we will work with the already uploaded data. \n", + "\n", + "Having a problem description such as to generate accurate forecasts 24 hours ahead sounds like a relatively straight forward task. However, there are quite a few steps a user needs to take before the model is put in production. A user needs to prepare the data, partition it into appropriate sets, select the best model, evaluate it against a baseline, and monitor the model in real life to collect enough observations on how it would perform had it been put in production. Some of these steps are time consuming, some require certain expertise in writing code. The steps shown in this notebook follow a typical thought process one follows before the model is put in production.\n", + "\n", + "Make sure you have executed the [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import logging\n", + "import os\n", + "\n", + "from matplotlib import pyplot as plt\n", + "import pandas as pd\n", + "\n", + "import azureml.core\n", + "from azureml.core.experiment import Experiment\n", + "from azureml.core.workspace import Workspace\n", + "from azureml.train.automl import AutoMLConfig" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This sample notebook may use features that are not available in previous versions of the Azure ML SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"This notebook was created using version 1.47.0 of the Azure ML SDK\")\n", + "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Accessing the Azure ML workspace requires authentication with Azure.\n", + "\n", + "The default authentication is an interactive authentication using the default tenant. Executing the ws = Workspace.from_config() line in the cell below will prompt for authentication the first time it is run.\n", + "\n", + "If you have multiple Azure tenants, you can specify the tenant by replacing the ws = Workspace.from_config() line in the cell below with the following:\n", + "```\n", + "from azureml.core.authentication import InteractiveLoginAuthentication\n", + "auth = InteractiveLoginAuthentication(tenant_id = 'mytenantid')\n", + "ws = Workspace.from_config(auth = auth)\n", + "```\n", + "If you need to run in an environment where interactive login is not possible, you can use Service Principal authentication by replacing the ws = Workspace.from_config() line in the cell below with the following:\n", + "```\n", + "from azureml.core.authentication import ServicePrincipalAuthentication\n", + "auth = ServicePrincipalAuthentication('mytenantid', 'myappid', 'mypassword')\n", + "ws = Workspace.from_config(auth = auth)\n", + "```\n", + "For more details, see [this link](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/manage-azureml-service/authentication-in-azureml/authentication-in-azureml.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import uuid\n", + "\n", + "ws = Workspace.from_config()\n", + "datastore = ws.get_default_datastore()\n", + "\n", + "# Choose a name for the run history container in the workspace.\n", + "experiment_name = \"forecasting-pipeline-tcn-\" + datetime.datetime.now().strftime(\n", + " \"%Y%m%d\"\n", + ")\n", + "experiment = Experiment(ws, experiment_name)\n", + "\n", + "output = {}\n", + "output[\"Subscription ID\"] = ws.subscription_id\n", + "output[\"Workspace\"] = ws.name\n", + "output[\"Resource Group\"] = ws.resource_group\n", + "output[\"Location\"] = ws.location\n", + "output[\"Run History Name\"] = experiment_name\n", + "pd.set_option(\"display.max_colwidth\", None)\n", + "outputDf = pd.DataFrame(data=output, index=[\"\"])\n", + "outputDf.T" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1. Compute \n", + "\n", + "#### Create or Attach existing AmlCompute\n", + "\n", + "You will need to create a compute target for your AutoML run. In this tutorial, you will create AmlCompute as your training compute resource.\n", + "\n", + "> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n", + "\n", + "\n", + "To run deep learning models we recommend to use GPU compute. Here, we use a 12 node cluster of the `Standard_NC8as_T4_v3` [series](https://learn.microsoft.com/en-us/azure/virtual-machines/nct4-v3-series) for illustration purposes. You will need to adjust the compute type and the number of nodes based on your needs which can be driven by the speed needed for model seelction, data size, etc. \n", + "\n", + "#### Creation of AmlCompute takes approximately 5 minutes. \n", + "If the AmlCompute with that name is already in your workspace, this code will skip the creation process.\n", + "As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from azureml.core.compute import ComputeTarget, AmlCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", + "\n", + "# Choose a name for your CPU cluster\n", + "amlcompute_cluster_name = \"demand-fcst-gpu-cluster\"\n", + "\n", + "# Verify that cluster does not exist already\n", + "try:\n", + " compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)\n", + " print(\"Found existing cluster, use it.\")\n", + "except ComputeTargetException:\n", + " compute_config = AmlCompute.provisioning_configuration(\n", + " vm_size=\"Standard_NC8as_T4_v3\", max_nodes=12, vm_priority=\"lowpriority\"\n", + " )\n", + " compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)\n", + "compute_target.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Data\n", + "If you ran the data preparation notebook [link placeholder]() and want to use the registered data, skip section 3.1 and, instead, uncomment and execute the code in section 3.2. If, on the other hand, you did not run the notebook and want to use the data that we pre-processed and saved in the public blob, execute the code in section 3.1." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.1 Loading and registering the data from public blob store\n", + "\n", + "Run the code in this section only if you want to use the data that is already available in the blobstore. If you want to use your own data that is already registered in your workspace, skip this section and procceed to run the commented out code in section 3.2." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following code registers a datastore `autom_fcst_tcn` in your workspace and links the data from the container `automl-sample-notebook-data`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Datastore\n", + "\n", + "# Please change the following to point to your own blob container and pass in account_key\n", + "blob_datastore_name = \"automl_fcst_tcn\"\n", + "container_name = \"automl-sample-notebook-data\"\n", + "account_name = \"automlsamplenotebookdata\"\n", + "\n", + "print(f'Creating datastore \"{blob_datastore_name}\" in your workspace ...\\n---')\n", + "demand_tcn_datastore = Datastore.register_azure_blob_container(\n", + " workspace=ws,\n", + " datastore_name=blob_datastore_name,\n", + " container_name=container_name,\n", + " account_name=account_name,\n", + " create_if_not_exists=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following code registers datasets from the `automl-sample-notebook-data` container in the datastore we just created. Once the datasets are registered, we will be able to use them in our experiments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Dataset\n", + "\n", + "print(\"Registering datasets in your workspace ...\\n---\")\n", + "\n", + "FOLDER_PREFIX_NAME = \"uci_electro_small_public_tcn\"\n", + "\n", + "target_path_train = f\"{FOLDER_PREFIX_NAME}_train\"\n", + "target_path_valid = f\"{FOLDER_PREFIX_NAME}_valid\"\n", + "target_path_test = f\"{FOLDER_PREFIX_NAME}_test\"\n", + "target_path_inference = f\"{FOLDER_PREFIX_NAME}_infer\"\n", + "\n", + "train_dataset = Dataset.Tabular.from_delimited_files(\n", + " path=demand_tcn_datastore.path(target_path_train + \"/\"),\n", + " validate=False,\n", + " infer_column_types=True,\n", + ").register(workspace=ws, name=target_path_train, create_new_version=True)\n", + "\n", + "valid_dataset = Dataset.Tabular.from_delimited_files(\n", + " path=demand_tcn_datastore.path(target_path_valid + \"/\"),\n", + " validate=False,\n", + " infer_column_types=True,\n", + ").register(workspace=ws, name=target_path_valid, create_new_version=True)\n", + "\n", + "test_dataset = Dataset.Tabular.from_delimited_files(\n", + " path=demand_tcn_datastore.path(target_path_test + \"/\"),\n", + " validate=False,\n", + " infer_column_types=True,\n", + ").register(workspace=ws, name=target_path_test, create_new_version=True)\n", + "\n", + "inference_dataset = Dataset.Tabular.from_delimited_files(\n", + " path=demand_tcn_datastore.path(target_path_inference + \"/\"),\n", + " validate=False,\n", + " infer_column_types=True,\n", + ").register(workspace=ws, name=target_path_inference, create_new_version=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2 Using data that is registered in your workspace\n", + "\n", + "If you ran the [data prep notebook link placeholder]() notebook, the partitioned data is already uploaded and registered in your workspace. Uncomment the following code and change the `DATASET_PREFIX_NAME`, to match the value in the data preparation notebook, and run the code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# from azureml.data.dataset_factory import TabularDatasetFactory\n", + "# from azureml.core.dataset import Dataset\n", + "\n", + "# DATASET_PREFIX_NAME = \"uci_electro_small_tcn\"\n", + "# print(f'Dataset prefix name: {DATASET_PREFIX_NAME}\\n---\\nLoading train, validation, test and inference sets ...\\n---')\n", + "\n", + "# target_path_train = f\"{DATASET_PREFIX_NAME}_train\"\n", + "# target_path_valid = f\"{DATASET_PREFIX_NAME}_valid\"\n", + "# target_path_test = f\"{DATASET_PREFIX_NAME}_test\"\n", + "# target_path_inference = f\"{DATASET_PREFIX_NAME}_inference\"\n", + "\n", + "# train_dataset = Dataset.get_by_name(ws, name=target_path_train)\n", + "# valid_dataset = Dataset.get_by_name(ws, name=target_path_valid)\n", + "# test_dataset = Dataset.get_by_name(ws, name=target_path_test)\n", + "# inference_dataset = Dataset.get_by_name(ws, name=target_path_inference)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.3 Test and inference sets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we have *test* and *inference* sets. The difference between the two is the presence of the target column. The test set contains the target column and is used to evaluate model performance using [rolling forecast](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-auto-train-forecast-v1#evaluating-model-accuracy-with-a-rolling-forecast). On the other hand, the target column is not present in the inference set to illustrate how to generate an actual forecast." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"The first few rows of the test set ...\\n---\")\n", + "print(test_dataset.to_pandas_dataframe().head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"The first few rows of the inference set ...\\n---\")\n", + "print(inference_dataset.to_pandas_dataframe().head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's set up what we know about the dataset.\n", + "\n", + "- **Target column** is what we want to forecast. In our case it is electricity consumption per customer measured in kilowatt hours (kWh).\n", + "- **Time column** is the time axis along which to predict.\n", + "- **Time series identifier columns** are identified by values of the columns listed `time_series_id_column_names`. In our case all unique time series are identified by a single column `customer_id`. However, it is quite common to have multiple columns identifying unique time series. See the [link](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-forecast#configuration-settings) for a more detailed explanation on this topic." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target_column_name = \"usage\"\n", + "time_column_name = \"datetime\"\n", + "GRAIN_COL = \"customer_id\"\n", + "time_series_id_column_names = [GRAIN_COL]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we download training data from the Datastore to make sure it looks as expected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df = train_dataset.to_pandas_dataframe()\n", + "\n", + "nseries = train_df.groupby(time_series_id_column_names).ngroups\n", + "print(\n", + " f\"Data contains {nseries} individual time-series:\\n{list(train_df[GRAIN_COL].unique())}\\n---\"\n", + ")\n", + "print(\"Printing the first few rows of the training data ...\\n---\")\n", + "train_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Train TCN\n", + "\n", + "In this section we will train and select the best TCN model as well as the baseline model. The baseline model will be used as a reference point to understand TCN's accuracy performance. The goal of forecasting is to have the most accurate predictions measured by some accuracy metric. What is considered an accurate prediction is fairly subjective. Take, for example, the MAPE (mean absolute percentage error) metric. A perfect forecast will result in the MAPE value of zero, which is not achievable using business data. For this reason it is imperative to have a baseline model to compare TCN results against. Doing this adds objectivity to the model acceptance criteria. \n", + "\n", + "The baseline model can be the model that is currently in production. Oftentimes, the baseline is set to be a Naive forecast, which we will use in this notebook. The choice of the baseline is also specific to the data. For example, if there is a clear trend in the data one may not want to use a Naive model. Instead, one can use an ARIMA model. Please see this [document](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-configure-auto-train-v1#supported-models) for a list of AutoML models one can chose from to use as a baseline model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following 2 parameters allow us to re-use training runs for the TCN and baseline models, respectively. This can be helpful it you need to experiment with the post model training steps thus avoiding the need to train a new model which can be computationally expensive." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "IS_TCN_MODEL_TRAINED = False\n", + "IS_BASE_MODEL_TRAINED = False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.1 Train AutoML model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.1.1 Set up training parameters\n", + "We need to provide the `ForecastingParameters` and `AutoMLConfig` objects. For the forecasting task we also need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name(s) definition.\n", + "\n", + "#### Forecasting Parameters\n", + "To define forecasting parameters for your experiment training, you can leverage the `ForecastingParameters` class. The table below details the forecasting parameters we will be passing into our experiment.\n", + "\n", + "\n", + "|Property|Description|\n", + "|-|-|\n", + "|**time_column_name**|The name of the time column in the data.|\n", + "|**forecast_horizon**|The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly).|\n", + "|**time_series_id_column_names**|The column names used to uniquely identify the time series in data that has multiple rows with the same timestamp. If the time series identifiers are not defined, the data set is assumed to be one time series.|\n", + "|**freq**|Forecast frequency. This optional parameter represents the period for which the forecast is desired, for example, daily, weekly, yearly, etc. Use this parameter for the correction of time series containing irregular data points or for padding of short time series. The frequency needs to be a pandas offset alias. Please refer to [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects) for more information.\n", + "\n", + "\n", + "#### AutoMLConfig arguments\n", + "|Property|Description|\n", + "|-|-|\n", + "| **task** | forecasting |\n", + "| **primary_metric** | This is the metric that you want to optimize. Forecasting supports the following primary metrics We recommend using either the normalized root mean squared error or normalized mean absolute erorr as a primary metric because they measure forecast accuracy. See the [link](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-automl-forecasting-faq#how-do-i-choose-the-primary-metric) for a more detailed discussion on this topic. |\n", + "| **experiment_timeout_hours** | Maximum amount of time in hours that each experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. When setting this criteria we advise to take into account the number of desired iterations parameter and set experiment timeout setting such that the desired number of iterations will be completed.|\n", + "| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. For TCN models we recommend to have at least 50 iterations to choose the best architecture. For our experiment we will set the number of iterations to 100, however, due to the experiment timeout settings being 1 hour on the specified compute cluster, we will not obtain 100 completions. |\n", + "| **label_column_name** | The name of the target column we are trying to predict. |\n", + "| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n", + "| **max_concurrent_iterations** | Number of TCN models that are estimated simultaneously. This number should be set to the number of nodes in your cluster. In our case, we have a 12 node cluster and set this value to 12. |\n", + "| **enable_dnn** | Enable Forecasting DNNs. The default value is `False`. |\n", + "| **allowed_models** | List of models we want to consider. Since we are only interested in the deep learning models, we list only the `TCNForecaster`.|" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.automl.core.forecasting_parameters import ForecastingParameters\n", + "\n", + "forecast_horizon = 24\n", + "\n", + "forecasting_parameters = ForecastingParameters(\n", + " time_column_name=time_column_name,\n", + " forecast_horizon=forecast_horizon,\n", + " time_series_id_column_names=time_series_id_column_names,\n", + " freq=\"H\",\n", + ")\n", + "\n", + "automl_config = AutoMLConfig(\n", + " task=\"forecasting\",\n", + " debug_log=\"tcn_main.log\",\n", + " primary_metric=\"normalized_root_mean_squared_error\",\n", + " experiment_timeout_hours=1,\n", + " iterations=100,\n", + " training_data=train_dataset,\n", + " validation_data=valid_dataset,\n", + " label_column_name=target_column_name,\n", + " compute_target=compute_target,\n", + " enable_early_stopping=True,\n", + " verbosity=logging.INFO,\n", + " max_concurrent_iterations=12,\n", + " max_cores_per_iteration=-1,\n", + " enable_dnn=True,\n", + " allowed_models=[\"TCNForecaster\"],\n", + " forecasting_parameters=forecasting_parameters,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.2 Construct pipeline steps\n", + "\n", + "The objective of the next block of code is to create an AzureML pipeline step that encapsulates the AutoML run. For more details see this [link](https://learn.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.automlstep?view=azure-ml-py). You do not have to change anything here, so run it as is." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.pipeline.core import PipelineData, TrainingOutput\n", + "from azureml.pipeline.steps import AutoMLStep, PythonScriptStep\n", + "from azureml.pipeline.core import Pipeline, PipelineParameter\n", + "\n", + "metrics_output_name = \"metrics_output\"\n", + "best_model_output_name = \"best_model_output\"\n", + "model_file_name = \"model_file\"\n", + "metrics_data_name = \"metrics_data\"\n", + "\n", + "metrics_data = PipelineData(\n", + " name=metrics_data_name,\n", + " datastore=datastore,\n", + " pipeline_output_name=metrics_output_name,\n", + " training_output=TrainingOutput(type=\"Metrics\"),\n", + ")\n", + "model_data = PipelineData(\n", + " name=model_file_name,\n", + " datastore=datastore,\n", + " pipeline_output_name=best_model_output_name,\n", + " training_output=TrainingOutput(type=\"Model\"),\n", + ")\n", + "\n", + "automl_step = AutoMLStep(\n", + " name=\"automl_module\",\n", + " automl_config=automl_config,\n", + " outputs=[metrics_data, model_data],\n", + " allow_reuse=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.3 Register model step\n", + "\n", + "#### 4.3.1 Run Configuration and Environment\n", + "To have a pipeline step run, we first need an environment to run the jobs. The environment can be built using the following code and you do not have to change anything here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.runconfig import CondaDependencies, RunConfiguration\n", + "\n", + "# create a new RunConfig object\n", + "conda_run_config = RunConfiguration(framework=\"python\")\n", + "\n", + "# Set compute target to AmlCompute\n", + "conda_run_config.target = compute_target\n", + "\n", + "conda_run_config.docker.use_docker = True\n", + "\n", + "cd = CondaDependencies.create(\n", + " pip_packages=[\n", + " \"azureml-sdk[automl]\",\n", + " \"applicationinsights\",\n", + " \"azureml-opendatasets\",\n", + " \"azureml-defaults\",\n", + " ],\n", + " conda_packages=[\"numpy==1.19.5\"],\n", + " pin_sdk_version=False,\n", + ")\n", + "conda_run_config.environment.python.conda_dependencies = cd\n", + "\n", + "print(\"run config is ready\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.3.2 Step to register the model.\n", + "The following code generates a step to register the model in your workspace from the previous step. Once you have the environment created, you are ready to define your pipeline's steps. There are many built-in steps available via the Azure Machine Learning SDK, as you can see on the [reference documentation](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps?view=azure-ml-py) for the `azureml.pipeline.steps` package. The most flexible class is *PythonScriptStep*, which runs a Python script.\n", + "\n", + "The arguments values specify the inputs and outputs of the step. In the example below, the data is the `uci_electro_small_train` dataset. The script `register_model.py` registers the best model from the training run in the workspace. This step will run on the machine defined by the `compute_target` parameter using the configuration `conda_run_config`.\n", + "\n", + "Please note that in order to use the already completed training pipeline for the TCN to generate predictions, we need to know the model name under which the best model is registered in the workspace. To avoid confusion and registering a different model with the same name, we give the model a unique name that is based on date and time of the day. As a result, you will need to record the model name that is printed in the console when the `IS_TCN_MODEL_TRAINED` parameter is `False` in order to use it later." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The model name with which to register the trained model in the workspace.\n", + "if not IS_TCN_MODEL_TRAINED:\n", + " model_name_str = \"uci-small-tcn-model-\" + datetime.datetime.now().strftime(\n", + " \"%Y%m%d%H%M\"\n", + " )\n", + " model_name = PipelineParameter(\"model_name\", default_value=model_name_str)\n", + " print(\n", + " f\"A new model will be registered under the name: {model_name_str}\\n\\\n", + " Please record this in order to use in the subsequent inference runs.\\n---\"\n", + " )\n", + "else:\n", + " model_name_str = (\n", + " \"\"\n", + " )\n", + " model_name = PipelineParameter(\"model_name\", default_value=model_name_str)\n", + " print(f\"Using trained model name: {model_name_str}\\n---\")\n", + "\n", + "register_model_step = PythonScriptStep(\n", + " script_name=\"register_model.py\",\n", + " name=\"register_model\",\n", + " source_directory=\"scripts\",\n", + " allow_reuse=False,\n", + " arguments=[\n", + " \"--model_name\",\n", + " model_name,\n", + " \"--model_path\",\n", + " model_data,\n", + " \"--ds_name\",\n", + " target_path_train,\n", + " ],\n", + " inputs=[model_data],\n", + " compute_target=compute_target,\n", + " runconfig=conda_run_config,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Model name: {model_name_str}\\n---\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.3.3 Build pipeline\n", + "\n", + "Next, we build the pipeline and kick off a run which will select the best TCN model and register it in the workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_pipeline = Pipeline(\n", + " description=\"training_pipeline_tcn\",\n", + " workspace=ws,\n", + " steps=[automl_step, register_model_step],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Is the TCN model trained? {IS_TCN_MODEL_TRAINED}\\n---\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "if not IS_TCN_MODEL_TRAINED:\n", + " print(\"Training new AutoML model ...\\n---\")\n", + " training_pipelin_run = experiment.submit(training_pipeline)\n", + " training_pipelin_run.wait_for_completion(show_output=False)\n", + " IS_TCN_MODEL_TRAINED = True\n", + "else:\n", + " from azureml.train.automl.run import AutoMLRun\n", + " from azureml.pipeline.core.run import PipelineRun\n", + "\n", + " PIPELINE_RUN_ID = (\n", + " \"\"\n", + " )\n", + " training_pipelin_run = PipelineRun(experiment=experiment, run_id=PIPELINE_RUN_ID)\n", + " print(f\"Using previously trained model. Pipeline run ID: {PIPELINE_RUN_ID}\\n---\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Train the baseline model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use Naive model as our baseline. To train it, we kick off another automl experiment with the following settings. Please note that we disabled DNN (`enable_dnn=False`), added the Naive model to the allowed models list, and set the number of iterations to 1 since we only interested in training one specific model. To reduce the training time for non-deep learning models, we set the number of cross validations to 2. Read the following [document](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-auto-train-forecast-v1#training-and-validation-data) for more information on this topic. Note that we did not use the cross validations settings for the TCN model training because such models typically require validation set that are substantially longer than a small multiple of a forecast horizon.\n", + "\n", + "The only `AutoMLConfig` settings you might consider changing are the `experiment_timeout_hours` and `allowed_models`. You might want to increase the experiment timeout if your data has lots of unique time series. The allowed model list can be modified to refect a different choice of the baseline model and can be selected from the supported [forecasting models](https://learn.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.constants.supportedmodels.forecasting) and [regression models](https://learn.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.constants.supportedmodels.regression)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "automl_config = AutoMLConfig(\n", + " task=\"forecasting\",\n", + " debug_log=\"baseline.log\",\n", + " primary_metric=\"normalized_root_mean_squared_error\",\n", + " experiment_timeout_hours=1,\n", + " iterations=1,\n", + " training_data=train_dataset,\n", + " label_column_name=target_column_name,\n", + " compute_target=compute_target,\n", + " enable_early_stopping=True,\n", + " n_cross_validations=2,\n", + " verbosity=logging.INFO,\n", + " max_cores_per_iteration=-1,\n", + " enable_dnn=False,\n", + " allowed_models=[\"Naive\"],\n", + " forecasting_parameters=forecasting_parameters,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "if not IS_BASE_MODEL_TRAINED:\n", + " remote_base_run = experiment.submit(automl_config, show_output=False)\n", + " remote_base_run.wait_for_completion(show_output=False)\n", + " IS_BASE_MODEL_TRAINED = True\n", + "else:\n", + " BASE_RUN_ID = \"\"\n", + " # during the initial training run copy-paste the run id to be utilized later if nedded.\n", + " remote_base_run = AutoMLRun(experiment=experiment, run_id=BASE_RUN_ID)\n", + " print(f\"Using previously trained model. Run ID: {BASE_RUN_ID}\\n---\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Test set inference " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.1 Inference the best TCN model\n", + "\n", + "We create an output folder which will be used to save the output of our experiments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create an output folder\n", + "OUTPUT_DIR = os.path.join(os.getcwd(), \"forecast_output\")\n", + "os.makedirs(OUTPUT_DIR, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.1.1 Get Inference Pipeline Environment\n", + "To trigger an inference pipeline run, we first need a running environment that contains all the appropriate packages for the model unpickling. This environment can be either assessed from the training run or using the `yml` file that comes with the model. In this section we use the environment from the training run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "from scripts.register_model import get_best_automl_run\n", + "\n", + "best_run = get_best_automl_run(training_pipelin_run)\n", + "inference_env = best_run.get_environment()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.runconfig import RunConfiguration\n", + "\n", + "run_config = RunConfiguration()\n", + "run_config.environment = inference_env" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.1.2 Build and submit the inference pipeline\n", + "\n", + "The inference pipeline will create two different output formats, 1) a tabular dataset that contains the prediction and 2) an `OutputFileDatasetConfig` that can be used for the sequential pipeline steps. The `inference_script_tcn.py` performs a rolling evaluation on the test set with the step size of 24 hours. This generates the same results as if the model was put in production and generated a 24-hour forecast once a day using the most recent data context. This is an efficient way of testing historical model performance at a tiny fraction of the compute and time commitment costs. For a more detailed information on rolling evaluation see the this [document](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-auto-train-forecast-v1#evaluating-model-accuracy-with-a-rolling-forecast) for a more detailed explanation.\n", + "\n", + "\n", + "\n", + "The `PythonScriptStep` uses the following arguments:
    \n", + "
  • `model_name`: the name under which the model was registered in the workspace.
  • \n", + "
  • `ouput_dataset_name`: name of the dataset under which the test set predictions will registered in the datastore
  • \n", + "
  • `test_dataset_name`: name of the test dataset
  • \n", + "
  • `target_column_name`: target column name
  • \n", + "
  • `time_column_name`: time column name
  • \n", + "
  • `output_path`: output data that is obtained from prediction step.
  • \n", + "
  • `run_rolling_evaluation`: Boolean parameter. Set to True to run rolling evaluation. Otherwise, runs a recursive forecast for the entire test set. If the length of the test set is larger than the forecast horizon the model was trained for, the recursive forecast method recursively applies the regular forecaster to generate context so that we can forecast further into the future. For more information, see the \"Recursive forecasting\" section in [this notebook](https://github.com/Azure/azureml-examples/blob/main/v1/python-sdk/tutorials/automl-with-azureml/forecasting-forecast-function/auto-ml-forecasting-function.ipynb).
  • \n", + "
  • `rolling_evaluation_step_size`: Optional parameter that instructs the rolling forecaster how many periods to step forward. See this [document](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-auto-train-forecast-v1#evaluating-model-accuracy-with-a-rolling-forecast) for a more detailed explanation.
\n", + " \n", + "The only parameters you may consider changing in this sections are the `run_rolling_evaluation` and the `rolling_evaluation_step_size`. The rest should be left as is." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.data import OutputFileDatasetConfig\n", + "\n", + "output_data = OutputFileDatasetConfig(name=\"prediction_result\")\n", + "\n", + "output_ds_name = \"uci-tcn-test-output\"\n", + "\n", + "test_step = PythonScriptStep(\n", + " name=\"infer-results\",\n", + " source_directory=\"scripts\",\n", + " script_name=\"inference_script_tcn.py\",\n", + " arguments=[\n", + " \"--model_name\",\n", + " model_name_str,\n", + " \"--ouput_dataset_name\",\n", + " output_ds_name,\n", + " \"--test_dataset_name\",\n", + " test_dataset.name,\n", + " \"--target_column_name\",\n", + " target_column_name,\n", + " \"--time_column_name\",\n", + " time_column_name,\n", + " \"--output_path\",\n", + " output_data,\n", + " \"--run_rolling_evaluation\",\n", + " True,\n", + " \"--rolling_evaluation_step_size\",\n", + " forecast_horizon,\n", + " ],\n", + " compute_target=compute_target,\n", + " allow_reuse=False,\n", + " runconfig=run_config,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_pipeline = Pipeline(ws, [test_step])\n", + "test_run = experiment.submit(test_pipeline)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "test_run.wait_for_completion(show_output=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.1.3 Get the predicted data\n", + "\n", + "We download the test set predictions and actuals, and store them in the `backtest_tcn` object, which in turn is saved to the output folder. We chose the name *backtest* to reflect the fact that the rolling evaluation on the test set is equivalent to monitoring model's performance in real life had it been put in production for the duration of the test set and generated forecast every 24 hours using the most recent data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Dataset\n", + "\n", + "backtest_tcn = Dataset.get_by_name(ws, output_ds_name)\n", + "backtest_tcn = backtest_tcn.to_pandas_dataframe()\n", + "backtest_tcn.to_csv(os.path.join(OUTPUT_DIR, \"predictions-tcn.csv\"), index=False)\n", + "backtest_tcn.tail(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.2 Inference the baseline model\n", + "\n", + "Next, we perform a rolling evaluation on the test set for the baseline model. To do this, we use the `run_remote_inference` method which downloads the pickle file of the model into the temporary folder `forecast_naive` and copies the `inference_script_naive.py` file to it. This folder is then uploaded on the compute cluster where inference is performed. The `inference_script_naive.py` script performs a rolling evaluation on the test set, similarly to what we have done for the TCN model. Upon completion of this step, we delete the newly created `forecast_naive` folder." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_experiment_base = Experiment(ws, experiment_name + \"_inference_base\")\n", + "\n", + "baseline_run = remote_base_run.get_best_child()\n", + "baseline_model_name = baseline_run.properties[\"model_name\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import shutil\n", + "from scripts.helper_scripts import run_remote_inference\n", + "\n", + "if True:\n", + " remote_base_run_test = run_remote_inference(\n", + " test_experiment=test_experiment_base,\n", + " compute_target=compute_target,\n", + " train_run=baseline_run,\n", + " test_dataset=test_dataset,\n", + " target_column_name=target_column_name,\n", + " rolling_evaluation_step_size=forecast_horizon,\n", + " inference_folder=\"./forecast_naive\",\n", + " )\n", + " remote_base_run_test.wait_for_completion(show_output=False)\n", + "\n", + " # download the forecast file to the local machine\n", + " remote_base_run_test.download_file(\n", + " \"outputs/predictions.csv\", os.path.join(OUTPUT_DIR, \"predictions-base.csv\")\n", + " )\n", + "\n", + " # delete downloaded scripts\n", + " shutil.rmtree(\"./forecast_naive\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Test set model evaluation\n", + "\n", + "In this section we will evaluate the test set performance for the best TCN model and compare it with the baseline. We will generate time series plots for forecasts and actuals, calculate accuracy metrics and plot the evolution of metrics for each model over time. All output from this section will be stored in the `forecast_output` folder and can be referenced any time you need it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7.1 Load test set results\n", + "\n", + "Here, we will import test set results for the TCN and the baseline experiments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "backtest_tcn = pd.read_csv(\n", + " os.path.join(OUTPUT_DIR, \"predictions-tcn.csv\"), parse_dates=[time_column_name]\n", + ")\n", + "backtest_base = pd.read_csv(\n", + " os.path.join(OUTPUT_DIR, \"predictions-base.csv\"), parse_dates=[time_column_name]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we combine outputs into a single dataframe which will be used for plotting and scoring." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "backtest = backtest_tcn.merge(\n", + " backtest_base.drop(target_column_name, axis=1),\n", + " on=[\"customer_id\", \"datetime\"],\n", + " how=\"inner\",\n", + " suffixes=[\"\", \"_base\"],\n", + ")\n", + "backtest.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\n", + " f\"N model: {backtest_tcn.shape[0]}. N baseline: {backtest_base.shape[0]}. N merged: {backtest.shape[0]}\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data we are working with has an hourly frequency and we plan to generate the forecasts every 24 hours. If the model were to be put in production such that the forecasts are generated and model's performance is monitored every 24 hours, we will mimic the scoring process on the test set by generating daily accuracy metrics. To do this, we create a date column (\"ymd\"). If you want to score the output at any other frequency, say, weekly, just change the frequency parameter to the desired frequency." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "PERIOD_COLUMN = \"ymd\"\n", + "\n", + "backtest[PERIOD_COLUMN] = backtest[time_column_name].dt.to_period(\n", + " \"D\"\n", + ") # year-month-day to be used for daily metrics computation\n", + "backtest.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7.2 Generate time series plots\n", + "\n", + "Here, we generate forecast versus actuals plot for the test set for both the best TCN model and the baseline. Since we use rolling evaluation with the step size of 24 hours, this mimics the behavior of putting both models in production and monitoring their behavior for the duration of the test set. This step allows users to make informed decisons about model performance and saves numerous costs associated with productionalizing the model and monitoring its performance in real life. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scripts.helper_scripts import _draw_one_plot\n", + "from matplotlib import pyplot as plt\n", + "from matplotlib.backends.backend_pdf import PdfPages\n", + "\n", + "plot_filename = \"forecast_vs_actual.pdf\"\n", + "\n", + "pdf = PdfPages(os.path.join(os.getcwd(), OUTPUT_DIR, plot_filename))\n", + "for _, one_forecast in backtest.groupby(GRAIN_COL):\n", + " one_forecast[time_column_name] = pd.to_datetime(one_forecast[time_column_name])\n", + " one_forecast.sort_values(time_column_name, inplace=True)\n", + " _draw_one_plot(\n", + " one_forecast,\n", + " time_column_name,\n", + " target_column_name,\n", + " [GRAIN_COL],\n", + " [target_column_name, \"predicted\", \"predicted_base\"],\n", + " pdf,\n", + " plot_predictions=True,\n", + " )\n", + "pdf.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7.3 Calculate metrics\n", + "Here, we will calculate the metric of interest for each day. For illustration purposes we use root mean squared error as the metric of choice. However, the `compute_all_metrics` method calculated all primary and secondary metrics for AutoML runs. Please refer to this *Regression/forecasting metrics* section in this [document](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml#regressionforecasting-metrics) for the list of available metrics. We will calculate the distribution of this metrics for each time series in our dataset. Looking at the descrptive stats of such metrics can be more informative than calculating a single metric such as the mean for each time series. As an example, we are looking at the RMSE (root mean squared error) metric, but you can choose any other metric computed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scripts.helper_scripts import compute_all_metrics\n", + "\n", + "DESIRED_METRIC_NAME = \"root_mean_squared_error\"\n", + "\n", + "metrics_per_grain_day = compute_all_metrics(\n", + " fcst_df=backtest,\n", + " actual_col=target_column_name,\n", + " fcst_col=\"predicted\",\n", + " ts_id_colnames=[GRAIN_COL, PERIOD_COLUMN],\n", + ")\n", + "metrics_per_grain_day = metrics_per_grain_day.query(\n", + " f'metric_name == \"{DESIRED_METRIC_NAME}\"'\n", + ")\n", + "metrics_per_grain_day[[GRAIN_COL, PERIOD_COLUMN]] = metrics_per_grain_day[\n", + " \"time_series_id\"\n", + "].str.split(\"|\", 1, expand=True)\n", + "metrics_per_grain_day.to_csv(\n", + " os.path.join(OUTPUT_DIR, \"metrics-automl.csv\"), index=False\n", + ")\n", + "metrics_per_grain_day.groupby(GRAIN_COL)[\"metric\"].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment the following line to see all metrics we computed\n", + "# print(f'List of available metrics: {metrics_per_grain_day[\"metric_name\"].unique()}\\n---')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# baseline metrics\n", + "metrics_per_grain_day_base = compute_all_metrics(\n", + " fcst_df=backtest,\n", + " actual_col=target_column_name,\n", + " fcst_col=\"predicted_base\",\n", + " ts_id_colnames=[GRAIN_COL, PERIOD_COLUMN],\n", + ")\n", + "metrics_per_grain_day_base = metrics_per_grain_day_base.query(\n", + " f'metric_name == \"{DESIRED_METRIC_NAME}\"'\n", + ")\n", + "metrics_per_grain_day_base[[GRAIN_COL, PERIOD_COLUMN]] = metrics_per_grain_day[\n", + " \"time_series_id\"\n", + "].str.split(\"|\", 1, expand=True)\n", + "metrics_per_grain_day_base.to_csv(\n", + " os.path.join(OUTPUT_DIR, \"metrics-base.csv\"), index=False\n", + ")\n", + "metrics_per_grain_day_base.groupby(GRAIN_COL)[\"metric\"].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7.4 Visualize metrics\n", + "\n", + "In this section we plot metric evolution over time for the TCN and the baseline models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics_df = metrics_per_grain_day.drop(\"time_series_id\", axis=1).merge(\n", + " metrics_per_grain_day_base.drop(\"time_series_id\", axis=1),\n", + " on=[\"metric_name\", GRAIN_COL, PERIOD_COLUMN],\n", + " how=\"inner\",\n", + " suffixes=[\"\", \"_base\"],\n", + ")\n", + "metrics_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grain = [GRAIN_COL]\n", + "plot_filename = \"metrics_plot.pdf\"\n", + "\n", + "pdf = PdfPages(os.path.join(os.getcwd(), OUTPUT_DIR, plot_filename))\n", + "for _, one_forecast in metrics_df.groupby(grain):\n", + " one_forecast[PERIOD_COLUMN] = pd.to_datetime(one_forecast[PERIOD_COLUMN])\n", + " one_forecast.sort_values(PERIOD_COLUMN, inplace=True)\n", + " _draw_one_plot(\n", + " one_forecast,\n", + " PERIOD_COLUMN,\n", + " target_column_name,\n", + " grain,\n", + " [\"metric\", \"metric_base\"],\n", + " pdf,\n", + " plot_predictions=True,\n", + " )\n", + "pdf.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import IFrame\n", + "\n", + "IFrame(os.path.join(\"./forecast_output/metrics_plot.pdf\"), width=800, height=300)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Inference TCN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this step, we generate an actual forecast by providing an inference set that does not contain actual values. This illustrates how to generate production forecasts in real life. The code in this section is pretty much identical to the one in section 6.1 with one exception, we set the `run_rolling_evaluation` argument to `False`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 8.1 Build and submit the inference pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from azureml.core.runconfig import RunConfiguration\n", + "\n", + "run_config = RunConfiguration()\n", + "run_config.environment = inference_env" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_data = OutputFileDatasetConfig(name=\"prediction_result\")\n", + "\n", + "output_ds_name = \"uci-tcn-inference-output\"\n", + "\n", + "inference_step = PythonScriptStep(\n", + " name=\"infer-results\",\n", + " source_directory=\"scripts\",\n", + " script_name=\"inference_script_tcn.py\",\n", + " arguments=[\n", + " \"--model_name\",\n", + " model_name_str,\n", + " \"--ouput_dataset_name\",\n", + " output_ds_name,\n", + " \"--test_dataset_name\",\n", + " inference_dataset.name,\n", + " \"--target_column_name\",\n", + " target_column_name,\n", + " \"--time_column_name\",\n", + " time_column_name,\n", + " \"--output_path\",\n", + " output_data,\n", + " \"--run_rolling_evaluation\",\n", + " False,\n", + " ],\n", + " compute_target=compute_target,\n", + " allow_reuse=False,\n", + " runconfig=run_config,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "inference_pipeline = Pipeline(ws, [inference_step])\n", + "inference_run = experiment.submit(inference_pipeline)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "inference_run.wait_for_completion(show_output=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 8.2 Get the predicted data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Dataset\n", + "\n", + "inference_ds = Dataset.get_by_name(ws, output_ds_name)\n", + "inference_df = inference_ds.to_pandas_dataframe()\n", + "inference_df.tail(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Schedule Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This section is about how to schedule a pipeline for periodically predictions. For more info about pipeline schedule and pipeline endpoint, please follow this [notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-setup-schedule-for-a-published-pipeline.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inference_published_pipeline = inference_pipeline.publish(\n", + " name=\"UCI Inference\", description=\"UCI Inference\"\n", + ")\n", + "print(\"Newly published pipeline id: {}\".format(inference_published_pipeline.id))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If `inference_dataset` is going to refresh every 24 hours and we want to predict every 24 hours (forecast_horizon), we can schedule our pipeline to run every day at 11 pm to get daily inference results. You can refresh your test dataset (a newer version will be created) periodically when new data is available (i.e. target column in test dataset would have values in the beginning as context data, and followed by NaNs to be predicted). The inference pipeline will pick up context to further improve the forecast accuracy. See the Forecasting away from training data in this [notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-forecast-function/auto-ml-forecasting-function.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule\n", + "\n", + "recurrence = ScheduleRecurrence(\n", + " frequency=\"Day\", interval=1, hours=[23], minutes=[00] # Runs every day at 11:00 pm\n", + ")\n", + "\n", + "schedule = Schedule.create(\n", + " workspace=ws,\n", + " name=\"tcn_inference_schedule\",\n", + " pipeline_id=inference_published_pipeline.id,\n", + " experiment_name=\"schedule-run-tcn-uci-electro\",\n", + " recurrence=recurrence,\n", + " wait_for_provisioning=True,\n", + " description=\"Schedule Run\",\n", + ")\n", + "\n", + "# You may want to make sure that the schedule is provisioned properly\n", + "# before making any further changes to the schedule\n", + "\n", + "print(\"Created schedule with id: {}\\n---\".format(schedule.id))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 9.1 [Optional] Disable schedule" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "schedule.disable()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "authors": [ + { + "name": "jialiu" + } + ], + "category": "tutorial", + "celltoolbar": "Raw Cell Format", + "compute": [ + "Remote" + ], + "datasets": [ + "Orange Juice Sales" + ], + "deployment": [ + "Azure Container Instance" + ], + "exclude_from_index": false, + "framework": [ + "Azure ML AutoML" + ], + "friendly_name": "Forecasting orange juice sales with deployment", + "index_order": 1, + "kernelspec": { + "display_name": "Python 3.8 - AzureML", + "language": "python", + "name": "python38-azureml" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "tags": [ + "None" + ], + "task": "Forecasting", + "vscode": { + "interpreter": { + "hash": "6bd77c88278e012ef31757c15997a7bea8c943977c43d6909403c00ae11d43ca" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/scripts/helper_scripts.py b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/scripts/helper_scripts.py new file mode 100644 index 0000000000..8b80055820 --- /dev/null +++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/scripts/helper_scripts.py @@ -0,0 +1,233 @@ +from typing import Any, Dict, Optional, List + +import argparse +import json +import os +import re +import shutil + +import pandas as pd + +from matplotlib import pyplot as plt +from matplotlib.backends.backend_pdf import PdfPages + +from azureml.core import ScriptRunConfig +from azureml.automl.core.shared import constants +from azureml.automl.core.shared.types import GrainType +from azureml.automl.runtime.shared.score import scoring + +GRAIN = "time_series_id" +BACKTEST_ITER = "backtest_iteration" +ACTUALS = "actual_level" +PREDICTIONS = "predicted_level" +ALL_GRAINS = "all_sets" + +FORECASTS_FILE = "forecast.csv" +SCORES_FILE = "scores.csv" +SCORES_FILE_GRAIN = "scores_per_grain.csv" +PLOTS_FILE = "plots_fcst_vs_actual.pdf" +PLOTS_FILE_GRAIN = "plots_fcst_vs_actual_per_grain.pdf" +RE_INVALID_SYMBOLS = re.compile("[: ]") + + +def _compute_metrics( + df: pd.DataFrame, metrics: List[str], actual_col: str, fcst_col: str +): + """ + Compute metrics for one data frame. + + :param df: The data frame which contains actual_level and predicted_level columns. + :return: The data frame with two columns - metric_name and metric. + """ + scores = scoring.score_regression( + y_test=df[actual_col], y_pred=df[fcst_col], metrics=metrics + ) + metrics_df = pd.DataFrame(list(scores.items()), columns=["metric_name", "metric"]) + metrics_df.sort_values(["metric_name"], inplace=True) + metrics_df.reset_index(drop=True, inplace=True) + return metrics_df + + +def _format_grain_name(grain: GrainType) -> str: + """ + Convert grain name to string. + + :param grain: the grain name. + :return: the string representation of the given grain. + """ + if not isinstance(grain, tuple) and not isinstance(grain, list): + return str(grain) + grain = list(map(str, grain)) + return "|".join(grain) + + +def compute_all_metrics( + fcst_df: pd.DataFrame, + actual_col: str, + fcst_col: str, + ts_id_colnames: List[str], + metric_names: Optional[List[set]] = None, +): + """ + Calculate metrics per grain. + + :param fcst_df: forecast data frame. Must contain 2 columns: 'actual_level' and 'predicted_level' + :param metric_names: (optional) the list of metric names to return + :param ts_id_colnames: (optional) list of grain column names + :return: dictionary of summary table for all tests and final decision on stationary vs nonstaionary + """ + if not metric_names: + metric_names = list(constants.Metric.SCALAR_REGRESSION_SET) + + if ts_id_colnames is None: + ts_id_colnames = [] + + metrics_list = [] + if ts_id_colnames: + for grain, df in fcst_df.groupby(ts_id_colnames): + one_grain_metrics_df = _compute_metrics( + df, metric_names, actual_col, fcst_col + ) + one_grain_metrics_df[GRAIN] = _format_grain_name(grain) + metrics_list.append(one_grain_metrics_df) + + # overall metrics + one_grain_metrics_df = _compute_metrics(fcst_df, metric_names, actual_col, fcst_col) + one_grain_metrics_df[GRAIN] = ALL_GRAINS + metrics_list.append(one_grain_metrics_df) + + # collect into a data frame + return pd.concat(metrics_list) + + +def _draw_one_plot( + df: pd.DataFrame, + time_column_name: str, + target_column_name: str, + grain_column_names: List[str], + columns_to_plot: List[str], + pdf: PdfPages, + plot_predictions=False, +) -> None: + """ + Draw the single plot. + + :param df: The data frame with the data to build plot. + :param time_column_name: The name of a time column. + :param grain_column_names: The name of grain columns. + :param pdf: The pdf backend used to render the plot. + """ + if isinstance(grain_column_names, str): + grain_column_names = [grain_column_names] + fig, _ = plt.subplots(figsize=(20, 10)) + df = df.set_index(time_column_name) + plt.plot(df[columns_to_plot]) + plt.xticks(rotation=45) + if grain_column_names: + grain_name = [df[grain].iloc[0] for grain in grain_column_names] + plt.title(f"Time series ID: {_format_grain_name(grain_name)}") + plt.legend(columns_to_plot) + plt.close(fig) + pdf.savefig(fig) + + +def calculate_scores_and_build_plots( + input_dir: str, output_dir: str, automl_settings: Dict[str, Any] +): + os.makedirs(output_dir, exist_ok=True) + grains = automl_settings.get(constants.TimeSeries.GRAIN_COLUMN_NAMES) + time_column_name = automl_settings.get(constants.TimeSeries.TIME_COLUMN_NAME) + if grains is None: + grains = [] + if isinstance(grains, str): + grains = [grains] + while BACKTEST_ITER in grains: + grains.remove(BACKTEST_ITER) + + dfs = [] + for fle in os.listdir(input_dir): + file_path = os.path.join(input_dir, fle) + if os.path.isfile(file_path) and file_path.endswith(".csv"): + df_iter = pd.read_csv(file_path, parse_dates=[time_column_name]) + for _, iteration in df_iter.groupby(BACKTEST_ITER): + dfs.append(iteration) + forecast_df = pd.concat(dfs, sort=False, ignore_index=True) + # == Per grain-iteration analysis + # To make sure plots are in order, sort the predictions by grain and iteration. + ts_index = grains + [BACKTEST_ITER] + forecast_df.sort_values(by=ts_index, inplace=True) + pdf = PdfPages(os.path.join(output_dir, PLOTS_FILE)) + for _, one_forecast in forecast_df.groupby(ts_index): + _draw_one_plot(one_forecast, time_column_name, grains, pdf) + pdf.close() + forecast_df.to_csv(os.path.join(output_dir, FORECASTS_FILE), index=False) + metrics = compute_all_metrics(forecast_df, grains + [BACKTEST_ITER]) + metrics.to_csv(os.path.join(output_dir, SCORES_FILE), index=False) + + # == Per grain analysis + pdf = PdfPages(os.path.join(output_dir, PLOTS_FILE_GRAIN)) + for _, one_forecast in forecast_df.groupby(grains): + _draw_one_plot(one_forecast, time_column_name, grains, pdf) + pdf.close() + metrics = compute_all_metrics(forecast_df, grains) + metrics.to_csv(os.path.join(output_dir, SCORES_FILE_GRAIN), index=False) + + +def run_remote_inference( + test_experiment, + compute_target, + train_run, + test_dataset, + target_column_name, + rolling_evaluation_step_size=1, + inference_folder="./forecast", +): + # Create local directory to copy the model.pkl and inference_script_naive.py files into. + # These files will be uploaded to and executed on the compute instance. + os.makedirs(inference_folder, exist_ok=True) + shutil.copy("scripts/inference_script_naive.py", inference_folder) + + # Find the extension of the model file (.pkl or .pt) + ls = train_run.get_file_names() # list artifacts + regex = re.compile("outputs/model[.](pt|pkl)") + model_path = None + for v in ls: + matcher = regex.match(v) + if matcher: + model_path = matcher[0] + break + model_name = os.path.split(model_path)[-1] + + train_run.download_file(model_path, os.path.join(inference_folder, model_name)) + + inference_env = train_run.get_environment() + print("Finished getting training environment ...\n---") + + config = ScriptRunConfig( + source_directory=inference_folder, + script="inference_script_naive.py", + arguments=[ + "--target_column_name", + target_column_name, + "--test_dataset", + test_dataset.as_named_input(test_dataset.name), + "--rolling_evaluation_step_size", + rolling_evaluation_step_size, + ], + compute_target=compute_target, + environment=inference_env, + ) + + print("Submitting experiment ...\n---") + run = test_experiment.submit( + config, + tags={ + "training_run_id": train_run.id, + "run_algorithm": train_run.properties["run_algorithm"], + "valid_score": train_run.properties["score"], + "primary_metric": train_run.properties["primary_metric"], + }, + ) + + run.log("run_algorithm", run.tags["run_algorithm"]) + return run diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/scripts/inference_script_naive.py b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/scripts/inference_script_naive.py new file mode 100644 index 0000000000..fb30a77195 --- /dev/null +++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/scripts/inference_script_naive.py @@ -0,0 +1,123 @@ +""" +This is the script that is executed on the compute instance. It relies +on the model.pkl file which is uploaded along with this script to the +compute instance. +""" + +import os +import argparse +from azureml.core import Dataset, Run +from sklearn.externals import joblib +from pandas.tseries.frequencies import to_offset + +try: + import torch + + _torch_present = True +except ImportError: + _torch_present = False + + +def map_location_cuda(storage, loc): + return storage.cuda() + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--target_column_name", + type=str, + dest="target_column_name", + help="Target Column Name", + ) + parser.add_argument( + "--test_dataset", type=str, dest="test_dataset", help="Test Dataset" + ) + parser.add_argument( + "--rolling_evaluation_step_size", + type=int, + default=1, + dest="rolling_evaluation_step_size", + help="Rolling evaluation step size (optional).", + ) + + # args = parser.parse_args() + args, unknown = parser.parse_known_args() + + return args + + +def load_model(): + list_artifacts = os.listdir(".") + print("All artifacts ...\n---") + print(list_artifacts) + print("---") + + if "model.pt" in list_artifacts: + assert _torch_present, "Loading DNN models needs torch to be presented." + if torch.cuda.is_available(): + map_location = map_location_cuda + else: + map_location = "cpu" + with open("model.pt", "rb") as fh: + fitted_model = torch.load(fh, map_location=map_location) + else: + fitted_model = joblib.load("model.pkl") + return fitted_model + + +def get_data(run, test_dataset_id): + ws = run.experiment.workspace + + # get the input dataset by id + test_dataset = Dataset.get_by_id(ws, id=test_dataset_id) + + test_df = test_dataset.to_pandas_dataframe().reset_index(drop=True) + return test_df + + +if __name__ == "__main__": + run = Run.get_context() + args = get_args() + target_column_name = args.target_column_name + test_dataset_id = args.test_dataset + rolling_evaluation_step_size = args.rolling_evaluation_step_size + predicted_column_name = "predicted" + + print(f"Target column name: {target_column_name}\n---") + print(f"Test dataset: {test_dataset_id}\n---") + print(f"Rolling evaluation step size: {rolling_evaluation_step_size}\n---") + + # Load model + fitted_model = load_model() + + # Get data + test_df = get_data(run, test_dataset_id) + + if target_column_name in test_df: + y_test = test_df.pop(target_column_name).values + print( + "Target column is present in the test dataset ...\n---\nFirst few rows of the test dataset after removing target column ...\n---" + ) + print(test_df.head()) + print("---") + else: + y_test = np.full(test_df.shape[0], np.nan) + + print("Rolling evaluation ...\n---") + df_all = fitted_model.rolling_forecast( + test_df, y_test, step=rolling_evaluation_step_size, ignore_data_errors=True + ) + + assign_dict = { + fitted_model.forecast_origin_column_name: "forecast_origin", + fitted_model.forecast_column_name: "predicted", + fitted_model.actual_column_name: target_column_name, + } + df_all.rename(columns=assign_dict, inplace=True) + + file_name = "outputs/predictions.csv" + export_csv = df_all.to_csv(file_name, header=True, index=False) # added Index + + # Upload the predictions into artifacts + run.upload_file(name=file_name, path_or_stream=file_name) diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/scripts/inference_script_tcn.py b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/scripts/inference_script_tcn.py new file mode 100644 index 0000000000..42d8a6c21d --- /dev/null +++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/scripts/inference_script_tcn.py @@ -0,0 +1,293 @@ +""" +This is the script that is executed on the compute instance. It can execute the forecast +and rolling_forecast methods. There is a list of required parameters that determine what +type of forecast will be executed. +""" + +import argparse +from datetime import datetime +import os +import uuid +import numpy as np +import pandas as pd + +from pandas.tseries.frequencies import to_offset +from sklearn.externals import joblib +from sklearn.metrics import mean_absolute_error, mean_squared_error + +from azureml.data.dataset_factory import TabularDatasetFactory +from azureml.automl.runtime.shared.score import scoring, constants as metrics_constants +import azureml.automl.core.shared.constants as constants +from azureml.core import Run, Dataset, Model + +try: + import torch + + _torch_present = True +except ImportError: + _torch_present = False + + +def map_location_cuda(storage, loc): + return storage.cuda() + + +def get_model(model_path, model_file_name): + # _, ext = os.path.splitext(model_path) + # Here we check if the file name is included in the model path + if not model_path.endswith(model_file_name): + model_full_path = os.path.join(model_path, model_file_name) + else: + model_full_path = model_path + print(f"(Model full path: {model_full_path}\n---") + + if model_file_name.endswith("pt"): + # Load the fc-tcn torch model. + assert _torch_present, "Loading DNN models needs torch to be presented." + if torch.cuda.is_available(): + map_location = map_location_cuda + else: + map_location = "cpu" + with open(model_full_path, "rb") as fh: + fitted_model = torch.load(fh, map_location=map_location) + else: + # Load the sklearn pipeline. + fitted_model = joblib.load(model_full_path) + return fitted_model + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_name", type=str, dest="model_name", help="Model to be loaded" + ) + parser.add_argument( + "--ouput_dataset_name", + type=str, + dest="ouput_dataset_name", + default="results", + help="Dataset name of the final output", + ) + parser.add_argument( + "--target_column_name", + type=str, + dest="target_column_name", + help="The target column name.", + ) + parser.add_argument( + "--time_column_name", + type=str, + dest="time_column_name", + help="The time column name.", + ) + parser.add_argument( + "--test_dataset_name", + type=str, + dest="test_dataset_name", + default="results", + help="Dataset name of the final output", + ) + parser.add_argument( + "--output_path", + type=str, + dest="output_path", + default="results", + help="The output path", + ) + parser.add_argument( + "--run_rolling_evaluation", + type=bool, + default=False, + dest="run_rolling_evaluation", + help="Run rolling evaluation?", + ) + parser.add_argument( + "--rolling_evaluation_step_size", + type=int, + default=1, + dest="rolling_evaluation_step_size", + help="Rolling forecast step size (optional).", + ) + # args = parser.parse_args() + args, unknown = parser.parse_known_args() + + return args + + +def get_data(run, fitted_model, target_column_name, test_dataset_name): + # get input dataset by name + test_dataset = Dataset.get_by_name(run.experiment.workspace, test_dataset_name) + test_df = test_dataset.to_pandas_dataframe() + if target_column_name in test_df: + y_test = test_df.pop(target_column_name).values + print( + "Target column is present in the test dataset ...\n---\nFirst few rows of the test dataset after remving target column ...\n---" + ) + print(test_df.head()) + print("---") + else: + y_test = np.full(test_df.shape[0], np.nan) + + return test_df, y_test + + +def get_model_filename(run, model_name, model_path): + model = Model(run.experiment.workspace, model_name) + if "model_file_name" in model.tags: + return model.tags["model_file_name"] + is_pkl = True + if model.tags.get("algorithm") == "TCNForecaster" or os.path.exists( + os.path.join(model_path, "model.pt") + ): + is_pkl = False + return "model.pkl" if is_pkl else "model.pt" + + +def infer_forecasting_dataset_tcn( + X_test, + y_test, + model, + output_path, + target_column_name, + time_column_name, + run_rolling_evaluation=False, + rolling_evaluation_step_size=1, + output_dataset_name="results", +): + """ + Method that inferences on the test set. If the target column is present and does + not contain NANs in the latest time period, we drop the target, generate forecast and + append the actuals to the forecast data frame. Otherwise, we assume the target column + contains the context and is an inpot to the forecast or rolling_forecast method. + """ + # If target any NANs for the mosrt recent observation -> run with y_test. Otherwise, remove y_test from the forecast() + last_obs_index = X_test[ + X_test[time_column_name] == X_test[time_column_name].max() + ].index + + if run_rolling_evaluation: + df_all = _rolling_evaluation_tcn( + X_test, + y_test, + model, + target_column_name, + time_column_name, + rolling_evaluation_step_size, + ) + elif np.isnan(y_test[last_obs_index]).any(): + print("Generating recursive forecast ...\n---") + y_pred, df_all = model.forecast(X_test, y_test) + else: + print("Generating recursive forecast ...\n---") + _, df_all = model.forecast(X_test) + df_all[target_column_name] = y_test + + run = Run.get_context() + + registered_train = TabularDatasetFactory.register_pandas_dataframe( + df_all, + target=( + run.experiment.workspace.get_default_datastore(), + datetime.now().strftime("%Y-%m-%d-") + str(uuid.uuid4())[:6], + ), + name=output_dataset_name, + ) + df_all.to_csv(os.path.join(output_path, output_dataset_name + ".csv"), index=False) + + +def _rolling_evaluation_tcn( + X_test, + y_test, + model, + target_column_name, + time_column_name, + rolling_evaluation_step_size, +): + # If target any NANs for the mosrt recent observation -> run with y_test. Otherwise, remove y_test from the forecast() + last_obs_index = X_test[ + X_test[time_column_name] == X_test[time_column_name].max() + ].index + last_obs_nans = np.isnan( + y_test[last_obs_index] + ).any() # last time stamp observations contian NANs? + y_all_nans = np.isnan(y_test).all() # all NANs? + + if y_all_nans: + print( + "Rolling evaluation is desired, yet the target column does not contain\ + any values for this operation to be performed. \nGenerating recursive forecast instead ...\n---" + ) + y_pred, df_all = model.forecast(X_test) + elif last_obs_nans and not y_all_nans: + print( + "Rolling evaluation. Test set target column contains NANs. Setting ignore_data_errors=True ...\n---" + ) + df_all = model.rolling_forecast( + X_test, y_test, step=rolling_evaluation_step_size, ignore_data_errors=True + ) + else: + print("Rolling evaluation ...\n---") + df_all = model.rolling_forecast( + X_test, y_test, step=rolling_evaluation_step_size, ignore_data_errors=False + ) + # df_all[target_column_name] = y_test + + # for non-recursive forecasts change columns names + if not y_all_nans: + assign_dict = { + model.forecast_origin_column_name: "forecast_origin", + model.forecast_column_name: "predicted", + model.actual_column_name: target_column_name, + } + df_all.rename(columns=assign_dict, inplace=True) + return df_all + + +if __name__ == "__main__": + run = Run.get_context() + args = get_args() + model_name = args.model_name + ouput_dataset_name = args.ouput_dataset_name + test_dataset_name = args.test_dataset_name + target_column_name = args.target_column_name + time_column_name = args.time_column_name + run_rolling_evaluation = args.run_rolling_evaluation + if args.rolling_evaluation_step_size is not None: + rolling_evaluation_step_size = args.rolling_evaluation_step_size + else: + rolling_evaluation_step_size = 1 + + print("args passed are: ") + + print(f"Model name: {model_name}\n---") + print(f"Test dataset name: {test_dataset_name}\n---") + print(f"Output dataset name: {ouput_dataset_name}\n---") + print(f"Target column name: {target_column_name}\n---") + print(f"Time column name: {time_column_name}\n---") + print(f"Rolling evaluation?: {run_rolling_evaluation}\n---") + if run_rolling_evaluation: + print(f"Rolling evaluation step size: {rolling_evaluation_step_size}\n---") + + model_path = Model.get_model_path(model_name) + model_file_name = get_model_filename(run, model_name, model_path) + + print(f"Model path: {model_path}\n---") + print(f"Model file name: {model_file_name}\n---") + + fitted_model = get_model(model_path, model_file_name) + + X_test_df, y_test = get_data( + run, fitted_model, target_column_name, test_dataset_name + ) + + infer_forecasting_dataset_tcn( + X_test_df, + y_test, + fitted_model, + args.output_path, + target_column_name, + time_column_name, + run_rolling_evaluation, + rolling_evaluation_step_size, + ouput_dataset_name, + ) diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/scripts/register_model.py b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/scripts/register_model.py new file mode 100644 index 0000000000..6f3089c2dd --- /dev/null +++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-demand-forecasting-tcn/scripts/register_model.py @@ -0,0 +1,64 @@ +import argparse +import os +import uuid +import shutil +from azureml.core.model import Model, Dataset +from azureml.core.run import Run, _OfflineRun +from azureml.core import Workspace +import azureml.automl.core.shared.constants as constants +from azureml.train.automl.run import AutoMLRun + + +def get_best_automl_run(pipeline_run): + all_children = [c for c in pipeline_run.get_children()] + automl_step = [ + c for c in all_children if c.properties.get("runTemplate") == "AutoML" + ] + for c in all_children: + print(c, c.properties) + automlrun = AutoMLRun(pipeline_run.experiment, automl_step[0].id) + best = automlrun.get_best_child() + return best + + +def get_model_path(model_artifact_path): + return model_artifact_path.split("/")[1] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model_name") + parser.add_argument("--model_path") + parser.add_argument("--ds_name") + args = parser.parse_args() + + print("Argument 1(model_name): %s" % args.model_name) + print("Argument 2(model_path): %s" % args.model_path) + print("Argument 3(ds_name): %s" % args.ds_name) + + run = Run.get_context() + ws = None + if type(run) == _OfflineRun: + ws = Workspace.from_config() + else: + ws = run.experiment.workspace + + train_ds = Dataset.get_by_name(ws, args.ds_name) + datasets = [(Dataset.Scenario.TRAINING, train_ds)] + new_dir = str(uuid.uuid4()) + os.makedirs(new_dir) + + # Register model with training dataset + best_run = get_best_automl_run(run.parent) + model_artifact_path = best_run.properties[constants.PROPERTY_KEY_OF_MODEL_PATH] + algo = best_run.properties.get("run_algorithm") + model_artifact_dir = model_artifact_path.split("/")[0] + model_file_name = model_artifact_path.split("/")[1] + model = best_run.register_model( + args.model_name, + model_path=model_artifact_dir, + datasets=datasets, + tags={"algorithm": algo, "model_file_name": model_file_name}, + ) + + print("Registered version {0} of model {1}".format(model.version, model.name))