Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/NotebookVM/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.png)

# Automated Machine Learning
**Sales Forecasting**



## Setup


In [1]:
import azureml.core
import pandas as pd
import numpy as np
import logging
import warnings

from pandas.tseries.frequencies import to_offset

# Squash warning messages for cleaner output in the notebook
warnings.showwarning = lambda *args, **kwargs: None

from azureml.core.workspace import Workspace
from azureml.core.experiment import Experiment
from azureml.train.automl import AutoMLConfig
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
ws = Workspace.from_config()

# choose a name for the run history container in the workspace
experiment_name = 'automl-saleforecasting'

experiment = Experiment(ws, experiment_name)

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Run History Name'] = experiment_name
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Unnamed: 0,Unnamed: 1
SDK version,1.0.72
Subscription ID,d6697b5a-c497-42c0-9d09-749855c5c4bd
Workspace,capstone-ml
Resource Group,capstone
Location,westus2
Run History Name,automl-saleforecasting


## Data

In [3]:
from azureml.core import Dataset, Run
cluster13eggs= Dataset.get_by_name(workspace=ws, name='cluster13eggs')
data= cluster13eggs.to_pandas_dataframe()
data.head()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion,city,state,holiday_type,transferred,class,dcoilwtico
0,625,2013-01-02,1,127547,14.0,True,Quito,Pichincha,Work Day,True,2502,93.14
1,638,2013-01-02,1,158680,5.0,True,Quito,Pichincha,Work Day,True,2502,93.14
2,678,2013-01-02,1,208384,32.0,True,Quito,Pichincha,Work Day,True,2502,93.14
3,679,2013-01-02,1,208386,7.0,True,Quito,Pichincha,Work Day,True,2502,93.14
4,718,2013-01-02,1,227111,9.0,True,Quito,Pichincha,Work Day,True,2502,93.14


In [4]:
data = data.drop(columns=['id','item_nbr'])

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155689 entries, 0 to 155688
Data columns (total 10 columns):
date            155689 non-null datetime64[ns]
store_nbr       155689 non-null int64
unit_sales      155689 non-null float64
onpromotion     155689 non-null bool
city            155689 non-null object
state           155689 non-null object
holiday_type    155689 non-null object
transferred     155689 non-null bool
class           155689 non-null int64
dcoilwtico      155689 non-null float64
dtypes: bool(2), datetime64[ns](1), float64(2), int64(2), object(3)
memory usage: 9.8+ MB


In [6]:
data = data.groupby(['date','store_nbr'] , as_index=False).agg(
{
    'unit_sales':sum,
    'onpromotion':'first',
    'city': 'first',
    'state': 'first',
    'holiday_type':'first',
    'transferred':'first',
    'class':'first',
    'dcoilwtico':'first'
}
)
data.head()

Unnamed: 0,date,store_nbr,unit_sales,onpromotion,city,state,holiday_type,transferred,class,dcoilwtico
0,2013-01-02,1,246.0,True,Quito,Pichincha,Work Day,True,2502,93.14
1,2013-01-02,2,307.0,True,Quito,Pichincha,Work Day,True,2502,93.14
2,2013-01-02,6,360.0,True,Quito,Pichincha,Work Day,True,2502,93.14
3,2013-01-03,1,203.0,True,Quito,Pichincha,Work Day,True,2502,92.97
4,2013-01-03,2,186.0,True,Quito,Pichincha,Work Day,True,2502,92.97


In [7]:
data['date'] = pd.to_datetime(data['date'])

## Split the data

splitting on time.

In [8]:
target_column_name = 'unit_sales'
time_column_name = 'date'
grain_column_names = ['store_nbr']
label =  "unit_sales"

In [9]:
train = data[data[time_column_name] < '2017-01-01']
test = data[data[time_column_name] >= '2017-01-01']

X_train = train.copy()
y_train = X_train.pop(target_column_name).values

X_test = test.copy()
y_test = X_test.pop(target_column_name).values

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5301, 9)
(5301,)
(904, 9)
(904,)


# set up experiment

In [13]:
time_series_settings = {
    "time_column_name": "date",
    "grain_column_names":['store_nbr'],
    "max_horizon": 50,
    "target_lags": 2,
    "target_rolling_window_size": 10,
    "preprocess": True,
}

In [14]:
automl_config = AutoMLConfig(task='forecasting',
                             primary_metric='normalized_root_mean_squared_error',
                             experiment_timeout_minutes=15,
                             enable_early_stopping=True,
                             training_data=train,
                             label_column_name=label,
                             n_cross_validations=5,
                             enable_ensembling=False,
                             verbosity=logging.INFO,
                             **time_series_settings)

# run model and get best model

In [None]:
local_run = experiment.submit(automl_config, show_output=True)


Running on local machine
Parent Run ID: AutoML_73ea83a8-ac93-4817-a3e0-39261666699f
Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization.

### view local run summary

In [15]:
local_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl-saleforecasting,AutoML_0ace7e86-b189-45bc-92e1-b9b581c0bbab,automl,Running,Link to Azure Machine Learning studio,Link to Documentation


## get best model

In [49]:
best_run, fitted_model = local_run.get_output()
fitted_model

ForecastingPipelineWrapper(pipeline=Pipeline(memory=None,
     steps=[('timeseriestransformer', TimeSeriesTransformer(logger=None,
           pipeline_type=<TimeSeriesPipelineType.FULL: 1>)), ('stackensembleregressor', StackEnsembleRegressor(base_learners=[('26', Pipeline(memory=None,
     steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('extratreesregressor'...   random_state=None, selection='cyclic', tol=0.0001, verbose=0),
            training_cv_folds=5))]),
              stddev=None)

# monitoring runs

In [17]:
from azureml.widgets import RunDetails
RunDetails(local_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…