In [2]:
import sys, os
sys.path.append("../../")
import pandas as pd
import category_encoders as ce
from tempfile import TemporaryDirectory
import uuid
import logging

import reco_utils.recommender.lightgbm.lightgbm_utils as lgb_utils
import reco_utils.dataset.criteo as criteo

import azureml.core

from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
import azureml.dataprep as dprep

# Check core SDK version number - based on build number of preview/master.
print("SDK version:", azureml.core.VERSION)

username = "nikhilj"
print("Your username is {0}".format(username))

SDK version: 1.0.18
Your username is nikhilj


In [3]:
SIZE = "sample"

nume_cols = ["I" + str(i) for i in range(1, 14)]
cate_cols = ["C" + str(i) for i in range(1, 27)]
label_col = "Label"

header = [label_col] + nume_cols + cate_cols
with TemporaryDirectory() as tmp:
    all_data = criteo.load_pandas_df(size=SIZE, local_cache_path=tmp, header=header)

8.79MB [00:05, 1.73MB/s]                            


In [4]:
all_data.head()

Unnamed: 0,Label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [5]:
# split data to 3 sets    
length = len(all_data)
train_data = all_data.loc[:0.8*length-1]
valid_data = all_data.loc[0.8*length:0.9*length-1]
test_data = all_data.loc[0.9*length:]

In [6]:
ord_encoder = ce.ordinal.OrdinalEncoder(cols=cate_cols)

def encode_csv(df, encoder, label_col, typ='fit'):
    if typ == 'fit':
        df = encoder.fit_transform(df)
    else:
        df = encoder.transform(df)
    y = df[label_col].values
    del df[label_col]
    return df, y

train_x, train_y = encode_csv(train_data, ord_encoder, label_col)
valid_x, valid_y = encode_csv(valid_data, ord_encoder, label_col, 'transform')
test_x, test_y = encode_csv(test_data, ord_encoder, label_col, 'transform')

print('Train Data Shape: X: {trn_x_shape}; Y: {trn_y_shape}.\nValid Data Shape: X: {vld_x_shape}; Y: {vld_y_shape}.\nTest Data Shape: X: {tst_x_shape}; Y: {tst_y_shape}.\n'
      .format(trn_x_shape=train_x.shape,
              trn_y_shape=train_y.shape,
              vld_x_shape=valid_x.shape,
              vld_y_shape=valid_y.shape,
              tst_x_shape=test_x.shape,
              tst_y_shape=test_y.shape,))

Train Data Shape: X: (80000, 39); Y: (80000,).
Valid Data Shape: X: (10000, 39); Y: (10000,).
Test Data Shape: X: (10000, 39); Y: (10000,).



In [7]:
subscription_id = "e91107f8-bd8c-4517-8c8b-14879a7dbec4" #you should be owner or contributor
resource_group = "nikhilj" #you should be owner or contributor
workspace_name = "nikhilj-aml-ws" #your workspace name
workspace_region = "westus2" #your region

In [8]:
ws = Workspace.create(name = workspace_name,
                      subscription_id = subscription_id,
                      resource_group = resource_group, 
                      location = workspace_region,                      
                      exist_ok=True)
ws.get_details()

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


{'id': '/subscriptions/e91107f8-bd8c-4517-8c8b-14879a7dbec4/resourceGroups/nikhilj/providers/Microsoft.MachineLearningServices/workspaces/nikhilj-aml-ws',
 'name': 'nikhilj-aml-ws',
 'location': 'westus2',
 'type': 'Microsoft.MachineLearningServices/workspaces',
 'workspaceid': '7ebdf7b2-80a7-48cc-8150-c0509730255a',
 'description': '',
 'friendlyName': 'nikhilj-aml-ws',
 'creationTime': '2019-03-22T23:19:17.3297990+00:00',
 'containerRegistry': '/subscriptions/e91107f8-bd8c-4517-8c8b-14879a7dbec4/resourcegroups/nikhilj/providers/microsoft.containerregistry/registries/nikhiljaacroorbxmro',
 'keyVault': '/subscriptions/e91107f8-bd8c-4517-8c8b-14879a7dbec4/resourcegroups/nikhilj/providers/microsoft.keyvault/vaults/nikhiljakeyvaultqpzgqmmc',
 'applicationInsights': '/subscriptions/e91107f8-bd8c-4517-8c8b-14879a7dbec4/resourcegroups/nikhilj/providers/microsoft.insights/components/nikhiljainsightsxtpjewom',
 'identityPrincipalId': '116ed18d-1033-43bf-9537-f14cab4bf153',
 'identityTenantId':

## Configure Automated ML

You can use these params.

|Property|Description|
|-|-|
|**task**|classification or regression|
|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|
|**primary_metric**|This is the metric that you want to optimize. Regression supports the following primary metrics: <br><i>spearman_correlation</i><br><i>normalized_root_mean_squared_error</i><br><i>r2_score</i><br><i>normalized_mean_absolute_error</i>|
|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|
|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|
|**n_cross_validations**|Number of cross validation splits.|
|**spark_context**|Spark Context object. for Databricks, use spark_context=sc|
|**max_concurrent_iterations**|Maximum number of iterations to execute in parallel. This should be <= number of worker nodes in your Azure Databricks cluster.|
|**X**|(sparse) array-like, shape = [n_samples, n_features]|
|**y**|(sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]<br>Multi-class targets. An indicator matrix turns on multilabel classification. This should be an array of integers.|
|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|
|**preprocess**|set this to True to enable pre-processing of data eg. string to numeric using one-hot encoding|
|**exit_score**|Target score for experiment. It is associated with the metric. eg. exit_score=0.995 will exit experiment after that|

In [10]:
# Choose a name for the experiment and specify the project folder.
experiment_name = 'automl-criteo'
project_folder = './sample_projects/automl-criteo'

experiment = Experiment(ws, experiment_name)

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace Name'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
output['Experiment Name'] = experiment.name
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(data = output, index = ['']).T

Unnamed: 0,Unnamed: 1
SDK version,1.0.18
Subscription ID,e91107f8-bd8c-4517-8c8b-14879a7dbec4
Workspace Name,nikhilj-aml-ws
Resource Group,nikhilj
Location,westus2
Project Directory,./sample_projects/automl-criteo
Experiment Name,automl-criteo


In [12]:
automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors_regression.log',
                             primary_metric = 'AUC_weighted',
                             iteration_timeout_minutes = 60, #some runs may take 10+ mins hence limiting it for workshop
                             iterations = 30, #you may change this to a higher number and see what happens
                             #validation_size = 0.20, #for large datasets only and not needed for workshop
                             verbosity = logging.INFO,
                             max_concurrent_iterations = 1, #change it based on number of worker nodes
                             #spark_context=sc, #databricks/spark related
                             n_cross_validations = 3, #(only needed for small datasets and if validation size is not set)
                             X = train_x,
                             y = train_y,
                             preprocess=True, #preprocess
                             path = project_folder)

In [None]:
local_run = experiment.submit(automl_config, show_output = False) # for higher runs please use show_output=False and use the below

Something else went wrong, sorry.
Traceback (most recent call last):
  File "/anaconda3/envs/reco_pyspark/lib/python3.6/site-packages/azureml/train/automl/_vendor/automl/client/core/common/limit_function_call_spawn.py", line 58, in execute
    **kwargs)
  File "/anaconda3/envs/reco_pyspark/lib/python3.6/site-packages/azureml/train/automl/_vendor/automl/client/core/common/spawn_client.py", line 91, in run_in_proc
    process.pid, errorcode, errorname))
automl.client.core.common.limit_function_call_exceptions.SubprocessException: Subprocess (pid 43848) killed by unhandled signal 6 (SIGABRT)
ERROR:adal-python:14a1288b-8d85-4f94-baa0-2574953ac5f8 - OAuth2Client:Get Token request failed
Traceback (most recent call last):
  File "/anaconda3/envs/reco_pyspark/lib/python3.6/site-packages/urllib3/connection.py", line 159, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw)
  File "/anaconda3/envs/reco_pyspark/lib/python3.6/site-packages/urllib3/util/connection.py", line 57, 

Performing interactive authentication. Please follow the instructions on the terminal.


In [None]:
displayHTML("<a href={} target='_blank'>Your experiment in Azure Portal: {}</a>".format(local_run.get_portal_url(), local_run.id))