In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.16.0 to work with agogemls


In [2]:
model = ws.models['TitanicAutoML']
print(model.name, 'version', model.version)

TitanicAutoML version 4


In [3]:
from azureml.core import Datastore, Dataset
import os
import json
import pandas

# Set default data store
default_ds = ws.get_default_datastore()

# Load test data
test_df = pandas.read_csv('test.csv')
df =test_df.drop(['PassengerId'], axis=1)

# Create a folder
batch_folder = './batch-data'
os.makedirs(batch_folder, exist_ok=True)
print("Folder created!")

# Save samples as a separate files
print("Saving files...")
size=10
list_of_dfs = [df.loc[i:i+size-1,:] for i in range(0, len(df),size)]
for l in range(1,42):
    fname = str(l) + '.csv'
    list_of_dfs[l].to_csv(os.path.join(batch_folder, fname), index=False)
print("files saved!")

# Upload the files to the default datastore
print("Uploading files to datastore...")
default_ds.upload(src_dir="batch-data", target_path="titanic-batch-data", overwrite=True, show_progress=True)

Folder created!
Saving files...
files saved!
Uploading files to datastore...
Uploading an estimated of 41 files
Uploading batch-data/1.csv
Uploaded batch-data/1.csv, 1 files out of an estimated total of 41
Uploading batch-data/10.csv
Uploaded batch-data/10.csv, 2 files out of an estimated total of 41
Uploading batch-data/11.csv
Uploaded batch-data/11.csv, 3 files out of an estimated total of 41
Uploading batch-data/12.csv
Uploaded batch-data/12.csv, 4 files out of an estimated total of 41
Uploading batch-data/13.csv
Uploaded batch-data/13.csv, 5 files out of an estimated total of 41
Uploading batch-data/14.csv
Uploaded batch-data/14.csv, 6 files out of an estimated total of 41
Uploading batch-data/15.csv
Uploaded batch-data/15.csv, 7 files out of an estimated total of 41
Uploading batch-data/16.csv
Uploaded batch-data/16.csv, 8 files out of an estimated total of 41
Uploading batch-data/17.csv
Uploaded batch-data/17.csv, 9 files out of an estimated total of 41
Uploading batch-data/18.cs

$AZUREML_DATAREFERENCE_11318de6a2ea400a8f811f130f17b4d6

In [4]:
# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path=(default_ds, 'titanic-batch-data/'), validate=False)

batch_data_set = batch_data_set.register(workspace=ws, 
                                             name='titanic-batch-data',
                                             description='batch data',
                                             create_new_version=True)
print("Done!")

Done!


In [5]:
import os
# Create a folder for the experiment files
experiment_folder = 'batch_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

In [6]:
%%writefile $experiment_folder/batch_titanic.py
import json
import os
import time
import joblib
import pandas
from azureml.core.model import Model

# Called when the service is loaded
def init():
    global model
    # Get the path to the deployed model file and load it
    model_path = Model.get_model_path('TitanicAutoML')
    model = joblib.load(model_path)
    
    #Print statement for appinsights custom traces:
    print ("model initialized" + time.strftime("%H:%M:%S"))

# Called when a request is received
def run(mini_batch):
    # This runs for each batch
    resultList = []
        
    # process each file in the batch
    for f in mini_batch:
        # Read the comma-delimited data
        data = pandas.read_csv(f)
        predictions = model.predict(data)

        # Append prediction to results
        resultList.append("{}: {}".format(os.path.basename(f), predictions))
    return resultList

Overwriting batch_pipeline/batch_titanic.py


In [7]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.core.runconfig import CondaDependencies

# Add dependencies required by the model
# For parallel pipeline steps, you need azureml-core and azureml-dataprep[fuse]
cd = CondaDependencies.create(pip_packages=['azureml-defaults','azureml-core','azureml-dataprep[fuse]==2.3.4','azureml-dataset-runtime'
                                            ,'azureml-automl-core','azureml.automl.runtime','azureml-train-automl-client'
                                            ,'azureml-pipeline-core','azureml-telemetry','azureml-interpret','azureml-train-automl-runtime'
                                            ,'xgboost==0.90','joblib','sklearn','inference-schema'])

batch_env = Environment(name='batch_environment')
batch_env.python.conda_dependencies = cd
batch_env.docker.enabled = True
batch_env.docker.base_image = DEFAULT_CPU_IMAGE
print('Configuration ready.')

Configuration ready.


In [8]:
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.pipeline.core import PipelineData
from azureml.core.compute import ComputeTarget, AmlCompute

cluster_name = "d12compute"
inference_cluster = ComputeTarget(workspace=ws, name=cluster_name)

output_dir = PipelineData(name='inferences', 
                          datastore=default_ds, 
                          output_path_on_compute='titanic/results')

parallel_run_config = ParallelRunConfig(
    source_directory=experiment_folder,
    entry_script="batch_titanic.py",
    mini_batch_size="5",
    error_threshold=10,
    output_action="append_row",
    environment=batch_env,
    compute_target=inference_cluster,
    node_count=2)

parallelrun_step = ParallelRunStep(
    name='batch-score-titanic',
    parallel_run_config=parallel_run_config,
    inputs=[batch_data_set.as_named_input('titanic_batch_data')],
    output=output_dir,
    arguments=[],
    allow_reuse=True
)

print('Steps defined')

Steps defined


In [9]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])
pipeline_run = Experiment(ws, 'batch_prediction_pipeline').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)

Created step batch-score-titanic [a9155fec][3c4cdfc6-6739-4235-afd2-5ae2e522e721], (This step will run and generate new outputs)
Submitted PipelineRun fe9eaf26-a992-473b-80a1-2e59dee2d96c
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/batch_prediction_pipeline/runs/fe9eaf26-a992-473b-80a1-2e59dee2d96c?wsid=/subscriptions/da21a094-26a3-472f-991b-e2b11979af40/resourcegroups/agoge/workspaces/agogemls
PipelineRunId: fe9eaf26-a992-473b-80a1-2e59dee2d96c
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/batch_prediction_pipeline/runs/fe9eaf26-a992-473b-80a1-2e59dee2d96c?wsid=/subscriptions/da21a094-26a3-472f-991b-e2b11979af40/resourcegroups/agoge/workspaces/agogemls
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 54f0a8db-b0fb-43c0-99be-243d9231be72
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/batch_prediction_pipeline/runs/54f0a8db-b0fb-43c0-99be-243d9231be72?wsid=/subscriptions/da21a094-26a3-4

Mounted titanic_batch_data to /mnt/batch/tasks/shared/LS_root/jobs/agogemls/azureml/54f0a8db-b0fb-43c0-99be-243d9231be72/mounts/workspaceblobstore/azureml/54f0a8db-b0fb-43c0-99be-243d9231be72/b3e899f2-09dc-4034-a6b6-84cb8caaefd9 as folder.
Exit __enter__ of DatasetContextManager
Entering Run History Context Manager.
Current directory:  /mnt/batch/tasks/shared/LS_root/jobs/agogemls/azureml/54f0a8db-b0fb-43c0-99be-243d9231be72/mounts/workspaceblobstore/azureml/54f0a8db-b0fb-43c0-99be-243d9231be72
Preparing to call script [ driver/amlbi_main.py ] with arguments: ['--client_sdk_version', '1.16.0', '--scoring_module_name', 'batch_titanic.py', '--mini_batch_size', '5', '--error_threshold', '10', '--output_action', 'append_row', '--logging_level', 'INFO', '--run_invocation_timeout', '60', '--run_max_try', '3', '--create_snapshot_at_runtime', 'True', '--output', '/mnt/batch/tasks/shared/LS_root/jobs/agogemls/azureml/54f0a8db-b0fb-43c0-99be-243d9231be72/mounts/workspaceblobstore/azureml/54f0a8d



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'fe9eaf26-a992-473b-80a1-2e59dee2d96c', 'status': 'Completed', 'startTimeUtc': '2020-10-26T05:56:35.450095Z', 'endTimeUtc': '2020-10-26T06:04:12.921643Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://agogemls1641700925.blob.core.windows.net/azureml/ExperimentRun/dcid.fe9eaf26-a992-473b-80a1-2e59dee2d96c/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=br1kTAsKii8iXH4%2B7i32yrH6sjkzJl1OJYIRI0jcdqM%3D&st=2020-10-26T05%3A49%3A13Z&se=2020-10-26T13%3A59%3A13Z&sp=r', 'logs/azureml/stderrlogs.txt': 'https://agogemls1641700925.blob.core.windows.net/azureml/ExperimentRun/dcid.fe9eaf26-a992-473b-80a1-2e59dee2d96c/logs/azureml/stderrlogs.txt?sv=2019-02-02&sr=b&sig=sk4DESwtKTApJesfdaRdqH5rvc%2BmOcyhri%2FISQ8jY%2BE%3D&st=2020-10-26T05%3A49%3A13Z&s

'Finished'

In [11]:
import pandas as pd
import shutil

shutil.rmtree('titanic-results', ignore_errors=True)

prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='titanic-results')


for root, dirs, files in os.walk('titanic-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Display the first 10 results
df.head(10)

Unnamed: 0,File,Prediction
0,37.csv,[0 1 0 0 1 1 0 0 0 0]
1,38.csv,[0 0 1 1 0 1 0 0 0 0]
2,39.csv,[0 1 1 0 0 1 0 1 0 0]
3,4.csv,[0 0 0 1 1 0 0 0 1 1]
4,40.csv,[1 0 1 0 0 0 0 0 1 1]
5,28.csv,[0 1 1 1 1 0 0 0 0 0]
6,29.csv,[0 0 0 0 0 0 1 0 0 0]
7,3.csv,[0 0 1 1 1 1 0 0 0 0]
8,30.csv,[0 0 0 0 1 1 0 1 0 0]
9,31.csv,[0 0 0 0 1 1 0 0 0 0]
