In [None]:
import os
import json
import sys
from datetime import datetime
from azureml.core.compute import AmlCompute
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import CondaDependencies, RunConfiguration
from azureml.core import Workspace, Run, Experiment
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule
from azureml.core import Experiment


In [None]:
pipeline_config = "pipeline_config.json"
with open(pipeline_config) as f:
    j = json.loads(f.read())

In [None]:
# SP authentication
sp_auth = ServicePrincipalAuthentication(
    tenant_id=j["sp_tenant"], username=j["sp_client"], password=j["sp_secret"]
)

# AML workspace
aml_ws = Workspace.get(
    name=j["aml_work_space"],
    auth=sp_auth,
    subscription_id=str(j["subscription_id"]),
    resource_group=j["resource_group_name"],
)



In [None]:
# Pipeline inputs, models, and outputs
inputs_ds = Datastore.register_azure_blob_container(
    aml_ws,
    datastore_name="inputs_ds",
    container_name=j["data_blob_container"],
    account_name=j["blob_account"],
    account_key=j["blob_key"],
)
inputs_dir = DataReference(datastore=inputs_ds, data_reference_name="inputs")

models_ds = Datastore.register_azure_blob_container(
    aml_ws,
    datastore_name="models_ds",
    container_name=j["models_blob_container"],
    account_name=j["blob_account"],
    account_key=j["blob_key"],
)
models_dir = DataReference(datastore=models_ds, data_reference_name="models")

outputs_ds = Datastore.register_azure_blob_container(
    aml_ws,
    datastore_name="outputs_ds",
    container_name=j["preds_blob_container"],
    account_name=j["blob_account"],
    account_key=j["blob_key"],
)
outputs_dir = PipelineData(name="outputs", datastore=outputs_ds, is_directory=True)

In [None]:
# Run config
conda_dependencies = CondaDependencies.create(
    pip_packages=j["pip_packages"],
    conda_packages=j["conda_packages"],
    python_version=j["python_version"]
)
run_config = RunConfiguration(conda_dependencies=conda_dependencies)
run_config.environment.docker.enabled = True

In [None]:
MOVIELENS_DATA_SIZE = '10m'

if MOVIELENS_DATA_SIZE == '10m':
    MAX_ALL = 72000
    NUM_PER_RUN = 10000
#    compute_target = AmlCompute(aml_ws, j["cluster_name"])    
    compute_target = AmlCompute(aml_ws, "top10-mvl-d4v2")    
else:
    MAX_ALL = 140000
    NUM_PER_RUN = 10000
    # getting memory errors...
    compute_target = AmlCompute(aml_ws, "top10-mvl-d4v2")    

# AML compute target


In [None]:
# Create a pipeline step for a subset of data...

steps = []
CUR_MIN = 1
CUR_MAX = CUR_MIN + NUM_PER_RUN

## will say for 10m
## if have copied the reco_utils dir to this dir...
while CUR_MIN < MAX_ALL:
    outputs_dir = PipelineData(name="outputs", datastore=outputs_ds, is_directory=True)
    cur_name = "{}_{}_{}".format(CUR_MIN, CUR_MAX, MOVIELENS_DATA_SIZE)
    print(cur_name)
    step = PythonScriptStep(
        name=cur_name,
        script_name=j["python_script_name"],
        arguments=[CUR_MIN, CUR_MAX, inputs_dir, models_dir, outputs_dir, '10', MOVIELENS_DATA_SIZE],
        inputs=[models_dir, inputs_dir],
        outputs=[outputs_dir],
        source_directory=j["python_script_directory"],
        compute_target=compute_target,
        runconfig=run_config,
        allow_reuse=False,
    )
    steps.append(step)
    CUR_MIN = CUR_MAX
    CUR_MAX = CUR_MIN + NUM_PER_RUN

In [None]:
pipeline = Pipeline(workspace=aml_ws, steps=steps)
pipeline.validate()

In [None]:
exp_name = 'reco_score_%s' %(MOVIELENS_DATA_SIZE)
print(exp_name)
pipeline_run = Experiment(aml_ws, exp_name).submit(pipeline)

In [None]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()