In [None]:
import azureml.core
from azureml.core import Workspace, Experiment
from azureml.core.model import Model
from azureml.core.webservice import Webservice
from azureml.core.image import ContainerImage
from azureml.core.webservice import AciWebservice
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.datastore import Datastore
from azureml.core.dataset import Dataset
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.exceptions import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.pipeline.core import Pipeline, PipelineData, PipelineRun, StepRun, PortDataReference
from azureml.pipeline.steps import PythonScriptStep
from azureml.widgets import RunDetails
from azureml.data.data_reference import DataReference
from azureml.core.authentication import AzureCliAuthentication
from azureml.pipeline.core import PipelineParameter
from azureml.core.authentication import InteractiveLoginAuthentication

import requests
import json
import os

#Check azure ML SDK version
print(azureml.core.VERSION)

## Set Up workspace

In [None]:
# Get from configuration JSON file with subscription, resource, and workspace name data.
cli_auth = AzureCliAuthentication()

ws = Workspace.from_config(path="./")

# print workspace created
print(ws.name, ws.location, ws.resource_group, sep='\t')

In [None]:
experiment_name = "test_auto_deploy"
exp = Experiment(workspace=ws, name=experiment_name)
print(exp.name, exp.workspace.name, sep="\n")

In [None]:
#Get default datastore (blob storage)
pipeline_datastore = Datastore(ws, "datastore_pipeline")
pipeline_datastore

## Attach computing resources

In [None]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.exceptions import ComputeTargetException

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpu-cluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

# For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


# checks to see if compute target already exists in workspace, else create it
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("found compute target: " + compute_name)
else:
    print("creating new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes)
    # create the cluster
    compute_target = ComputeTarget.create(workspace=ws, name=compute_name, provisioning_configuration=provisioning_config)

    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
    print(compute_target.get_status().serialize())

## Create Docker container

In [None]:
aml_run_config = RunConfiguration()

aml_run_config.target = compute_name
aml_run_config.environment.docker.enabled = True
aml_run_config.environment.docker.base_image = "mcr.microsoft.com/azureml/base:latest" 
aml_run_config.environment.python.user_managed_dependencies = False

# Dependecies for the model train step
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas', 'scikit-learn', 'numpy', 'nltk'],
    pip_packages=['joblib', 'imblearn', 'sklearn','azureml-sdk','pyyaml','GitPython','ansiblemetrics','pydriller','requests','datetime','iacminer', 'pygithub', 'azure-cli-core'],
    pin_sdk_version=False)

print(aml_run_config)

## For first run upload JSON file with repo url

In [None]:
#Upload repo metrics (results from repo minor)
#pipeline_datastore.upload_files(["./data/repo_url.json"], target_path="input_data", overwrite=True)

#Datsets objects used for reading data from workspace
#repo_url = Dataset.File.from_files(pipeline_datastore.path( './input_data/repo_url.json'))

#Register list to workspace
#repo_url = repo_url.register(workspace = ws, name = "repo_url")

## Create pipeline data, data references and parameters

In [None]:
# Reference the data uploaded to blob storage using DataReference
input_data = DataReference(
    datastore=pipeline_datastore,
    data_reference_name="repo_url",
    path_on_datastore="input_data/repo_url.json")

#Pipeline data for transferring intermediate data
github_data = PipelineData("github_data",datastore=pipeline_datastore)
repo_metrics = PipelineData("metrics_data",datastore=pipeline_datastore)
metrics_clean = PipelineData("metrics_clean",datastore=pipeline_datastore)
model_file = PipelineData("train_model",datastore=pipeline_datastore)
final_model = PipelineData("final_model",datastore=pipeline_datastore)
container_store = PipelineData("container_store",datastore=pipeline_datastore)

#Pipeline parameters for transferring repo name and repo owner
repo_name_par = PipelineParameter(name="repo_name", default_value='molecule')
repo_owner_par = PipelineParameter(name="repo_owner", default_value='ansible-community')

## Create each pipeline step

In [None]:
source_directory = "./scripts/github_cloner"

github_cloner = PythonScriptStep(name="github_cloner",
                        script_name="./github_cloner.py",
                        arguments=["--github_list", input_data,"--repo_owner",repo_owner_par,
                                   "--repo_name",repo_name_par,"--raw_data",github_data],
                        inputs=[input_data],
                        outputs=[github_data],
                        compute_target=compute_name,
                        runconfig=aml_run_config,
                        source_directory=source_directory,
                        allow_reuse=True)

print("Github cloner step created")

In [None]:
source_directory = "./scripts/repo_miner"
repo_miner = PythonScriptStep(name="repo_miner",
                        script_name="./repo_miner.py",
                        arguments=["--repo_owner",repo_owner_par,"--repo_name",repo_name_par,"--raw_data",github_data,"--metrics_data",repo_metrics],
                        inputs=[github_data],
                        outputs=[repo_metrics],
                        compute_target=compute_name,
                        runconfig=aml_run_config,
                        source_directory=source_directory,
                        allow_reuse=True)

print("Repo miner step created")

In [None]:
source_directory = "./scripts/cleaning"
metrics_cleaning = PythonScriptStep(name="metrics_cleaning",
                        script_name="./cleaning.py",
                        arguments=["--repo_owner",repo_owner_par,"--repo_name",repo_name_par,"--metrics_data",repo_metrics,"--metrics_clean",metrics_clean],
                        inputs=[repo_metrics],
                        outputs=[metrics_clean],
                        compute_target=compute_name,
                        runconfig=aml_run_config,
                        source_directory=source_directory,
                        allow_reuse=True)

print("Metrics cleaning step created")

In [None]:
source_directory = "./scripts/training"
train_model = PythonScriptStep(name="model_training",
                        script_name="./train.py",
                        arguments=["--repo_owner",repo_owner_par,"--repo_name",repo_name_par,"--metrics_clean",metrics_clean,"--train_model",model_file],
                        inputs=[metrics_clean],
                        outputs=[model_file],
                        compute_target=compute_name,
                        runconfig=aml_run_config,
                        source_directory=source_directory,
                        allow_reuse=True)

print("Model training step created")

In [None]:
source_directory = "./scripts/evaluate"
evaluate_model = PythonScriptStep(name="model_evaluation",
                        script_name="./evaluate.py",
                        arguments=["--repo_owner",repo_owner_par,"--repo_name",repo_name_par,"--train_model",model_file,"--final_model",final_model],
                        inputs=[model_file],
                        outputs=[final_model],
                        compute_target=compute_name,
                        runconfig=aml_run_config,
                        source_directory=source_directory,
                        allow_reuse=True)

print("Model evaluation step created")

In [None]:
# Make sure the steps run after each other
#aci_webservice.run_after(container_image)
container_image.run_after(evaluate_model)
evaluate_model.run_after(train_model)
train_model.run_after(metrics_cleaning)
metrics_cleaning.run_after(repo_miner)
repo_miner.run_after(github_cloner)

In [None]:
steps = [github_cloner, repo_miner, metrics_cleaning, train_model, evaluate_model]

## Submit pipeline and see resuts

In [None]:
# create pipeline, attach to workspace and associate steps
pipeline1 = Pipeline(workspace=ws, steps=steps) 

pipeline1.validate()

pipeline_run1= exp.submit(pipeline1, regenerate_output=False)

In [None]:
RunDetails(pipeline_run1).show()

In [None]:
pipeline_run1.wait_for_completion()

## Get info of pipeline steps

In [None]:
for step in pipeline_run1.get_steps():
    print("Outputs of step " + step.name)
    
    output_dict = step.get_outputs()

    for name, output in output_dict.items():
        
        output_reference = output.get_port_data_reference() # Get output port data reference
        print("\tname: " + name)
        print("\tdatastore: " + output_reference.datastore_name)
        print("\tpath on datastore: " + output_reference.path_on_datastore)

## Publish pipeline

In [None]:
# publish a pipeline from the submitted pipeline run
pipeline_name = "deployment"
description = "Defect prediction pippeline: from github cloning to model evaluation and implementation"
published_pipeline = pipeline_run1.publish_pipeline(name=pipeline_name, description=description, version="1", continue_on_step_failure=True)
published_pipeline

In [None]:
#Get the endpoint URL
rest_endpoint = published_pipeline.endpoint
print("You can perform HTTP POST on URL {} to trigger this pipeline".format(rest_endpoint))

## All submitted pipelines


In [None]:
from azureml.pipeline.core import PublishedPipeline
all_pub_pipelines = PublishedPipeline.list(ws)
all_pub_pipelines

In [None]:
#Get most recent model
model_list = Model.list(ws)
production_model = next(
    filter(
        lambda x: x.created_time == max(model.created_time for model in model_list),
        model_list,
    )
)

model = Model(ws, name=production_model.name)
model

## Test pipeline

In [None]:
#Get ADD token
auth = InteractiveLoginAuthentication()
aad_token = auth.get_authentication_header()
#experiment_name = "Defect_prediction_pipeline"

# specify the param when running the pipeline
response = requests.post(rest_endpoint, 
                         headers=aad_token, 
                         json={"ExperimentName": experiment_name },
                        "ParameterAssignments": {
                            "repo_name": 'ansible-community',
                            "repo_owner": 'molecule'},
                        "RunSource": "SDK")

run_id = response.json()
run_id


## Pipeline with ACI webservice

In [None]:
aci_store = PipelineData("ACI_store",datastore=pipeline_datastore)

In [None]:
# Make sure the steps run after each other
aci_webservice.run_after(evaluate_model)
evaluate_model.run_after(train_model)
train_model.run_after(metrics_cleaning)
metrics_cleaning.run_after(repo_miner)
repo_miner.run_after(github_cloner)

In [None]:
source_directory = "./scripts/deployment"
aci_webservice = PythonScriptStep(name="container_image",
                        script_name="./aci_webservice_new.py",
                        arguments=["--repo_owner",repo_owner_par,"--repo_name",repo_name_par,"--final_model",final_model, "--aci_store",aci_store],
                        inputs=[final_model],
                        outputs = [aci_store],
                        compute_target=compute_name,
                        runconfig=aml_run_config,
                        source_directory=source_directory,
                        allow_reuse=True)

print("Container image step created")

In [None]:
steps = [github_cloner, repo_miner, metrics_cleaning, train_model, evaluate_model, aci_webservice]

In [None]:
# create pipeline, attach to workspace and associate steps
pipeline1 = Pipeline(workspace=ws, steps=steps) 

pipeline1.validate()

pipeline_run1= exp.submit(pipeline1, regenerate_output=False)

In [None]:
RunDetails(pipeline_run1).show()

In [None]:
pipeline_run1.wait_for_completion()

## Output

In [None]:
# Call the web service, passing the input data
response = service.run(input_data = "test")

# Get the predictions
predictions = json.loads(response)

In [None]:
endpoint = service.scoring_uri
print(endpoint)