# Initiate + Create Workspace

In [2]:
from azureml.core import Workspace
from azureml.core import Environment
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Experiment
import os

# from azureml.core.authentication import InteractiveLoginAuthentication

ws = Workspace.from_config()
ds = ws.get_default_datastore()
print(ws.name, "loaded successfully")

aml-aks-poc loaded successfully


# Creating Training Folder

In [3]:
folder_training_script = './jobcode'
os.makedirs(folder_training_script, exist_ok=True)

print('Done')

Done


# Upload data by using get_default_datastore()

In [33]:
ds.upload(src_dir='./jobsdata', target_path='jobsdata', overwrite=True, show_progress=True)

print('Uploaded Jobs Data')

Uploading an estimated of 2 files
Uploading ./jobsdata\jobs.csv
Uploaded ./jobsdata\jobs.csv, 1 files out of an estimated total of 2
Uploading ./jobsdata\jobs_old.csv
Uploaded ./jobsdata\jobs_old.csv, 2 files out of an estimated total of 2
Uploaded 2 files
Uploaded Jobs Data


In [18]:
ds.upload(src_dir='./jobcode', target_path='jobcode', overwrite=True, show_progress=True)




Uploading an estimated of 1 files
Uploading ./jobcode\train.py
Uploaded ./jobcode\train.py, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_8bc3699dab9c4c7a9f246e609a9a4640

# Create Compute Cluster

In [4]:
# Naming the cluster and setting minimal and maximal number of nodes 
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpucluster")
min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 1)

# Choosing environment variables 
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")

provisioning_config = AmlCompute.provisioning_configuration(
    vm_size = vm_size, min_nodes = min_nodes, max_nodes = max_nodes)

# Creating the cluster
compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

print('Compute target created')

Compute target created


# Create ML Model

In [54]:
%%writefile $folder_training_script/train.py

import argparse
import os
import numpy as np
import pandas as pd
import glob

from azureml.core import Run
# from utils import load_data

import joblib

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

# let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model
parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
parser.add_argument('--max-depth', type=int, dest='max_depth', default=5, help='max depth')
args = parser.parse_args()

###
data_folder = os.path.join(args.data_folder, 'jobsdata')
print('Data folder:', data_folder)

job_data = pd.read_csv(os.path.join(data_folder, 'jobs.csv'))

                        
X = job_data.drop(columns =["quality"])
y = job_data["quality"]

clf = DecisionTreeRegressor(random_state=0,max_depth = args.max_depth)
rmse= np.mean(np.sqrt(-cross_val_score(clf, X, y, scoring="neg_mean_squared_error", cv = 5)))
print('RMSE is', rmse)

# Get the experiment run context
run = Run.get_context()

run.log('max depth', np.float(args.max_depth))
run.log('rmse', np.float(rmse))

os.makedirs('outputs', exist_ok=True)

clf.fit(X,y)
# file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=clf, filename='outputs/job_model.pkl')

run.complete()

Overwriting ./jobcode/train.py


# Creating Environment

In [55]:
job_env = Environment("job-experiment-env-5")
job_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
job_env.docker.enabled = True # Docker container

job_packages = CondaDependencies.create(conda_packages=['scikit-learn', "numpy", "pandas", "joblib"])

# Add the dependencies to the environment
job_env.python.conda_dependencies = job_packages

print(job_env.name, 'defined.')



job-experiment-env-5 defined.


# Registering env to Azure ML Workspace

In [56]:
job_env.register(workspace=ws)

{
    "assetId": "azureml://locations/eastus2/workspaces/6f7360e0-658a-4233-8141-019587c2881d/environments/job-experiment-env-5/versions/1",
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20221010.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "buildContext": null,
        "enabled": true,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "job-experiment-en

# Creating Estimator

In [57]:
from azureml.train.estimator import Estimator

script_params = {
    '--data-folder': ds.as_mount(),
    '--max-depth': 10
}

registered_env = Environment.get(ws, 'job-experiment-env-5')

# Create an estimator
estimator = Estimator(source_directory=folder_training_script,
                      script_params=script_params,
                      compute_target = compute_target, # Run the experiment on the remote compute target
                      environment_definition = registered_env,
                      entry_script='train.py')



# Create Jobs (Experiment)

In [58]:
#Create an experiment
experiment = Experiment(workspace = ws, name = "job_expt")

print('Experiment created')

Experiment created


# Submit Experiment with Estimator Information

In [59]:
run = experiment.submit(config=estimator)
run



Experiment,Id,Type,Status,Details Page,Docs Page
job_expt,job_expt_1668246911_56739a2c,azureml.scriptrun,Preparing,Link to Azure Machine Learning studio,Link to Documentation


# Register ML Model

In [61]:
model = run.register_model(model_name='job_model',
                           model_path='outputs/job_model.pkl',
                           tags = {'area': "jobs", 'type': "sklearn"},
                           description = "salary prediction")

print(model.name, model.id, model.version, sep='\t')

job_model	job_model:1	1
