Before...
=========

We developed a model for digits classification in `MLOps/data_science/working/data_science_digits_model.ipynb`.

Now
===

We migrate to the cloud by adapting the previous notebook for **Azure Machine Learning**.

This notebook should be run in the **Azure Machine Learning Studio** for real easy use.
Note that it is possible to run it outside of AML Studio if you install Azure ML SDK using `pip install azureml-sdk` and configure your credentials in **Authentification**.

# Authentification

In [1]:
# Handle to the workspace
from azure.ai.ml import MLClient

# Authentication package
from azure.identity import DefaultAzureCredential

credential = DefaultAzureCredential()

In [2]:
# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="<SUBSCRIPTION_ID>",
    resource_group_name="<RESOURCE_GROUP>",
    workspace_name="<AZUREML_WORKSPACE_NAME>",
)

# Create compute cluster

In [3]:
from azure.ai.ml.entities import AmlCompute

# Name assigned to the compute cluster
cpu_compute_target = "cpu-cluster"

try:
    # let's see if the compute target already exists
    cpu_cluster = ml_client.compute.get(cpu_compute_target)
    print(
        f"You already have a cluster named {cpu_compute_target}, we'll reuse it as is."
    )

except Exception:
    print("Creating a new cpu compute target...")

    # Let's create the Azure ML compute object with the intended parameters
    cpu_cluster = AmlCompute(
        name=cpu_compute_target,
        # Azure ML Compute is the on-demand VM service
        type="amlcompute",
        # VM Family
        size="STANDARD_F4S_V2", #"STANDARD_DS2_V2", #STANDARD_DS3_V2
        # Minimum running nodes when there is no job running
        min_instances=0,
        # Nodes in cluster
        max_instances=1,
        # How many seconds will the node running after the job termination
        idle_time_before_scale_down=180,
        # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination
        tier="Dedicated",
    )
    print(
        f"AMLCompute with name {cpu_cluster.name} will be created, with compute size {cpu_cluster.size}"
    )
    # Now, we pass the object to MLClient's create_or_update method
    cpu_cluster = ml_client.compute.begin_create_or_update(cpu_cluster)

You already have a cluster named cpu-cluster, we'll reuse it as is.


# Create virtual environment

In [4]:
import os

dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [5]:
%%writefile {dependencies_dir}/conda.yml
name: model-env
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - tensorflow=2.6.0
  - tensorflow-estimator=2.6.0
  - keras=2.6.0
  - pip=21.2.4
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - inference-schema[numpy-support]==1.3.0
    - xlrd==2.0.1
    - mlflow== 1.26.1
    - azureml-mlflow==1.42.0
    - psutil>=5.8,<5.9
    - tqdm>=4.59,<4.60
    - ipykernel~=6.0
    - matplotlib


Overwriting ./dependencies/conda.yml


In [6]:
from azure.ai.ml.entities import Environment

custom_env_name = "cv_dl"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for tutorial of MLOps",
    tags={"tensorflow": "2.6.0"},
    conda_file=os.path.join(dependencies_dir, "conda.yml"),
    image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

Environment with name cv_dl is registered to workspace, the environment version is 8


# Create training script

In [7]:
import os

train_src_dir = "./working"
os.makedirs(train_src_dir, exist_ok=True)

In [8]:
%%writefile {train_src_dir}/main.py
import os
import argparse
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import mlflow
import mlflow.keras

import tensorflow
import tensorflow.keras

from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Lambda, Dense, Flatten
from tensorflow.keras.layers import BatchNormalization, Convolution2D , MaxPooling2D
from tensorflow.keras.optimizers import Adam

from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--train", type=str, help="path to train data")
    parser.add_argument("--test", type=str, help="path to test data")
    parser.add_argument("--learning_rate", required=False, default=0.001, type=float)
    parser.add_argument("--registered_model_name", type=str, help="model name")
    args = parser.parse_args()
   
    # Start Logging
    mlflow.start_run()

    # enable autologging
    mlflow.tensorflow.autolog()

    ###################
    #<prepare the data>
    ###################
    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    print("train data:", args.train)
    print("train data:", args.test)

    # create the training & test sets, skipping the header row with [1:]
    train = pd.read_csv(args.train)
    test = pd.read_csv(args.test)

    # Extracting the label column
    X_train = (train.iloc[:,1:].values).astype('float32') # all pixel values
    y_train = train.iloc[:,0].values.astype('int32') # only labels i.e targets digits
    
    X_test = test.values.astype('float32')

    # Convert train datset to (num_images, img_rows, img_cols, colour_channel_gray) format
    X_train = X_train.reshape(X_train.shape[0], 28, 28,1)
    X_test = X_test.reshape(X_test.shape[0], 28, 28,1)

    mlflow.log_metric("num_features", X_train.shape[1]*X_train.shape[2])

    mean_px = X_train.mean().astype(np.float32)
    std_px = X_train.std().astype(np.float32)

    def standardize(x): 
        return (x-mean_px)/std_px

    y_train= to_categorical(y_train)
    
    mlflow.log_metric("num_samples", y_train.shape[1])
    ####################
    #</prepare the data>
    ####################

    ##################
    #<train the model>
    ##################
    # fix random seed for reproducibility
    seed = 43
    np.random.seed(seed)

    batch_size = 64
    gen = ImageDataGenerator()
    batches = gen.flow(X_train, y_train, batch_size=batch_size)

    def get_bn_model():
        model = Sequential([
            Lambda(standardize, input_shape=(28,28,1)),
            Convolution2D(32,(3,3), activation='relu'),
            BatchNormalization(axis=1),
            Convolution2D(32,(3,3), activation='relu'),
            MaxPooling2D(),
            BatchNormalization(axis=1),
            Convolution2D(64,(3,3), activation='relu'),
            BatchNormalization(axis=1),
            Convolution2D(64,(3,3), activation='relu'),
            MaxPooling2D(),
            Flatten(),
            BatchNormalization(),
            Dense(512, activation='relu'),
            BatchNormalization(),
            Dense(10, activation='softmax')
            ])
        model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    model = get_bn_model()
    model.optimizer.lr = args.learning_rate

    print(f"Training with data of shape {X_train.shape}")

    model.fit(batches, steps_per_epoch=len(batches), epochs=5)
    ###################
    #</train the model>
    ###################
    
    # Stop Logging
    mlflow.end_run()

if __name__ == "__main__":
    main()


Overwriting ./working/main.py


# Create Azure ML training job

In [9]:
from azure.ai.ml import command
from azure.ai.ml import Input

registered_model_name = "digits_model"

job = command(
    inputs=dict(
        train=Input(
            type="uri_file",
            path="./input/train.csv",
        ),
        test=Input(
            type="uri_file",
            path="./input/test.csv",
        ),
        learning_rate=0.001,
        registered_model_name=registered_model_name,
    ),
    code="./working/",  # location of source code
    command="python main.py --train ${{inputs.train}} --test ${{inputs.test}} --learning_rate ${{inputs.learning_rate}} --registered_model_name ${{inputs.registered_model_name}}",
    environment="cv_dl@latest",
    compute="cpu-cluster",
    experiment_name="train_model_mnist",
    display_name="digits",
)

In [10]:
ml_client.create_or_update(job)

[32mUploading working (0.01 MBs): 100%|██████████| 5004/5004 [00:00<00:00, 93741.70it/s]
[39m



Experiment,Name,Type,Status,Details Page
train_model_mnist,purple_circle_pv1n6f4sqq,command,Starting,Link to Azure Machine Learning studio
