# P8.4 Training the pspnet model on a cluster

## Set up training workspace

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

import azureml.core
from azureml.core import Workspace
ws = Workspace.from_config()
# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

#assign a name to the expriment
from azureml.core import Experiment
experiment_name = 'pspnet_semantic_segmentation'
exp = Experiment(workspace=ws, name=experiment_name)


#create a script folder
import os
script_folder = os.path.join(os.getcwd(), "segmentation_model")
os.makedirs(script_folder, exist_ok=True)
import shutil
shutil.copy('utils.py', script_folder)

from azureml.core.model import Model

Azure ML SDK Version:  1.39.0


## Create or Attach existing compute resource

In [2]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpu-cluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("found compute target: " + compute_name)
else:
    print("creating new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

found compute target: cpu-cluster


## Create training script

In [3]:
%%writefile $script_folder/train.py


########################### getting parameters ###################
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
parser.add_argument('--batch_size', type=int, dest='batch_size', default=4, help='training batch size')
parser.add_argument('--epochs', type=int, dest='epochs', default=10, help='training epochs')
args = parser.parse_args()

########################### getting images / masks path s###################

from utils import img_paths

data_folder = os.path.join(args.data_folder, "citydata")
print('Data folder:', data_folder)

train_input_dir = os.path.join(data_folder, "leftImg8bit\\train")
train_mask_dir = os.path.join(data_folder, "gtFine\\train")

val_input_dir = os.path.join(data_folder, "leftImg8bit\\val")
val_mask_dir = os.path.join(data_folder, "gtFine\\val")

test_input_dir = os.path.join(data_folder, "leftImg8bit\\test")
test_mask_dir = os.path.join(data_folder, "gtFine\\test")

train_img_paths, train_mask_paths= img_paths(train_input_dir, train_mask_dir)
val_img_paths,val_mask_paths = img_paths(val_input_dir,val_mask_dir)
test_img_paths,test_mask_paths = img_paths(test_input_dir,test_mask_dir)

print("Number of training samples:", len(train_img_paths))
print("Number of validation samples:", len(val_img_paths))
print("Number of test samples:", len(test_img_paths))
########################## creating training & validation dataset ###############
from utils import get_classes
CLASSES = get_classes()
n_classes = len(CLASSES)+1
IMG_SIZE = (384,384)
BATCH_SIZE=args.batch_size

from utils import DataGenerator,get_training_augmentation,get_validation_augmentation,get_preprocessing, Dataloader
# Dataset for train images
train_dataloader = Dataloader(
    train_img_paths, 
    train_mask_paths,
    img_size=IMG_SIZE, 
    classes=CLASSES,
    augmentation=get_training_augmentation(),
    preprocessing=get_preprocessing(),
    batch_size=BATCH_SIZE,
    shuffle=True
)

# Dataset for validation images
valid_dataloader = Dataloader(
    val_img_paths, 
    val_mask_paths,
    img_size=IMG_SIZE, 
    classes=CLASSES,
    augmentation=get_validation_augmentation(),
    preprocessing=get_preprocessing(),
    batch_size=1,
    shuffle=False
)

print('input images shape:', train_dataloader[0][0].shape)
print('input masks shape:', train_dataloader[0][1].shape)

################### Building and training the model ##########################
from azureml.core import Run
# get hold of the current run
run = Run.get_context()

import tensorflow as tf
import segmentation_models as sm
sm.set_framework('tf.keras')
from utils import get_backbone

# define optomizer
LR= 0.0001
optim = tf.keras.optimizers.Adam(LR)
metrics = [sm.metrics.IOUScore(threshold=0.5), sm.metrics.FScore(threshold=0.5)]
# dice coeff loss
total_loss = sm.losses.categorical_focal_dice_loss 

# define callbacks for learning rate scheduling and best checkpoints saving
os.makedirs('outputs', exist_ok=True)
checkpoint_filepath = 'outputs/pspnet_train.h5'
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(checkpoint_filepath, save_weights_only=True, save_best_only=True, mode='min'),
    tf.keras.callbacks.ReduceLROnPlateau(),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3),
]
BACKBONE=get_backbone()
model = sm.PSPNet(BACKBONE,encoder_weights='imagenet',encoder_freeze=False, classes=n_classes, activation='softmax')

# compile keras model with defined optimozer, loss and metrics
model.compile(optim, total_loss, metrics)

EPOCHS = args.epochs
# train the model
history = model.fit(
    train_dataloader, 
    epochs=EPOCHS, 
    callbacks=callbacks, 
    validation_data=valid_dataloader,
    shuffle=True,
)

# calculate accuracy on the validation data
iou_score = max(history.history['val_iou_score'])
print('Best val_iou_score: ', iou_score)

f1_score = max(history.history['val_f1-score'])
print('Best val_f1-score (dice coeff):', f1_score ) 

run.log('epochs', args.epochs)
run.log('iou_score', iou_score)
run.log('f1_score', f1_score)

#save the entiere model
model.load_weights(checkpoint_filepath)
model.save('outputs/pspnet_model')

Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/calcp8/code/Users/lei.xiaofan/P8/segmentation_model/train.py


## Configure the training job

In [4]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

env=Environment.get(workspace=ws, name='train-P8-segmentation', version=3)
"""
# to install required packages
env = Environment('train-P8-segmentation')
cd = CondaDependencies.create(
    pip_packages=['azureml-dataset-runtime[pandas,fuse]','azureml-defaults',
                  'tensorflow==2.8.0',
                  'opencv-python-headless',
                  'segmentation_models',
                  'albumentations' ],
    conda_packages = ['pip','python==3.9.7','scikit-learn==1.0.2']
)

env.python.conda_dependencies = cd

# Register environment to re-use later
env.register(workspace = ws)
"""

"\n# to install required packages\nenv = Environment('train-P8-segmentation')\ncd = CondaDependencies.create(\n    pip_packages=['azureml-dataset-runtime[pandas,fuse]','azureml-defaults',\n                  'tensorflow==2.8.0',\n                  'opencv-python-headless',\n                  'segmentation_models',\n                  'albumentations' ],\n    conda_packages = ['pip','python==3.9.7','scikit-learn==1.0.2']\n)\n\nenv.python.conda_dependencies = cd\n\n# Register environment to re-use later\nenv.register(workspace = ws)\n"

In [9]:
from azureml.core.dataset import Dataset
dataset = Dataset.get_by_name(workspace=ws, name="cityscape_select")

from azureml.core import ScriptRunConfig
args = ['--data-folder', dataset.as_mount(), '--batch_size', 4, '--epochs',20]

src = ScriptRunConfig(source_directory=script_folder,
                      script='train.py', 
                      arguments=args,
                      compute_target=compute_target,
                      environment=env)

## Submit the job to the cluster

In [10]:
run = exp.submit(config=src)
run

Experiment,Id,Type,Status,Details Page,Docs Page
pspnet_semantic_segmentation,pspnet_semantic_segmentation_1650105408_6d6dfcb5,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


## Display run results

In [8]:
print(run.get_metrics())

{'epochs': 20, 'iou_score': 0.5496424436569214, 'f1_score': 0.6332504153251648}


## ## Register model

In [10]:
# register model 
model = run.register_model(model_name='pspnet_model', model_path='outputs/pspnet_model',
                        tags={'Training context':'Script'},
                    properties={'iou_score': run.get_metrics()['iou_score'], 'f1_score': run.get_metrics()['f1_score']})
print('pspnet semantic segmentation model', model.name, model.id, model.version, sep='\t')


pspnet semantic segmentation model	pspnet_model	pspnet_model:1	1


In [8]:
import tensorflow as tf

In [2]:
model_path= Model.get_model_path(model_name='pspnet_model', version=3, _workspace=ws)


In [3]:
model_path

'azureml-models/pspnet_model/3/pspnet_train'

In [5]:
 import tensorflow as tf
 tf.keras.models.load_model(model_path)



<keras.engine.functional.Functional at 0x7faecf6e4df0>

In [4]:
Model.register(model_path=model_path,
                          model_name="pspnet_model",
                          tags={'data': "cityscape", 'type': "segmentation"},
                          description="Pspnet segmentation model",
                          workspace=ws)

Registering model pspnet_model


Model(workspace=Workspace.create(name='docs-ws', subscription_id='403c34e4-adde-4596-85b1-272a798d7ef2', resource_group='openclassrooms'), name=pspnet_model, id=pspnet_model:2, version=2, tags={'data': 'cityscape', 'type': 'segmentation'}, properties={})

In [6]:
model = Model(ws, 'pspnet_model',version=3)