In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # default is ‘last_expr’

%load_ext autoreload
%autoreload 2

In [None]:
import azure.batch
azure.batch.__version__

In [None]:
import os
import datetime

from azure.batch import BatchServiceClient
from azure.batch.models import *
from azure.common.credentials import ServicePrincipalCredentials

# Set up an instance of the batch processing API

We create one Azure Batch Pool for each instance of the batch processing API.

The limit for the number of Pools in our Batch account is 100.

## Step 1: Create an Azure Batch Pool

In [None]:
# MODIFY THESE CELLS FOR YOUR OWN NODEPOOL SETUP!

# POOL_ID should start with the name of the API instance this pool will be used for
POOL_ID = 'training_1'
assert len(POOL_ID) <= 64, 'pool_id has more than 64 characters'

# choose the account in East US or South Central US
BATCH_ACCOUNT_URL = 'https://zoobot.eastus.batch.azure.com'

In [None]:
# secrets read from environment variables from docker-compose setup

# using the "cameratrapsbatch" service principal (app)
# ensure this service principal has the relevant IAM access setup 
# on the BATCH_ACCOUNT_URL Azure batch resource

# authenticate with Batch account using the service principle "camera-trap-async-api" in our AAD
APP_CLIENT_ID = os.environ['APP_CLIENT_ID']
APP_CLIENT_SECRET = os.environ['APP_CLIENT_SECRET']
APP_TENANT_ID = os.environ['APP_TENANT_ID']

# other configuration info

# Docker image in our custom Azure Registry
# credentials in azure portal
REGISTRY_SERVER = 'zoobot.azurecr.io'
REGISTRY_USERNAME = REGISTRY_SERVER.split('.')[0]
REGISTRY_PASSWORD = os.environ['REGISTRY_PASSWORD']
# the docker image built for running the ML system application
REGISTRY_IMAGE_NAME = 'pytorch:1.10.1-gpu-py3'
# the pre-built image we made and pused to our registry via instructions
# see https://github.com/zooniverse/bajor/tree/main/docs/zoobot_azure_batch#build-the-docker-image-for-azure-batch-node-pools
# format is 'login-server/repository:tag'
CONTAINER_IMAGE_NAME = f'{REGISTRY_USERNAME}.azurecr.io/{REGISTRY_IMAGE_NAME}'

# storage setup - credentials in Azure portal
STORAGE_ACCOUNT_KEY = os.environ['STORAGE_ACCOUNT_KEY']
STORAGE_ACCOUNT_NAME = 'kadeactivelearning'

# names of the env containers supporting the API instances in the above storage account
# manually created these containers in Azure portal
STORAGE_CONTAINER = 'training'
STORAGE_CONTAINER_MOUNT_POINT = 'training'
MODELS_STORAGE_CONTAINER = 'models'
MODELS_STORAGE_CONTAINER_MOUNT_POINT = 'models'

# Azure Batch node pool VM type
# https://docs.microsoft.com/en-us/azure/virtual-machines/ncv3-series
# check the VM is available in our location - az batch location list-skus --location East-US --query name==Standard_NC6s_v3
POOL_VM_SIZE = 'Standard_NC6s_v3'  

# auto-scale formula - can be set manually in Azure portal
# last statement makes sure that nodes aren't removed until their tasks are finished
# docs: https://docs.microsoft.com/en-us/azure/batch/batch-automatic-scaling

# setup the pool autoscaling interval (determines when nodes scale up / down for pending jobs)
# lowest is 5 mins adn this allows jobs to be processed in a quicker timeframe
POOL_SCALING_EVALATION_INTERVAL_MINS = 5
# MODIFY the "cappedPoolSize" if it should be other than 2 dedicated nodes
POOL_AUTO_SCALE_FORMULA = """
// In this formula, the pool size is adjusted based on the number of tasks in the queue. 
// Note that both comments and line breaks are acceptable in formula strings.

// Get pending tasks for the past 15 minutes.
$samples = $ActiveTasks.GetSamplePercent(TimeInterval_Minute * 3);

// If we have fewer than 70 percent data points, we use the last sample point, otherwise we use the maximum of last sample point and the history average.
$tasks = $samples < 70 ? max(0, $ActiveTasks.GetSample(1)) : 
max( $ActiveTasks.GetSample(1), avg($ActiveTasks.GetSample(TimeInterval_Minute * 3)));

 // If number of pending tasks is not 0, set targetVM to pending tasks, otherwise set to 0, since there is usually long intervals between job submissions.
$targetVMs = $tasks > 0 ? $tasks + $CurrentDedicatedNodes : 0;

// The pool size is capped at 2, if target VM value is more than that, set it to 2.
cappedPoolSize = 2;

// Currently can only use dedicated nodes due provisioning failure with Spot nodes :(
// $TargetDedicatedNodes = 0;
$TargetDedicatedNodes = max(0, min($targetVMs, cappedPoolSize));

// preferable use spot nodes for training work - these may get preempted (see above - provisioning failure in the region)
// $TargetLowPriorityNodes = max(0, min($targetVMs, cappedPoolSize));
$TargetLowPriorityNodes = 0;

// Set node deallocation mode - keep nodes active only until tasks finish
$NodeDeallocationOption = taskcompletion;
"""


In [None]:
def print_batch_exception(batch_exception):
    """
    Prints the contents of the specified Batch exception.
    """
    print('-------------------------------------------')
    print('Exception encountered:')
    if batch_exception.error and \
            batch_exception.error.message and \
            batch_exception.error.message.value:
        print(batch_exception.error.message.value)
        if batch_exception.error.values:
            print()
            for msg in batch_exception.error.values:
                print(f'{msg.key}:\t{msg.value}')
    print('-------------------------------------------')

def create_pool(batch_service_client, pool_id):
    """
    Create a pool with pool_id and the Docker image specified by constants in above cells
    """
    # we have to use VM images supporting GPU access *and* Docker
    # this VM image will run our custom container
    # find the VM refs and make sure they match the VirtualMachineConfiguration config below 
    # https://docs.microsoft.com/en-us/cli/azure/batch/pool/supported-images?view=azure-cli-latest
    #     see notes at bottom on accepting the image agreement terms
    # az batch pool supported-images list --subscription Zooniverse-Primary --account-endpoint 'https://zoobot.eastus.batch.azure.com' --account-name zoobot
    image_ref = ImageReference(
        publisher='microsoft-azure-batch',
        offer='ubuntu-server-container',
        sku='20-04-lts',
        version='latest'  # URN: microsoft-azure-batch:ubuntu-server-container:20-04-lts:1.3.0
        # The Azure Batch container image only accepts 'latest' version
    )

    # specify a container registry from which to pull the custom container
    # see the `batch_service` folder on instructions for building the container image
    container_registry = ContainerRegistry(
        registry_server=REGISTRY_SERVER,
        user_name=REGISTRY_USERNAME,
        password=REGISTRY_PASSWORD
    )

    container_conf = ContainerConfiguration(
        container_image_names = [CONTAINER_IMAGE_NAME],
        container_registries =[container_registry]
    )

    # match the SKU for "publisher": "microsoft-azure-batch"
    # in ImageReference Above
    # https://docs.microsoft.com/en-us/cli/azure/batch/pool/supported-images?view=azure-cli-latest
    vm_config = VirtualMachineConfiguration(
        image_reference=image_ref,
        container_configuration=container_conf,
        node_agent_sku_id='batch.node.ubuntu 20.04'
    )

    # setup the blob storage container mounts to the node VMs
    training_mount = MountConfiguration(
        azure_blob_file_system_configuration=AzureBlobFileSystemConfiguration(
            account_name=STORAGE_ACCOUNT_NAME,
            container_name=STORAGE_CONTAINER,
            # mount the container in the VM
            relative_mount_path=STORAGE_CONTAINER_MOUNT_POINT,
            account_key=STORAGE_ACCOUNT_KEY,
            blobfuse_options='-o attr_timeout=240 -o entry_timeout=240 -o negative_timeout=120 -o allow_other'
        )
    )
    models_mount = MountConfiguration(
        azure_blob_file_system_configuration=AzureBlobFileSystemConfiguration(
            account_name=STORAGE_ACCOUNT_NAME,
            container_name=MODELS_STORAGE_CONTAINER,
            # mount the container in the VM
            relative_mount_path=MODELS_STORAGE_CONTAINER_MOUNT_POINT,
            account_key=STORAGE_ACCOUNT_KEY,
            # see blob mount opts in https://github.com/Azure/azure-storage-fuse#mount-options
            # these can be tweaked to improve performance
            blobfuse_options='-o attr_timeout=240 -o entry_timeout=240 -o negative_timeout=120 -o allow_other'
        )
    )
    
    new_pool = PoolAddParameter(
        id=POOL_ID,
        display_name=POOL_ID,
        vm_size=POOL_VM_SIZE,
        enable_auto_scale=True,
        auto_scale_formula=POOL_AUTO_SCALE_FORMULA,
        auto_scale_evaluation_interval=datetime.timedelta(minutes=POOL_SCALING_EVALATION_INTERVAL_MINS),
        virtual_machine_configuration=vm_config,
        # default is 1; each task occupies the entire GPU so we can only run one task at a time on a node
        task_slots_per_node=1,
        mount_configuration=[training_mount, models_mount],
    )
    batch_service_client.pool.add(new_pool)


In [None]:
print(APP_CLIENT_ID)
print(APP_TENANT_ID)

In [None]:
credentials = ServicePrincipalCredentials(
    client_id=APP_CLIENT_ID,
    secret=APP_CLIENT_SECRET,
    tenant=APP_TENANT_ID,
    resource='https://batch.core.windows.net/'
)

# if using the Batch quota system, use https://docs.microsoft.com/en-us/python/api/azure-batch/azure.batch.batch_auth.sharedkeycredentials?view=azure-python
# to authenticate instead of the service principal is also okay.

batch_client = BatchServiceClient(credentials=credentials, batch_url=BATCH_ACCOUNT_URL)

In [None]:
%%time
# pool creation should run quickly

try:
    create_pool(batch_client, POOL_ID)
except BatchErrorException as e:
    print_batch_exception(e)
    raise

## Useful CLI commands for using Docker images with Batch

List all Batch supported images with their "capabilities" (e.g. "DockerCompatible", "NvidiaTeslaDriverInstalled"):
```
az batch pool supported-images list
```
with the pool information provided in additional parameters.

Listing all versions of a SKU of image:
```
az vm image list --all --publisher microsoft-dsvm
```

You may need to accept the terms of an image:
```
az vm image list --all --publisher <publisher>
```
note: tied to the URN above in ImageReference cmd
to find the URN for the image you want to use, followed by:
```
az vm image terms accept --urn <corresponding-urn>
```