# Job Page Loading Helper

This notebook provides tooling to populate a jobs page from scratch using images from the [wandb dockerhub](https://hub.docker.com/u/wandb).

Tooling includes:
1. Create job from docker image
2. Rename job to user-friendly name
3. Add example runs that will auto-populate the run's `Clone from...` menu
4. Delete the dummy run used to create the job initially

Notes:
1. This notebook uses a [special branch of the SDK](https://github.com/wandb/wandb/tree/andrew/helpers) with helpful GQL mutations added.  Please install that branch for now until it's merged into main.
2. Jobs prefixed with `gpu_` require a GPU to run and are added to a GPU queue by default.  Please make sure you have a GPU  agent available to run these jobs, otherwise no runs will be populated.
3. You must have queues running to populate jobs!
4. The `sql_query` job currently does not work on M1.  This is due to upstream issues with emulation and lack of linux/arm64 support for the `connectorx` package.  The job should still work on an `amd64` machine.

## Settings

In [None]:
# Repo settings
JOB_REPO_ENTITY = 'launch-test'
JOB_REPO_PROJECT = 'jobs'

# Queue settings
CPU_QUEUE_NAME = 'andrew-cpu'
GPU_QUEUE_NAME = 'andrew-gpu'

# Job/image settings
DOCKER_IMAGE_TAG = '134fcaf3d4b1499e69b426fad803b7e2cca85ab5'
JOBS_DIR = 'jobs'

In [None]:
def get_env(envlist):
    env = {}
    with open(envlist) as f:
        for line in f.read().splitlines():
            k, v = line.split('=')
            env[k] = v
    
    return env

job_repo_base_env = get_env("/Users/andrewtruong/.wandb_launch/env.list")

In [None]:
from functools import partial
from pathlib import Path

import platform
import click
import docker
import yaml

import wandb
from wandb.sdk.internal.internal_api import Api as InternalApi
from wandb.sdk.launch import launch_add


api = wandb.Api()
iapi = InternalApi()
LOADER_STR = "__loader-delete-me__"


def load_job(jobname, queue_name, entity=JOB_REPO_ENTITY, project=JOB_REPO_PROJECT, tag=DOCKER_IMAGE_TAG):
    img = jobname2img(jobname, tag)
    wandb.termlog(f"Creating job: {entity}/{project}/{img}")
    create_job(img, entity, project)
    
    registry = get_registry()
    ui_name, ui_desc = registry[jobname]['name'], registry[jobname]['desc']
    artname = jobname2artname(jobname, tag)
    artpath = artname2artpath(artname, entity, project, tag='latest')
    wandb.termlog(f"Renaming job to: {ui_name}")
    rename_job(artpath, ui_name, ui_desc)
    
    new_artpath = artname2artpath(ui_name, entity, project, tag='latest')
    wandb.termlog("Adding new example runs...")
    add_example_runs(new_artpath, jobname, entity, project, queue_name)


def create_job(img, entity=JOB_REPO_ENTITY, project=JOB_REPO_PROJECT, env=job_repo_base_env):
    """
    Create a job by running the docker image.
    The run will show as failed because there is no config, but that's ok.  It will get deleted later.
    """
    env["WANDB_ENTITY"] = entity
    env["WANDB_PROJECT"] = project
    env["WANDB_NAME"] = LOADER_STR
    env["WANDB_DOCKER"] = img

    client = docker.from_env()
    
    emulation = True
    if emulation:
        container = client.containers.run(img, environment=env, detach=True, auto_remove=True, network_mode='host')
    else:
        container = client.containers.run(img, environment=env, detach=True, auto_remove=True, network_mode='host', platform='linux/amd64')    
    
    output = container.attach(stdout=True, stream=True, logs=True)
    for line in output:
        click.echo(line.decode('utf-8'), nl=False)

                    
                    
def rename_job(job_path, new_name, new_desc):
    """
    Rename the job from the default name to a pretty name and description we define in `registry.yaml`
    """
    art = api.artifact(job_path)
    asid = art._attrs['artifactSequence']['id']
    
    iapi.update_artifact_collection(asid, new_name, new_desc)


def add_example_runs(job_art_path, jobname, entity, project, queue_name):
    """
    Add example runs for the user to see and easily `Clone from...` in the UI.
    """
    base_launcher = partial(launch_add.launch_add, job=job_art_path, project=project, entity=entity, queue_name=queue_name)
    config_paths = Path(f'{JOBS_DIR}/{jobname}/configs').glob('*.yml')
    
    for p in config_paths:
        with p.open() as f:
            config = yaml.safe_load(f)
        base_launcher(config={"overrides": {"run_config": config['config']}}, name=config['run_name'])
        

def delete_loader_runs():
    """
    Delete the unsightly "loader" run
    """
    api = wandb.Api()
    for run in api.runs(f"{JOB_REPO_ENTITY}/{JOB_REPO_PROJECT}"):
        if run.name == LOADER_STR:
            run.delete()
    

def get_registry():
    with open('registry.yaml') as f:
        return yaml.safe_load(f)

def jobname2img(jobname, tag):
    return f"wandb/job_{jobname}:{tag}"

def get_jobnames(jobs_dir):
    return [p.stem for p in Path(jobs_dir).glob('*')]

def jobname2artname(jobname, tag):
    return f"job-wandb_job_{jobname}_{tag}"

def artname2artpath(artname, entity, project, tag="latest"):
    return f'{entity}/{project}/{artname}:{tag}'

## Spin up helper resources

In [None]:
# !docker run -p 3307:3306 -d sakiladb/mysql:latest
# !docker build -t tritonserver-wandb jobs/deploy_to_nvidia_triton/server && \
#     docker run --rm --net=host -p 8000:8000 -v $HOME/.aws:/root/.aws:ro -d tritonserver-wandb

## Deploy jobs

In [None]:
jobnames = get_jobnames(JOBS_DIR)
is_m1 = platform.machine() == 'arm64' and platform.system() == "Darwin"

for jobname in jobnames:
    if is_m1 and jobname == 'sql_query':
        continue  # connectorx seems to cause issues with emulation on M1.
    if jobname.startswith('gpu_'):
        load_job(jobname, GPU_QUEUE_NAME)
    else:
        load_job(jobname, CPU_QUEUE_NAME)

delete_loader_runs()

## Delete sagemaker endpoints that were spun up
- You may have to run this manually because the jobs above need to actually run before the endpoints are created

In [None]:
import boto3

sagemaker = boto3.client('sagemaker')

response = sagemaker.list_endpoints()
endpoints = response['Endpoints']

for endpoint in endpoints:
    try:
        sagemaker.delete_endpoint(EndpointName=endpoint['EndpointName'])
    except Exception as e:
        print(e)

## Check to see if any setup runs failed

In [None]:
api = wandb.Api()  # you need to run this again to refresh the runs
for run in api.runs(f"{JOB_REPO_ENTITY}/{JOB_REPO_PROJECT}"):
    if run.state == 'failed':
        for art in run.used_artifacts():
            if art.type == 'job':
                job = art.name
                break
        print(f"{job}::{run.name} || {run}")
        
        