In [1]:
# import tensorflow as tf

# model = tf.keras.applications.InceptionV3(
#     include_top=True,
#     weights="imagenet",
#     input_tensor=None,
#     input_shape=None,
#     pooling=None,
#     classes=1000,
#     classifier_activation="softmax",
# )

# model.save('inceptionv3')

# import wandb
# with wandb.init(project='trt-testing') as run:
#     art = wandb.Artifact('inceptionv3', 'inceptionv3')
#     art.add_dir('inceptionv3')
#     run.log_artifact(art)

In [3]:
from pathlib import Path
import yaml
import wandb
from wandb.sdk.internal.internal_api import Api as InternalApi
from wandb.sdk.launch import launch, launch_add
import re
import boto3
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
import platform


def image_name_to_artifact_name(s, alias="latest"):
    result = re.sub(r"[:/]", "_", s)
    result = f"job-{result}:{alias}"
    return result


def _traverse_dict(d, path=None):
    if path is None:
        path = []
    output = {}
    for k, v in d.items():
        new_path = path + [k]
        if isinstance(v, dict):
            if "name" in v and "desc" in v:
                output["/".join(new_path)] = {"name": v["name"], "desc": v["desc"]}
            else:
                output.update(_traverse_dict(v, new_path))
    return output


def get_registry(fname: str = "registry.yaml"):
    with open(fname) as f:
        registry = yaml.safe_load(f)
        return _traverse_dict(registry)


api = wandb.Api()
iapi = InternalApi()
is_m1 = platform.machine() == "arm64" and platform.system() == "Darwin"

entity = "megatruong"
project = "example-launch-jobs"
queue_name = "andrew-cpu"

# git commit sha
tag = "5a0b4abc7b178cf28279c7f7a725d8c801792988"

# Point to env vars and creds to be mounted
resource_args = {
    "local-container": {
        "net": "host",
        "env-file": "/Users/andrewtruong/.wandb_launch/env.list",
        "volume": [
            "/Users/andrewtruong/.aws:/home/andrewtruong/.aws:ro",
            "/Users/andrewtruong/.aws:/root/.aws:ro",
        ],
    }
}


kwargs_list = []
registry = get_registry()
for job_dir, metadata in registry.items():
    *dir_parts, job_name = job_dir.split("/")
    img = f"wandb/job_{job_name}:{tag}"

    if is_m1:
        if job_name == "sql_query":
            wandb.termwarn(
                "SQL Query Job cannot be run on M1 due to upstream connectorx issue."
            )
            continue
    
        elif job_name.startswith('gpu_'):
            wandb.termwarn(f"Job {job_name} requires an Nvidia GPU.")
            continue
        
        elif job_name == 'openai_evals':
            wandb.termwarn("OpenAI Evals are bugged")
            continue
    
    # Create example runs for each job
    configs_path = Path("jobs", *dir_parts, job_name, "configs")
    for path in configs_path.glob("*.yml"):
        with open(path) as f:
            config = yaml.safe_load(f)

        # launch_add.launch_add(
        #     docker_image=img,
        #     name=config.get("run_name"),
        #     config={"overrides": {"run_config": config.get("config", {})}},
        #     queue_name=queue_name,
        #     entity=entity,
        #     project=project,
        # )

        launch.run(
            iapi,
            docker_image=img,
            name=config["run_name"],
            config={"overrides": {"run_config": config["config"]}},
            resource="local-container",
            resource_args=resource_args,
            entity=entity,
            project=project
        )

#         kwargs = {
#             "entity": entity,
#             "project": project,
#             "name": config.get("run_name"),
#             "docker_image": img,
#             "config": {"overrides": {"run_config": config.get("config", {})}},
#             "resource": "local-container",
#             "resource_args": resource_args,
#         }
#         kwargs_list.append(kwargs)


# with ThreadPoolExecutor(6) as exc:
#     futures = {
#         exc.submit(launch.run, iapi, **kwargs): i
#         for i, kwargs in enumerate(kwargs_list)
#     }
#     for future in tqdm(as_completed(futures), total=len(futures)):
#         pass  # launch wont raise any errors; need to check later

# 4. Cleanup sagemaker resources

sagemaker = boto3.client("sagemaker")
response = sagemaker.list_endpoints()
endpoints = response["Endpoints"]

for endpoint in endpoints:
    try:
        sagemaker.delete_endpoint(EndpointName=endpoint["EndpointName"])
    except Exception as e:
        print(e)
        wandb.termerror(f"Problem deleting {endpoint['EndpointName']}")
    else:
        wandb.termlog(f"Successfully deleted {endpoint['EndpointName']}")


[34m[1mwandb[0m: [35mlaunch:[0m 🚀 Launching run into megatruong/example-launch-jobs
[34m[1mwandb[0m: [35mlaunch:[0m 🚀 Launching run into megatruong/example-launch-jobs
[34m[1mwandb[0m: [35mlaunch:[0m 🚀 Launching run into megatruong/example-launch-jobs
[34m[1mwandb[0m: [35mlaunch:[0m 🚀 Launching run into megatruong/example-launch-jobs
[34m[1mwandb[0m: [35mlaunch:[0m 🚀 Launching run into megatruong/example-launch-jobs
[34m[1mwandb[0m: [35mlaunch:[0m 🚀 Launching run into megatruong/example-launch-jobs


  0%|          | 0/9 [00:00<?, ?it/s]

[34m[1mwandb[0m: [35mlaunch:[0m Launching run in docker with command: docker run --rm -e WANDB_BASE_URL=https://api.wandb.ai -e WANDB_API_KEY -e WANDB_PROJECT=example-launch-jobs -e WANDB_ENTITY=megatruong -e WANDB_LAUNCH=True -e WANDB_RUN_ID=5knsvabf -e WANDB_DOCKER=wandb/job_deploy_to_sagemaker_endpoints:5a0b4abc7b178cf28279c7f7a725d8c801792988 -e WANDB_NAME='Deploy PyTorch Model' -e WANDB_CONFIG='{"artifact": "wandb-artifact://megatruong/ptl-testing2/model-vgw632i7:v0", "framework": "pytorch", "framework_version": "1.12", "python_version": "py38", "sagemaker_role": "arn:aws:iam::687678353814:role/sagemaker", "sagemaker_bucket": "sagemaker-us-west-2-687678353814", "instance_type": "ml.c5.xlarge", "instance_count": 1, "sagemaker_model_setup_kwargs": {}, "sagemaker_model_deployment_kwargs": {}}' -e WANDB_ARTIFACTS='{}' --net host --env-file /Users/andrewtruong/.wandb_launch/env.list --volume /Users/andrewtruong/.aws:/home/andrewtruong/.aws:ro --volume /Users/andrewtruong/.aws:/roo

-

wandb: Generated config at: overloaded_config.pbtxt
wandb: Waiting for W&B process to finish... (failed 1). Press Control-C to abort syncing.
wandb: \ 0.006 MB of 0.009 MB uploaded (0.000 MB deduped)

-

wandb: 🚀 View run Deploy Ensemble Model at: https://wandb.ai/megatruong/example-launch-jobs/runs/jgj7pb8m
wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)
wandb: Find logs at: ./wandb/run-20230608_042418-jgj7pb8m/logs
Traceback (most recent call last):
  File "/launch/job.py", line 145, in <module>
    client.load_model(model_name, config=json.dumps(triton_configs))
  File "/usr/local/lib/python3.9/site-packages/tritonclient/http/_client.py", line 652, in load_model
    response = self._post(request_uri=request_uri,
  File "/usr/local/lib/python3.9/site-packages/tritonclient/http/_client.py", line 284, in _post
    response = self._client_stub.post(request_uri=request_uri,
  File "/usr/local/lib/python3.9/site-packages/geventhttpclient/client.py", line 272, in post
    return self.request(METHOD_POST, request_uri, body=body, headers=headers)
  File "/usr/local/lib/python3.9/site-packages/geventhttpclient/client.py", line 253, in request
    response 

--

wandb: Waiting for W&B process to finish... (failed 1). Press Control-C to abort syncing.
wandb: 🚀 View run Deploy TensorFlow Model at: https://wandb.ai/megatruong/example-launch-jobs/runs/i0cv77z8
wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)
wandb: Find logs at: ./wandb/run-20230608_042418-i0cv77z8/logs
Traceback (most recent call last):
  File "/launch/job.py", line 145, in <module>
    client.load_model(model_name, config=json.dumps(triton_configs))
  File "/usr/local/lib/python3.9/site-packages/tritonclient/http/_client.py", line 652, in load_model
    response = self._post(request_uri=request_uri,
  File "/usr/local/lib/python3.9/site-packages/tritonclient/http/_client.py", line 284, in _post
    response = self._client_stub.post(request_uri=request_uri,
  File "/usr/local/lib/python3.9/site-packages/geventhttpclient/client.py", line 272, in post
    return self.request(METHOD_POST, request_uri, body=body, headers=headers)
  File "/usr/local

!

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
wandb: sagemaker job: Successfully deployed endpoint: tensorflow-inference-2023-06-08-04-24-51-369
The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
wandb: Waiting for W&B process to finish... (success).
wandb: | 0.004 MB of 0.007 MB uploaded (0.000 MB deduped)
wandb: Run summary:
wandb: sagemaker_endpoint tensorflow-inference...
wandb: 
wandb: 🚀 View run Deploy TensorFlow Model at: https://wandb.ai/megatruong/example-launch-jobs/runs/4yq91mhe
wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)
wandb: Find logs at: ./wandb/run-20230608_042408-4yq91mhe/logs


---!

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
wandb: sagemaker job: Successfully deployed endpoint: pytorch-inference-2023-06-08-04-26-01-962
The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
wandb: Waiting for W&B process to finish... (success).
wandb: \ 0.004 MB of 0.004 MB uploaded (0.000 MB deduped)
wandb: Run summary:
wandb: sagemaker_endpoint pytorch-inference-20...
wandb: 
wandb: 🚀 View run Deploy PyTorch Model at: https://wandb.ai/megatruong/example-launch-jobs/runs/5knsvabf
wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)
wandb: Find logs at: ./wandb/run-20230608_042408-5knsvabf/logs
[34m[1mwandb[0m: Successfully deleted pytorch-inference-2023-06-08-04-26-01-962
[34m[1mwandb[0m: Successfully deleted tensorflow-inference-2023-06-08-04-24-51-369
