In [None]:
!oc login --token=sha256~7gVymlwTvU58Ela9mbMpuWnx-2CGZ64z7cdIR-c9AnM --server=https://api.mini2.mydomain.com:6443

# Create custom image

In [None]:
!oc -n huggingface new-build --name custom-mnist-image --code https://github.com/thinkahead/rhods-notebooks --context-dir batch-job/custom-image

In [None]:
!oc wait --for=condition=complete build.build.openshift.io/custom-mnist-image-1 -n huggingface --timeout=600s

In [None]:
#!oc get bc,build,is,pods -n huggingface
#!oc logs build/custom-mnist-image-1 -n huggingface
!oc delete bc -n huggingface --selector build=custom-mnist-image

# Create and Submit the Job

In [None]:
# Import pieces from codeflare-sdk
from codeflare_sdk.cluster.auth import TokenAuthentication
from codeflare_sdk.job.jobs import DDPJobDefinition

First, we begin by authenticating using the SDK.

In [None]:
# Create authentication object for oc user permissions
auth = TokenAuthentication(
    token = "sha256~7gVymlwTvU58Ela9mbMpuWnx-2CGZ64z7cdIR-c9AnM",
    server = "https://api.mini2.mydomain.com:6443",
    skip_tls=True
)
auth.login()

Now that we are logged in, we can directly submit our batch job (model training on two workers with four gpus each) to MCAD via torchx.

In [None]:
import os
jobdef = DDPJobDefinition(
    name="mnistjob",
    script="mnist_fashion.py",
    scheduler_args={"namespace": "huggingface"},
    j="3x1",
    gpu=1,
    cpu=1,
    memMB=8000,
    env={'AWS_ACCESS_KEY_ID':os.environ.get('AWS_ACCESS_KEY_ID'),
         'AWS_SECRET_ACCESS_KEY':os.environ.get('AWS_SECRET_ACCESS_KEY'),
         'AWS_S3_ENDPOINT':os.environ.get('AWS_S3_ENDPOINT'),
         'OUTPUT_PATH':'saved/mymodel.onnx'},
    image="image-registry.openshift-image-registry.svc:5000/huggingface/custom-mnist-image:latest"
    #image="quay.io/michaelclifford/mnist-test:latest"
)
job = jobdef.submit()

Now we can go ahead and look at the status and logs of our batch job.

In [None]:
job.status()

In [None]:
print(job.logs())

Finally, we can remove the job once complete and release/terminate the associated resources, bringing everything back to the way it was before job submission.

In [None]:
job.cancel()

and delete the custom image if it is no longer required

In [None]:
!oc delete is -n huggingface --selector build=custom-mnist-image

In [None]:
auth.logout()

# Fetch the Fashion-MNIST dataset

In [None]:
!pip install scikit-learn matplotlib

In [None]:
from sklearn.datasets import fetch_openml
import numpy as np
import json
import requests
X, y = fetch_openml('Fashion-MNIST', return_X_y=True, parser='auto')

# Predictions from onnx model served using ModelMesh

In [None]:
model_name="mymodel"

import requests
import json
URL='http://modelmesh-serving.huggingface.svc.cluster.local:8008/v2/models/'+model_name+'/infer' # underscore characters are removed
headers = {}
payload = {
        "inputs": [{ "name": "input_0", "shape": (20,1,28,28), "datatype": "FP32", "data": X.loc[0:19].values.flatten().tolist()}]
    }
#print(payload)
headers = {"content-type": "application/json"}
res = requests.post(URL, json=payload, headers=headers)
print(res)
#print(res.text)

In [None]:
results=np.argmax(np.array(res.json()['outputs'][0]['data']).reshape(res.json()['outputs'][0]['shape']),axis=1).tolist()

In [None]:
# classes of fashion mnist dataset
classes = ['T-shirt/top','Trouser','Pullover','Dress','Coat','Sandal','Shirt','Sneaker','Bag','Ankle Boot']
# plotting the results
fig = plt.figure(figsize=(25,4))
for imagenum in range(20):
    expected=int(y.iloc[imagenum])
    actual=results[imagenum]
    ax = fig.add_subplot(2, 10, imagenum+1, xticks=[], yticks=[])
    ax.imshow(np.array(X.iloc[imagenum, 0:]).reshape((28, 28)), cmap='gray')
    ax.set_title("{} ({})".format(classes[expected], classes[actual]),color=("green" if expected==actual else "red"))

# Submit gRPC request to the ModelMesh for batch of samples

In [None]:
!wget https://raw.githubusercontent.com/kserve/modelmesh-serving/main/fvt/proto/kfs_inference_v2.proto
!python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. ./kfs_inference_v2.proto

In [None]:
model_name="mymodel"
payload = { "model_name": model_name,
            "inputs": [{ "name": "input_0", "shape": (5,1,28,28), "datatype": "FP32", "contents": {"fp32_contents":X.loc[0:4].values.flatten().tolist()}}]
    }

In [None]:
import grpc
import kfs_inference_v2_pb2, kfs_inference_v2_pb2_grpc
grpc_url="modelmesh-serving.huggingface.svc.cluster.local:8033"
request=kfs_inference_v2_pb2.ModelInferRequest(model_name=model_name,inputs=payload["inputs"])
grpc_channel = grpc.insecure_channel(grpc_url)
grpc_stub = kfs_inference_v2_pb2_grpc.GRPCInferenceServiceStub(grpc_channel)
response = grpc_stub.ModelInfer(request)

In [None]:
print(type(response.outputs),type(response.raw_output_contents))
from google.protobuf.json_format import MessageToDict
d = MessageToDict(response.outputs[0])
print(d)
binary_data=bytes([x for x in response.raw_output_contents[0]])

In [None]:
import struct
import base64
FLOAT = 'f'
fmt = '<' + FLOAT * (len(binary_data) // struct.calcsize(FLOAT))
numbers = [str(n) for n in np.argmax(np.array(struct.unpack(fmt, binary_data)).reshape(*[int(shapeval) for shapeval in d['shape']]),axis=1)]
print("Expected",y[0:5].values.tolist(),"Actual",numbers)