In [None]:
# Import pieces from codeflare-sdk
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication
from codeflare_sdk.job.jobs import DDPJobDefinition

In [None]:
# Create authentication object for oc user permissions
auth = TokenAuthentication(
    token = "sha256~jnyicQGsSMtmoyxBWyuhH-2_Av4KmeQ63IiaDCsE1mY",
    server = "https://api.mini2.mydomain.com:6443",
    skip_tls=True
)
auth.login()

Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper).

In [None]:
# Create our cluster and submit appwrapper (reduce specs as desired)
cluster = Cluster(ClusterConfiguration(
    name='mnisttest',
    namespace='batch-mnist',
    image="quay.io/thinkahead/base:ray2.1.0-py38-gpu-pytorch1.12.0cu117-20230419-1",
    min_worker=2,
    max_worker=3,
    min_cpus=8,
    max_cpus=8,
    min_memory=16,
    max_memory=16,
    gpu=1,
    instascale=False # Can be set to false if scaling not needed
))

Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster.

In [None]:
# Bring up the cluster
cluster.up()

Now, we want to check on the status of our resource cluster, and wait until it is finally ready for use.

In [None]:
cluster.status()

In [None]:
cluster.wait_ready()

In [None]:
cluster.status()

Let's quickly verify that the specs of the cluster are as expected.

In [None]:
cluster.details()

Now that our resource cluster is ready, we can directly submit our batch job (model training on three workers with 1 gpu each) to the cluster via torchx.

In [None]:
import os
jobdef = DDPJobDefinition(
    name="cifar10",
    script="cifar10.py",
    env={'AWS_ACCESS_KEY_ID':os.environ.get('AWS_ACCESS_KEY_ID'),
         'AWS_SECRET_ACCESS_KEY':os.environ.get('AWS_SECRET_ACCESS_KEY'),
         'AWS_S3_ENDPOINT':os.environ.get('AWS_S3_ENDPOINT'),
         'OUTPUT_PATH':'saved/cifar10.onnx'},
    #scheduler_args={"requirements": "requirements.txt"}
)
job = jobdef.submit(cluster)

Now we can go ahead and look at the status and logs of our batch job.

In [None]:
job.status()

In [None]:
print(job.logs())

Finally, we bring our resource cluster down and release/terminate the associated resources, bringing everything back to the way it was before our cluster was brought up.

In [None]:
cluster.down()

In [None]:
auth.logout()

# Load the CIFAR10 dataset using torchvision

In [None]:
!pip install matplotlib

In [None]:
from torchvision.datasets import CIFAR10
trainX=CIFAR10("..", train=True, download=True)
testX=CIFAR10("..", train=False, download=True)

Visualize first 25 images

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
# label names of the images
label_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

# define rows and columns of figure
rows, columns = 5, 5
fig=plt.figure(figsize=(10, 10))
# visualize these first 25 images
for i in range(1, columns*rows +1):
    fig.add_subplot(rows, columns, i)
    img,label=testX[i-1]
    #plt.imshow(np.array(img).transpose(1,2,0).reshape(32,32,3))
    plt.imshow(img)
    plt.xticks([])
    plt.yticks([])
    plt.title("{}".format(label_names[label]))
plt.show()

Check the normalization

In [None]:
img, label = testX[99]

In [None]:
plt.imshow(img)

In [None]:
# Convert the PIL image to a PyTorch tensor using ToTensor() and plot the pixel values of this tensor image. 
# We define our transform function to convert the PIL image to a PyTorch tensor image.
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
img, label = testX[99]
# define custom transform function
transform = transforms.Compose([
    transforms.ToTensor()
])
 
# transform the pIL image to tensor
# image
img_tr = transform(img)
 
# Convert tensor image to numpy array
img_np = np.array(img_tr)
 
# plot the pixel values
plt.hist(img_np.ravel(), bins=50, density=True)
plt.xlabel("pixel values")
plt.ylabel("relative frequency")
plt.title("distribution of pixels")

# calculate mean and std
mean, std = img_tr.mean([1,2]), img_tr.std([1,2])
 
# print mean and std
print("mean and std before normalize:")
print("Mean of the image:", mean)
print("Std of the image:", std)

In [None]:
from torchvision import transforms
 
# define custom transform
# here we are using our calculated
# mean & std
transform_norm = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
 
# get normalized image
img_normalized = transform_norm(img)
 
# convert normalized image to numpy
# array
img_np = np.array(img_normalized)
 
# plot the pixel values
plt.hist(img_np.ravel(), bins=50, density=True)
plt.xlabel("pixel values")
plt.ylabel("relative frequency")
plt.title("distribution of pixels")

# Python code to calculate mean and std
# of normalized image
 
# get normalized image
img_nor = transform_norm(img)
 
# cailculate mean and std
mean, std = img_nor.mean([1,2]), img_nor.std([1,2])
 
# print mean and std
print("Mean and Std of normalized image:")
print("Mean of the image:", mean)
print("Std of the image:", std)

# Here we find that after normalization the values of mean and std are 0.0 and 1.0 respectively. 
# This verifies that after normalize the image mean and standard deviation becomes 0 and 1 respectively.

In [None]:
# Python Code to visualize normalized image
 
# get normalized image
img_normalized = transform_norm(img)
 
# convert this image to numpy array
img_normalized = np.array(img_normalized)
 
# transpose from shape of (3,,) to shape of (,,3)
img_normalized = img_normalized.transpose(1, 2, 0)
 
# display the normalized image
plt.imshow(img_normalized)
plt.xticks([])
plt.yticks([])

Copy cifar10.onnx from the Ray pod to this notebook

# Load the onnx model

In [None]:
model_name='cifar10' # torch.onnx.export with batch
model_file_name=model_name+".onnx"

In [None]:
import onnx
import onnxruntime
import torch
import numpy as np
session = onnxruntime.InferenceSession(model_file_name, None, providers=['CPUExecutionProvider'])
input_name = session.get_inputs()[0].name
print("input name", input_name)
input_shape = session.get_inputs()[0].shape
print("input shape", input_shape)
input_type = session.get_inputs()[0].type
print("input type", input_type)
print([i.name for i in session.get_outputs()])
output_name = session.get_outputs()[0].name
print("output name", output_name)
output_shape = session.get_outputs()[0].shape
print("output shape", output_shape)
output_type = session.get_outputs()[0].type
print("output type", output_type)

# Inferencing using onnx model

In [None]:
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
 
# define custom transform function
transform = transforms.Compose([
    #transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5),(0.5, 0.5,0.5))
    transforms.ToTensor(),transforms.Normalize(mean=(0.49139968, 0.48215827 ,0.44653124),std=(0.24703233, 0.24348505, 0.26158768))
])

count=0
total_count=25
label_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

# define rows and columns of figure
rows, columns = 5, 5
fig=plt.figure(figsize=(10, 10))
# visualize these first 25 images
for i in range(1, columns*rows +1):
    fig.add_subplot(rows, columns, i)
    img,expected=testX[i-1]
    result = session.run([i.name for i in session.get_outputs()], {input_name:np.array(transform(np.array(img))).reshape(1,3,32,32)})
    actual=np.argmax(result)
    #plt.imshow(np.array(img).transpose(1,2,0).reshape(32,32,3))
    plt.imshow(img)
    plt.xticks([])
    plt.yticks([])
    plt.title("{} ({})".format(label_names[expected],label_names[actual]),color=("green" if expected==actual else "red"))
    if actual!=expected: count+=1
    #print("Expected",expected,"Actual",actual,actual==expected)
print('Accuracy:',(1-count/total_count))
plt.show()

# Fetch the CIFAR10 dataset from scikit-learn (Optional)

In [None]:
!pip install scikit-learn matplotlib

In [None]:
from sklearn.datasets import fetch_openml
import numpy as np
import json
import requests
X, y = fetch_openml('CIFAR_10', return_X_y=True, parser='auto')

## Draw few images

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

# reshape and transpose the images
images = np.array(X[0:25]).reshape(25,3,32,32).transpose(0,2,3,1)
# take labels of the images 
labels = y
# label names of the images
label_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

# define rows and columns of figure
rows, columns = 5, 5
fig=plt.figure(figsize=(10, 10))
# visualize these first 25 images
for i in range(1, columns*rows +1):
    fig.add_subplot(rows, columns, i)
    plt.imshow(images[i-1].reshape(32,32,3))
    plt.xticks([])
    plt.yticks([])
    plt.title("{}".format(label_names[int(y.iloc[i-1])]))
plt.show()

# Copy onnx model to S3 bucket if not already copied

In [None]:
import os
import boto3
from boto3 import session
key_id = os.environ.get('AWS_ACCESS_KEY_ID')
secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
endpoint_url = os.environ.get('AWS_S3_ENDPOINT')
session = boto3.session.Session(aws_access_key_id=key_id, aws_secret_access_key=secret_key)
s3_client = boto3.client('s3', aws_access_key_id=key_id, aws_secret_access_key=secret_key,endpoint_url=endpoint_url,verify=False)
buckets=s3_client.list_buckets()
for bucket in buckets['Buckets']: print(bucket['Name'])
s3_client.upload_file(model_file_name, bucket['Name'],model_file_name)
[item.get("Key") for item in s3_client.list_objects_v2(Bucket=bucket['Name']).get("Contents")]

# Convert the model from onnx to OpenVINO IR and copy to S3 bucket

In [None]:
!pip install openvino-dev
!mo --input_model cifar10.onnx

In [None]:
import os
import boto3
from boto3 import session
key_id = os.environ.get('AWS_ACCESS_KEY_ID')
secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
endpoint_url = os.environ.get('AWS_S3_ENDPOINT')
session = boto3.session.Session(aws_access_key_id=key_id, aws_secret_access_key=secret_key)
s3_client = boto3.client('s3', aws_access_key_id=key_id, aws_secret_access_key=secret_key,endpoint_url=endpoint_url,verify=False)
buckets=s3_client.list_buckets()
s3_client.upload_file(model_name+".bin", bucket['Name'],model_name+"/"+model_name+".bin")
s3_client.upload_file(model_name+".xml", bucket['Name'],model_name+"/"+model_name+".xml")
[item.get("Key") for item in s3_client.list_objects_v2(Bucket=bucket['Name']).get("Contents")]

Deploy the model in your Data Science project using RHODS UI under "Models and model servers"

# Submit HTTP REST request to the ModelMesh for single sample

In [None]:
model_name="cifar10"

import requests
import json
URL='http://modelmesh-serving.huggingface.svc.cluster.local:8008/v2/models/'+model_name+'/infer' # underscore characters are removed
headers = {"content-type": "application/json"}
for imagenum in range(10):
    img, label = testX[imagenum]
    arr=transform(img).reshape(1,3,32,32)
    payload = {
        "inputs": [{ "name": "input_0", "shape": (1,3,32,32), "datatype": "FP32", "data": arr.tolist()}]
    }
    res = requests.post(URL, json=payload, headers=headers)
    #print(res)
    #print(res.text)
    print("Expected",label_names[label],", Actual",label_names[np.argmax(res.json()['outputs'][0]['data'])])

# Submit HTTP REST request to the ModelMesh for a batch of samples

In [None]:
arr=[np.array(transform(testX[i][0])) for i in range(0,10)]
actual=[testX[i][1] for i in range(0,10)]
arr=np.array(arr)
payload = {
        "inputs": [{ "name": "input_0", "shape": (10,3,32,32), "datatype": "FP32", "data": arr.tolist()}]
}
res = requests.post(URL, json=payload, headers=headers)

In [None]:
expected=np.argmax(np.array(res.json()['outputs'][0]['data']).reshape(res.json()['outputs'][0]['shape']),axis=1).tolist()
print(actual)
print(expected)

# Submit gRPC request to the ModelMesh for single sample

In [None]:
!pip install grpcio grpcio-tools==1.46.0

In [None]:
!wget https://raw.githubusercontent.com/kserve/modelmesh-serving/main/fvt/proto/kfs_inference_v2.proto
!python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. ./kfs_inference_v2.proto

In [None]:
model_name="cifar10"
img, label = testX[0]
print(img,label)
arr=transform(img).reshape(1,3,32,32)
payload = { "model_name": model_name,
            "inputs": [{ "name": "input_0", "shape": (1,3,32,32), "datatype": "FP32", "contents": {"fp32_contents":arr.flatten().tolist()}}]
    }

In [None]:
import grpc
import kfs_inference_v2_pb2, kfs_inference_v2_pb2_grpc
grpc_url="modelmesh-serving.huggingface.svc.cluster.local:8033"
request=kfs_inference_v2_pb2.ModelInferRequest(model_name=model_name,inputs=payload["inputs"])
grpc_channel = grpc.insecure_channel(grpc_url)
grpc_stub = kfs_inference_v2_pb2_grpc.GRPCInferenceServiceStub(grpc_channel)
response = grpc_stub.ModelInfer(request)

In [None]:
print(type(response.outputs),type(response.raw_output_contents))
from google.protobuf.json_format import MessageToDict
d = MessageToDict(response.outputs[0])
print(d)
binary_data=bytes([x for x in response.raw_output_contents[0]])

In [None]:
import struct
import base64
FLOAT = 'f'
fmt = '<' + FLOAT * (len(binary_data) // struct.calcsize(FLOAT))
print("Expected",label,"Actual",np.argmax(np.array(struct.unpack(fmt, binary_data))))