In this notebook, we will go over how to leverage the SDK to directly work interactively with a Ray Cluster during development.

In [None]:
# Import pieces from codeflare-sdk
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication

In [None]:
# Create authentication object for user permissions
# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config
# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually
auth = TokenAuthentication(
    token = "sha256~DVnZ1tbr11kPu9ltluH0M_Xa3O_6dEKFZScmvHuio1Y",
    server = "https://api.demo-01-rhsys.wzhlab.top:6443",
    skip_tls= True
)
auth.login()

Once again, let's start by running through the same cluster setup as before:

NOTE: The default images used by the CodeFlare SDK for creating a RayCluster resource depend on the installed Python version:

- For Python 3.9: 'quay.io/modh/ray:2.35.0-py39-cu121'
- For Python 3.11: 'quay.io/modh/ray:2.35.0-py311-cu121'

If you prefer to use a custom Ray image that better suits your needs, you can specify it in the image field to override the default.

In [None]:
# Create and configure our cluster object
# The SDK will try to find the name of your default local queue based on the annotation "kueue.x-k8s.io/default-queue": "true" unless you specify the local queue manually below
cluster_name = "llama-factory-test"
cluster = Cluster(ClusterConfiguration(
    name=cluster_name,
    head_cpu_requests=1,
    head_cpu_limits=1,
    head_memory_requests=6,
    head_memory_limits=6,
    head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests
    worker_extended_resource_requests={'nvidia.com/gpu':0},
    num_workers=2,
    worker_cpu_requests='250m',
    worker_cpu_limits=1,
    worker_memory_requests=4,
    worker_memory_limits=6,
    image="quay.io/wangzheng422/qimgs:llama-factory-ray-20250103-v01", # Optional Field 
    write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources 
    # local_queue="local-queue-name" # Specify the local queue manually
))

In [None]:
# Bring up the cluster
cluster.up()
cluster.wait_ready()

In [None]:
cluster.details()

This time we will demonstrate another potential method of use: working with the Ray cluster interactively.

Using the SDK, we can get both the Ray cluster URI and dashboard URI:

In [None]:
ray_dashboard_uri = cluster.cluster_dashboard_uri()
ray_cluster_uri = cluster.cluster_uri()
print(ray_dashboard_uri)
print(ray_cluster_uri)

Now we can connect directly to our Ray cluster via the Ray python client:

In [None]:
from codeflare_sdk import generate_cert
# Create required TLS cert and export the environment variables to enable TLS
generate_cert.generate_tls_cert(cluster_name, cluster.config.namespace)
generate_cert.export_env(cluster_name, cluster.config.namespace)

In [None]:
# before proceeding make sure the cluster exists and the uri is not empty
assert ray_cluster_uri, "Ray cluster needs to be started and set before proceeding"

import ray

# reset the ray context in case there's already one. 
ray.shutdown()
# establish connection to ray cluster

# install additional libraries that will be required for model training
# runtime_env = {"pip": ["transformers==4.41.2", "datasets==2.17.0", "accelerate==0.31.0", "scikit-learn==1.5.0"]}
runtime_env = {}
# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines
# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb
ray.init(address=ray_cluster_uri, runtime_env=runtime_env)

print("Ray cluster is up and running: ", ray.is_initialized())

Now that we are connected (and have passed in some package requirements), let's try writing some training code:

In [None]:
import os
import subprocess
import re

@ray.remote(memory=4 * 1024 * 1024 * 1024)  # 4 GB memory limit
class NetworkCommandActor:
    def get_eth0_ip(self):
        try:
            result = subprocess.run(['ip', 'a', 'show', 'eth0'], capture_output=True, text=True, check=True)
            ip_match = re.search(r'inet (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', result.stdout)
            if ip_match:
                return ip_match.group(1)
            else:
                return "IP address not found"
        except subprocess.CalledProcessError as e:
            return f"Error getting IP address: {e}"

    def execute_command(self, ip_address, nnodes, node_rank):
        command = f'FORCE_TORCHRUN=1 NNODES={nnodes} NODE_RANK={node_rank} MASTER_ADDR={ip_address} MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml'
        try:
            os.system(command)
            return "Command executed successfully"
        except Exception as e:
            return f"Error executing command: {e}"

Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:

In [None]:
#call the above cell as a remote ray function
actor1 = NetworkCommandActor.remote()
actor2 = NetworkCommandActor.remote()

ip1 = ray.get(actor1.get_eth0_ip.remote())
ip2 = ray.get(actor2.get_eth0_ip.remote())

print(f"Actor 1 IP: {ip1}")
print(f"Actor 2 IP: {ip2}")

# Define nnodes and node_rank
nnodes = 2  # Assuming there are 2 nodes
node_rank1 = 0  # Rank for actor 1
node_rank2 = 1  # Rank for actor 2

# Example of executing command with actor 1's IP
result1 = ray.get(actor1.execute_command.remote(ip1, nnodes, node_rank1))
print(f"Actor 1 command result: {result1}")

# Example of executing command with actor 2's IP
result2 = ray.get(actor2.execute_command.remote(ip2, nnodes, node_rank2))
print(f"Actor 2 command result: {result2}")

Once complete, we can bring our Ray cluster down and clean up:

In [None]:
cluster.down()

In [None]:
auth.logout()