In this notebook, we will go over how to leverage the SDK to directly work interactively with a Ray Cluster during development.

In [17]:
# Import pieces from codeflare-sdk
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication

In [18]:
import codeflare_sdk
print(codeflare_sdk.__version__)

0.23.1


In [19]:
# Create authentication object for user permissions
# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config
# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually
auth = TokenAuthentication(
    token = "sha256~oMdcvXQ6WV5ZxlRwCniQXn9_VzyIGbrs05b8ccrDvbQ",
    server = "https://api.demo-01-rhsys.wzhlab.top:6443",
    skip_tls= True
)
auth.login()



'Logged into https://api.demo-01-rhsys.wzhlab.top:6443'

Once again, let's start by running through the same cluster setup as before:

NOTE: The default images used by the CodeFlare SDK for creating a RayCluster resource depend on the installed Python version:

- For Python 3.9: 'quay.io/modh/ray:2.35.0-py39-cu121'
- For Python 3.11: 'quay.io/modh/ray:2.35.0-py311-cu121'

If you prefer to use a custom Ray image that better suits your needs, you can specify it in the image field to override the default.

In [20]:
# Create and configure our cluster object
# The SDK will try to find the name of your default local queue based on the annotation "kueue.x-k8s.io/default-queue": "true" unless you specify the local queue manually below
cluster_name = "llama-factory-test"
cluster = Cluster(ClusterConfiguration(
    name=cluster_name,
    head_cpu_requests=1,
    head_cpu_limits=1,
    head_memory_requests=6,
    head_memory_limits=6,
    head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests
    worker_extended_resource_requests={'nvidia.com/gpu':0},
    num_workers=2,
    worker_cpu_requests='2',
    worker_cpu_limits=8,
    worker_memory_requests=4,
    worker_memory_limits=6,
    image="quay.io/wangzheng422/qimgs:llama-factory-ray-20250106-v06", # Optional Field 
    write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources 
    # local_queue="local-queue-name" # Specify the local queue manually
    verify_tls=False,
))

Yaml resources loaded for llama-factory-test


VBox(children=(HBox(children=(Button(description='Cluster Up', icon='play', style=ButtonStyle(), tooltip='Creaâ€¦

Output()

In [21]:
# Bring up the cluster
cluster.up()
cluster.wait_ready()

Ray Cluster: 'llama-factory-test' has successfully been created
Waiting for requested resources to be set up...
Requested cluster is up and running!
Dashboard is ready!


In [22]:
cluster.details()

RayCluster(name='llama-factory-test', status=<RayClusterStatus.READY: 'ready'>, head_cpu_requests=1, head_cpu_limits=1, head_mem_requests='6G', head_mem_limits='6G', num_workers=2, worker_mem_requests='4G', worker_mem_limits='6G', worker_cpu_requests='2', worker_cpu_limits=8, namespace='rhods-notebooks', dashboard='https://ray-dashboard-llama-factory-test-rhods-notebooks.apps.demo-01-rhsys.wzhlab.top', worker_extended_resources={'nvidia.com/gpu': 0}, head_extended_resources={'nvidia.com/gpu': 0})

This time we will demonstrate another potential method of use: working with the Ray cluster interactively.

Using the SDK, we can get both the Ray cluster URI and dashboard URI:

In [23]:
ray_dashboard_uri = cluster.cluster_dashboard_uri()
ray_cluster_uri = cluster.cluster_uri()
print(ray_dashboard_uri)
print(ray_cluster_uri)

https://ray-dashboard-llama-factory-test-rhods-notebooks.apps.demo-01-rhsys.wzhlab.top
ray://llama-factory-test-head-svc.rhods-notebooks.svc:10001


Now we can connect directly to our Ray cluster via the Ray python client:

In [24]:
from codeflare_sdk import generate_cert
# Create required TLS cert and export the environment variables to enable TLS
generate_cert.generate_tls_cert(cluster_name, cluster.config.namespace)
generate_cert.export_env(cluster_name, cluster.config.namespace)

In [25]:
# before proceeding make sure the cluster exists and the uri is not empty
assert ray_cluster_uri, "Ray cluster needs to be started and set before proceeding"

import ray

# reset the ray context in case there's already one. 
ray.shutdown()
# establish connection to ray cluster

# install additional libraries that will be required for model training
# runtime_env = {"pip": ["transformers==4.41.2", "datasets==2.17.0", "accelerate==0.31.0", "scikit-learn==1.5.0"]}
runtime_env = {}
# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines
# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb
ray.init(address=ray_cluster_uri, runtime_env=runtime_env, ignore_reinit_error=True)

print("Ray cluster is up and running: ", ray.is_initialized())

2025-01-07 16:29:14,056	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: ignore_reinit_error
SIGTERM handler is not set because current thread is not the main thread.


Ray cluster is up and running:  True


Now that we are connected (and have passed in some package requirements), let's try writing some training code:

In [26]:
import os
import subprocess
import re

@ray.remote(memory=4 * 1024 * 1024 * 1024)  # 4 GB memory limit
class NetworkCommandActor:
    def get_eth0_ip(self):
        try:
            result = subprocess.run(['ip', 'a', 'show', 'eth0'], capture_output=True, text=True, check=True)
            ip_match = re.search(r'inet (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', result.stdout)
            if ip_match:
                return ip_match.group(1)
            else:
                return "IP address not found"
        except subprocess.CalledProcessError as e:
            return f"Error getting IP address: {e}"
    def execute_short_command(self, ip_address, nnodes, node_rank):
        # command = f'source /opt/py_env/bin/activate; cd /app; llamafactory-cli train wzh/tinyllama_lora_sft.yaml'
        command = f'llamafactory-cli train wzh/tinyllama_lora_sft.yaml'
        return self._run_command_in_host_env(command)

    def execute_command(self, ip_address, nnodes, node_rank):
        # command = f'source /opt/py_env/bin/activate; cd /app; FORCE_TORCHRUN=1 NNODES={nnodes} NODE_RANK={node_rank} MASTER_ADDR={ip_address} MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft.yaml'
        command = f'FORCE_TORCHRUN=1 NNODES={nnodes} NODE_RANK={node_rank} MASTER_ADDR={ip_address} MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft.yaml'
        return self._run_command_in_host_env(command)

    def _run_command_in_host_env(self, command):
        try:
            # Run the command in a new shell
            result = subprocess.run(command, shell=True, capture_output=True, text=True)
            if result.returncode == 0:
                return result.stdout  # Command executed successfully
            else:
                return f"Error executing command: {result.stderr}"
        except Exception as e:
            return f"Unexpected error: {e}"


Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:

In [27]:
#call the above cell as a remote ray function
actor1 = NetworkCommandActor.remote()
actor2 = NetworkCommandActor.remote()

ip1 = ray.get(actor1.get_eth0_ip.remote())
ip2 = ray.get(actor2.get_eth0_ip.remote())

print(f"Actor 1 IP: {ip1}")
print(f"Actor 2 IP: {ip2}")

# Define nnodes and node_rank
nnodes = 2  # Assuming there are 2 nodes
node_rank1 = 0  # Rank for actor 1
node_rank2 = 1  # Rank for actor 2

# Call the remote functions to execute commands
result1_future = actor1.execute_command.remote(ip1, nnodes, node_rank1)
result2_future = actor2.execute_command.remote(ip1, nnodes, node_rank2)

# Retrieve the results (will wait for both to complete)
result1 = ray.get(result1_future)
result2 = ray.get(result2_future)

print(f"Actor 1 command result: {result1}")
print(f"Actor 2 command result: {result2}")

Actor 1 IP: 10.132.0.152
Actor 2 IP: 10.132.0.153
[2025-01-07 16:29:35,124] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cpu (auto detect)
[INFO|2025-01-07 16:29:39] llamafactory.cli:157 >> Initializing distributed tasks at: 10.132.0.152:29500
[2025-01-07 16:29:50,354] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cpu (auto detect)
[INFO|2025-01-07 16:29:52] llamafactory.hparams.parser:355 >> Process rank: 0, device: cpu:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
[INFO|2025-01-07 16:29:52] llamafactory.data.template:157 >> Add pad token: </s>
[INFO|2025-01-07 16:29:52] llamafactory.data.loader:157 >> Loading dataset identity.json...
[INFO|2025-01-07 16:29:53] llamafactory.data.loader:157 >> Loading dataset alpaca_en_demo.json...
training example:
input_ids:
[1, 836, 1389, 2313, 31908, 23980, 836, 31873, 1389, 2313, 31908, 16644, 31905, 312, 705, 16717, 3227, 28035, 363, 7421, 8825, 3321, 417, 16717, 

Once complete, we can bring our Ray cluster down and clean up:

In [15]:
cluster.down()

Ray Cluster: 'llama-factory-test' has successfully been deleted


In [16]:
auth.logout()

'Successfully logged out of https://api.demo-01-rhsys.wzhlab.top:6443'