In [14]:
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication
import os
import sys

In [15]:
# Create the training and evaluation datasets.
# This can be run only once.
!{sys.executable} -m pip install datasets
import create_dataset
create_dataset.gsm8k_qa_no_tokens_template()


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [16]:
# Authenticate the CodeFlare SDK
# On OpenShift, you can retrieve the token by running `oc whoami -t`,
# and the server with `oc cluster-info`.
auth = TokenAuthentication(
    token = '',
    server = 'https://api.ac10-ocp.fpb.local:6443',
    skip_tls=True
)
auth.login()



'Logged into https://api.ac10-ocp.fpb.local:6443'

In [17]:
import codeflare_sdk 
codeflare_sdk.list_local_queues()

[{'name': 'ft-ray-queue', 'flavors': ['default-flavor']}]

In [18]:
# Configure the Ray cluster
# NOTE: If running outside of RHOAI notebooks, add the following line to the cluster configuration:
# namespace="rhods-notebooks"
cluster = Cluster(ClusterConfiguration(
    name='ray1',
    num_workers=2,
    worker_cpu_requests=16,
    worker_cpu_limits=16,
    head_cpu_requests=16,
    head_cpu_limits=16,
    worker_memory_requests=128,
    worker_memory_limits=256,
    head_memory_requests=128,
    head_memory_limits=256,
    # Use the following parameters with NVIDIA GPUs
    # Ensure the Python version in the notebook image matches the version used in the Ray cluster to avoid compatibility issues
    image="quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26",
    head_extended_resource_requests={'nvidia.com/gpu':1},
    worker_extended_resource_requests={'nvidia.com/gpu':1},
    # Or replace them with these parameters for AMD GPUs
    # image="quay.io/rhoai/ray:2.35.0-py311-rocm61-torch24-fa26",
    # head_extended_resource_requests={'amd.com/gpu':1},
    # worker_extended_resource_requests={'amd.com/gpu':1},
))

Yaml resources loaded for ray1


VBox(children=(HBox(children=(Button(description='Cluster Up', icon='play', style=ButtonStyle(), tooltip='Crea…

Output()

In [19]:
# Create the Ray cluster
cluster.up()

Ray Cluster: 'ray1' has successfully been created


In [20]:
cluster.wait_ready()

Waiting for requested resources to be set up...
Requested cluster is up and running!
Dashboard is ready!


In [21]:
cluster.details()

RayCluster(name='ray1', status=<RayClusterStatus.READY: 'ready'>, head_cpu_requests=16, head_cpu_limits=16, head_mem_requests='128G', head_mem_limits='256G', num_workers=2, worker_mem_requests='128G', worker_mem_limits='256G', worker_cpu_requests=16, worker_cpu_limits=16, namespace='ft-ray', dashboard='https://ray-dashboard-ray1-ft-ray.apps.ac10-ocp.fpb.local', worker_extended_resources={'nvidia.com/gpu': 1}, head_extended_resources={'nvidia.com/gpu': 1})

In [25]:
# Initialize the Job Submission Client
client = cluster.job_client

In [26]:
# Storage configuration
storage_path = '/opt/app-root/src'

# The S3 bucket where to store checkpoint.
# It can be set manually, otherwise it's retrieved from configured the data connection.
s3_bucket = ''
if not s3_bucket:
    s3_bucket = os.environ.get('AWS_S3_BUCKET')
if s3_bucket:
    storage_path = f's3://{s3_bucket}'

In [27]:
# Submit Ray job
submission_id = client.submit_job(
    entrypoint="python ray_finetune_llm_deepspeed.py "
               "--model-name=meta-llama/Meta-Llama-3.1-8B "
               "--lora "
               "--num-devices=8 "
               "--num-epochs=3 "
               "--ds-config=./deepspeed_configs/zero_3_offload_optim_param.json "
               f"--storage-path={storage_path}/ray_finetune_llm_deepspeed/ "
               "--batch-size-per-device=32 "
               "--eval-batch-size-per-device=32 ",
    runtime_env={
        "env_vars": {
            # Set the following variables if using AWS S3 as storage
            # 'AWS_ACCESS_KEY_ID': os.environ.get('AWS_ACCESS_KEY_ID'),
            # 'AWS_SECRET_ACCESS_KEY': os.environ.get('AWS_SECRET_ACCESS_KEY'),
            # 'AWS_DEFAULT_REGION': os.environ.get('AWS_DEFAULT_REGION'),
            'HF_HOME': f'{storage_path}/.cache'
        },
        'pip': 'requirements.txt',
        'working_dir': './',
        "excludes": ["/docs/", "*.ipynb", "*.md"]
    },
)
print(submission_id)

2025-06-02 03:27:53,806	INFO dashboard_sdk.py:385 -- Package gcs://_ray_pkg_52b0f31952ae7a23.zip already exists, skipping upload.


raysubmit_usLE9m1eahwjzfRC


In [12]:
client.stop_job(submission_id)

True

In [13]:
cluster.down()

Ray Cluster: 'ray1' has successfully been deleted
