In [1]:
import ray, json, time

In [2]:
def pprint(data):
    print(json.dumps(data, indent=4))

### CONNECT TO THE RAY CLUSTER

In [3]:
ray.init(address='ray://193.167.37.127:10001')

2024-11-11 06:21:39,413	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: log_to_driver
    Ray: 2.38.0
    Python: 3.12.7
This process on Ray Client was started with:
    Ray: 2.38.0
    Python: 3.12.3



0,1
Python version:,3.12.7
Ray version:,2.38.0
Dashboard:,http://10.151.173.95:8265


### WHAT HARDWARE DOES THE CLUSTER HAVE AVAILABLE?

In [4]:
pprint(ray.available_resources())

{
    "accelerator_type:G": 4.0,
    "node:__internal_head__": 1.0,
    "node:10.84.60.60": 1.0,
    "node:10.130.233.68": 1.0,
    "CPU": 55.0,
    "memory": 115816633344.0,
    "object_store_memory": 34444921650.0,
    "node:10.84.60.62": 1.0,
    "GPU": 4.0,
    "node:10.151.173.95": 1.0,
    "node:10.151.173.100": 1.0
}


### COMPUTE A LAMBDA FUNC ON THE CLUSTER

In [7]:
@ray.remote
def func():
    temp = 'foo'
    return temp

future = func.remote()
return_value = ray.get(future)

In [6]:
future

ClientObjectRef(b85750e2b7774ee6ffffffffffffffffffffffff0d00000001000000)

In [8]:
print(return_value)

foo


### EXECUTE MANY LAMBDA FUNCS IN PARALLEL ON THE CLUSTER

In [18]:
@ray.remote
def parallel_func():
    import random, time

    # SLEEP FOR 1 SECOND TO DEMONSTRATE PARALLELISM
    time.sleep(1)
    return random.uniform(0, 1)

# LAUNCH 10 PARALLEL TASKS & WAIT FOR EACH ONE TO FINISH
start_time = time.time()
tasks = [parallel_func.remote() for x in range(10)]

print(ray.get(tasks))
time_delta = round(time.time() - start_time, 3)
print(f'\nEXPERIMENT TOOK {time_delta} SECONDS')

[0.545285612897436, 0.029824888231838154, 0.9125729910519942, 0.21183493423655753, 0.9021293484819745, 0.18196141929122478, 0.007404437705985423, 0.9669807054804704, 0.09714340487997652, 0.7229129538020123]

EXPERIMENT TOOK 1.069 SECONDS


### SCHEDULE A CLASS INSTANCE ON THE CLUSTER

In [19]:
@ray.remote
class Counter:
    def __init__(self):
        self.value = 0

    def increment(self):
        self.value += 1
        return self.value

    def get_counter(self):
        return self.value

In [20]:
# Create an actor from this class.
counter = Counter.remote()

In [21]:
counter

ClientActorHandle(54b961fab34f8a6dfd78e8ba0d000000)

### EXECUTE CLASS METHODS REMOTELY

In [22]:
future = counter.increment.remote()
next_value = ray.get(future)
print(next_value)

1


### REMOTE LAMBDAS CAN UTILIZE REMOTE CLASSES

In [23]:
@ray.remote
def func(counter):
    future = counter.increment.remote()
    return ray.get(future)

# PASS THE DISTRIBUTED COUNTER AS AN ARG
future = func.remote(counter)
return_value = ray.get(future)
print(return_value)

2


### THE EMOJI LIBRARY IS NOT INSTALLED LOCALLY

In [24]:
import emoji

ModuleNotFoundError: No module named 'emoji'

### INSTALL FUNC-SPECIFIC PIP PACKAGES

In [25]:
@ray.remote(runtime_env={"pip": ["emoji"]})
def func():
  import emoji
  return emoji.emojize('Python is :thumbs_up:')

print(ray.get(func.remote()))

Python is 👍


### INSTALL CONNECTION-WIDE PIP PACKAGES

In [None]:
# ray.init(address='ray://193.167.37.127:10001', runtime_env={"pip": ["emoji"]})

### RESOURCE RESERVATION
- `num_cpus=1` means that each func reserves 1 cpu core.
    - CPU bound tasks are best with 1 core.
    - Tasks with IO often require 2-3 to run efficiently.
- `max_retries=3` means that `local` tasks that fail are automatically retried 3 times before the `global` job fails.

In [26]:
@ray.remote(num_cpus=1, max_retries=3)
def nth_prime_cpu():
    import random, time

    nums = [random.randrange(100000, 150000) for _ in range(10)]
    start_time = time.time()

    def is_prime(num):
        if num < 2:
            return False
        for i in range(2, int(num**0.5) + 1):
            if num % i == 0:
                return False
        return True
    
    # START COMPUTING
    for  nth_prime in nums:
        count = 0
        num = 1
            
        while count < nth_prime:
            num += 1
            if is_prime(num):
                count += 1
            
    return time.time() - start_time

In [27]:
print(ray.get(nth_prime_cpu.remote()))

23.899380207061768


### LAUNCH PARALLEL TASKS ON THE CLUSTER

In [29]:
pprint(ray.available_resources())

{
    "accelerator_type:G": 4.0,
    "node:__internal_head__": 1.0,
    "node:10.84.60.60": 1.0,
    "node:10.130.233.68": 1.0,
    "CPU": 55.0,
    "object_store_memory": 34444902323.0,
    "memory": 115816633344.0,
    "node:10.84.60.62": 1.0,
    "GPU": 4.0,
    "node:10.151.173.95": 1.0,
    "node:10.151.173.100": 1.0
}


In [28]:
num_tasks = 55
start_time = time.time()

tasks = [nth_prime_cpu.remote() for x in range(num_tasks)]
results = ray.get(tasks)
delta_time = round(time.time() - start_time, 3)

print(f'\nEXPERIMENT TOOK {delta_time} SECONDS')


EXPERIMENT TOOK 88.626 SECONDS


### GRAFANA OUTPUT

<center><img src="cpu2.png" /></center>