In [None]:
# !pip install ray ray[tune]

In [1]:
import ray, json, math, re

In [2]:
def available_gpus():
    base = ray.available_resources().get('GPU')
    if base: return int(base)
    return 0

In [3]:
def pprint(data):
    print(json.dumps(data, indent=4))

In [4]:
ray.init(address='ray://193.167.37.127:10001')

2024-11-08 02:24:34,607	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: log_to_driver
SIGTERM handler is not set because current thread is not the main thread.
    Ray: 2.38.0
    Python: 3.12.7
This process on Ray Client was started with:
    Ray: 2.38.0
    Python: 3.12.3



0,1
Python version:,3.12.7
Ray version:,2.38.0
Dashboard:,http://10.151.173.92:8265


# NAIVE MASS-GPU COMPUTATION

In [None]:
# @ray.remote(num_gpus=1, scheduling_strategy="DEFAULT")
# @ray.remote(num_cpus=1, scheduling_strategy="SPREAD")

In [5]:
@ray.remote(num_gpus=1)
def gpu_stress_test():
    import torch, time
    torch.cuda.set_device(0)

    # FIND WHAT GPU WERE USING
    my_gpu = torch.cuda.get_device_name(0)

    MATRIX_SIZE, ITERATIONS = 4096, 100
    start_time = time.time()
    
    matrix_a = torch.randn((MATRIX_SIZE, MATRIX_SIZE), device='cuda')
    matrix_b = torch.randn((MATRIX_SIZE, MATRIX_SIZE), device='cuda')

    for _ in range(ITERATIONS):
        result = torch.matmul(matrix_a, matrix_b)

        del result
        torch.cuda.empty_cache()

    return (my_gpu, time.time() - start_time)

In [7]:
# MAKE SURE ALL THREE GPUS ARE AVAILABLE
assert available_gpus() == 3, 'EXPECTED AMOUNT OF GPUS NOT AVAILABLE'

NUM_TASKS = 100
tasks = [gpu_stress_test.remote() for x in range(NUM_TASKS)]
task_durations = ray.get(tasks)

### EXPERIMENT GRAFANA OUTPUT

<center><img src="naive_profile.png" /></center>

### HOW MUCH BETTER IS A 4090 THAN A TITAN?

In [10]:
def avg_times(task_durations):
    averages = {}
    ratios = {}

    # AGGREGATE EACH SUBTASK DURATION PER GPU
    for gpu_name, task_duration in task_durations:
        if gpu_name not in averages:
            averages[gpu_name] = []

        averages[gpu_name].append(task_duration)

    max_name_length = max(len(name) for name in averages.keys())

    # FIND THE AVERAGE DURATION FOR EACH GPU
    for gpu, values in averages.items():
        averages[gpu] = round(sum(values) / len(values), 3)

    # WHAT GPU IS SLOWEST
    slowest_duration = max(averages.values())

    # FIND RATIOS
    for gpu, avg_value in averages.items():
        ratios[gpu] =  round(slowest_duration / avg_value, 3)

    return {
        'avg_durations': averages,
        'ratios': ratios
    }

In [11]:
naive_analysis = avg_times(task_durations)
pprint(naive_analysis)

{
    "avg_durations": {
        "NVIDIA GeForce RTX 4090": 0.468,
        "NVIDIA GeForce GTX TITAN X": 3.069
    },
    "ratios": {
        "NVIDIA GeForce RTX 4090": 6.558,
        "NVIDIA GeForce GTX TITAN X": 1.0
    }
}


# OPTIMIZED GPU SOLUTION

### REGISTER ONE GPU_ACTOR ON EACH CLUSTER NODE WITH A GPU

In [12]:
@ray.remote(num_gpus=1)
class gpu_actor:
    def __init__(self):
        self.my_gpu = self.detect_gpu()

    # DETECT WHAT GPU ACTOR HAS RESERVED
    def detect_gpu(self):
        import torch
        return torch.cuda.get_device_name(0)

    # DO MATRIX MULTIPLICATION TO STRESSTEST
    def perform_work(self):
        import torch, time
        
        MATRIX_SIZE, ITERATIONS = 4096, 100

        matrix_a = torch.randn((MATRIX_SIZE, MATRIX_SIZE), device='cuda')
        matrix_b = torch.randn((MATRIX_SIZE, MATRIX_SIZE), device='cuda')
    
        start_time = time.time()
        for _ in range(ITERATIONS):
            result = torch.matmul(matrix_a, matrix_b)
    
            del result
            torch.cuda.empty_cache()

        return (self.my_gpu, time.time() - start_time)

### BASED ON A RELATIVE POWER RATIO
### FIGURE OUT HOW TO SPLIT n TASKS EVENLY AMONG x GPUS

In [13]:
def task_distribution(total_num_tasks, reserved_gpus, power_ratios):
    container = []

    # HOW MANY UNITS OF WORK IN TOTAL
    total_units = sum([power_ratios[gpu_name] for gpu_name in reserved_gpus])
    work_delegated = 0

    # BASED ON THE POWER RATIO, DELEGATE N TASKS TO EACH GPU
    for gpu_name in reserved_gpus:
        gpu_tasks = math.floor(total_num_tasks  * (power_ratios[gpu_name] / total_units))
        
        work_delegated += gpu_tasks
        container.append(gpu_tasks)

    # CHECK IF ANY TASKS ARE UNASSIGNED
    missing_tasks = total_num_tasks - work_delegated

    # IF YES, THEN DUMP THEM ON THE FIRST GPU
    if missing_tasks > 0:
        container[0] += missing_tasks

    return container

### CREATE n UNIQUE TASKS
### DISTRIBUTE THEM TO x GPUs ON THE CLUSTER

In [14]:
def simulation(total_tasks, gpu_power_ratio):

    # CHECK CLUSTER GPU STATUS
    num_gpus = available_gpus()
    expected_gpus = 3
    assert num_gpus == expected_gpus, f'EXPECTED AMOUNT OF GPUS NOT AVAILABLE (GOT {num_gpus}, EXPECTED {expected_gpus})'
    
    # CREATE AN INSTANCE ON EACH NODE WITH A GPU
    # AND FIND WHICH GPU THEY WERE ASSIGNED
    actors = [gpu_actor.remote() for x in range(num_gpus)]
    actor_gpus = [ray.get(actor.detect_gpu.remote()) for actor in actors]

    # FIGURE OUT HOW MANY TASKS EACH GPU SHOULD BE ASSIGNED
    # BASED ON RELATIVE COMPUTATIONAL POWER RATIO
    actor_tasks = task_distribution(total_tasks, actor_gpus, gpu_power_ratio)
    tasks = []

    # DELEGATE & START THE TASKS
    for nth, actor in enumerate(actors):
        gpu_name = actor_gpus[nth]
        num_tasks = actor_tasks[nth]
        
        print(f'ASSIGNING {num_tasks} TASKS TO GPU {gpu_name}')
        tasks += [actor.perform_work.remote() for x in range(num_tasks)]

    # WAIT FOR EVERY TASK TO FINISH
    durations = ray.get(tasks)
    total_duration = round(sum([item[1] for item in durations]), 3)
    print(f'THE EXPERIMENT TOOK {total_duration} SECONDS')

    # THEN KILL THE ACTORS
    [ray.kill(actor) for actor in actors]

    return durations

In [15]:
task_durations = simulation(100, {
    'NVIDIA GeForce GTX TITAN X': 1,
    'NVIDIA GeForce RTX 4090': 7,
    '6000 ADA': None,
    '3080': None,
    '3060': None,
})

ASSIGNING 12 TASKS TO GPU NVIDIA GeForce GTX TITAN X
ASSIGNING 77 TASKS TO GPU NVIDIA GeForce RTX 4090
ASSIGNING 11 TASKS TO GPU NVIDIA GeForce GTX TITAN X
THE EXPERIMENT TOOK 95.32 SECONDS


In [16]:
pprint(avg_times(task_durations))

{
    "avg_durations": {
        "NVIDIA GeForce GTX TITAN X": 2.969,
        "NVIDIA GeForce RTX 4090": 0.351
    },
    "ratios": {
        "NVIDIA GeForce GTX TITAN X": 1.0,
        "NVIDIA GeForce RTX 4090": 8.459
    }
}


### GOOD PERFORMANCE PROFILE -- GRAFANA PROFILE

<center><img src="good_profile.png" /></center>

### NAIVE PERFORMANCE PROFILE -- GRAFANA

<center><img src="naive_profile.png" /></center>