In [10]:
# coding: utf-8
from funcx.sdk.client import FuncXClient
import sys
import time
import yaml

In [11]:
fxc = FuncXClient()
fxc.version_check()

In [18]:
config = yaml.safe_load(open("../pwf/config.yml"))
print(config)

{'funcx_endpoints': {'daq_fx_ep': '190dfdbd-4d4f-40fa-a0b1-63237042f989', 'compute_fx_ep': 'ef62df54-6d3c-43ff-8f84-c334f3ab3000'}}


In [19]:
aps_alcf_endpoints = config['funcx_endpoints']

### Receive the endpoints' status information.

We can retrieve the endpoint status information using `fxc.get_endpoint()` function. Given the end-point uuid, it returns `logs` and `status` information. `logs` holds the last 30-secs-time-seperated 11 status information (last 5.5 mins) of the endpoint.

`ef_info = fxc.get_endpoint_status(aps_alcf_endpoints['daq_fx_ep'])`

`ef_info['status] # If endpoint is online, then this value is set to online.`

`ef_info['logs'][0]['info'] # Check the status of the endpoint within last 30.`

In [20]:
for ep_key in aps_alcf_endpoints:
    ep_uuid = aps_alcf_endpoints[ep_key]
    ep_info = fxc.get_endpoint_status(ep_uuid)
    if ep_info['status'] != 'online':
        raise RuntimeError(f"The endpoint is not online. Endpoint name:{ep_key}; Status:{ep_info['status']}.")
print(f"All endpoints are online.")

All endpoints are online.


### Check if resources can be allocated.

We try to register and run some funcX functions, `pi` and `hworld`, to test the resources allocation. This can take some time to finish since the jobs need to be submitted and resources need to be allocated before the functions start executing.

`pi(num_points)` # calculates the pi according to the given precision. 10**8 is a reasonable input to run.

`hworld()` # returns hostname with "hello world" string.

In [5]:
def pi(num_points=10**8):
    from random import random

    inside = 0
    for i in range(num_points):
        x, y = random(), random()  # Drop a random point in the box.
        if x**2 + y**2 < 1:        # Count points within the circle.
            inside += 1
    return (inside*4 / num_points)

def hworld():
    import socket

    hello_host= f"{socket.gethostname()}: hello world"
    return hello_host

test_funcs_desc_uuid = str(uuid.uuid4()) # random uuid for test functions
test_funcs = {
    'pi' : fxc.register_function(pi, description=f"Test function to calculate pi. UUID={test_funcs_desc_uuid}"),
    'hworld' : fxc.register_function(hworld, description=f"Test function to get hostname of the worker. UUID={test_funcs_desc_uuid}")
}

In [7]:
search_results = fxc.search_function(f"{test_funcs_desc_uuid}", offset=0, limit=5)
if len(search_results) != 2: 
    raise RuntimeError(f"There should be exactly two functions. The number of registered functions is: {len(search_results)}")
for res in search_results:
    if test_funcs[res['function_name']] != res['function_uuid']:
        raise RuntimeError(f"Function uuid did not match for {res['function_name']}!") 
print("All test functions are registered successfully.")

All test functions are registered successfully.


In [8]:
test_func_batch = fxc.create_batch()

for ep_key in aps_alcf_endpoints:
    ep_uuid = aps_alcf_endpoints[ep_key]
    for fx_test_key in test_funcs:
        fx_test_uuid = test_funcs[fx_test_key]
        test_func_batch.add(endpoint_id=ep_uuid, function_id=fx_test_uuid)

# Initiate test functions' batch run and receive task ids
batch_task_ids = fxc.batch_run(test_func_batch)

batch_task_status = fxc.get_batch_result(batch_task_ids)

In [9]:
# Get the initial status of the tasks
while True: 
    batch_task_status = fxc.get_batch_result(batch_task_ids)
    running_tasks = [ s for s in batch_task_status if batch_task_status[s]['status'] != 'success']
    if running_tasks: print(f"Tasks are still running: {running_tasks}")
    else: 
        #for task_id in batch_task_status: print(f"task id: {task_id}; result: {batch_task_status[task_id]['result']}")
        break
    time.sleep(2)  # Sleep 2 seconds
print("All tasks are completed successfully.")


Tasks are still running: ['c7e83867-c0ab-4a67-b8bf-c7da9b17a760', 'fa92ee32-7473-4eb7-8b20-cfac6d7d12b4', '6b88e6d2-c9b9-448a-a0f6-68838df28731']
Tasks are still running: ['c7e83867-c0ab-4a67-b8bf-c7da9b17a760', 'fa92ee32-7473-4eb7-8b20-cfac6d7d12b4', '6b88e6d2-c9b9-448a-a0f6-68838df28731']
Tasks are still running: ['c7e83867-c0ab-4a67-b8bf-c7da9b17a760', 'fa92ee32-7473-4eb7-8b20-cfac6d7d12b4', '6b88e6d2-c9b9-448a-a0f6-68838df28731']
Tasks are still running: ['c7e83867-c0ab-4a67-b8bf-c7da9b17a760', 'fa92ee32-7473-4eb7-8b20-cfac6d7d12b4', '6b88e6d2-c9b9-448a-a0f6-68838df28731']
Tasks are still running: ['c7e83867-c0ab-4a67-b8bf-c7da9b17a760', 'fa92ee32-7473-4eb7-8b20-cfac6d7d12b4']
Tasks are still running: ['c7e83867-c0ab-4a67-b8bf-c7da9b17a760', 'fa92ee32-7473-4eb7-8b20-cfac6d7d12b4']
Tasks are still running: ['c7e83867-c0ab-4a67-b8bf-c7da9b17a760', 'fa92ee32-7473-4eb7-8b20-cfac6d7d12b4']
Tasks are still running: ['c7e83867-c0ab-4a67-b8bf-c7da9b17a760', 'fa92ee32-7473-4eb7-8b20-cfac6d7