In [8]:
import os
import sys
import requests
from utils.experiment import Experiment, execute_command_on_server_and_clients
from execo_g5k import oarsub, oardel, OarSubmission, get_current_oar_jobs, get_oar_job_nodes, get_oar_job_info, Deployment, deploy

In [4]:
jobname="fl-measure"
nodecount=3
walltime="3:00:00"
resources_selection="-t exotic -p estats"
site="toulouse"
force_redeploy=False
environment_dsc_file='../images/fl_jetson_image.yaml'
storage_group="energyfl"

# Reserve a job and deploy the chosen environment

In [5]:
jobs = get_current_oar_jobs()
jobid = None
waiting_jobs = []
ifdeploy=True
while jobs:
    j, site = jobs.pop()
    info = get_oar_job_info(j, site)
    if info['name'] == jobname:
        if info['state'] == 'Running':
            jobid = j
            print("A {} job is already running, using it. jobid is {}".format(jobname, jobid))
            ifdeploy=False
            break
        else:
            waiting_jobs.append(j)
if not jobid and not waiting_jobs:
    jobspec = OarSubmission(resources="/cluster=1/nodes={}".format(nodecount), walltime=walltime,
                            additional_options=resources_selection, job_type="deploy", name=jobname,
                            queue='testing')
    jobid, _ = oarsub([(jobspec, site)]).pop()
    print("New job submitted, jobid is {}".format(jobid))
elif not jobid:
    print("One or more {} jobs exist ({}) but are not running.\n"
          " Connect to the frontend to see what is happening, and/or run the cell again.".format(
          jobname, ", ".join([str(j) for j in waiting_jobs])))

New job submitted, jobid is 448947


In [6]:
nodes = get_oar_job_nodes(jobid, site)
nodes.sort(key=lambda n: n.address)
nodes

[Host('estats-7.toulouse.grid5000.fr'),
 Host('estats-8.toulouse.grid5000.fr'),
 Host('estats-9.toulouse.grid5000.fr')]

In [7]:
if ifdeploy:
    deployment = Deployment(hosts=nodes, env_file=os.path.abspath(environment_dsc_file))
    print(os.path.abspath(environment_dsc_file))
    deploy_ok, deploy_failed = deploy(deployment, check_deployed_command=not force_redeploy,
                                stdout_handlers=[sys.stdout],
                                stderr_handlers=[sys.stderr])
    print("Deployement status:\n* ok: {}\n* failed: {}".format(deploy_ok, deploy_failed))

/home/tunguyen/jetson-test/images/fl_jetson_image.yaml
Deployement status:
* ok: {'estats-9.toulouse.grid5000.fr', 'estats-8.toulouse.grid5000.fr', 'estats-7.toulouse.grid5000.fr'}
* failed: set()


# Allow access for the node to the nfs storage group and mount the storage to a folder

In [9]:
# curl -X POST 'https://api.grid5000.fr/stable/sites/toulouse/storage/storage1/energyfl/access' -H "Content-Type: application/json" -d '{"termination" : {"job": 448571, "site": "toulouse"}}'
# or
ifdeploy=True
if ifdeploy:
    url = f"https://api.grid5000.fr/stable/sites/{site}/storage/storage1/{storage_group}/access"
    requests.post(url, headers={"Content-Type": "application/json"}, json={"termination": {"job": jobid, "site": site}})
    cmd = f"mkdir /root/{storage_group} ; mount storage1.toulouse.grid5000.fr:/export/group/{storage_group} /root/{storage_group}/"
    _ = execute_command_on_server_and_clients(nodes, cmd, background=False) 

Successfully executed on Host('estats-7.toulouse.grid5000.fr') command 'mkdir /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'
Successfully executed on Host('estats-8.toulouse.grid5000.fr') command 'mkdir /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'
Successfully executed on Host('estats-9.toulouse.grid5000.fr') command 'mkdir /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'


# Define parameters for FL experiment and run it

In [8]:
params = {
    "params.num_rounds":[10],
    "data.num_clients": [10],
    "data.alpha": [10], #[1,2,5,10],
    "data.partition":["label_skew"],
    "client.lr" : [1e-2],#,1e-2],
    "client.local_epochs": [1],
    "client.decay_rate": [1],
    "client.decay_steps": [10],
    "neuralnet":["MobileNetV3Small"],
    "strategy": ["fedavg"],
    "optimizer": ["SGD"],
}

repository_dir = "/home/tunguyen/jetson-test"

to_remove = ["client.cid","client.dry_run","data.partition_dir","params.num_classes","tmp_result_folder","exp_datetime"]

Exps = Experiment(
    params=params,
    nodes=nodes,
    repository_dir=repository_dir,
    sleep=30,
    key_to_remove=to_remove)
#Exps.frontend_dry_run()
Exps.run()

2024-02-15 13:45:19,614 - MyEXP - INFO - Server : Host('estats-5.toulouse.grid5000.fr') 
 Clients: [Host('estats-8.toulouse.grid5000.fr'), Host('estats-9.toulouse.grid5000.fr')]
2024-02-15 13:45:19,654 - MyEXP - INFO - Experiments left : 1
2024-02-15 13:45:19,658 - MyEXP - INFO - Experiment 0 Remaining : 0
2024-02-15 13:45:19,659 - MyEXP - INFO - GETTING PARAMETERS


****************************************************************************************************


2024-02-15 13:45:20,696 - MyEXP - INFO - SERVER IP ADDRESS : 172.16.121.5


Successfully executed on Host('estats-5.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl/outputs/2024-02-15_13-45-19; mkdir -p /tmp/2024-02-15_13-45-19; echo -n > /tmp/2024-02-15_13-45-19/logs.log;'
Successfully executed on Host('estats-8.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl/outputs/2024-02-15_13-45-19; mkdir -p /tmp/2024-02-15_13-45-19; echo -n > /tmp/2024-02-15_13-45-19/logs.log;'


2024-02-15 13:45:23,289 - MyEXP - INFO - SSH PYTHON CMD TO SERVER AND CLIENTS
2024-02-15 13:45:23,291 - MyEXP - INFO - START MONITORING


Successfully executed on Host('estats-9.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl/outputs/2024-02-15_13-45-19; mkdir -p /tmp/2024-02-15_13-45-19; echo -n > /tmp/2024-02-15_13-45-19/logs.log;'


2024-02-15 13:45:53,324 - MyEXP - INFO - START SERVER AND CLIENTS
2024-02-15 13:45:58,328 - MyEXP - INFO - WAITING FOR SERVER AND CLIENTS TO FINISH TRAINING
2024-02-15 13:48:45,833 - MyEXP - INFO - JTOP PROCESSES FINISHED AND KILLED
2024-02-15 13:48:45,840 - MyEXP - INFO - SAVE TMP FILES AND PARAMETERS TO FRONTEND


Successfully executed on Host('estats-5.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl/outputs/2024-02-15_13-45-19/server; cp -r /tmp/2024-02-15_13-45-19/* /root/energyfl/outputs/2024-02-15_13-45-19/server'
Successfully executed on Host('estats-8.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl/outputs/2024-02-15_13-45-19/client_0; cp -r /tmp/2024-02-15_13-45-19/* /root/energyfl/outputs/2024-02-15_13-45-19/client_0'


2024-02-15 13:48:46,699 - MyEXP - INFO - FINISHED SAVING TO FRONTEND
2024-02-15 13:48:46,700 - MyEXP - INFO - SAVE HYPERPARAMS TO CSV
2024-02-15 13:48:46,708 - MyEXP - INFO - KEY defaults REPLACED


Successfully executed on Host('estats-9.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl/outputs/2024-02-15_13-45-19/client_1; cp -r /tmp/2024-02-15_13-45-19/* /root/energyfl/outputs/2024-02-15_13-45-19/client_1'


2024-02-15 13:48:46,923 - MyEXP - INFO - EXPERIMENT 0 DONE
2024-02-15 13:48:46,925 - MyEXP - INFO - ALL PROCESSES KILLED
2024-02-15 13:48:46,925 - MyEXP - INFO - ALL EXPERIMENTS DONE


In [None]:
Exps.kill_all()

Results will be available at `f"/srv/storage/{group_storage}@storage1.toulouse.grid5000.fr"`

# And don't forget to kill the job when you're done with the experiments

In [19]:
oardel([(jobid,site)])

TypeError: %i format: a number is required, not NoneType

In [7]:
# from execo import Process, SshProcess
# from execo_engine import logger
# import shlex
# get_ssh_processes = Process(shlex.split('pgrep -f ssh'))
# get_ssh_processes.run()
# ssh_processes = get_ssh_processes.stdout.splitlines()
# # For each PID, create a kill command and run it
# for pid in ssh_processes:
#     kill_command = 'kill -9 {0}'.format(pid)
#     kill_process = SshProcess(kill_command, 'localhost')
#     kill_process.run()

#     if kill_process.ok:
#         logger.info('Successfully killed process {0}'.format(pid))
#     else:
#         logger.error('Failed to kill process {0}'.format(pid))

2024-02-14 14:11:54,357 [31mERROR:[m Failed to kill process 605
2024-02-14 14:11:54,574 [31mERROR:[m Failed to kill process 606
2024-02-14 14:11:54,777 [31mERROR:[m Failed to kill process 2836559
2024-02-14 14:11:54,999 [31mERROR:[m Failed to kill process 3964793
2024-02-14 14:11:55,215 [31mERROR:[m Failed to kill process 3968219
2024-02-14 14:11:55,454 [31mERROR:[m Failed to kill process 3968243
2024-02-14 14:11:55,669 [31mERROR:[m Failed to kill process 3974547
2024-02-14 14:11:55,865 [35mINFO:[m Successfully killed process 3974554
2024-02-14 14:11:56,093 [31mERROR:[m Failed to kill process 4007820
2024-02-14 14:11:56,297 [35mINFO:[m Successfully killed process 4007827
2024-02-14 14:11:56,523 [31mERROR:[m Failed to kill process 4017864
2024-02-14 14:11:56,730 [31mERROR:[m Failed to kill process 4017871
