In [9]:
import os
import sys
import requests
from utils.experiment import Experiment, execute_command_on_server_and_clients
from execo_g5k import oarsub, oardel, OarSubmission, get_current_oar_jobs, get_oar_job_nodes, get_oar_job_info, Deployment, deploy

In [2]:
jobname="fl-measure"
nodecount=3
walltime="06:00:00"
resources_selection="-t exotic -p estats"
site="toulouse"
force_redeploy=False
environment_dsc_file='./images/fl_jetson_image.yaml'
storage_group="energyfl"

# Reserve a job and deploy the chosen environment

In [3]:
jobs = get_current_oar_jobs()
jobid = None
waiting_jobs = []
ifdeploy=True
while jobs:
    j, site = jobs.pop()
    info = get_oar_job_info(j, site)
    if info['name'] == jobname:
        if info['state'] == 'Running':
            jobid = j
            print("A {} job is already running, using it. jobid is {}".format(jobname, jobid))
            ifdeploy=False
            break
        else:
            waiting_jobs.append(j)
if not jobid and not waiting_jobs:
    jobspec = OarSubmission(resources="/cluster=1/nodes={}".format(nodecount), walltime=walltime,
                            additional_options=resources_selection, job_type="deploy", name=jobname,
                            queue='testing')
    jobid, _ = oarsub([(jobspec, site)]).pop()
    print("New job submitted, jobid is {}".format(jobid))
elif not jobid:
    print("One or more {} jobs exist ({}) but are not running.\n"
          " Connect to the frontend to see what is happening, and/or run the cell again.".format(
          jobname, ", ".join([str(j) for j in waiting_jobs])))

A fl-measure job is already running, using it. jobid is 448571


In [4]:
nodes = get_oar_job_nodes(jobid, site)
nodes.sort(key=lambda n: n.address)
nodes

[Host('estats-5.toulouse.grid5000.fr'),
 Host('estats-8.toulouse.grid5000.fr'),
 Host('estats-9.toulouse.grid5000.fr')]

In [5]:
if ifdeploy:
    deployment = Deployment(hosts=nodes, env_file=os.path.abspath(environment_dsc_file))
    deploy_ok, deploy_failed = deploy(deployment, check_deployed_command=not force_redeploy,
                                stdout_handlers=[sys.stdout],
                                stderr_handlers=[sys.stderr])
    print("Deployement status:\n* ok: {}\n* failed: {}".format(deploy_ok, deploy_failed))

# Allow access for the node to the nfs storage group and mount the storage to a folder

In [None]:
# curl -X POST 'https://api.grid5000.fr/stable/sites/toulouse/storage/storage1/energyfl/access' -H "Content-Type: application/json" -d '{"termination" : {"job": 448571, "site": "toulouse"}}'
url = f"https://api.grid5000.fr/stable/sites/{site}/storage/storage1/{storage_group}/access"
requests.post(url, headers={"Content-Type": "application/json"}, json={"termination": {"job": jobid, "site": site}})

In [11]:
cmd = f"mkdir /root/{storage_group} ; mount storage1.toulouse.grid5000.fr:/export/group/{storage_group} /root/{storage_group}/"
_ = execute_command_on_server_and_clients(nodes, cmd, background=False)

Successfully executed on Host('estats-5.toulouse.grid5000.fr') command 'mkdir /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'
Successfully executed on Host('estats-8.toulouse.grid5000.fr') command 'mkdir /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'
Successfully executed on Host('estats-9.toulouse.grid5000.fr') command 'mkdir /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'


# Define parameters for FL experiment and run it

In [13]:
params = {
    "params.num_rounds":[3, 3],
    "client.lr" : [1e-1],
    #"client.local_epochs": [1, 3, 5],
    "client.decay_rate": [0.1],
    "client.decay_steps": [10],
    
    "neuralnet":["MobileNetV3Small"],
    "strategy": ["fedavg"],
    "optimizer": ["SGD"],
}

repository_dir = "/home/mjay/FL-G5K-Test"

to_remove = ["client.cid","client.dry_run","params.root_data","params.num_classes","tmp_result_folder","exp_datetime"]

Exps = Experiment(
    params=params,
    nodes=nodes,
    repository_dir=repository_dir,
    sleep=0,
    key_to_remove=to_remove)
Exps.run()

2024-02-02 16:52:34,888 - MyEXP - INFO - Server : Host('estats-5.toulouse.grid5000.fr') 
 Clients: [Host('estats-8.toulouse.grid5000.fr'), Host('estats-9.toulouse.grid5000.fr')]
2024-02-02 16:52:34,893 - MyEXP - INFO - Experiments left : 1
2024-02-02 16:52:34,896 - MyEXP - INFO - Experiment 0 Remaining : 0
2024-02-02 16:52:34,897 - MyEXP - INFO - GETTING PARAMETERS


****************************************************************************************************


2024-02-02 16:52:35,952 - MyEXP - INFO - SERVER IP ADDRESS : 172.16.121.5


Successfully executed on Host('estats-5.toulouse.grid5000.fr') command 'mkdir -p /tmp/2024-02-02_16-52-34; echo -n > /tmp/2024-02-02_16-52-34/logs.log;'
Successfully executed on Host('estats-8.toulouse.grid5000.fr') command 'mkdir -p /tmp/2024-02-02_16-52-34; echo -n > /tmp/2024-02-02_16-52-34/logs.log;'


2024-02-02 16:52:37,839 - MyEXP - INFO - SSH PYTHON CMD TO SERVER AND CLIENTS
2024-02-02 16:52:37,841 - MyEXP - INFO - START MONITORING
2024-02-02 16:52:37,842 - MyEXP - INFO - START SERVER AND CLIENTS


Successfully executed on Host('estats-9.toulouse.grid5000.fr') command 'mkdir -p /tmp/2024-02-02_16-52-34; echo -n > /tmp/2024-02-02_16-52-34/logs.log;'


2024-02-02 16:52:42,848 - MyEXP - INFO - WAITING FOR SERVER AND CLIENTS TO FINISH TRAINING
2024-02-02 16:54:12,061 - MyEXP - INFO - JTOP PROCESSES FINISHED AND KILLED
2024-02-02 16:54:12,065 - MyEXP - INFO - SAVE TMP FILES AND PARAMETERS TO FRONTEND


Successfully executed on Host('estats-5.toulouse.grid5000.fr') command 'mkdir -p /home/mjay/FL-G5K-Test/outputs/2024-02-02_16-52-34/server; cp -r /tmp/2024-02-02_16-52-34/. /home/mjay/FL-G5K-Test/outputs/2024-02-02_16-52-34/server'
Successfully executed on Host('estats-8.toulouse.grid5000.fr') command 'mkdir -p /home/mjay/FL-G5K-Test/outputs/2024-02-02_16-52-34/client_0; cp -r /tmp/2024-02-02_16-52-34/. /home/mjay/FL-G5K-Test/outputs/2024-02-02_16-52-34/client_0'


2024-02-02 16:54:12,873 - MyEXP - INFO - FINISHED SAVING TO FRONTEND
2024-02-02 16:54:12,874 - MyEXP - INFO - SAVE HYPERPARAMS TO CSV
2024-02-02 16:54:12,877 - MyEXP - INFO - KEY defaults REPLACED
2024-02-02 16:54:12,896 - MyEXP - INFO - EXPERIMENT 0 DONE
2024-02-02 16:54:12,898 - MyEXP - INFO - ALL PROCESSES KILLED
2024-02-02 16:54:12,898 - MyEXP - INFO - ALL EXPERIMENTS DONE


Successfully executed on Host('estats-9.toulouse.grid5000.fr') command 'mkdir -p /home/mjay/FL-G5K-Test/outputs/2024-02-02_16-52-34/client_1; cp -r /tmp/2024-02-02_16-52-34/. /home/mjay/FL-G5K-Test/outputs/2024-02-02_16-52-34/client_1'


In [7]:
Exps.kill_all()

NameError: name 'Exps' is not defined

Results will be available at `f"/srv/storage/{group_storage}@storage1.toulouse.grid5000.fr"`

In [None]:
oardel([(jobid,site)])