In [None]:
import os
import sys
import requests
from utils.experiment import Experiment, execute_command_on_server_and_clients
from execo_g5k import oarsub, oardel, OarSubmission, get_current_oar_jobs, get_oar_job_nodes, get_oar_job_info, Deployment, deploy

In [None]:
jobname="fl-measure"
nodecount=3
walltime="06:00:00"
resources_selection="-t exotic -p estats"
site="toulouse"
force_redeploy=False
environment_dsc_file='./images/fl_jetson_image.yaml'
storage_group="energyfl"

# Reserve a job and deploy the chosen environment

In [None]:
jobs = get_current_oar_jobs()
jobid = None
waiting_jobs = []
ifdeploy=True
while jobs:
    j, site = jobs.pop()
    info = get_oar_job_info(j, site)
    if info['name'] == jobname:
        if info['state'] == 'Running':
            jobid = j
            print("A {} job is already running, using it. jobid is {}".format(jobname, jobid))
            ifdeploy=False
            break
        else:
            waiting_jobs.append(j)
if not jobid and not waiting_jobs:
    jobspec = OarSubmission(resources="/cluster=1/nodes={}".format(nodecount), walltime=walltime,
                            additional_options=resources_selection, job_type="deploy", name=jobname,
                            queue='testing')
    jobid, _ = oarsub([(jobspec, site)]).pop()
    print("New job submitted, jobid is {}".format(jobid))
elif not jobid:
    print("One or more {} jobs exist ({}) but are not running.\n"
          " Connect to the frontend to see what is happening, and/or run the cell again.".format(
          jobname, ", ".join([str(j) for j in waiting_jobs])))

In [None]:
nodes = get_oar_job_nodes(jobid, site)
nodes.sort(key=lambda n: n.address)
nodes

In [None]:
if ifdeploy:
    deployment = Deployment(hosts=nodes, env_file=os.path.abspath(environment_dsc_file))
    deploy_ok, deploy_failed = deploy(deployment, check_deployed_command=not force_redeploy,
                                stdout_handlers=[sys.stdout],
                                stderr_handlers=[sys.stderr])
    print("Deployement status:\n* ok: {}\n* failed: {}".format(deploy_ok, deploy_failed))

# Allow access for the node to the nfs storage group and mount the storage to a folder

In [None]:
# curl -X POST 'https://api.grid5000.fr/stable/sites/toulouse/storage/storage1/energyfl/access' -H "Content-Type: application/json" -d '{"termination" : {"job": 448571, "site": "toulouse"}}'
# or
if ifdeploy:
    url = f"https://api.grid5000.fr/stable/sites/{site}/storage/storage1/{storage_group}/access"
    requests.post(url, headers={"Content-Type": "application/json"}, json={"termination": {"job": jobid, "site": site}})
    cmd = f"mkdir /root/{storage_group} ; mount storage1.toulouse.grid5000.fr:/export/group/{storage_group} /root/{storage_group}/"
    _ = execute_command_on_server_and_clients(nodes, cmd, background=False) 

# Define parameters for FL experiment and run it

In [None]:
params = {
    "params.num_rounds":[5, 7],
    "client.lr" : [1e-1],
    #"client.local_epochs": [1, 3, 5],
    "client.decay_rate": [0.1],
    "client.decay_steps": [10],
    
    "neuralnet":["MobileNetV3Small"],
    "strategy": ["fedavg"],
    "optimizer": ["SGD"],
}

repository_dir = "/home/mjay/FL-G5K-Test"

to_remove = ["client.cid","client.dry_run","params.root_data","params.num_classes","tmp_result_folder","exp_datetime"]

Exps = Experiment(
    params=params,
    nodes=nodes,
    repository_dir=repository_dir,
    sleep=30,
    key_to_remove=to_remove)
Exps.run()

In [None]:
# Exps.kill_all()

Results will be available at `f"/srv/storage/{group_storage}@storage1.toulouse.grid5000.fr"`

# And don't forget to kill the job when you're done with the experiments

In [None]:
oardel([(jobid,site)])