In [1]:
import os
import sys
import requests
from utils.experiment import Experiment, execute_command_on_server_and_clients
from execo_g5k import oarsub, oardel, OarSubmission, get_current_oar_jobs, get_oar_job_nodes, get_oar_job_info, Deployment, deploy

In [2]:
jobname="fl-measure-imagenet"
nodecount=11
walltime="13:50:00"
resources_selection="-t exotic -p estats -t night"
site="toulouse"
force_redeploy=False
environment_dsc_file='../images/fl_jetson_image_complet.yaml'
storage_group="energyfl"

# Reserve a job and deploy the chosen environment

In [3]:
jobs = get_current_oar_jobs()
jobid = None
waiting_jobs = []
ifdeploy=True

In [4]:
while jobs:
    j, site = jobs.pop()
    info = get_oar_job_info(j, site)
    if info['name'] == jobname:
        if info['state'] == 'Running':
            jobid = j
            print("A {} job is already running, using it. jobid is {}".format(jobname, jobid))
            ifdeploy=False
            break
        else:
            waiting_jobs.append(j)
if not jobid and not waiting_jobs:
    jobspec = OarSubmission(resources="/cluster=1/nodes={}".format(nodecount), walltime=walltime,
                            additional_options=resources_selection, job_type="deploy", name=jobname,
                            queue='default')
    jobid, _ = oarsub([(jobspec, site)]).pop()
    print("New job submitted, jobid is {}".format(jobid))
elif not jobid:
    print("One or more {} jobs exist ({}) but are not running.\n"
          " Connect to the frontend to see what is happening, and/or run the cell again.".format(
          jobname, ", ".join([str(j) for j in waiting_jobs])))

New job submitted, jobid is 451642


In [4]:
jobid = 451476
nodes = get_oar_job_nodes(jobid, site)
nodes.sort(key=lambda n: n.address)
nodes

[Host('estats-10.toulouse.grid5000.fr'),
 Host('estats-11.toulouse.grid5000.fr'),
 Host('estats-12.toulouse.grid5000.fr'),
 Host('estats-2.toulouse.grid5000.fr'),
 Host('estats-3.toulouse.grid5000.fr'),
 Host('estats-4.toulouse.grid5000.fr'),
 Host('estats-5.toulouse.grid5000.fr'),
 Host('estats-6.toulouse.grid5000.fr'),
 Host('estats-7.toulouse.grid5000.fr'),
 Host('estats-8.toulouse.grid5000.fr'),
 Host('estats-9.toulouse.grid5000.fr')]

In [5]:
if ifdeploy:
    deployment = Deployment(hosts=nodes, env_file=os.path.abspath(environment_dsc_file))
    print(os.path.abspath(environment_dsc_file))
    deploy_ok, deploy_failed = deploy(deployment, check_deployed_command=not force_redeploy,
                                stdout_handlers=[sys.stdout],
                                stderr_handlers=[sys.stderr])
    print("Deployement status:\n* ok: {}\n* failed: {}".format(deploy_ok, deploy_failed))

/home/tunguyen/jetson-multiclient/images/fl_jetson_image_complet.yaml
Deployement status:
* ok: {'estats-4.toulouse.grid5000.fr', 'estats-3.toulouse.grid5000.fr', 'estats-10.toulouse.grid5000.fr', 'estats-6.toulouse.grid5000.fr', 'estats-7.toulouse.grid5000.fr', 'estats-9.toulouse.grid5000.fr', 'estats-12.toulouse.grid5000.fr', 'estats-8.toulouse.grid5000.fr', 'estats-5.toulouse.grid5000.fr', 'estats-2.toulouse.grid5000.fr', 'estats-11.toulouse.grid5000.fr'}
* failed: set()




stdout:
Deployment #D-83a077de-0e61-439d-a7f5-e54efe901784 started
Grab the key file /home/tunguyen/.ssh/authorized_keys
Grab the tarball file /home/tunguyen/public/ubuntul4t35-pytorch-estats.tar.zst
Grab the postinstall file server:///grid5000/postinstalls/g5k-postinstall.tgz
Using classical reboot instead of kexec (SetDeploymentMiniOS) because the last deployment is not trusted
Launching a deployment of ubuntul4t35-pytorch-estats-datasets:tunguyen:aarch64:2024030113 on estats-[2-12].toulouse.grid5000.fr
Performing a Deploy[SetDeploymentMiniOSUntrusted] step
  switch_pxe
[dbg] -------------------------
[dbg] NODE: estats-2.toulouse.grid5000.fr
[dbg] -------------------------
[dbg] COMMAND: KADEPLOY_BOOTLOADER_NO_GRUB_FROM_DEST="1" KADEPLOY_BOOTLOADER_NO_GRUB_MKCONFIG_FROM_DEST="1" KADEPLOY_CLUSTER="estats" KADEPLOY_VERSION="3.8.2.stable" KADEPLOY_USER="tunguyen" KADEPLOY_DEPLOY_PART="/dev/disk/by-partlabel/KDPL_DEPLOY_disk0" KADEPLOY_SWAP_PART="/dev/disk/by-partlabel/KDPL_SWAP_disk0" 

stdout:
Deployment #D-83a077de-0e61-439d-a7f5-e54efe901784 started
Grab the key file /home/tunguyen/.ssh/authorized_keys
Grab the tarball file /home/tunguyen/public/ubuntul4t35-pytorch-estats.tar.zst
Grab the postinstall file server:///grid5000/postinstalls/g5k-postinstall.tgz
Using classical reboot instead of kexec (SetDeploymentMiniOS) because the last deployment is not trusted
Launching a deployment of ubuntul4t35-pytorch-estats-datasets:tunguyen:aarch64:2024030113 on estats-[2-12].toulouse.grid5000.fr
Performing a Deploy[SetDeploymentMiniOSUntrusted] step
  switch_pxe
[dbg] -------------------------
[dbg] NODE: estats-2.toulouse.grid5000.fr
[dbg] -------------------------
[dbg] COMMAND: KADEPLOY_BOOTLOADER_NO_GRUB_FROM_DEST="1" KADEPLOY_BOOTLOADER_NO_GRUB_MKCONFIG_FROM_DEST="1" KADEPLOY_CLUSTER="estats" KADEPLOY_VERSION="3.8.2.stable" KADEPLOY_USER="tunguyen" KADEPLOY_DEPLOY_PART="/dev/disk/by-partlabel/KDPL_DEPLOY_disk0" KADEPLOY_SWAP_PART="/dev/disk/by-partlabel/KDPL_SWAP_disk0" 

2024-06-18 20:20:33,254 [31mERROR:[m <Kadeployer(Deployment(hosts={'estats-3.toulouse.grid5000.fr', 'estats-4.toulouse.grid5000.fr', 'estats-10.toulouse.grid5000.fr', 'estats-5.toulouse.grid5000.fr', 'estats-6.toulouse.grid5000.fr', 'estats-7.toulouse.grid5000.fr', 'estats-9.toulouse.grid5000.fr', 'estats-8.toulouse.grid5000.fr', 'estats-12.toulouse.grid5000.fr', 'estats-2.toulouse.grid5000.fr', 'estats-11.toulouse.grid5000.fr'}, env_file='/home/tunguyen/jetson-multiclient/images/fl_jetson_image_complet.yaml'), name=Kadeployer on 11 hosts / 1 frontends, started=True, start_date='2024-06-18 19:53:03+02:00', ended=False, end_date='2024-06-18 20:20:33+02:00', num_processes=1, num_started=1, num_ended=1, num_timeouts=0, num_errors=0, num_forced_kills=0, num_non_zero_exit_codes=1, num_ok=0, num_finished_ok=0, ok=False, total/deployed/undeployed = 11/0/0)>:
deploy on toulouse, total/deployed/undeployed = 11/0/0:
<SshProcess('kadeploy3 -k -d -a /home/tunguyen/jetson-multiclient/images/fl_je

2024-06-18 20:20:33,254 - execo - ERROR - <Kadeployer(Deployment(hosts={'estats-3.toulouse.grid5000.fr', 'estats-4.toulouse.grid5000.fr', 'estats-10.toulouse.grid5000.fr', 'estats-5.toulouse.grid5000.fr', 'estats-6.toulouse.grid5000.fr', 'estats-7.toulouse.grid5000.fr', 'estats-9.toulouse.grid5000.fr', 'estats-8.toulouse.grid5000.fr', 'estats-12.toulouse.grid5000.fr', 'estats-2.toulouse.grid5000.fr', 'estats-11.toulouse.grid5000.fr'}, env_file='/home/tunguyen/jetson-multiclient/images/fl_jetson_image_complet.yaml'), name=Kadeployer on 11 hosts / 1 frontends, started=True, start_date='2024-06-18 19:53:03+02:00', ended=False, end_date='2024-06-18 20:20:33+02:00', num_processes=1, num_started=1, num_ended=1, num_timeouts=0, num_errors=0, num_forced_kills=0, num_non_zero_exit_codes=1, num_ok=0, num_finished_ok=0, ok=False, total/deployed/undeployed = 11/0/0)>:
deploy on toulouse, total/deployed/undeployed = 11/0/0:
<SshProcess('kadeploy3 -k -d -a /home/tunguyen/jetson-multiclient/images/fl

# Allow access for the node to the nfs storage group and mount the storage to a folder

In [10]:
#!curl -X POST 'https://api.grid5000.fr/stable/sites/toulouse/storage/storage1/energyfl/access' -H "Content-Type: application/json" -d '{"termination" : {"job":451476, "site": "toulouse"}}'
# or
if ifdeploy:
    url = f"https://api.grid5000.fr/stable/sites/{site}/storage/storage1/{storage_group}/access"
    requests.post(url, headers={"Content-Type": "application/json"}, json={"termination": {"job": jobid, "site": site}}, verify=False)
    cmd = f"mkdir -p /root/{storage_group} ; mount storage1.toulouse.grid5000.fr:/export/group/{storage_group} /root/{storage_group}/"
    _ = execute_command_on_server_and_clients(nodes, cmd, background=False) 



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-10.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-10.toulouse.grid5000.fr closed.



Failed to execute on Host('estats-10.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'
Failed to execute on Host('estats-11.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'
Failed to execute on Host('estats-12.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'
Failed to execute on Host('estats-2.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'
Failed to execute on Host('estats-3.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'
Failed to execute on Host('estats-4.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/ene

stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-11.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-11.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-12.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-12.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-2.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-2.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-3.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-3.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-4.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-4.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-5.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-5.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-6.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-6.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-7.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-7.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-8.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-8.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-9.toulouse.grid5000.fr closed.



stdout:
mount: /root/energyfl: bad option; for several filesystems (e.g. nfs, cifs) you might need a /sbin/mount.<type> helper program.

stderr:
Connection to estats-9.toulouse.grid5000.fr closed.

