In [2]:
import os
import sys
import requests
from utils.experiment import Experiment, execute_command_on_server_and_clients
from execo_g5k import oarsub, oardel, OarSubmission, get_current_oar_jobs, get_oar_job_nodes, get_oar_job_info, Deployment, deploy

In [3]:
jobname="fl-measure"
nodecount=3
#walltime="13:50:00"
walltime="4:40:00"
#resources_selection="-t exotic -p estats -t night"
resources_selection="-p estats -t exotic"
site="toulouse"
force_redeploy=False
#environment_dsc_file='../images/fl_jetson_image_complet.yaml'
environment_dsc_file='../images/ubuntul4t35-pytorch-estats-datasets-2.yaml'
storage_group="energyfl"

# Reserve a job and deploy the chosen environment

In [4]:
jobs = get_current_oar_jobs()
jobid = None
waiting_jobs = []
ifdeploy=True

In [5]:
while jobs:
    j, site = jobs.pop()
    info = get_oar_job_info(j, site)
    if info['name'] == jobname:
        if info['state'] == 'Running':
            jobid = j
            print("A {} job is already running, using it. jobid is {}".format(jobname, jobid))
            ifdeploy=False
            break
        else:
            waiting_jobs.append(j)
if not jobid and not waiting_jobs:
    jobspec = OarSubmission(resources="/cluster=1/nodes={}".format(nodecount), walltime=walltime,
                            additional_options=resources_selection, job_type="deploy", name=jobname,
                            queue='default')
    jobid, _ = oarsub([(jobspec, site)]).pop()
    print("New job submitted, jobid is {}".format(jobid))
elif not jobid:
    print("One or more {} jobs exist ({}) but are not running.\n"
          " Connect to the frontend to see what is happening, and/or run the cell again.".format(
          jobname, ", ".join([str(j) for j in waiting_jobs])))

New job submitted, jobid is 451688


In [6]:
jobid = 451688
nodes = get_oar_job_nodes(jobid, site)
nodes.sort(key=lambda n: n.address)
nodes

[Host('estats-7.toulouse.grid5000.fr'),
 Host('estats-8.toulouse.grid5000.fr'),
 Host('estats-9.toulouse.grid5000.fr')]

In [7]:
if ifdeploy:
    deployment = Deployment(hosts=nodes, env_file=os.path.abspath(environment_dsc_file))
    print(os.path.abspath(environment_dsc_file))
    deploy_ok, deploy_failed = deploy(deployment, check_deployed_command=not force_redeploy,
                                stdout_handlers=[sys.stdout],
                                stderr_handlers=[sys.stderr])
    print("Deployement status:\n* ok: {}\n* failed: {}".format(deploy_ok, deploy_failed))

/home/tunguyen/jetson-imagenet/images/ubuntul4t35-pytorch-estats-datasets-2.yaml
Deployement status:
* ok: {'estats-7.toulouse.grid5000.fr', 'estats-8.toulouse.grid5000.fr', 'estats-9.toulouse.grid5000.fr'}
* failed: set()






toulouse: Connection to toulouse.grid5000.fr closed.toulouse: 

# Allow access for the node to the nfs storage group and mount the storage to a folder

In [8]:
#!curl -X POST 'https://api.grid5000.fr/stable/sites/toulouse/storage/storage1/energyfl/access' -H "Content-Type: application/json" -d '{"termination" : {"job":451657, "site": "toulouse"}}'
# or
if ifdeploy:
    url = f"https://api.grid5000.fr/stable/sites/{site}/storage/storage1/{storage_group}/access"
    requests.post(url, headers={"Content-Type": "application/json"}, json={"termination": {"job": jobid, "site": site}}, verify=False)
    cmd = f"mkdir -p /root/{storage_group} ; mount storage1.toulouse.grid5000.fr:/export/group/{storage_group} /root/{storage_group}/"
    _ = execute_command_on_server_and_clients(nodes, cmd, background=False) 



Successfully executed on Host('estats-7.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'
Successfully executed on Host('estats-8.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'
Successfully executed on Host('estats-9.toulouse.grid5000.fr') command 'mkdir -p /root/energyfl ; mount storage1.toulouse.grid5000.fr:/export/group/energyfl /root/energyfl/'
