# Very basic `qsub`, `qstat`, `qdel` Python wrapper

## Basic code

In [180]:
# qsub_utils

import os
import subprocess
import platform
import logging

PBS_CONFIGURATION = {}
logger = logging.getLogger('qsub_utils')


def setup_configuration(nodes=''):
    """
    Method to setup PBS configuration
    :param nodes: specify nodes with list of features. `-l nodes={nodes}`
    """
    global PBS_CONFIGURATION   
    if len(nodes) > 0:
        PBS_CONFIGURATION['nodes'] = nodes
        

def get_configuration_str(conf_dict):
    """
    Method to convert configuration dictionary to string
    """
    conf_str = "#PBS"    
    if 'nodes' in conf_dict:
        conf_str += " -l nodes=%s" % conf_dict['nodes']
    if 'name' in conf_dict:
        conf_str += " -N %s" % conf_dict['name']
    if 'cwd' in conf_dict:         
        conf_str += " -d %s" % conf_dict['cwd'] 
    if 'stdout' in conf_dict:
        conf_str += " -o %s" % conf_dict['stdout']
    if 'stderr' in conf_dict:        
        conf_str += " -e %s" % conf_dict['stderr']           
    return conf_str

    
def write_launch_file(cmd_str, conf_dict, env=''):
    """
    Method to write a PBS launch file for qsub

    :param cmd_str: command string, e.g "python -c \'import sys; sys.print\'"
    :param name: name of the job
    :param cwd: current working directory
    :param env: environmanet string, e.g. "export PATH=$PATH:/path/to/bin"
    """
    with open(conf_dict['launch'], 'w') as w:        
        w.write(get_configuration_str(conf_dict) + '\n\n')
        if len(env) > 0:
            w.write(env + '\n')
        w.write(cmd_str)


def submit_job(cmd, name, cwd='', env=''):
    """
    Method to submit a job writing a launch file and using qsub
    `qsub job_{name}.launch`

    :param cmd: list of commands, e.g. ['python', '-c', '\"import sys; print sys.path\"']
    :param name: name of the job
    :param cwd: current working directory
    :param env: environmanet string, e.g. "export PATH=$PATH:/path/to/bin"
    """
    assert len(name) > 0, "Job name can not be empty"
    assert len(cmd) > 0, "Job command can not be empty"     
    
    if ' ' in name:
        name = name.replace(' ', '_')    

    filename = os.path.join(cwd, '%s.launch' % name)
        
    job_conf = dict()
    job_conf['nodes'] = PBS_CONFIGURATION['nodes']
    job_conf['name'] = name
    if len(cwd) > 0: job_conf['cwd'] = cwd
    job_conf['launch'] = filename
    job_conf['stdout'] = os.path.join(cwd, "log.out")
    job_conf['stderr'] = os.path.join(cwd, "log.err")
        
    write_launch_file(' '.join(cmd), job_conf, env)
    program = ['qsub', filename]
    process = subprocess.Popen(program,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               close_fds=False if platform.system() == 'Windows' else True)
    process.wait()
    job_id = process.stdout.read()
    assert job_id is not None and len(job_id) > 0, "Failed to fetch job id from qsub"

    job_conf['id'] = job_id.replace('\n', '')
    return job_conf


def delete_job(job_id):
    if not job_is_running(job_id):
        logger.warn("Job '%s' is not running. Can not delete job" % job_id)
        return False
    _id = _get_id(job_id)
    program = ['qdel', _id]
    process = subprocess.Popen(program,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               close_fds=False if platform.system() == 'Windows' else True)
    returncode = process.wait()
    return returncode == 0


def _get_id(job_id):
    _id = job_id.split('.')[0]
    return _id


def get_stats(job_id):
    _id = _get_id(job_id)
    program = ['qstat', '-f', _id]
    process = subprocess.Popen(program,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               close_fds=False if platform.system() == 'Windows' else True)
    process.wait()
    out = process.stdout.read()
    out = out.split('\n')
    stats = {}
    if len(out) > 0:
        for line in out:
            kv = line.split(' = ')
            if len(kv) > 1:
                stats[kv[0].strip()] = kv[1].strip()
    return stats


def job_is_running(job_id):
    stats = get_stats(job_id)
    if len(stats) > 0:
        """
        the job states
            E -    Job is exiting after having run.
            H -    Job is held.
            Q -    job is queued, eligable to run or routed.
            R -    job is running.
            T -    job is being moved to new location.
            W -    job is waiting for its execution time (-a option) to be reached.
            S -    (Unicos only) job is suspend.
        """
        if stats['job_state'] in ['R', 'Q', 'H', 'T', 'W']:
            return True
    else:
        return False


def get_stdout(job_info):
    filename = job_info['stdout']
    if not os.path.exists(filename):
        logger.warn("Stdout filename %s' is not found" % filename)
        return None
    out = []
    with open(filename, 'r') as r:
        while True:
            line = r.readline()
            if len(line) == 0:
                break
            out.append(line)
    return out


def get_stderr(job_info):
    filename = job_info['stderr']
    if not os.path.exists(filename):
        logger.warn("Stdout filename %s' is not found" % filename)
        return None
    out = []
    with open(filename, 'r') as r:
        while True:
            line = r.readline()
            if len(line) == 0:
                break
        out.append(line)
    return out


## Example

In [181]:
setup_configuration(nodes='1')

In [182]:
PBS_CONFIGURATION

{'nodes': '1'}

In [183]:
caffe_help_cmd = [
    "caffe",
    "--help", 
]
job_info = submit_job(caffe_help_cmd, 'caffe_help', env='export PATH=$PATH:/opt/caffe-master/build/tools/')

print('Before while')
while job_is_running(job_info['id']):        
    print('-')
    time.sleep(0.5)
#     print delete_job(job_info['id'])
    
stats = get_stats(job_info['id'])
print stats

Before while
{}


In [184]:
job_info

{'id': 'qsub: waiting for job 3958.c001 to startqsub: job 3958.c001 ready\r  ########################################################################\r  # Colfax Cluster - https://colfaxresearch.com/\r  #      Date:           Tue Mar 21 10:10:12 PDT 2017\r  #    Job ID:           3958.c001\r  #      User:           u2459\r  # Resources:           neednodes=1,nodes=1,walltime=24:00:00\r  ########################################################################\r  \r-bash: caffe_help.launch: command not found\rqsub: job 3958.c001 completed',
 'launch': 'caffe_help.launch',
 'name': 'caffe_help',
 'nodes': '1',
 'stderr': 'log.err',
 'stdout': 'log.out'}

In [150]:
# import time 

# python_inf_cmd = [
#     "python",
#     "-c \"print(\'YOU SHOULD SEE THIS\')\"", 
# ]

# job_info = submit_job(python_inf_cmd, 'python_sys', env='export PATH=$PATH:/opt/caffe-master/build/tools/')

# print('Before while')
# while job_is_running(job_info['id']):        
#     print('-')
#     time.sleep(0.5)
# #     print delete_job(job_info['id'])
    
# stats = get_stats(job_info['id'])
# print stats

In [151]:
job_info

{'id': '3916.c001',
 'launch': 'caffe_help.launch',
 'name': 'caffe_help',
 'nodes': '1',
 'stderr': 'log.err',
 'stdout': 'log.out'}

In [172]:
!cat {job_info['launch']}

#!/bin/bash
#PBS -l nodes=1 -N caffe_help -o log.out -e log.err

caffe --help


In [166]:
!{job_info['launch']}

/usr/bin/sh: caffe_help.launch: command not found


In [178]:
job_info['launch']

'caffe_help.launch'

In [179]:
!export PATH="$PATH:/opt/caffe-master/build/tools/" && qsub -V -I -x ~/Intel_MobileODT/notebooks/caffe_help.launch
#!echo /opt/caffe-master/build/tools/caffe --help | qsub -l nodes=1 -N python_sys -o log.out -e log.err 

qsub: waiting for job 3957.c001 to start
qsub: job 3957.c001 ready


  ########################################################################
  # Colfax Cluster - https://colfaxresearch.com/
  #      Date:           Tue Mar 21 10:01:15 PDT 2017
  #    Job ID:           3957.c001
  #      User:           u2459
  # Resources:           neednodes=1,nodes=1,walltime=24:00:00
  ########################################################################
  
caffe: command line brew
usage: caffe <command> <args>

commands:
  train           train or finetune a model
  test            score a model
  data_server     run data server - remote data source
  device_query    show GPU diagnostic information
  time            benchmark model execution time
  collect         collects layer data on specified device
  compare         collects layer data using inputs from other device

  Flags from /builddir/build/BUILD/gflags-2.1.1/src/gflags.cc:
    -flagfile (load flags from file) type: string default: 

In [160]:
!ls -all 

total 15720
drwxrwxr-x.  4 u2459 u2459     4096 Mar 21  2017 .
drwxrwxr-x. 10 u2459 u2459     4096 Mar 20 03:28 ..
-rw-rw-r--.  1 u2459 u2459      114 Mar 21  2017 caffe_help.launch
-rw-rw-r--.  1 u2459 u2459      821 Mar 20 18:55 cervix_detector.ipynb
-rw-rw-r--.  1 u2459 u2459     9947 Mar 20 18:55 colfax_intel_caffe_tryouts.ipynb
-rw-rw-r--.  1 u2459 u2459 15299022 Mar 20 03:28 data_exploration.ipynb
-rw-rw-r--.  1 u2459 u2459   174919 Mar 20 03:28 dev_unet.ipynb
-rw-rw-r--.  1 u2459 u2459    36857 Mar 20 03:28 digits__data_preparation.ipynb
-rw-rw-r--.  1 u2459 u2459    58493 Mar 21  2017 example_qsub_utils.ipynb
drwxr-xr-x.  2 u2459 u2459     4096 Mar 20 18:56 .ipynb_checkpoints
-rw-rw-r--.  1 u2459 u2459      154 Mar 21  2017 job_caffe_help.launch
-rw-rw-r--.  1 u2459 u2459      155 Mar 21 04:20 job_python_inf.launch
-rw-------.  1 u2459 u2459        0 Mar 21 08:26 log.err
-rw-------.  1 u2459 u2459      634 Mar 21 08:26 log.out
-rw-rw-r--.  1 u2459 u2459      142 

In [161]:
out = get_stdout(job_info)
err = get_stderr(job_info)

print out
print err

['\n', '  ########################################################################\n', '  # Colfax Cluster - https://colfaxresearch.com/\n', '  #      Date:           Tue Mar 21 08:26:36 PDT 2017\n', '  #    Job ID:           3917.c001\n', '  #      User:           u2459\n', '  # Resources:           neednodes=1,nodes=1,walltime=24:00:00\n', '  ########################################################################\n', '  \n', '\n', '  ########################################################################\n', '  # Colfax Cluster\n', '  # End of output for job 3917.c001\n', '  # Date: Tue Mar 21 08:26:37 PDT 2017\n', '  ########################################################################\n', '  \n']
['']


In [162]:
!cat log.out


  ########################################################################
  # Colfax Cluster - https://colfaxresearch.com/
  #      Date:           Tue Mar 21 08:26:36 PDT 2017
  #    Job ID:           3917.c001
  #      User:           u2459
  # Resources:           neednodes=1,nodes=1,walltime=24:00:00
  ########################################################################
  

  ########################################################################
  # Colfax Cluster
  # End of output for job 3917.c001
  # Date: Tue Mar 21 08:26:37 PDT 2017
  ########################################################################
  


In [130]:
!cat /var/log/messages

cat: /var/log/messages: Permission denied


In [97]:
!ls

cervix_detector.ipynb		  log.err
colfax_intel_caffe_tryouts.ipynb  log.out
data_exploration.ipynb		  python_sys.launch
dev_unet.ipynb			  resnet_with_keras.ipynb
digits__data_preparation.ipynb	  test_data_iterator.ipynb
example_qsub_utils.ipynb	  unet_with_keras.ipynb
job_caffe_help.launch		  weights
job_python_inf.launch


In [94]:
job_info

{'cwd': '',
 'id': '3855.c001',
 'launch_filename': 'job_python_inf.launch',
 'name': 'python_inf',
 'stderr_filename': '/home/u2459/python_inf.e3855',
 'stdout_filename': '/home/u2459/python_inf.o3855'}

In [96]:
print delete_job(job_info['id'])

None


In [23]:
get_stats(job_id)

{}

In [24]:
!cat {filename}

#PBS -l nodes=1:knl7210:ram96gb -o /home/u2459 -e /home/u2459 -N python_inf
export PATH=$PATH:/opt/caffe-master/build/tools/
python -c "import time;time.sleep(1000000000)"

In [27]:
#!rm ~/python_inf.*

In [28]:
!ls ~/

build_opencv	     Intel_MobileODT  start_digits.sh	test_launch
DIGITS		     keras_source     start_digits.sh~	test.py
digits_dependencies  opencv	      STDIN.e3535	tmp
env.local	     opencv_source    STDIN.o3535


In [29]:
!qstat -f {job_id.split(".")[0]}

qstat: Unknown Job Id Error 3840.c001


In [25]:
get_stats(job_id)

out split ['']

STATS: {}

ERR:  qstat: Unknown Job Id Error 3835.c001



In [14]:
!qstat -f 38454

qstat: Unknown Job Id Error 38454.c001


In [104]:
!ls ~/

build_opencv	     keras_source      STDIN.e3535  Train_Caffe_Model.e3859
DIGITS		     opencv	       STDIN.o3535  Train_Caffe_Model.o3859
digits_dependencies  opencv_source     test_launch
env.local	     start_digits.sh   test.py
Intel_MobileODT      start_digits.sh~  tmp


In [107]:
!cat ~/Train_Caffe_Model.e3859

/var/spool/torque/mom_priv/jobs/3859.c001.SC: line 3: 34:ln=01: command not found
/var/spool/torque/mom_priv/jobs/3859.c001.SC: line 3: 36:mh=00:pi=40: command not found
/var/spool/torque/mom_priv/jobs/3859.c001.SC: line 3: 33:so=01: command not found
/var/spool/torque/mom_priv/jobs/3859.c001.SC: line 3: 35:do=01: command not found
/var/spool/torque/mom_priv/jobs/3859.c001.SC: line 3: 35:bd=40: command not found
/var/spool/torque/mom_priv/jobs/3859.c001.SC: line 3: 33: command not found
/var/spool/torque/mom_priv/jobs/3859.c001.SC: line 3: 01:cd=40: command not found
/var/spool/torque/mom_priv/jobs/3859.c001.SC: line 3: 33: command not found
/var/spool/torque/mom_priv/jobs/3859.c001.SC: line 3: 01:or=40: command not found
/var/spool/torque/mom_priv/jobs/3859.c001.SC: line 3: 31: command not found
/var/spool/torque/mom_priv/jobs/3859.c001.SC: line 3: 01:mi=01: command not found
/var/spool/torque/mom_priv/jobs/3859.c001.SC: line 3: 05: command not found
/var/spool/torque/mom_

In [108]:
!cat ~/Train_Caffe_Model.o3859


  ########################################################################
  # Colfax Cluster - https://colfaxresearch.com/
  #      Date:           Tue Mar 21 04:17:42 PDT 2017
  #    Job ID:           3859.c001
  #      User:           u2459
  # Resources:           neednodes=1:knl7210:ram96gb,nodes=1:knl7210:ram96gb,walltime=24:00:00
  ########################################################################
  

  ########################################################################
  # Colfax Cluster
  # End of output for job 3859.c001
  # Date: Tue Mar 21 04:17:43 PDT 2017
  ########################################################################
  


In [None]:
!ls 

In [33]:
!ls /home/u2459/DIGITS/digits/jobs/20170321-071415-7265

caffe_output.log	      original.prototxt  Train_Caffe_Model.e3889
deploy.prototxt		      solver.prototxt	 Train_Caffe_Model.o3889
job_Train_Caffe_Model.launch  status.pickle	 train_val.prototxt


In [34]:
!cat /home/u2459/DIGITS/digits/jobs/20170321-071415-7265/job_Train_Caffe_Model.launch

#PBS -l nodes=1:knl7210:ram96gb -N Train_Caffe_Model -d /home/u2459/DIGITS/digits/jobs/20170321-071415-7265 -o /home/u2459/DIGITS/digits/jobs/20170321-071415-7265 -e /home/u2459/DIGITS/digits/jobs/20170321-071415-7265

/opt/caffe-master/build/tools/caffe --help

In [40]:
!cat /home/u2459/DIGITS/digits/jobs/20170321-071415-7265/Train_Caffe_Model.e3893

In [36]:
!qsub /home/u2459/DIGITS/digits/jobs/20170321-071415-7265/job_Train_Caffe_Model.launch

3893.c001


In [27]:
#!echo /opt/caffe-master/build/tools/caffe --help | qsub -l nodes=1:knl7210:ram96gb

3890.c001


In [None]:
!echo caffe_help.launch

In [49]:
!qstat

In [43]:
!ls
#!cat STDIN.o3890

cervix_detector.ipynb		  job_caffe_help.launch
colfax_intel_caffe_tryouts.ipynb  job_python_inf.launch
data_exploration.ipynb		  resnet_with_keras.ipynb
dev_unet.ipynb			  test_data_iterator.ipynb
digits__data_preparation.ipynb	  unet_with_keras.ipynb
example_qsub_utils.ipynb	  weights


In [44]:
!cat job_caffe_help.launch

#PBS -l nodes=1:knl7210:ram96gb -o /home/u2459 -e /home/u2459 -N caffe_help
export PATH=$PATH:/opt/caffe-master/build/tools/
caffe --help

In [53]:
!qsub job_caffe_help.launch

3897.c001


In [54]:
!cat /home/u2459/log.out


  ########################################################################
  # Colfax Cluster - https://colfaxresearch.com/
  #      Date:           Tue Mar 21 07:30:16 PDT 2017
  #    Job ID:           3897.c001
  #      User:           u2459
  # Resources:           neednodes=1:knl7210:ram96gb,nodes=1:knl7210:ram96gb,walltime=24:00:00
  ########################################################################
  
caffe: command line brew
usage: caffe <command> <args>

commands:
  train           train or finetune a model
  test            score a model
  data_server     run data server - remote data source
  device_query    show GPU diagnostic information
  time            benchmark model execution time
  collect         collects layer data on specified device
  compare         collects layer data using inputs from other device

  Flags from /builddir/build/BUILD/gflags-2.1.1/src/gflags.cc:
    -flagfile (load flags from file) type: string default: ""
    -frome