## Run Diamond

Runs `diamond` with given reference files and input. 
Needs diamond tool on hdfs which loaded before running.

In [None]:
import os
import subprocess
import sys
import stat

from hops import hdfs
from pyspark import SparkContext

import utils



#### load arguments

In [None]:

args=utils.load_arguments(sys.argv)


if args is not None:
    args=args[utils.KEY_DIAMOND]
else :
    sys.exit(utils.NO_CONFIG_ERR)
    
sc = SparkContext.getOrCreate()



INPUT_ROOT=args['INPUT_ROOT']
OUTPUT_ROOT=args['OUTPUT_ROOT']
REFERENCE_FASTA_PATH = args['REFERENCE_FASTA']
DIAMOND_RUN= args['DIAMOND_RUN']
TOP = args['TOP']
COMPRESS = args['COMPRESS']
OUTPUT_FORMAT=args['OUTPUT_FORMAT']
OUTPUT_PREFIX=args['OUTPUT_PREFIX']
ADVANCE_TUNING = args['ADVANCED_TUNING']
LOG_DIR=args['LOGS_ROOT']
DIAMOND_TOOL=args['DIAMOND_TOOL']

inputRef= os.path.split(REFERENCE_FASTA_PATH)[1]
DIAMOND_PREFIX='./diamond'
SPACE=utils.SPACE

#### Functions to load diamond and to make diamond db

In [None]:
# load diamond scripts from hdfs
def install_diamond(diamond_path):
    
    hdfs.copy_to_local(diamond_path)
    st = os.stat(DIAMOND_PREFIX)
    os.chmod(DIAMOND_PREFIX, st.st_mode | stat.S_IEXEC)


### MAKE DIAMOND DB
def make_diamond_db(reference_path):
    
    install_diamond(DIAMOND_TOOL)
    diamond_DB=os.path.split(reference_path)[1]+'_DB.dmnd'
    hdfs.copy_to_local(reference_path)
    params={ 'makedb --in': inputRef, '-d':diamond_DB }
    cmd_DB=utils.build_command(DIAMOND_PREFIX, params)
      
    status=subprocess.run(cmd_DB.split(' '),stdout=subprocess.PIPE)

    if status.returncode==0:
        hdfs_path=os.path.split(reference_path)[0]
        hdfs.copy_to_hdfs(diamond_DB,hdfs_path,overwrite=True)
        
        return os.path.join(hdfs_path,diamond_DB)

    return None




#### Map function running diamond on single file

In [None]:


"""
Runs diamond via subprcess on single file.
Output and diamond log is copied back to hdfs.
"""
def apply_diamond(file,diamondDB_path):
    #install diamond
    if not os.path.exists(DIAMOND_PREFIX):
        install_diamond(DIAMOND_TOOL)
    
    # download diamond DB file
    diamond_DB=os.path.split(diamondDB_path)[1]
    if not os.path.exists(diamond_DB):
        hdfs.copy_to_local(diamondDB_path)
        
    # download input file
    hdfs.copy_to_local(file, overwrite=True)
    file=os.path.basename(file)
    filename=os.path.splitext(os.path.splitext(file)[0])[0] # split file name without extension    
            
    outfile=OUTPUT_PREFIX+filename     

    if ADVANCE_TUNING:
        advanced_params = str(COMPRESS)+SPACE+ADVANCE_TUNING
    else :
        advanced_params = str(COMPRESS)
        
    parameters = { '-d': diamond_DB, '-q':file, '-o':outfile, '--top': TOP, '--outfmt':OUTPUT_FORMAT, '--compress': advanced_params}
    cmd=utils.build_command(DIAMOND_PREFIX+SPACE+DIAMOND_RUN,parameters)
    print('INFO: Running diamond with command:', cmd)
       
    log_file=os.path.splitext(outfile)[0]+'.txt'
    with open(log_file, "w") as f:
        subprocess.run(cmd.split(' '),stdout=f,stderr=f)
    
    hdfs.copy_to_hdfs(log_file, LOG_DIR, overwrite=True)
    
    if COMPRESS:
        target=outfile+'.gz'
    else:
        target=outfile 
    
    if os.path.exists(target):
        hdfs.copy_to_hdfs(target,OUTPUT_ROOT,overwrite=True)
        os.remove(target)
        os.remove(file)

    parameters.clear() 
    
    return target
    

#### Run

* first make diamond db
* load input hdfs file paths
* make spark rdd and map function to run parallel


In [None]:
# make diamond DB
diamondDB_path=make_diamond_db(REFERENCE_FASTA_PATH)    

# load input data
inputFiles=utils.load_file_names(INPUT_ROOT)
# run
final=sc.parallelize(inputFiles).map(lambda x: apply_diamond(x,diamondDB_path) ).collect()


