## Run Diamond

Runs `diamond` with given reference files and input. 

#### WIP notes



In [None]:
import os
import subprocess
import sys

from hops import hdfs
from pyspark import SparkContext

import utils



In [None]:
args=utils.load_arguments(sys.argv)


if args is not None:
    args=args['Diamond']
else :
    sys.exit(utils.NO_CONFIG_ERR)
    
sc = SparkContext.getOrCreate()

In [None]:

INPUT_ROOT=args['INPUT_ROOT']
OUTPUT_ROOT=args['OUTPUT_ROOT']
REFERENCE_FASTA_PATH = args['REFERENCE_FASTA']
DIAMOND_RUN= args['DIAMOND_RUN']
TOP = args['TOP']
COMPRESS = args['COMPRESS']
OUTPUT_FORMAT=args['OUTPUT_FORMAT']
OUTPUT_PREFIX=args['OUTPUT_PREFIX']
ADVANCE_TUNING = args['ADVANCED_TUNING']
LOG_DIR=args['LOGS_ROOT']


inputRef= os.path.split(REFERENCE_FASTA_PATH)[1]
DIAMOND_PREFIX='./diamond'
SPACE=utils.SPACE

In [None]:

### MAKE DIAMOND DB
def make_diamond_db(reference_path):

    ref=os.path.split(reference_path)[1]
    diamond_DB=os.path.splitext(ref)[0]+'_DB.dmnd'
    hdfs.copy_to_local(reference_path)
    params={ 'makedb --in': ref, '-d':diamond_DB }
    cmd_DB=utils.build_command(DIAMOND_PREFIX, params)

    status=subprocess.run(cmd_DB.split(' '),stdout=subprocess.PIPE)

    if status.returncode==0:
        return diamond_DB

    return None




In [None]:



def apply_diamond(file,diamondDB_path):

    filename=os.path.splitext(os.path.splitext(file)[0])[0] # split file name without extension    
    hdfs.copy_to_local(os.path.join(INPUT_ROOT,file), overwrite=True)
    hdfs.copy_to_local(diamondDB_path)
    diamond_DB=os.path.split(diamondDB_path)[1]
    outfile=OUTPUT_PREFIX+filename     

    if ADVANCE_TUNING:
        advanced_params = str(COMPRESS)+SPACE+ADVANCE_TUNING
    else :
        advanced_params = str(COMPRESS)
        
    parameters = { '-d': diamond_DB, '-q':file, '-o':outfile, '--top': TOP, '--outfmt':OUTPUT_FORMAT, '--compress': advanced_params}
    cmd=utils.build_command(DIAMOND_PREFIX+SPACE+DIAMOND_RUN,parameters)
    print('INFO: Running diamond with command:', cmd)
       
    log_file=os.path.splitext(outfile)[0]+'.txt'
    with open(log_file, "w") as f:
        subprocess.run(cmd.split(' '),stdout=f,stderr=f)
    
    hdfs.copy_to_hdfs(log_file, LOG_DIR, overwrite=True)
    
    if COMPRESS:
        target=outfile+'.gz'
    else:
        target=outfile 
    
    if os.path.exists(target):
        hdfs.copy_to_hdfs(target,OUTPUT_ROOT,overwrite=True)
        os.remove(target)
        os.remove(file)

    parameters.clear() 
    
    return target
    

In [None]:
# make diamond DB
diamond_DB=make_diamond_db(REFERENCE_FASTA_PATH)

if diamond_DB is not None:
    hdfs_path=os.path.split(REFERENCE_FASTA_PATH)[0]
    hdfs.copy_to_hdfs(diamond_DB,hdfs_path, overwrite=True)
    diamondDB_path=os.path.join(hdfs_path, diamond_DB)

    # load input data
    inputFiles=utils.load_file_names(INPUT_ROOT)
    # run
    final=sc.parallelize(inputFiles).map(lambda x: apply_diamond(x,diamondDB_path) ).collect()


