## Merge BAM

This job to merge BAM files per sample using samtools. Unique files are found by combining:

* Lanes
* Paired/Unpaired
* All parts incase the input FASTQ was split into smaller parts

In [None]:
import os
import subprocess
import sys

import pysam
from hops import hdfs
from pyspark import SparkContext

import utils



#### Load arguments

In [None]:
args=utils.load_arguments(sys.argv)
#args=utils.load_arguments([0,'hdfs:///Projects/4C/Jupyter/settingsDJ.yml'])

if args is not None:    
    args=args[utils.KEY_MERGE]
else :
    sys.exit(utils.NO_CONFIG_ERR)
    
sc = SparkContext.getOrCreate()


SPACE=utils.SPACE
INPUT_ROOT=args['INPUT_ROOT']
OUTPUT_MERGE=args['OUTPUT_MERGE']

THREADS=str(args['THREADS'])




#### Function to map


In [None]:

"""
runs 'samtools merge' as subprocess to merge all files for a given sample name.
"""
def merge_files(file):
    
    print("INFO: Run merge files", file)    
    first_name=file
    merged_file=first_name+'.bam'
    group_files=list(filter(lambda x: file in x ,inputFiles)) # all files for sample name
    # copy all files to local
    [hdfs.copy_to_local(os.path.join(INPUT_ROOT,x), overwrite=True) for x in group_files]
    # get string with all file names
    args=' '.join(group_files)
    
    params={'merge':merged_file, args: '','-@': THREADS}
    cmd=utils.build_command(utils.SAMTOOLS,params)
   
    subprocess.run(cmd.split(), stdout=subprocess.PIPE)
    if os.path.exists(merged_file):
        hdfs.copy_to_hdfs(merged_file,OUTPUT_MERGE,overwrite=True)
        os.remove(merged_file)
        for f in group_files:
            os.remove(f)

    return merged_file
  

        
    


#### Load input file names

In [None]:
# load input file hdfs paths
inputFiles=utils.load_file_names(INPUT_ROOT)
# take only file names
inputFiles=[os.path.basename(f) for f in inputFiles]
# get unique sample name
uniques=utils.find_unique_names(inputFiles)


#### Run in parallel 

In [None]:
## merge in parallel
mergedList=sc.parallelize(uniques).map(merge_files).collect()

