### Filtering SAM to BAM unmapped 

This job to filter SAM files output from Nextgenmap. It runs `samtools` via `pysam` to convert to BAM and store only unmapped content.

In [None]:
import pysam
import os
from hops import hdfs
import utils
import sys
from pyspark import SparkContext
import subprocess


#### Load arguments

In [None]:

args=utils.load_arguments(sys.argv)
if args is not None:
    args=args[utils.KEY_SAM]
else :
    sys.exit(utils.NO_CONFIG_ERR)
    
sc = SparkContext.getOrCreate()

inputRoot=args['INPUT_ROOT']
outputBam=args['OUTPUT_BAM']

threads=str(args['THREADS'])


##### Map function

In [None]:
"""
run via pysam equivalent to 'samtools view'
Output is copied back to hdfs 
"""
def convert_sam(file_path):

    file=os.path.basename(file_path)
    bam_file=file.split('.')[0]+utils.UNMAPPED_BAM
    if utils.skip_file(file,bam_file,outputBam):
        return [-1]
    hdfs.copy_to_local(file_path, overwrite=True)
    print("INFO: Run unmapped sequences BAM : ", file)    
    pysam.view('-o', bam_file, '-b', file,'-f 4','-@',threads, catch_stdout=False)
    
    if os.path.exists(bam_file):
        hdfs.copy_to_hdfs(bam_file,outputBam,overwrite=True)
        os.remove(bam_file)
        os.remove(file)

    return bam_file




    


In [None]:
# load input file paths

inputFiles=utils.load_file_names(inputRoot)





#### Run in parallel

In [None]:
### convert to bam and filter unmapped sequences
unMapped=sc.parallelize(inputFiles).map(convert_sam).collect()