## Filter non human

This job to filter non human unmapped reads from BAM file via samtools. It needs a BED file to filter specific reads regions.

In [None]:
import pysam
import os
from hops import hdfs
import utils
import sys
from pyspark import SparkContext
import subprocess


#### Load arguments

In [None]:
# args=utils.load_arguments([0,'hdfs:///Projects/TCGA_viruses/Jupyter/pipeline/settings_DJ.yml'])
args=utils.load_arguments(sys.argv)
if args is not None:
    args=args['Unhuman']
else :
    sys.exit(utils.NO_CONFIG_ERR)
    
sc = SparkContext.getOrCreate()


inputRoot=args['INPUT_ROOT']
outputBam=args['OUTPUT_BAM']
humanFilterPath=args['FILTER_BED']

threads=str(args['THREADS'])


#### map function

In [None]:
"""
Runs pysam on a BAM file to filter specific reads region as specified in BED file.
Output is copied back to hdfs.
If output file name is already present in destination the process is skipped 
to avoid processing of same file incase of resubmit of failed run.
"""
def remove_human(file):
    
    filename=os.path.basename(file)
    bam_file=filename.split('.')[0]+'_NH.bam'
    if not hdfs.exists(os.path.join(outputBam,bam_file)): # check if output already exists
    
        hdfs.copy_to_local(humanFilterPath)
        humanFilter=os.path.basename(humanFilterPath)
        hdfs.copy_to_local(file, overwrite=True)        
        print("INFO: Run non human BAM : ", filename)
        pysam.view('-o','/dev/null', '-L', humanFilter, '-U',bam_file, filename,'-@',threads, catch_stdout=False)

        if os.path.exists(bam_file):
            hdfs.copy_to_hdfs(bam_file,outputBam,overwrite=True)
            os.remove(bam_file)
            os.remove(filename)

        return bam_file
    
    else:
        print('skipping existing file: ', filename)            
        return None




    


#### Load input files hdfs paths

In [None]:

inputFiles=utils.load_file_names(inputRoot)



In [None]:
print('Number of input files: ', len(inputFiles))

#### Run in parallel

In [None]:

### make spark rdd and map function
unMapped=sc.parallelize(inputFiles).map(remove_human).collect()
