## Filtering BAM 

This job to filter BAM filers. It runs `samtools` to convert with following steps:

* Merge lanes per sample
* Sort BAM
* Convert BAM to FASTQ

In [None]:
import os
import subprocess
import sys

import pysam
from hops import hdfs
from pyspark import SparkContext

import utils



In [None]:
args=utils.load_arguments(sys.argv)

if args is not None:    
    args=args['FilterSAM']
else :
    sys.exit(utils.NO_CONFIG_ERR)
    
sc = SparkContext.getOrCreate()

In [None]:

SAMTOOLS='/srv/hops/anaconda/envs/4C/bin/samtools'
SPACE=utils.SPACE
OUTPUT_BAM=args['OUTPUT_BAM']
OUTPUT_MERGE=args['OUTPUT_MERGE']
OUTPUT_SORT=args['OUTPUT_SORT']
OUTPUT_FASTQ=args['OUTPUT_FASTQ']
THREADS=str(args['THREADS'])

## input start with BAM unmapped files
inputRoot=OUTPUT_BAM

In [None]:


def merge_files(file):
    
    print("INFO: Run merge files", file)    
    first_name=os.path.splitext(file[0])[0]
    merged_file=first_name.split(utils.LANE_SEPAROTOR)[0]+'.bam'
    [hdfs.copy_to_local(os.path.join(OUTPUT_BAM,x), overwrite=True) for x in file]
    pysam.merge(merged_file,file[0],file[1],file[2],file[3],'--threads',THREADS)
    if os.path.exists(merged_file):
        hdfs.copy_to_hdfs(merged_file,OUTPUT_MERGE,overwrite=True)
        os.remove(merged_file)
        for f in file:
            os.remove(f)

    return merged_file
  
    
def sort(file):    
    
    sort_file = 'sorted_'+file
    pysam.sort('-@',THREADS,'-n',file,'-o',sort_file,catch_stdout=False)
    return sort_file
    

def convert_fastq(file):
    
    filename=os.path.splitext(file)[0]
    
    params={'bam2fq':file, '-@': THREADS}
    cmd=utils.build_command(SAMTOOLS,params)
    out=filename+'.fastq'   
    out_comprssd=out+'.gz'
    zipp='gzip -1'

    p1 = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE)
    with open(out_comprssd, 'wb') as fout :
        subprocess.run(zipp.split(' '), stdin=p1.stdout, stdout=fout)

    return out_comprssd


def sort_convert(file):
    
    hdfs.copy_to_local(os.path.join(OUTPUT_MERGE,file), overwrite=True)
    print("INFO: Run sort ", file)    
    sort_file=sort(file)
    print("INFO: Run convert to fastq ", sort_file)
    outfile = convert_fastq(sort_file)
    
    if os.path.exists(outfile):
        hdfs.copy_to_hdfs(outfile,OUTPUT_FASTQ,overwrite=True)
        os.remove(outfile)
        os.remove(sort_file)
        os.remove(file)
    
        return outfile
    else :
        return False
        
    


In [None]:

inputFiles=utils.load_file_names(inputRoot)


In [None]:
### Combining all lanes per sample
nbrLanes=utils.find_number_of_lanes(inputFiles)
print('INFO: Number of Lanes found: ', nbrLanes)
combinedBam = utils.combine_all_lanes(inputFiles,nbrLanes)


In [None]:
## merge 
mergedList=sc.parallelize(combinedBam).map(merge_files).collect()
hdfs.delete(OUTPUT_BAM,recursive=True)


In [None]:
### sort and convert to fastq
finalList = sc.parallelize(mergedList).map(sort_convert).collect()
hdfs.delete(OUTPUT_MERGE,recursive=True)