### Split large FASTQ files into smaller chunks

Uses fastqsplitter python library.  Input files assumed are paired!

In [None]:
import os
import sys
import traceback
from hops import hdfs
import subprocess
import time
import math
from pyspark import SparkContext,SparkConf
import utils


### Intilialise input and output folder and load input file hdfs paths

In [None]:
args=utils.load_arguments(sys.argv)
if args is not None:
    args=args[utils.KEY_SPLITFASTQ]
else :
    sys.exit(utils.NO_CONFIG_ERR)

sc = SparkContext.getOrCreate()

INPUT_ROOT=args['INPUT_ROOT']

all_files=utils.load_file_names(INPUT_ROOT)




#### Helper functions

In [105]:
"""
Finds R1 and R2 total size for given pair.
Returns array of pair(R1,R2) and total size.
"""
def get_pair_size(x):
    r1=x[0] # r1 file name
    r2=x[1] # r2 file name
    r1_size=hdfs.stat(r1).st_size
    r2_size=hdfs.stat(r2).st_size

    return [x,r1_size+r2_size]


"""
Returns string with names of splits of file depending on the number of splits

eg. part1_sampleName.fq.gz part2_sampleName.fq.gz part3_sampleName.fq.gz
"""
def get_parts_file_names(file_path,number_of_splits,ext='.fastq.gz'):
    file_name=os.path.basename(file_path)
    name=os.path.splitext(os.path.splitext(file_name)[0])[0]
    return ['part'+str(i)+'_'+name+ext for i in range(1,number_of_splits+1)]


"""
Calculates number of splits based on file size and upper threshold.
Default threshold = 4GB

eg. For a file size of 8GB output is 2 with default threshold.
"""
def find_nbr_splits(size, threshold=4096000000):       
    return math.ceil( size/threshold )


"""
Splits single fastq file by calling fastqsplitter via subprocess.
If successful, original input file is deleted.
"""
def split_file(input_file_path,files):
    input_file_path=hdfs.get_plain_path(input_file_path)
    print('Starting to split file: ', input_file_path)

    hdfs.copy_to_local(input_file_path)
    dirname, input_file = os.path.split(input_file_path)
    output_args=' -o '+' -o '.join(files) # form the argument
    cmd='fastqsplitter -i '+input_file+output_args
    print('Running fastqsplitter with cmd: ', cmd)
    
    start=time.time()
    out=subprocess.run(cmd.split(' '),stdout=subprocess.PIPE)
    end=time.time()
    print( " time diff " , end-start)
    
    if out.returncode==0:
        [ hdfs.copy_to_hdfs(x, os.path.join(dirname,"split"),overwrite=True) for x in files ]
        print('Splitting successful. Deleting original file at: ',input_file_path)
        hdfs.delete(input_file_path)
        [ os.remove(x) for x in files ]
    os.remove(input_file)
      
    return  

    



### Map function


* Input is array of file name, size.
* Depending on the size number of splits is calculated. Default threshold = 4GB as upper limit of file.
* Depending on the number of parts the command argument is formed.
* Finally fastqsplitter is called.


In [None]:
"""
Map function.
Input is array of file name, size.
Depending on the size numnber of splits is calculated.
Depending on the number of parts the command argument is formed.
Finally fastqsplitter is called.
"""
def mapSplit(input_pair):
    input_file=input_pair[0]
    size=input_pair[1]
    nbr_parts=find_nbr_splits(size) # calculate number of splits based on size
    files=get_parts_file_names(input_file,nbr_parts) # form arguments for splits
    split_file(input_file,files) # split
    
    return

### Steps

* group R1 and R2 pairs
* calculate each pair total size (R1+R2)
* filter files based on total size and greater than specified threshold
* flatten list to get separate entrieds for R1 and R2





In [None]:
# combine into r1 and r2
group=utils.group_R1R2(all_files)


# get paired size
paired_size=sc.parallelize(group).map(get_pair_size)


# filter based on min size
minSize=8192000000 # minimum size of file to select for spliting
filtered_toSplit=paired_size.filter(lambda x: x[1] >= minSize)
#filtered_toSplit=filtered_toSplit.map(lambda x : os.path.basename(x))
paired_path=filtered_toSplit.collect()


# separate r1 and r2
flat_list=[]
for i in paired_path:
    flat_list.append((i[0][0],i[1])) # R1,total size (R1+R2)
    flat_list.append((i[0][1],i[1])) # R2,total size (R1+R2)






### Run flat list in parallel

In [None]:

# run split
data=sc.parallelize(flat_list)
try:
    data.map(mapSplit).collect()
except Exception as e:
    traceback.print_exc()
    raise SystemError('Failed job execution')
