### Pre-processing of raw FASTQ files
This notebook is for quality trimming of raw fastq files. The raw files are expected to be already uploaded in the Hopsworks HDFS. It applies `trimmomatic` and `cutadapt` tools. For trimmomatic, paired end mode is used and both paired and unpaired output files are preserved. For cutadapt processing only **paired R2** files are used as input. Different arguments can be set in `settings.yml` file.

Note: Currently only RNA sequence processing is supported via  value of flag `IS_RNA` set to `True`. DNA sequencing is not yet supported.


#### WIP notes

* trimmomatic jar hopy to local throws error
* subprocess throws error in invalid adapters 

In [None]:
import os
import subprocess
import sys

from hops import hdfs
from pyspark import SparkContext

import utils



In [None]:
args=utils.load_arguments(sys.argv)

if args is not None:
    args=args['Trimmomatic']
else :
    sys.exit(utils.NO_CONFIG_ERR)
    
sc = SparkContext.getOrCreate()



In [None]:

OUTPUT_ROOT=args['OUTPUT_ROOT']
OUTPUT_PAIRED=args['OUTPUT_PAIRED']
OUTPUT_UNPAIRED=args['OUTPUT_UNPAIRED']
INPUT_ROOT=args['INPUT_ROOT']
USE_CUTADAPT = args['USE_CUTADAPT']
IS_RNA = args['IS_RNA']

PHRED = args['PHRED']
LEADING = str(args['LEADING'])
TRAILING = str(args['TRAILING'])
SLIDING_WINDOW = str(args['SLIDINGWINDOW'])
MIN_LEN = str(args['MINLEN'])
THREADS = args['THREADS']
JAVA = "java -jar"
CUTADPAT_ARGS='cutadapt -j 0 -u 3 -o'
SPACE=utils.SPACE


In [None]:
# trimmomatic jar version and adpaters
tool=utils.find_file_like('trimmomatic')
if tool is None:
    sys.exit(utils.TRIMMOMATIC_NOT_FOUND)
    
ADAPTER=utils.find_file_like('TruSeq')
if tool is None:
    sys.exit(utils.TRIMMOMATIC_ADAPTER_NOT_FOUND)
    


In [None]:

files=hdfs.ls(INPUT_ROOT)

### pair R1 and R2 for paired processing by trimmomatic
pairedList =  utils.group_R1R2(files)


In [None]:

### TODO 
# flag for RNA /DN change adapters
# single ended


### paired end attributes

illuminaclip_adapters = "ILLUMINACLIP:"+ADAPTER+":2:30:10:2:keepBothReads"
illuminaclip_Attribute = "LEADING:"+LEADING+SPACE+"TRAILING:"+TRAILING+SPACE+"SLIDINGWINDOW:"+SLIDING_WINDOW+SPACE+"MINLEN:"+MIN_LEN

# cutadpat
def cut(input_file):
    
    out_trim='cut_'+input_file
    params={out_trim : input_file}
    cmd_cut=utils.build_command(CUTADPAT_ARGS, params)
  
    subprocess.run(cmd_cut.split(' '),check=True).returncode
    if out_trim:
        os.remove(input_file)
        os.rename(out_trim,input_file)
        
    params.clear()



### trimmomatic on paired files
def apply_trim_paired(x):
    
    filename_forward=x[0] # R1
    filename_reverse=x[1] # R2

    hdfs.copy_to_local(os.path.join(INPUT_ROOT,filename_forward), overwrite=True)
    hdfs.copy_to_local(os.path.join(INPUT_ROOT,filename_reverse), overwrite=True)
    
    output_forward_paired ='trim_paired_'+filename_forward
    output_forward_unpaired='trim_unpaired_'+filename_forward
    output_reverse_paired ='trim_paired_'+filename_reverse
    output_reverse_unpaired='trim_unpaired_'+filename_reverse
    
    parameters = { 'PE ':'-'+PHRED, '-threads': THREADS, filename_forward: utils.EMPTY, filename_reverse: utils.EMPTY,
                  output_forward_paired: utils.EMPTY, output_forward_unpaired: utils.EMPTY,
                  output_reverse_paired: utils.EMPTY, output_reverse_unpaired:utils.EMPTY,
                  illuminaclip_adapters: utils.EMPTY, illuminaclip_Attribute: utils.EMPTY                 
                 }
    
    
    cmd_paired=utils.build_command(JAVA+SPACE+tool, parameters)
    
    os.system(cmd_paired) # run
  
    status=False
    if os.path.exists(output_forward_paired):
        if USE_CUTADAPT:
            cut(output_reverse_paired)        
            
        hdfs.copy_to_hdfs(output_forward_paired, OUTPUT_PAIRED, overwrite=True)
        hdfs.copy_to_hdfs(output_forward_unpaired, OUTPUT_UNPAIRED, overwrite=True)
        hdfs.copy_to_hdfs(output_reverse_paired, OUTPUT_PAIRED, overwrite=True)
        hdfs.copy_to_hdfs(output_reverse_unpaired, OUTPUT_UNPAIRED, overwrite=True)
        
        os.remove(output_reverse_paired) 
        os.remove(output_forward_unpaired)
        os.remove(output_forward_paired) 
        os.remove(output_reverse_unpaired) 
        os.remove(filename_forward)
        os.remove(filename_reverse)
        status=True
        
    parameters.clear()
    return [status,output_forward_paired,output_forward_unpaired,output_reverse_paired,output_reverse_unpaired]

In [None]:
dataPairedRdd=sc.parallelize(pairedList)


In [None]:
# run 
trimmedFiles=dataPairedRdd.map(apply_trim_paired).collect()
