### Quality trimming of raw FASTQ files
This notebook is for quality trimming of raw fastq files. The raw files are expected to be already uploaded in the Hopsworks HDFS. It applies `trimmomatic` and `cutadapt` tools. For trimmomatic, both single and paired end mode is used according to the sample name. (if sample name contains `_R` its treated as paired file else as single ended). For cutadapt processing only **paired R2** files are used as input. Different arguments can be set in `settings.yml` file.

Note: Currently only RNA sequence processing is supported via  value of flag `IS_RNA` set to `True`. DNA sequencing is not yet supported.


In [None]:
import os
import subprocess
import sys

from hops import hdfs
from pyspark import SparkContext

import utils



#### Load arguments

In [None]:
args=utils.load_arguments(sys.argv)


if args is not None:
    args=args[utils.KEY_TRIMMOMATIC]
else :
    sys.exit(utils.NO_CONFIG_ERR)
    
sc = SparkContext.getOrCreate()


OUTPUT_PAIRED=args['OUTPUT_PAIRED']
OUTPUT_UNPAIRED=args['OUTPUT_UNPAIRED']
IS_SAVE_UNPAIRED= args['SAVE_UNPAIRED']
INPUT_ROOT=args['INPUT_ROOT']
USE_CUTADAPT = args['USE_CUTADAPT']
IS_RNA = args['IS_RNA']
LOGS_ROOT = args['LOGS_ROOT']

PHRED = args['PHRED']
LEADING = str(args['LEADING'])
TRAILING = str(args['TRAILING'])
SLIDING_WINDOW = str(args['SLIDINGWINDOW'])
MIN_LEN = str(args['MINLEN'])
THREADS = args['THREADS']
JAVA = "java -jar"
CUTADPAT_ARGS='cutadapt -j 0 -u 3 -o'
SPACE=utils.SPACE
OUTPUT_SINGLE=args['OUTPUT_SINGLE']

#### Get trimmomatic jar and adapter files
JAR_PATH=args['JAR']
ADAPTER_PAIR_PATH=args['ADAPTER_PAIR']
ADAPTER_SINGLE_PATH=args['ADAPTER_SINGLE']
if JAR_PATH is None:
    sys.exit(utils.TRIMMOMATIC_NOT_FOUND)

if (ADAPTER_PAIR_PATH or ADAPTER_SINGLE_PATH) is None:
    sys.exit(utils.TRIMMOMATIC_ADAPTER_NOT_FOUND)

tool=os.path.basename(JAR_PATH)
ADAPTER_PAIR=os.path.basename(ADAPTER_PAIR_PATH)
ADAPTER_SINGLE=os.path.basename(ADAPTER_SINGLE_PATH)




In [None]:



"""
cutadpat
"""
def cut(input_file,log_file):
    
    out_trim='cut_'+input_file
    params={out_trim : input_file}
    cmd_cut=utils.build_command(CUTADPAT_ARGS, params)
    with open(log_file, "a") as f:
        subprocess.run(cmd_cut.split(' '),stdout=f,stderr=f)
    if out_trim:
        os.remove(input_file)
        os.rename(out_trim,input_file)
        
    params.clear()

    
"""
Trimmomatic on single end file.
Output files are copied back to hdfs
"""
def apply_trim_single(file_input):

    # get file name
    file_name=os.path.split(file_input)[1]
    file_output='trim_'+file_name

    # check if output already exists
    if hdfs.exists(os.path.join(OUTPUT_SINGLE,file_name)):
        print('Skipping input file as output file already exists', file_name)
        return [-1]

    # copy input to local
    hdfs.copy_to_local(file_input, overwrite=True)
    if not (os.path.exists(tool)):
        hdfs.copy_to_local(JAR_PATH)
    if not (os.path.exists(ADAPTER_SINGLE)):
        hdfs.copy_to_local(ADAPTER_SINGLE_PATH)

    # single end attributes
    attribute='SE -'+PHRED
    threads='-threads '+str(THREADS)
    illuminaclip_adapters = "ILLUMINACLIP:"+ADAPTER_SINGLE+":2:30:10"
    illuminaclip_Attribute = "LEADING:"+LEADING+SPACE+"TRAILING:"+TRAILING+SPACE+"SLIDINGWINDOW:"+SLIDING_WINDOW+SPACE+"MINLEN:"+MIN_LEN
    s=SPACE
    # command to run
    cmd_single =JAVA + s + tool + s + attribute + s + threads + s + file_name + s + file_output + s + illuminaclip_adapters + s + illuminaclip_Attribute
    print('INFO: Run trimmomatic command: ', cmd_single)
    # run
    log_file=utils.get_sampleName_with_lane(file_name)+'.txt'
    # run
    with open(log_file, "w") as f:
        result=subprocess.run(cmd_single.split(),stdout=f,stderr=f)
  
    status=False
    if os.path.exists(file_output) and result.returncode==0:
        # copy output to hdfs
        hdfs.copy_to_hdfs(file_output, OUTPUT_SINGLE, overwrite=True)  

        status=True

    # copy logs
    hdfs.copy_to_hdfs(log_file, LOGS_ROOT, overwrite=True)
    # remove local files
    os.remove(file_output)
    os.remove(file_name)    
    return file_output
    


"""
Trimmomatic on paired end file via subprocess.
Output files are copied to hdfs.
"""
def apply_trim_paired(x):
    r1=x[0] # R1
    r2=x[1] # R2

    # get file names
    filename_forward=os.path.basename(r1)
    filename_reverse=os.path.basename(r2)
    # append suffixes to output
    output_forward_paired =utils.TRIM_PAIRED+filename_forward
    output_forward_unpaired=utils.TRIM_UNPAIRED+filename_forward
    output_reverse_paired =utils.TRIM_PAIRED+filename_reverse
    output_reverse_unpaired=utils.TRIM_UNPAIRED+filename_reverse

    # check if output already exists
    if hdfs.exists(os.path.join(OUTPUT_PAIRED,output_forward_paired)):
        print('Skipping input file as output file already exists', filename_forward)
        return [-1]

    hdfs.copy_to_local(r1, overwrite=True)
    hdfs.copy_to_local(r2, overwrite=True)
    if not (os.path.exists(tool)):
        hdfs.copy_to_local(JAR_PATH)
    if not (os.path.exists(ADAPTER_PAIR)):
        hdfs.copy_to_local(ADAPTER_PAIR_PATH)


    ### paired end attributes
    attribute='PE -'+PHRED
    threads='-threads '+str(THREADS)
    illuminaclip_adapters = "ILLUMINACLIP:"+ADAPTER_PAIR+":2:30:10:2:keepBothReads"
    illuminaclip_Attribute = "LEADING:"+LEADING+SPACE+"TRAILING:"+TRAILING+SPACE+"SLIDINGWINDOW:"+SLIDING_WINDOW+SPACE+"MINLEN:"+MIN_LEN

    s=SPACE
    cmd1 = JAVA + s + tool + s + attribute + s + threads + s + filename_forward + s + filename_reverse + s + output_forward_paired + s + output_forward_unpaired
    cmd2 = s + output_reverse_paired + s + output_reverse_unpaired + s + illuminaclip_adapters + s + illuminaclip_Attribute
    # final command
    cmd_paired = cmd1 + cmd2
    print('INFO: Run trimmomatic command: ', cmd_paired)
    # run
    log_file=utils.get_sampleName_with_lane(filename_forward)+'.txt'
    with open(log_file, "w") as f:
        result=subprocess.run(cmd_paired.split(),stdout=f,stderr=f)

  
    status=False
    if result.returncode==0:
        if USE_CUTADAPT: # run cutadapt on R2 
            cut(output_reverse_paired,log_file)        
            
        # copy output to hdfs
        hdfs.copy_to_hdfs(output_forward_paired, OUTPUT_PAIRED, overwrite=True)
        hdfs.copy_to_hdfs(output_reverse_paired, OUTPUT_PAIRED, overwrite=True)
        if IS_SAVE_UNPAIRED:
            hdfs.copy_to_hdfs(output_forward_unpaired, OUTPUT_UNPAIRED, overwrite=True)
            hdfs.copy_to_hdfs(output_reverse_unpaired, OUTPUT_UNPAIRED, overwrite=True)

        status=True

    # copy logs
    hdfs.copy_to_hdfs(log_file, LOGS_ROOT, overwrite=True)
    # remove local files
    os.remove(output_reverse_paired) 
    os.remove(output_forward_unpaired)
    os.remove(output_forward_paired) 
    os.remove(output_reverse_unpaired) 
    os.remove(filename_forward)
    os.remove(filename_reverse)
                
 
    return [status,output_forward_paired,output_forward_unpaired,output_reverse_paired,output_reverse_unpaired]

#### Load input files hdfs path

In [None]:
all_files=utils.load_file_names(INPUT_ROOT)


#### Get list of all single end files and run trimmomatic in single mode in parallel

In [None]:
### single
single_files=[f for f in all_files if utils.R_IDENTIFIER not in f]

print('number of single input  files processing ', len(single_files))
dataRdd=sc.parallelize(single_files)

# run
trimmedSingleFiles=dataRdd.map(lambda x: apply_trim_single(x)).collect()

#### Pair R1 and R2 as a tuple in a list and run trimmomatic in paired end in parallel

In [None]:
### pair R1 and R2
pairedList =  utils.group_R1R2(all_files)
print('number of input paired files processing ', len(pairedList))
dataPairedRdd=sc.parallelize(pairedList)
# run
trimmedFiles=dataPairedRdd.map(lambda x: apply_trim_paired(x)).collect()
