### This notebook was made to create (and submit jobs for) JankyPipe, a pipeline that takes fastq files as input of Mycobacterium tuberculosis isolates, aligns the reads to H37Rv and calls variants. The output is a VCF file, a lineage call and a Qualimap report. This notebook also submits a job that runs JankyPipe on all of the isolate in our study.

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

In [2]:
import os
import pandas as pd
import numpy as np
from slurmpy import Slurm
import vcf
import shutil

#### Function to launch JankyPipe as a Job

In [40]:
def Launch_JankyPipe(fqf1 , fqf2 , tag , output_dir , scratch_dir , O2_SLURM_logs_dir):
    
    '''
    This script launches a job to call variants for the input fastq files against H37Rv
    using a number of packages. The important output (VCF, lineage info files, quality report)
    is stored in the output directory while the intermediary files (SAMs, trimmed fastqs, BAM, etc)
    are stored in a scratch directory.
    '''
    
    #store all commands in a list
    commands_list = []
    
    #change directory to scratch
    commands_list.append( 'cd ' + scratch_dir )

    ###################################
    ### Load Necessary Modules ########
    ###################################

    #load perl
    commands_list.append( 'module load perl/5.24.0' )

    #load java
    commands_list.append( 'module load java/jdk-1.8u112' )

    #load BWA
    commands_list.append( 'module load bwa/0.7.15' )

    #load Samtools
    commands_list.append( 'module load samtools/1.3.1' )

    #load BCFtools
    commands_list.append( 'module load bcftools/1.3.1' )

    #load Picard
    commands_list.append( 'module load picard/2.8.0' )

    #Create Index files for Reference Genome
    commands_list.append( 'mkdir RefGen' )

    #copy reference genome over to RefGen folder
    commands_list.append( 'cp /home/rv76/Farhat_Lab/Reference_Seqs/H37Rv/h37rv.fasta RefGen/TBRefGen.fasta' )

    #change directory to RefGen folder
    commands_list.append( 'cd RefGen' )

    ###################################
    ### Create Index Files for H37Rv ##
    ###################################
    commands_list.append( 'samtools faidx TBRefGen.fasta' )
    commands_list.append( 'bwa index TBRefGen.fasta' )

    RefGen = scratch_dir + '/RefGen/TBRefGen.fasta' #H37Rv reference

    #go back to parent directory
    commands_list.append( 'cd ..' )

    ###################################
    ### UnZip FastQ files #############
    ###################################
    fqf1_base_name = fqf1.split('/')[-1][0:-9]
    fqf2_base_name = fqf2.split('/')[-1][0:-9]

    #work with the unzipped files for the rest of the pipeline (after unzipping them)
    fqf1_unzipped = scratch_dir + '/{}'.format(fqf1_base_name) + '.fastq'
    fqf2_unzipped = scratch_dir + '/{}'.format(fqf2_base_name) + '.fastq'

    commands_list.append( 'zcat {0} > {1}'.format(fqf1, fqf1_unzipped) )
    commands_list.append( 'zcat {0} > {1}'.format(fqf2, fqf2_unzipped) )

    #use the unzipped fastq files now
    fqf1 = fqf1_unzipped
    fqf2 = fqf2_unzipped

    ####################################
    ### PRINSEQ (trim reads) ##########
    ###################################

    #create directory for prinseq in output directory
    commands_list.append( 'mkdir ' + output_dir + '/prinseq' )

    commands_list.append( 'perl /n/data1/hms/dbmi/farhat/bin/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {0} -fastq2 {1} -out_format 3 -out_good {2}/{3}-trimmed -out_bad null -log {4}/{3}-trimmed.log -min_qual_mean 20 -verbose'.format(fqf1, fqf2, scratch_dir, tag , output_dir+'/prinseq') )

    #use newly trimmed fastq files now
    fqf1 = scratch_dir + '/{}'.format(tag) + '-trimmed_1.fastq'
    fqf2 = scratch_dir + '/{}'.format(tag) + '-trimmed_2.fastq'

    ######################################
    ### BWA (align reads to reference) ###
    ######################################

    #create SAM file
    samfile = scratch_dir + '/{}.sam'.format(tag)

    #run BWA
    commands_list.append( 'bwa mem -M {3} {0} {1} > {2}'.format(fqf1 , fqf2 , samfile , RefGen) )

    #####################################
    ### PICARD (sort & convert to BAM) ##
    #####################################

    #create BAM file
    bamfile = scratch_dir + '/{0}.sorted.bam'.format(tag)

    commands_list.append( 'java -Xmx16G -jar /n/data1/hms/dbmi/farhat/bin/picard/picard/build/libs/picard.jar SortSam INPUT={0} OUTPUT={1} SORT_ORDER=coordinate'.format(samfile, bamfile) )

    ####################################
    ### PICARD (remove duplicates) ####
    ###################################

    #create BAM file with removed duplicates
    drbamfile = bamfile.replace(".bam", ".duprem.bam")

    #remove duplicates from BAM file
    commands_list.append( "java -Xmx32G -jar /n/data1/hms/dbmi/farhat/bin/picard/picard/build/libs/picard.jar MarkDuplicates I={0} O={1} REMOVE_DUPLICATES=true M={2} ASSUME_SORT_ORDER=coordinate".format(bamfile, drbamfile, drbamfile[:-4]+'.metrics') )

    ####################################
    ### SAMTOOLS (to index BAM file) ###
    ####################################
    
    commands_list.append( "samtools index {0}".format(drbamfile) )
    
    ######################################
    ### QUALIMAP (quality of BAM file) ###
    ######################################
    
    #store quality report, pilon VCF & lineage call information all in Output directory
    commands_list.append( 'cd ' + output_dir )
    commands_list.append( 'mkdir QualiMap' ) #make a folder for pilon output in output directory
    commands_list.append( 'unset DISPLAY' ) #unset JAVA virtual machine variable [http://qualimap.bioinfo.cipf.es/doc_html/faq.html]
    commands_list.append( "/n/data1/hms/dbmi/farhat/bin/qualimap_v2.2.1/qualimap bamqc -bam {0} --outdir {1} --outfile {2}.pdf --outformat PDF".format(drbamfile, output_dir+'/QualiMap', tag+'_stats') )

    ###################################
    ### PILON (call variants) #########
    ###################################
    
    #store quality report, pilon VCF & lineage call information all in Output directory
    commands_list.append( 'mkdir pilon' ) #make a folder for pilon output in output directory
    out_pilon_dir = output_dir + '/pilon/' #variable for pilon output path

    commands_list.append( 'java -Xmx32G -jar /n/data1/hms/dbmi/farhat/bin/pilon/pilon-1.22.jar --genome {0} --bam {1} --output {2} --outdir {3} --variant'.format(RefGen, drbamfile, tag, out_pilon_dir) )

    #####################################
    ### Luca's LINEAGE CALLING script ###
    #####################################

    #create directory 
    commands_list.append( 'mkdir ' + scratch_dir + '/fast-lineage-caller/' )#make a folder for lineage call in output directory
    commands_list.append( 'mkdir ' + output_dir + '/fast-lineage-caller/' )#make a folder for lineage call in scratch directory

    #create VRT file
    vrtfile = scratch_dir + '/fast-lineage-caller/{}.vrt'.format(tag)

    commands_list.append( 'cd ' + scratch_dir + '/fast-lineage-caller' )#change directory to store output in scratch

    #convert VCF to VRT
    commands_list.append( 'vrtTools-vcf2vrt.py {0} {1} 1'.format(out_pilon_dir+tag+'.vcf', vrtfile) )

    #call lineage with SNP database an VRT file
    commands_list.append( 'cd ' + output_dir + '/fast-lineage-caller' )#change directory to store output in VCF output

    commands_list.append( 'FastLineageCaller-assign2lineage.py /home/rv76/Bio_Pipelines/fast-lineage-caller-master/example/db_snps.tsv ' + vrtfile + ' &> ' + 'lineage_call.txt' )

    ###############################################################################################################
    ######################################## SUBMIT as a job to O2 ################################################
    ###############################################################################################################
    
    #append all commands in a single string to be submitted as a job
    JankyPipe_job = ''
    for command_i in commands_list:
        JankyPipe_job = JankyPipe_job + '\n' + command_i
    
    #directory where you want output + error files
    os.chdir(O2_SLURM_logs_dir)

    job_name = tag

    s = Slurm(job_name , {'partition':'short' , 'n':'1' , 't':'0-6:00:00' , 'mem-per-cpu':'36G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

    #submits the job
    job_id = s.run(JankyPipe_job)

    print job_name  + ' : ' +  str(job_id)

#### Pull all relevant sequenced isolate and corresponding FastQ file paths

In [3]:
sample_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/dennis_mujuni_collaboration/annotation CSV files/sample_fastq_path_names.csv' , sep = ',').set_index('sample_num')

In [4]:
sample_annotation.head(n=2)

Unnamed: 0_level_0,sample_id,fastq_files
sample_num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Undetermined,/n/data1/hms/dbmi/farhat/Roger/dennis_mujuni_c...
1,TC100810,/n/data1/hms/dbmi/farhat/Roger/dennis_mujuni_c...


In [5]:
np.shape(sample_annotation)

(26, 2)

## DO NOT RUN CELLS BELOW UNLESS YOU WANT TO RUN EVERY SAMPLE THROUGH JANKYPIPE AGAIN!

################################################################################################################################################################################################

################################################################################################################################################################################################

#### Create directories for each isolate and launch JankyPipe

IMPORTANT PARENT DIRECTORIES 

--->   /n/scratch2/rv76/JankyPipe_dennis_mujuni_collaboration/intermediary_files

[to store intermediate files (unzipped fastq, trimmed fastq, SAM, sorted BAM, etc)]


--->   /n/data1/hms/dbmi/farhat/Roger/dennis_mujuni_collaboration/JankyPipe/output

[to store final files (pilon VCF, lineage, QualiMap, trim logs)]


--->   /n/data1/hms/dbmi/farhat/Roger/dennis_mujuni_collaboration/JankyPipe/O2_SLURM_logs

[to store submitted SLURM script, and SLURM error & verbose logs]

In [44]:
for isolate_i in range(0 , np.shape(sample_annotation)[0]):
    
    isolate_fastq_paths = sample_annotation.loc[isolate_i , 'fastq_files']

    #paths & names for fastq files
    fqf1 = isolate_fastq_paths + '_L001_R1_001.fastq.gz'
    fqf2 = isolate_fastq_paths + '_L001_R2_001.fastq.gz'

    #get the tag ID for the fastq files (same as ID for fastq files)
    tag = fqf1.split('/')[-1].split('_')[1]

    #where pilon VCF and lineage information will be stored [LAB FOLDER]
    output_dir = '/n/data1/hms/dbmi/farhat/Roger/dennis_mujuni_collaboration/JankyPipe/output/' + tag
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
        os.makedirs(output_dir)
    elif not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    #where everything else happens (trimming, aligning, etc.) [SCRATCH FOLDER]
    scratch_dir = '/n/scratch2/rv76/JankyPipe_dennis_mujuni_collaboration/intermediary_files/' + tag
    if os.path.exists(scratch_dir):
        shutil.rmtree(scratch_dir)
        os.makedirs(scratch_dir)
    elif not os.path.exists(scratch_dir):
        os.makedirs(scratch_dir)

    #store O2 job log files [LAB FOLDER]
    O2_SLURM_logs_dir = '/n/data1/hms/dbmi/farhat/Roger/dennis_mujuni_collaboration/JankyPipe/O2_SLURM_logs/' + tag

    if os.path.exists(O2_SLURM_logs_dir):
        shutil.rmtree(O2_SLURM_logs_dir)
        os.makedirs(O2_SLURM_logs_dir)
    elif not os.path.exists(O2_SLURM_logs_dir):
        os.makedirs(O2_SLURM_logs_dir)

    #Launch JankyPipe after making necessary directories!!!
    Launch_JankyPipe(fqf1 , fqf2 , tag , output_dir , scratch_dir , O2_SLURM_logs_dir)

submitted: Submitted batch job 2230933


S0 : 2230933


submitted: Submitted batch job 2230938
submitted: Submitted batch job 2230941


S1 : 2230938
S2 : 2230941


submitted: Submitted batch job 2230945
submitted: Submitted batch job 2230950


S3 : 2230945
S4 : 2230950


submitted: Submitted batch job 2230956
submitted: Submitted batch job 2230958


S5 : 2230956
S6 : 2230958


submitted: Submitted batch job 2230960


S7 : 2230960


submitted: Submitted batch job 2230963
submitted: Submitted batch job 2230967


S8 : 2230963
S9 : 2230967


submitted: Submitted batch job 2230973
submitted: Submitted batch job 2230976


S10 : 2230973
S11 : 2230976


submitted: Submitted batch job 2230981
submitted: Submitted batch job 2230984


S12 : 2230981
S13 : 2230984


submitted: Submitted batch job 2230990
submitted: Submitted batch job 2230996


S14 : 2230990
S15 : 2230996


submitted: Submitted batch job 2231003
submitted: Submitted batch job 2231009
submitted: Submitted batch job 2231013


S16 : 2231003
S17 : 2231009
S18 : 2231013


submitted: Submitted batch job 2231018
submitted: Submitted batch job 2231026


S19 : 2231018
S20 : 2231026


submitted: Submitted batch job 2231031


S21 : 2231031


submitted: Submitted batch job 2231038
submitted: Submitted batch job 2231043


S22 : 2231038
S23 : 2231043
S24 : 2231049
S25 : 2231054


submitted: Submitted batch job 2231049
submitted: Submitted batch job 2231054


################################################################################################################################################################################################

################################################################################################################################################################################################

##### Determine if jobs ran successfully or not

In [6]:
successful_run = []

for isolate_i in range(0 , np.shape(sample_annotation)[0]):

    #get the tag ID for the fastq files (same as ID for fastq files)
    tag = 'S' + str(sample_annotation.loc[0,:].name)

    #where pilon VCF and lineage information will be stored [LAB FOLDER]
    output_dir = '/n/data1/hms/dbmi/farhat/Roger/dennis_mujuni_collaboration/JankyPipe/output/' + tag
    
    #check to see 'Lineage Call' folder exists in the output directory (last thing that is run in JankyPipe)
    if os.path.exists(output_dir + '/fast-lineage-caller/'):
        
        successful_run.append('yes')
        
    else:
        
        successful_run.append('no')
        
sample_annotation['successful_run'] = successful_run

In [7]:
sample_annotation[sample_annotation.successful_run == 'no']

Unnamed: 0_level_0,sample_id,fastq_files,successful_run
sample_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


#### Re-Run isolates through pipeline that hit a run-timelimit

In [None]:
#isolates that don't have a lineage-call directory didn't finish running through pipeline
sample_annotation_ReRun = sample_annotation[sample_annotation.successful_run == 'no']

#if path already exists, remove current contents, then recreate empty directory
#if path doesn't exist, create new directory

for isolate_i in range(0 , np.shape(sample_annotation_ReRun)[0]):
    
    isolate_fastq_paths = sample_annotation_ReRun.loc[isolate_i , 'fastq_files']

    #paths & names for fastq files
    fqf1 = isolate_fastq_paths + '_L001_R1_001.fastq.gz'
    fqf2 = isolate_fastq_paths + '_L001_R2_001.fastq.gz'

    #get the tag ID for the fastq files (same as ID for fastq files)
    tag = fqf1.split('/')[-1].split('_')[1]

    #where pilon VCF and lineage information will be stored [LAB FOLDER]
    output_dir = '/n/data1/hms/dbmi/farhat/Roger/dennis_mujuni_collaboration/JankyPipe/output/' + tag
    
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
        os.makedirs(output_dir)
    elif not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    #where everything else happens (trimming, aligning, etc.) [SCRATCH FOLDER]
    scratch_dir = '/n/scratch2/rv76/JankyPipe_dennis_mujuni_collaboration/intermediary_files/' + tag
    
    if os.path.exists(scratch_dir):
        shutil.rmtree(scratch_dir)
        os.makedirs(scratch_dir)
    elif not os.path.exists(scratch_dir):
        os.makedirs(scratch_dir)

    #store O2 job log files [LAB FOLDER]
    O2_SLURM_logs_dir = '/n/data1/hms/dbmi/farhat/Roger/dennis_mujuni_collaboration/JankyPipe/O2_SLURM_logs/' + tag

    if os.path.exists(O2_SLURM_logs_dir):
        shutil.rmtree(O2_SLURM_logs_dir)
        os.makedirs(O2_SLURM_logs_dir)
    elif not os.path.exists(O2_SLURM_logs_dir):
        os.makedirs(O2_SLURM_logs_dir)

    #Launch JankyPipe after making necessary directories!!!
    Launch_JankyPipe(fqf1 , fqf2 , tag , output_dir , scratch_dir , O2_SLURM_logs_dir)