# ASE calling pipeline

Mostly based on Yanyu's work in 2017. The pipeline aligns `fastq` sequences to `bam` using `STAR`, then adjust the mapping via `WASP` to account for allele specificity, and finally call genotype and ASE via `QuASAR`.

## Overview of procedure

1. Gather require resources
2. Align reads to genome
3. Remove biased reads
4. Genotype and ASE calling

The pipeline is implemented in SoS as displayed in the rest of this SoS notebook. The pipeline can be executed from this notebook directly on local or remote computer. For more information see [SoS website](https://github.com/vatlab/SOS).

### Environment configurations

In [5]:
[global]
cwd = path('~/Documents/ASE')
# Resource files
resource_dir = f"{cwd:a}/hg19"
ref_fa = "hg19.fa"
ref_gtf = 'Homo_sapiens.GRCh38.91.gtf.gz'
wasp_dir = f"{cwd:a}/WASP"
# Sample files
sample_dir = f"{cwd:a}/samples"
from collections import OrderedDict
samples = OrderedDict({'ENCLB279NMT': ['ENCFF824TZM', 'ENCFF176JNE']})
fastq = paths([[f"{sample_dir}/{s}/{q}.fastq.gz" for q in samples[s]] for s in samples])

[wasp]
output: f"{wasp_dir}/README.md"
bash: work_dir = f'{cwd:a}'
    git clone https://github.com/bmvdgeijn/WASP

[star]
# One time deal for all projects, quite time / resource consuming (32GB memory)
parameter: ncpu = 2
depends: Py_Module('docker')
output: f"{resource_dir}/genomeParameters.txt"
bash: docker_image = 'bschiffthaler/ngs', expand=True
     STAR --runMode genomeGenerate \
        --genomeDir {resource_dir} \
        --genomeFastaFiles {resource_dir}/{ref_fa} \
        --sjdbGTFtagExonParentTranscript {resource_dir}/{ref_gtf}
        --runThreadN {ncpu}

## Resource preparation

### hg19 human reference data

Obtain `hg19.fa` and `Homo_sapiens.GRCh38.91.gtf.gz`.

In [3]:
[hg19_reference_1 (download)]
# Download `hg19.2bit` and `twoBitToFa` from {ucsc_url}
ucsc_url = "http://hgdownload.cse.ucsc.edu"
output: f"{resource_dir}/hg19.2bit", f"{resource_dir}/twoBitToFa"
download: dest_dir = resource_dir, expand = True
    {ucsc_url}/goldenPath/hg19/bigZips/hg19.2bit
    {ucsc_url}/admin/exe/linux.x86_64/twoBitToFa

[hg19_reference_2 (decompress hg19.fa)]
# Use `twoBitToFa` to extract `hg19.fa` from `hg19.2bit`
output: f"{resource_dir}/{ref_fa}"
bash: expand = True
    chmod +x {_input[1]}
    {_input[1]} {_input[0]} {_output}

[hg19_reference_3 (gene annotations)]
# Download `Homo_sapiens.GRCh38.91.gtf.gz` from Ensembl
# https://useast.ensembl.org/info/data/ftp/index.html
ensembl_ftp = 'ftp://ftp.ensembl.org/pub/release-91/gtf/homo_sapiens/'
output: f"{resource_dir}/{ref_gtf}"
download: dest_dir = resource_dir, expand = True
    {ensembl_ftp}/{ref_gtf}

## Get samples

FIXME: add description -- where do samples come from, etc

In [None]:
[obtain_samples]
# Download samples from ENCODE
# https://www.encodeproject.org/experiments/ENCSR384KAN/
encode_url = 'https://www.encodeproject.org/files'
input: for_each = 'fastq', concurrent = True
output: fastq, group_by = 1
download: dest_dir = sample_dir, expand = True
    {encode_url}/{_fastq:bnn}/@@download/{_fastq:b}

## Alignment

In [None]:
[align_1 (STAR prefiltering alignment)]
# Align with STAR
# Followd by samtools remove reads with quality less than {qual_cutoff}
parameter: qual_cutoff = 10
depend: sos_step('star')
input: fastq, group_by = 2, concurrent = True
output: [f"{sample_dir}/{x}.qual{qual_cutoff}.bam" for x in samples], group_by = 1
bash: docker_image = 'bschiffthaler/ngs', expand=True
    STAR --genomeDir {resource_dir} \
        --genomeFastaFiles {resource_dir}/{ref_fa} \
        --readFilesIn {_input[0]} {_input[1]} \
        --runThreadN {ncpu} --outStd BAM_SortedByCoordinate \
        --outSAMtype BAM SortedByCoordinate \
        --sjdbGTFtagExonParentTranscript {resource_dir}/{ref_gtf} |
    samtools view -bq {qual_cutoff} > {_output}

[align_2 (WASP intersecting SNP)]
# WASP finding unbiased reads intersecting with SNP
depends: sos_step('wasp')
input: group_by = 1, pattern = '{name}.bam', concurrent = True
output: expand_pattern(f'{_name:n}.remap.fq.gz')
bash: expand = True
    python {wasp_dir}/find_intersecting_snps.py \
        --snp_dir FIXME --is_sorted {_input}

[align_3 (STAR prefiltering alignement)]
# Align WASP remap with STAR
# Followd by samtools remove reads with quality less than {qual_cutoff}
parameter: qual_cutoff = 10
input: group_by = 1, pattern = '{name}.remap.fq.gz', concurrent = True
output: expand_pattern(f'{_name}.wasp.qual{qual_cutoff}.bam')
bash: docker_image = 'bschiffthaler/ngs', expand=True
    STAR --genomeDir {resource_dir} \
        --genomeFastaFiles {resource_dir}/{ref_fa} \
        --readFilesIn {_input} \
        --readFilesCommand zcat \
        --runThreadN {ncpu} --outStd BAM_SortedByCoordinate \
        --outSAMtype BAM SortedByCoordinate \
        --sjdbGTFtagExonParentTranscript {resource_dir}/{ref_gtf} |
    samtools view -bq {qual_cutoff} > {_output}

In [None]:
infile = sys.argv[1]
list_of_snps = sys.argv[2]
nthreads = sys.argv[3]
genesymbol = sys.argv[4]
genome_folder = sys.argv[5]
wasp_path = sys.argv[6]
prefix = basename(infile)
log_idx = str(int(random.random() * 1000))
logfilename = '.'.join([prefix, log_idx, 'log']) 
logfile = open(logfilename, 'w')
print('fastq2bam.' + logfilename + ' is generated')
logfile.write('The index is ' + log_idx)
logfile.close()

cmd = 'mkdir ' + prefix + '_align1'
os.system(cmd)

# ############################################################
# #START# STAR pre-filtering alignment #START#
# # input/output:
# working_folder1 = prefix + '_align1'
# inputfile = infile
# outputfile = os.sep.join([working_folder1, 'Aligned.sortedByCoord.out.bam'])
# ###
# # cmd:
# cmd = 'STAR --runThreadN ' + nthreads + ' \
# --genomeDir ' + genome_folder + ' \
# --readFilesIn ' + inputfile + ' \
# --readFilesCommand bzcat \
# --outFileNamePrefix ' + working_folder1 + '/ \
# --outSAMtype BAM SortedByCoordinate \
# --sjdbGTFtagExonParentTranscript ' + genesymbol + ' \
# > ' + outputfile + '.out'
# ###
# # run cmd
# do_and_report(cmd, logfilename, 'STAR-align-prefiltering')
# ###
# # post step: release memory
#  # cmd = 'STAR --genomeLoad Remove'
# # do_and_report(cmd, logfilename, 'STAR-align-releasing-memory')
# ###
# #END# STAR pre-filtering alignment #END#
############################################################


# ############################################################
# #START# samtools removing reads with quuality less than 10 #START#
# # input/output:
# inputfile = outputfile
# outputfile = os.sep.join([working_folder1, 'Aligned.sortedByCoord.out.filtered-quality-10.bam'])
# ###
# # cmd:
# cmd = 'samtools view -bq 10 ' + inputfile + ' > ' + outputfile # here the quality is hard coded as 10
# ###
# # run cmd:
# do_and_report(cmd, logfilename, 'samtools-filter-quality-less-than-10')
# ###
# #END# STAR pre-filtering alignment #END#
############################################################


############################################################
#START# WASP finding unbiased reads intersecting with SNP #START#
# # input/output:
# inputfile = outputfile
# temp = inputfile.split('.')
# wasp_name = basename(inputfile)
# wasp_name = wasp_name.split('.')
# wasp_name = '.'.join(wasp_name[ : -1])
# outputfile = '.'.join(temp[ : -1] + ['remap.fq.gz'])
# ###
# # cmd:
# cmd = 'python-conda ' + wasp_path + '/find_intersecting_snps.py \
# --snp_dir ' + list_of_snps + ' \
# --is_sorted '\
# + inputfile
# ###
# # run cmd:
# do_and_report(cmd, logfilename, 'WASP-intersect-without-bias')
###
#END# WASP finding unbiased reads intersecting with SNP #END#
############################################################

# cmd = 'mkdir ' + outputfile + '_align2'
# os.system(cmd)

############################################################
# #START# STAR post-filtering alignment #START#
# # input/output:
# working_folder2 = outputfile + '_align2'
# inputfile = outputfile
# outputfile = os.sep.join([working_folder2, 'Aligned.sortedByCoord.out.bam'])
# ###
# # cmd:
# cmd = 'STAR --runThreadN ' + nthreads + ' \
# --genomeDir ' + genome_folder + ' \
# --readFilesIn ' + inputfile + ' \
# --readFilesCommand zcat \
# --outFileNamePrefix ' + working_folder2 + '/ \
# --outSAMtype BAM SortedByCoordinate \
# --sjdbGTFtagExonParentTranscript ' + genesymbol + ' \
# > ' + outputfile + '.out'
# ###
# # run cmd:
# do_and_report(cmd, logfilename, 'STAR-align-postfiltering')
# ###
# # post step: release memory
# # cmd = 'STAR --genomeLoad Remove'
# # do_and_report(cmd, logfilename, 'STAR-align-releasing-memory')
# ###
# #END# STAR post-filtering alignment #END#
# ############################################################


# ############################################################
# #START# samtools removing reads with quuality less than 10 #START#
# # input/output:
# inputfile = outputfile
# outputfile = os.sep.join([working_folder2, 'Aligned.sortedByCoord.out.filtered-quality-10.bam'])
# ###
# # cmd:
# cmd = 'samtools view -bq 10 ' + inputfile + ' > ' + outputfile # here the quality is hard coded as 10
# ###
# # run cmd:
# do_and_report(cmd, logfilename, 'samtools-filter-quality-less-than-10')
# ###
# #END# STAR pre-filtering alignment #END#
# ############################################################


############################################################
#START# WASP removing ambiguously mapped reads #START#
# input/output:
inputfile_to_remap = os.sep.join([working_folder1, wasp_name + '.to.remap.bam'])
inputfile_keep = os.sep.join([working_folder1, wasp_name + '.keep_remapped.bam'])
inputfile_remap = outputfile
temp = inputfile.split('.')
outputfile = inputfile_keep
###
# cmd:
cmd = 'python-conda ' + wasp_path + '/filter_remapped_reads.py '\
+ inputfile_to_remap + ' '\
+ inputfile_remap + ' '\
+ inputfile_keep
###
# run cmd:
do_and_report(cmd, logfilename, 'WASP-remove-ambiguous-reads')
###
#END# WASP removing ambiguously mapped reads #END#
############################################################


############################################################
#START# samtools merging kept reads #START#
# input/output:
inputfile1 = outputfile
inputfile2 = os.sep.join([working_folder1, wasp_name + '.keep.bam'])
temp = os.sep.join([working_folder1, wasp_name + '.merged_keep.bam'])
outputfile = os.sep.join([working_folder1, wasp_name + '.merged_keep.sorted.bam'])
###
# cmd:
cmd = 'samtools merge '\
+ temp + ' '\
+ inputfile1 + ' '\
+ inputfile2
###
# run cmd:
do_and_report(cmd, logfilename, 'samtools-merge-unbiased-reads')
###
# post step:
cmd = 'samtools sort -o '\
+ outputfile + ' '\
+ temp
do_and_report(cmd, logfilename, 'samtools-merge-unbiased-reads-sort')
cmd = 'samtools index '\
+ outputfile
do_and_report(cmd, logfilename, 'samtools-merge-unbiased-reads-index')
#END# samtools merging kept reads #END#
############################################################


############################################################
#START# WASP removing duplicated reads randomly #START#
# input/output:
inputfile = outputfile
outputfile = os.sep.join([working_folder1, 'final.bam'])
###
# cmd:
cmd = 'python-conda ' + wasp_path + '/rmdup.py '\
+ inputfile + ' '\
+ outputfile
###
# run cmd:
do_and_report(cmd, logfilename, 'WASP-remove-duplicated-reads')
###
#END# WASP removing duplicated reads randomly #END#
############################################################