## Alignment Statistics and Deletion Simulation with pysam

In [40]:
import pysam
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
#import sys
#sys.path.insert(0, '/Users/velina/Documents/GitHub/genepeeks-science')
#from genepeeks.common import utilities as util

In [None]:
# DMD_ensembl = util.Mongo.get_collection_data('gene', wanted_db='prod', query={'_id': 'DMD'}, find_one=True, single_field='ensembl')
# DMD_exons = util.get_nested_value(DMD_ensembl, ('is_primary', 'transcripts', 'is_primary', 'exons'))

In [42]:
# DMD exon/gene coordinates and useful regions (coordinates taken from IGV b37 reference)
EXON46 = [31950197, 31950344]
EXON47 = [31947713, 31947862]
EXON48 = [31893308, 31893490]
EXON49 = [31854835, 31854939]
EXON50 = [31838092, 31838200]
EXON51 = [31792077, 31792309]
DMD = [31115794, 33357558]  # taken from ensembl GRCh37

# deletion of partial exons
EX48_50_PART = [31838130, 31893308]

In [8]:
# Velina's local bam files for testing (names are consistent with subject/sample names)
bamfiles = ['FGPC-0000-444F.bam', 'FGPC-8V7M-82J3.bam', 'FPWB-0000-840T.bam', 'MGPC-6KHG-N0LS.bam']

In [9]:
# workaround to force real time print output in notebooks
# see http://stackoverflow.com/questions/29772158/make-ipython-notebook-print-in-real-time
import sys
oldsysstdout = sys.stdout
class flushfile():
    def __init__(self, f):
        self.f = f
    def __getattr__(self,name): 
        return object.__getattribute__(self.f, name)
    def write(self, x):
        self.f.write(x)
        self.f.flush()
    def flush(self):
        self.f.flush()
sys.stdout = flushfile(sys.stdout)

## Playing around with pysam and samtools

In [18]:
bamfile = pysam.AlignmentFile('bams/FGPC-0000-444F.bam', 'rb')
print bamfile.references

('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT')


In [19]:
# header breakdown
print bamfile.header.keys()
# take template header from existing bamfile
template_header = bamfile.header
bamfile.header['SQ']

['SQ', 'RG', 'PG', 'HD']


[{'LN': 249250621, 'SN': '1'},
 {'LN': 243199373, 'SN': '2'},
 {'LN': 198022430, 'SN': '3'},
 {'LN': 191154276, 'SN': '4'},
 {'LN': 180915260, 'SN': '5'},
 {'LN': 171115067, 'SN': '6'},
 {'LN': 159138663, 'SN': '7'},
 {'LN': 146364022, 'SN': '8'},
 {'LN': 141213431, 'SN': '9'},
 {'LN': 135534747, 'SN': '10'},
 {'LN': 135006516, 'SN': '11'},
 {'LN': 133851895, 'SN': '12'},
 {'LN': 115169878, 'SN': '13'},
 {'LN': 107349540, 'SN': '14'},
 {'LN': 102531392, 'SN': '15'},
 {'LN': 90354753, 'SN': '16'},
 {'LN': 81195210, 'SN': '17'},
 {'LN': 78077248, 'SN': '18'},
 {'LN': 59128983, 'SN': '19'},
 {'LN': 63025520, 'SN': '20'},
 {'LN': 48129895, 'SN': '21'},
 {'LN': 51304566, 'SN': '22'},
 {'LN': 155270560, 'SN': 'X'},
 {'LN': 59373566, 'SN': 'Y'},
 {'LN': 16569, 'SN': 'MT'}]

In [20]:
# does not output in same format as actual samfile -- see documentation for breakdown
iterable = bamfile.fetch('X', *EXON49)
example_read = iterable.next()
print example_read

NS500214:178:H5FGFAFXX:1:11101:1977:9301	147	22	31854685	60	151M	22	31854633	151	AAGCTTTGCCTCTTCTATTACAGTATGGCCAGTATTTCCTTACAAGTTATTTCACTGATTATAAATAGTCCACGTCAATGGCAAATGTACAACAGGGGAAGCATAACCCATTATGAGGTAATGGATATTGCTAGAGGTTGCTTCATTACCT	array('B', [12, 23, 18, 28, 27, 28, 25, 27, 29, 12, 26, 29, 29, 10, 33, 24, 29, 29, 27, 28, 30, 31, 26, 27, 29, 25, 9, 31, 32, 13, 32, 31, 27, 29, 24, 29, 28, 32, 33, 24, 27, 26, 34, 30, 32, 31, 29, 27, 29, 29, 29, 28, 34, 28, 33, 10, 29, 30, 30, 23, 30, 23, 31, 31, 30, 28, 33, 25, 29, 33, 35, 29, 28, 33, 29, 29, 31, 30, 31, 31, 32, 35, 31, 31, 30, 30, 33, 28, 29, 35, 31, 29, 35, 33, 31, 32, 25, 35, 32, 34, 33, 36, 31, 23, 32, 30, 34, 34, 36, 31, 31, 23, 31, 31, 35, 34, 32, 33, 29, 32, 31, 31, 32, 35, 31, 29, 31, 24, 31, 33, 35, 29, 34, 35, 34, 32, 33, 31, 31, 33, 35, 31, 30, 36, 31, 31, 28, 30, 29, 31, 29])	[('BD', 'KMNLCKONPONMMNNLLKMLMMMKNMMLMMMMKKBLMNLJMLLKMKJKKBLNMMNLNKIJKJDKJKMMLLLLKMMKJMLLLKCJMJLLKKJKLLIILKJLLMJIJKHLMJIJMLKLKMJJNMMNKKKJNNMLLLNMLKPPNN

In [21]:
example_read.reference_end

31854836

In [22]:
coverage = bamfile.count_coverage('X', *EXON49)

In [34]:
print len(coverage[0])
print coverage[0]

104
array('L', [0L, 0L, 1L, 260L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 281L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 299L, 0L, 287L, 284L, 285L, 1L, 0L, 1L, 0L, 0L, 2L, 0L, 0L, 0L, 1L, 0L, 319L, 0L, 308L, 1L, 315L, 323L, 307L, 305L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 2L, 301L, 1L, 303L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 339L, 0L, 0L, 0L, 0L, 0L, 326L, 303L, 0L, 1L, 0L, 0L, 0L, 289L, 0L, 0L, 0L, 0L, 307L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L])


In [119]:
print example_read.is_unmapped
example_read.tid

False


22

In [75]:
# getting reference name
bamfile.getrname(example_read.reference_id)

'X'

In [202]:
# next two cells testing pysam's samtools integration (specifically samtools view and merge)
%%time
infile = 'FGPC-8V7M-82J3.bam'
outfile = 'FGPC-8V7M-82J3_ch22.bam'
pysam.view('-b', '-o' + outfile, infile, '21', '22', catch_stdout=False)
pysam.index(outfile)

CPU times: user 6.68 s, sys: 107 ms, total: 6.79 s
Wall time: 6.84 s


In [169]:
%%time
infile = 'FGPC-8V7M-82J3.bam'
outfile = 'FGPC-8V7M-82J3_ch20.bam'
pysam.view('-b', '-o' + outfile, infile, '20', catch_stdout=False)
# pysam.index(outfile)

pysam.merge('combined.bam', outfile, 'FGPC-8V7M-82J3_ch22.bam', catch_stdout=False)

CPU times: user 14 s, sys: 192 ms, total: 14.2 s
Wall time: 14.3 s


## Recording Alignment Statistics 
(Exon 49 and full DMD)
Exon 49 is located in deletion hotspot in DMD

In [96]:
def alignment_stats(bamfile_name, reference, region, minmapQ=30):
    '''Prints alignment stats across reads in particular region in BAM. 
       reference -- ref or chrm number
       region -- list or tuple with region bounds
    '''
    with pysam.AlignmentFile(bamfile_name, 'rb') as bamfile:
        stats = Counter()
        for read in bamfile.fetch(reference, *region):
            stats['total'] += 1
            stats['qcfail'] += int(read.is_qcfail)

            stats['paired'] += int(read.is_paired)
            stats['read1'] += int(read.is_read1)
            stats['read2'] += int(read.is_read2)
            
            if read.is_unmapped:
                stats['unmapped'] += 1
                continue

            stats['mapq <= 30'] += int(read.mapping_quality <= minmapQ)

            stats['mapped'] += 1
            if read.is_proper_pair:
                stats['proper pair'] += 1
                if abs(read.template_length) > 850:
                    stats['long_insert'] += 1
                    print 'long read: {}'.format(read.template_length)
                else:
                    stats['insert_lengths'] += (abs(read.template_length) if read.is_read1 else 0)

        stats['avg insert length'] = stats['insert_lengths'] / float(stats['read1'])
        output_order = ('total', 'mapped', 'unmapped', 'paired', 'read1', 'read2', 
                        'proper pair', 'qcfail', 'mapq <= 30', 'long_insert', 'avg insert length')

        print '{} \nChromosome {} Region {} Stats:'.format(bamfile_name, reference, region)
        for key in output_order:
            if key == 'avg insert length':
                print '\t{}: {}'.format(key, stats[key])
            else:
                print '\t{}: {}, Prop: {}'.format(key, stats[key], stats[key] / float(stats['total']))

In [97]:
alignment_stats('bams/FGPC-0000-444F.bam', 'X', EXON49)

bams/FGPC-0000-444F.bam 
Chromosome X Region [31854835, 31854939] Stats:
	total: 680, Prop: 1.0
	mapped: 680, Prop: 1.0
	unmapped: 0, Prop: 0.0
	paired: 680, Prop: 1.0
	read1: 342, Prop: 0.502941176471
	read2: 338, Prop: 0.497058823529
	proper pair: 680, Prop: 1.0
	qcfail: 0, Prop: 0.0
	mapq <= 30: 0, Prop: 0.0
	long_insert: 0, Prop: 0.0
	avg insert length: 226.011695906


In [98]:
for bam in bamfiles:
    alignment_stats('bams/{}'.format(bam), 'X', DMD)

long read: 87733677
long read: 36406070
long read: 67357711
bams/FGPC-0000-444F.bam 
Chromosome X Region [31115794, 33357558] Stats:
	total: 88693, Prop: 1.0
	mapped: 88577, Prop: 0.998692117755
	unmapped: 116, Prop: 0.0013078822455
	paired: 88693, Prop: 1.0
	read1: 44343, Prop: 0.499960538036
	read2: 44350, Prop: 0.500039461964
	proper pair: 88301, Prop: 0.995580259998
	qcfail: 0, Prop: 0.0
	mapq <= 30: 180, Prop: 0.00202947244991
	long_insert: 3, Prop: 3.38245408319e-05
	avg insert length: 246.341519518
long read: 8528942
long read: -3886263
bams/FGPC-8V7M-82J3.bam 
Chromosome X Region [31115794, 33357558] Stats:
	total: 19239, Prop: 1.0
	mapped: 19235, Prop: 0.999792088986
	unmapped: 4, Prop: 0.000207911014086
	paired: 19239, Prop: 1.0
	read1: 9621, Prop: 0.50007796663
	read2: 9618, Prop: 0.49992203337
	proper pair: 19209, Prop: 0.998440667394
	qcfail: 0, Prop: 0.0
	mapq <= 30: 29, Prop: 0.00150735485212
	long_insert: 2, Prop: 0.000103955507043
	avg insert length: 232.779232928
long

### Conclusions
Mapping quality seems to be fairly high across DMD gene (only 0.1-0.4% of reads have mapQ less than 30) for 4 samples tested here. 

Repeat with more samples -- could also check specific regions, add check to see if reads with low mapQ are also short reads. 

## Writing basic BAM with simulated deletion

In [10]:
def simulate_exon_del(outfile, infile_bam, chrom_ref, del_region, depletion_factor=2, output_full=False,
                      before_wanted=None, after_wanted=None):
    ''' Simulates deletion in specified region by reducing the number of aligned reads in region 
        by specified depletion_factor.
        depletion_factor -- int indicating factor by which total read number reduced
        output_full -- option to generate complete bam file (all chromosomes) vs bam file with just chrom_ref
    '''
    if del_region[0] > del_region[1]:
        raise ValueError('Region boundaries invalid')
        
    template_bam = pysam.AlignmentFile(infile_bam, 'rb')
    
    with pysam.AlignmentFile(outfile, 'wb', template=template_bam) as outf:
        for i, read in enumerate(template_bam.fetch(chrom_ref)):
            if read.reference_start <= del_region[1] and read.reference_end >= del_region[0]:
                if random.random() < 1. / depletion_factor:
                    outf.write(read)
            else:
                outf.write(read)
        print 'Finished generating reads in bamfile for desired chrom_ref'
    
    # generates bam file for all chroms before desired and separate bam file for all chroms after; 
    # then merges three bam files -- might require clean up of extra files after running
    if output_full:
        ref_list = list(template_bam.references)
        index = ref_list.index(chrom_ref)
        
        if not before_wanted:   
            before = ref_list[:index]
            before_wanted = '{}_before_{}.bam'.format(infile_bam.split('.')[0], chrom_ref)
            pysam.view('-b', '-o' + before_wanted, infile_bam, *before, catch_stdout=False)
            print 'Finished generating bamfile for refs before chrom_ref'
            
        if not after_wanted:
            after = ref_list[(index + 1):]
            after_wanted = '{}_after_{}.bam'.format(infile_bam.split('.')[0], chrom_ref)
            pysam.view('-b', '-o' + after_wanted, infile_bam, *after, catch_stdout=False)
            print 'Finished generating bamfile for refs after chrom_ref'

        outfile_full = '{}_full.bam'.format(outfile.split('.')[0])
        pysam.merge(outfile_full, before_wanted, outfile, after_wanted, catch_stdout=False)
        print 'Finished merging all bamfiles into final output'
        
        template_bam.close()
        pysam.index(outfile_full)
    else:
        pysam.index(outfile)
        
    print 'Finished creating index file for final output bamfile'

In [12]:
%%time
# assuming bams live in bams directory
outfile = 'bams/FGPC-0000-444F_49_del.bam'
simulate_exon_del(outfile, 'bams/FGPC-0000-444F.bam', 'X', EXON49, depletion_factor=2)

Finished generating reads in bamfile for desired chrom_ref
Finished creating index file for final output bamfile
CPU times: user 1min, sys: 1.29 s, total: 1min 1s
Wall time: 1min 2s


In [207]:
%%time
# this is a relatively small bamfile (only about 1.5 GB) -- outputting full bamfile is quite slow in general
infile = 'bams/FGPC-8V7M-82J3.bam'
outfile = 'bams/FGPC-8V7M-82J3_48-50_del.bam'
simulate_exon_del(outfile, infile, 'X', EX48_50_PART, depletion_factor=3, output_full=True)

Finished generating reads in bamfile for desired chrom_ref
Finished generating bamfile for refs before chrom_ref
Finished generating bamfile for refs after chrom_ref
Finished merging all bamfiles into final output
Finished creating index file for final output bamfile
CPU times: user 7min 10s, sys: 7.01 s, total: 7min 17s
Wall time: 7min 20s


In [17]:
%%time
infile = 'bams/FGPC-8V7M-82J3.bam'
outfile = 'bams/FGPC-8V7M-82J3_48-50_del.bam'
before_wanted = 'bams/FGPC-8V7M-82J3_before_X.bam'
after_wanted = 'bams/FGPC-8V7M-82J3_after_X.bam'
simulate_exon_del(outfile, infile, 'X', EX48_50_PART, depletion_factor=2, output_full=True, 
                  before_wanted=before_wanted, after_wanted=after_wanted)

Finished generating reads in bamfile for desired chrom_ref
Finished merging all bamfiles into final output
Finished creating index file for final output bamfile
CPU times: user 4min 3s, sys: 4.14 s, total: 4min 7s
Wall time: 4min 9s
