In [15]:
from genepeeks.common import utilities as util
import pysam, os
import tabletext
import numpy as np
import pandas as pd

#### Get DMD exons from gene collection in mongo using genepeeks science utilities function

In [3]:
DMD_ensembl = util.Mongo.get_collection_data('gene', wanted_db='prod', query={'_id': 'DMD'}, find_one=True, single_field='ensembl')
DMD_exons = util.get_nested_value(DMD_ensembl, ('is_primary', 'transcripts', 'is_primary', 'exons'))
DMD_exons_merged = util.merge_intervals(DMD_exons, min_dist=200, include_index=True)
exon_labels = ['Ex' + exon['index'] for exon in DMD_exons_merged]
print len(DMD_exons_merged), [exon['index'] for exon in DMD_exons_merged if not exon['index'].isdigit()]

Finished loading ensembl within the gene collection from the genepeeks-prod database with a single entries with a query of {'_id': 'DMD'} after 0.13 sec
78 ['65|66']


#### Count the number of males and females

In [4]:
F_count = M_count = 0
for root, dirs, files in os.walk('/mnt/vep/subjects'):
    for file_name in files:
        if file_name.endswith('.bam'):
            if file_name.startswith('F'):
                F_count += 1
            elif file_name.startswith('M'):
                M_count += 1
print F_count
print M_count

180
263


#### Gather identifying info for each sample

In [21]:
def get_sample_info(RG, root=None):
    # Gather identifying info for each sample
    try:
        # normal RG['ID'] format: FCLR-GP01-2121_1-M1-1_HGGF5AFXX-L004
        subject, specimen_sample, flow_cell_lane = RG['ID'].split('_')
    except:
        # older RG['ID'] format: FPWB-0000-429L_1-P1-1
        subject, specimen_sample = RG['ID'].split('_')
        flow_cell_id = lane = None
    else:
        flow_cell_id, lane = flow_cell_lane.rsplit('-', 1)

    gender = subject[0]
    if specimen_sample.startswith(('ACGT', 'Omega')):
        lab, specimen_num, sequencer, sample = specimen_sample.split('.')
        specimen_num = '{}_{}'.format(lab, specimen_num)
    else:
        specimen_num, sequencer, sample = specimen_sample.split('-')
    specimen = '{}_{}'.format(subject, specimen_num)
    sample = '{}_{}'.format(subject, specimen_sample)
    full_id = RG['ID']
    if root and root.endswith('re86'):
        full_id += '_re86'
        is_re86 = True
    else:
        is_re86 = False

    sample_info = [full_id, subject, specimen, sample, gender, sequencer, flow_cell_id, lane, bwa_version, date_modified, is_re86]
    return sample_info

#### Determine which exon a read overlaps with, if any

In [90]:
def get_exon_num(pair_start, pair_end, skipped_counts, min_overlap=20):
    exon_num = None
    for i, exon in enumerate(DMD_exons_merged):
        if exon['end'] <= pair_start:
            continue
        else:
            if exon['start'] < pair_end:
                exon_num = i
                # If not already in the last exon, check if the read pair falls in the next exon as well.
                if exon_num != (len(DMD_exons_merged) - 1):
                    next_exon = DMD_exons_merged[i + 1]
                    if next_exon['start'] < pair_end:
                        util.add_to_dict(skipped_counts, 'in_two_exons')
                        # Skip the read pair if it falls in both exons
                        exon_num = None
            else:
                util.add_to_dict(skipped_counts, 'outside_of_exon')
            break
    return exon_num

#### Get each sample's coverage across all DMD exons

In [92]:
def create_coverage_matrix():
    base_headers = ['id', 'subject', 'specimen', 'sample', 'gender', 'sequencer', 'flow_cell_id', 'lane', 'bwa_version', 'date_modified', 'is_rerun']
    full_headers = base_headers + exon_labels
    subject_count = 0
    skipped_counts = {}
    coverage_matrix = []
    for root, dirs, files in os.walk('/mnt/vep/subjects'):
        for file_name in files:
            if file_name.endswith('.bam'):# and 'FRMR' in file_name:
                if '{}.bai'.format(file_name) not in files:
                    print '{} is missing an index file'.format(file_name)
                    continue
                # The following subject does not have legit data
                if 'FPWB-0001-0309' in file_name:
                    continue
                file_path = os.path.join(root, file_name)
                date_modified = os.path.getmtime(file_path)

                bamfile = pysam.AlignmentFile(file_path, "rb")

                # Gather identifying info for each sample
                subject_coverages = {}
                bwa_version = next(PG['VN'] for PG in bamfile.header['PG'] if PG.get('ID') == 'bwa')
                for RG in bamfile.header['RG']:
                    # Initialize each row with identifying info for the sample plus each exon's coverage of 0
                    sample_info = get_sample_info(RG, root=root)
                    if len(sample_info) != len(base_headers):
                        util.stop_err('Unequal number of sample info fields vs base headers: {}'.format(zip(base_headers, sample_info)))
                    subject_coverages[RG['ID']] = sample_info + [0] * len(DMD_exons_merged)

                # Get coverage data for each sample within each exon
                for read in bamfile.fetch('X', start=31137345, end=33229636):
                    if not read.is_unmapped:
                        if read.mapping_quality == 60:
                            # Find what exon each read falls in, and increase that exon's coverage by 1
                            exon_num = get_exon_num(read.reference_start, read.reference_end, skipped_counts)
                            if exon_num is not None:
                                subject_coverages[read.get_tag('RG')][exon_num + len(base_headers)] += 1
                        else:
                            util.add_to_dict(skipped_counts, 'MAPQ below 60')

                coverage_matrix += subject_coverages.values()
                subject_count += 1
                if subject_count % 20 == 0:
                    print 'Finished parsing {} subjects'.format(subject_count)

    coverage_matrix_df = pd.DataFrame(coverage_matrix, columns=full_headers)

    # Print counts of skipped reads
    print 'Finished parsing all {} subjects'.format(subject_count)
    for k, v in skipped_counts.items():
        print k, v
    return coverage_matrix_df

In [93]:
%%time
coverage_matrix_df = create_coverage_matrix()
coverage_matrix_df.to_csv("../exon_data/coverage_matrix.csv")

Finished parsing 20 subjects
Finished parsing 40 subjects
Finished parsing 60 subjects
Finished parsing 80 subjects
Finished parsing 100 subjects
Finished parsing 120 subjects
Finished parsing 140 subjects
Finished parsing 160 subjects
FK2R-0000-151I.bam is missing an index file
Finished parsing 180 subjects
Finished parsing 200 subjects
Finished parsing 220 subjects
Finished parsing 240 subjects
Finished parsing 260 subjects
Finished parsing 280 subjects
Finished parsing 300 subjects
Finished parsing 320 subjects
Finished parsing 340 subjects
Finished parsing 360 subjects
Finished parsing 380 subjects
Finished parsing 400 subjects
Finished parsing 420 subjects
MCLR-NA12-8911.bam is missing an index file
Finished parsing 440 subjects
Finished parsing all 440 subjects
outside_of_exon 5263221
MAPQ below 60 125282
CPU times: user 4min 8s, sys: 132 ms, total: 4min 8s
Wall time: 4min 32s


#### Print summary stats for each exon across all female subjects

In [94]:
female_coverage_matrix = coverage_matrix_df[coverage_matrix_df['gender'] == 'F']
female_coverage_matrix.filter(regex=('Ex')).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Ex1,543.0,443.679558,336.413788,30.0,180.0,416.0,583.5,2647.0
Ex2,543.0,54.329650,33.910453,8.0,32.0,48.0,67.0,283.0
Ex3,543.0,135.235727,83.554276,30.0,85.0,121.0,162.0,754.0
Ex4,543.0,205.519337,142.111109,45.0,114.5,184.0,252.0,1177.0
Ex5,543.0,114.408840,67.922590,27.0,75.0,106.0,136.0,563.0
Ex6,543.0,166.158379,110.632319,32.0,97.0,148.0,202.0,890.0
Ex7,543.0,215.769797,171.498412,10.0,76.5,207.0,288.5,1151.0
Ex8,543.0,180.012891,127.034099,11.0,77.0,177.0,237.0,871.0
Ex9,543.0,73.537753,43.716610,5.0,44.0,67.0,97.0,336.0
Ex10,543.0,151.325967,108.949829,14.0,73.0,141.0,196.0,830.0
