In [90]:
! cd ../../genepeeks-science/ && python setup.py develop

running develop
running egg_info
writing requirements to genepeeks.egg-info/requires.txt
writing genepeeks.egg-info/PKG-INFO
writing top-level names to genepeeks.egg-info/top_level.txt
writing dependency_links to genepeeks.egg-info/dependency_links.txt
reading manifest file 'genepeeks.egg-info/SOURCES.txt'
writing manifest file 'genepeeks.egg-info/SOURCES.txt'
running build_ext
Creating /home/ec2-user/anaconda2/lib/python2.7/site-packages/genepeeks.egg-link (link to .)
genepeeks 1.0.1.1.dev0 is already the active version in easy-install.pth

Installed /home/ec2-user/git/genepeeks-science
Processing dependencies for genepeeks==1.0.1.1.dev0
Searching for apiclient==1.0.3
Best match: apiclient 1.0.3
Adding apiclient 1.0.3 to easy-install.pth file

Using /home/ec2-user/anaconda2/lib/python2.7/site-packages
Searching for config==0.3.9
Best match: config 0.3.9
Adding config 0.3.9 to easy-install.pth file

Using /home/ec2-user/anaconda2/lib/python2.7/site-packages
Searching for dropbox==7.1.1

In [7]:
from genepeeks.common import utilities as util
import pysam, os
import tabletext
import numpy as np

#### Get DMD exons from gene collection in mongo using genepeeks science utilities function

In [8]:
DMD_ensembl = util.Mongo.get_collection_data('gene', wanted_db='prod', query={'_id': 'DMD'}, find_one=True, single_field='ensembl')
DMD_exons = util.get_nested_value(DMD_ensembl, ('is_primary', 'transcripts', 'is_primary', 'exons'))
DMD_exons_merged = util.merge_intervals(DMD_exons, min_dist=200, include_index=True)
exon_labels = ['Ex' + exon['index'] for exon in DMD_exons_merged]
print exon_labels


Finished loading ensembl within the gene collection from the genepeeks-prod database with a single entries with a query of {'_id': 'DMD'} after 0.17 sec
['Ex1', 'Ex2', 'Ex3', 'Ex4', 'Ex5', 'Ex6', 'Ex7', 'Ex8', 'Ex9', 'Ex10', 'Ex11', 'Ex12', 'Ex13', 'Ex14', 'Ex15', 'Ex16', 'Ex17', 'Ex18', 'Ex19', 'Ex20', 'Ex21', 'Ex22', 'Ex23', 'Ex24', 'Ex25', 'Ex26', 'Ex27', 'Ex28', 'Ex29', 'Ex30', 'Ex31', 'Ex32', 'Ex33', 'Ex34', 'Ex35', 'Ex36', 'Ex37', 'Ex38', 'Ex39', 'Ex40', 'Ex41', 'Ex42', 'Ex43', 'Ex44', 'Ex45', 'Ex46', 'Ex47', 'Ex48', 'Ex49', 'Ex50', 'Ex51', 'Ex52', 'Ex53', 'Ex54', 'Ex55', 'Ex56', 'Ex57', 'Ex58', 'Ex59', 'Ex60', 'Ex61', 'Ex62', 'Ex63', 'Ex64', 'Ex65|66', 'Ex67', 'Ex68', 'Ex69', 'Ex70', 'Ex71', 'Ex72', 'Ex73', 'Ex74', 'Ex75', 'Ex76', 'Ex77', 'Ex78', 'Ex79']


#### Count the number of males and females

In [116]:
F_count = M_count = 0
for root, dirs, files in os.walk('/mnt/vep/subjects'):
    for file_name in files:
        if file_name.endswith('.bam'):
            if file_name.startswith('F'):
                F_count += 1
            elif file_name.startswith('M'):
                M_count += 1
print F_count
print M_count

180
263


#### Get each sample's coverage across all DMD exons

In [124]:
base_headers = ['id', 'subject', 'specimen', 'sample', 'gender', 'sequencer', 'flow_cell_id', 'lane', 'bwa_version', 'date_modified']
subject_count = 0
coverage_matrix = [base_headers + exon_labels]
for root, dirs, files in os.walk('/mnt/vep/subjects'):
    for file_name in files:
        if file_name.endswith('.bam'):
            if '{}.bai'.format(file_name) not in files:
                print '{} is missing an index file'.format(file_name)
                continue
            file_path = os.path.join(root, file_name)
            date_modified = os.path.getmtime(file_path)
            
            bamfile = pysam.AlignmentFile(file_path, "rb")
            
            # Gather identifying info for each sample
            subject_coverages = {}
            bwa_version = next(PG['VN'] for PG in bamfile.header['PG'] if PG.get('ID') == 'bwa')
            for RG in bamfile.header['RG']:
                try:
                    # normal RG['ID'] format: FCLR-GP01-2121_1-M1-1_HGGF5AFXX-L004
                    subject, specimen_sample, flow_cell_lane = RG['ID'].split('_')
                except:
                    # older RG['ID'] format: FPWB-0000-429L_1-P1-1
                    subject, specimen_sample = RG['ID'].split('_')
                    flow_cell_id = lane = None
                else:
                    flow_cell_id, lane = flow_cell_lane.rsplit('-', 1)

                gender = subject[0]
                if specimen_sample.startswith(('ACGT', 'Omega')):
                    lab, specimen_num, sequencer, sample = specimen_sample.split('.')
                    specimen_num = '{}_{}'.format(lab, specimen_num)
                else:
                    specimen_num, sequencer, sample = specimen_sample.split('-')
                specimen = '{}_{}'.format(subject, specimen_num)
                sample = '{}_{}'.format(subject, specimen_sample)
                full_id = RG['ID']
                if root.endswith('re86'):
                    full_id += '_re86'
                    
                id_info = [full_id, subject, specimen, sample, gender, sequencer, flow_cell_id, lane, bwa_version, date_modified]
                # Initialize each row with identifying info for the sample plus each exon's coverage of 0
                subject_coverages[RG['ID']] = id_info + [0] * len(DMD_exons_merged)

            # Get coverage data for each sample within each exon
            for read in bamfile.fetch('X', start=31137345, end=33229636):
                if not read.is_unmapped and read.mapping_quality == 60:
                    # Find what exon each read falls in, and increase that exon's coverage by 1
                    interval_info = util.in_interval(read.reference_start, DMD_exons_merged, get_interval=True)
                    if not interval_info[0]:
                        # If the start of the read is not in an exon, check the end of the read
                        interval_info = util.in_interval(read.reference_end, DMD_exons_merged, get_interval=True)
#                     else:
#                         # Make sure that the end of the read does not map to a different exon than the start
#                         end_interval_info = util.in_interval(read.reference_end, DMD_exons_merged, get_interval=True)
#                         if end_interval_info[0] and end_interval_info[1] != interval_info[1]:
#                             print read.reference_start, interval_info[1], read.reference_end, end_interval_info[1]
                        
                    if interval_info[0]:
                        exon_num = interval_info[1]
                        subject_coverages[read.get_tag('RG')][exon_num + len(base_headers)] += 1
                        
            coverage_matrix += subject_coverages.values()
            subject_count += 1
            if subject_count % 10 == 0:
                print 'Finished parsing {} subjects'.format(subject_count)
coverage_matrix = np.array(coverage_matrix)
print subject_count
print coverage_matrix

Finished parsing 10 subjects
Finished parsing 20 subjects
Finished parsing 30 subjects
Finished parsing 40 subjects
Finished parsing 50 subjects
Finished parsing 60 subjects
Finished parsing 70 subjects
Finished parsing 80 subjects
Finished parsing 90 subjects
Finished parsing 100 subjects
Finished parsing 110 subjects
Finished parsing 120 subjects
Finished parsing 130 subjects
Finished parsing 140 subjects
Finished parsing 150 subjects
Finished parsing 160 subjects
FK2R-0000-151I.bam is missing an index file
Finished parsing 170 subjects
Finished parsing 180 subjects
Finished parsing 190 subjects
Finished parsing 200 subjects
Finished parsing 210 subjects
Finished parsing 220 subjects
Finished parsing 230 subjects
Finished parsing 240 subjects
Finished parsing 250 subjects
Finished parsing 260 subjects
Finished parsing 270 subjects
Finished parsing 280 subjects
Finished parsing 290 subjects
Finished parsing 300 subjects
Finished parsing 310 subjects
Finished parsing 320 subjects
Finis

In [121]:
import pandas as pd

df = pd.DataFrame(coverage_matrix)
df.to_csv("../exon_data/coverage_matrix.csv")

#### Print each exon's mean, std, min, and max coverage, and turn the stats into an array

In [122]:
print len(coverage_matrix), 'rows'
exon_cov_stats = [['Exon', 'Mean', 'Std', 'Min', '1%', '99%', 'Max']]
exon_max_cov = []
female_coverage_matrix = np.array([list(coverage_matrix[0])] + [list(item) for item in coverage_matrix if item[0].startswith('F')])
for i in range(len(DMD_exons_merged)):
    col = i + len(base_headers)
    exon_coverage = map(int, female_coverage_matrix[1:,col])
    exon_stats = [
        coverage_matrix[0][col],
        round(np.mean(exon_coverage), 2),
        round(np.std(exon_coverage), 2),
        min(exon_coverage),
        np.percentile(exon_coverage, 1),
        np.percentile(exon_coverage, 99),
        max(exon_coverage)
    ]
    exon_cov_stats.append(exon_stats)
# exon_stats_array = np.array(exon_cov_stats[1:,[1:5]])
print tabletext.to_text(exon_cov_stats)

1267 rows
┌─────────┬────────┬────────┬─────┬───────┬────────┬──────┐
│ Exon    │ Mean   │ Std    │ Min │ 1%    │ 99%    │ Max  │
├─────────┼────────┼────────┼─────┼───────┼────────┼──────┤
│ Ex1     │ 440.51 │ 336.99 │   0 │ 39.46 │ 2051.7 │ 2647 │
├─────────┼────────┼────────┼─────┼───────┼────────┼──────┤
│ Ex2     │  15.19 │   9.32 │   0 │   0.0 │   39.0 │   42 │
├─────────┼────────┼────────┼─────┼───────┼────────┼──────┤
│ Ex3     │  106.5 │  56.55 │   0 │ 24.92 │  287.1 │  476 │
├─────────┼────────┼────────┼─────┼───────┼────────┼──────┤
│ Ex4     │ 179.87 │ 118.86 │   0 │ 41.46 │ 720.26 │  982 │
├─────────┼────────┼────────┼─────┼───────┼────────┼──────┤
│ Ex5     │ 113.31 │  66.95 │   0 │ 29.46 │  411.4 │  552 │
├─────────┼────────┼────────┼─────┼───────┼────────┼──────┤
│ Ex6     │ 155.07 │  86.83 │   0 │ 25.46 │ 476.54 │  654 │
├─────────┼────────┼────────┼─────┼───────┼────────┼──────┤
│ Ex7     │ 130.78 │  86.01 │   0 │  4.46 │ 328.48 │  461 │
├─────────┼────────┼────────┼─

#### Normalize each subject's coverage on each exon compared to the mean and std of that exon's coverage

In [15]:
coverage_matrix_vs_max_exon = coverage_matrix / exon_stats_array[:,-1]
print coverage_matrix_vs_max_exon

normalized_coverage_matrix = (coverage_matrix - exon_stats_array[:,1]) / exon_stats_array[:,2]
print normalized_coverage_matrix

[[ 0.67197452  0.34883721  0.56981132 ...,  0.7337884   0.79245283
   0.73668639]
 [ 0.65059145  0.41860465  0.55471698 ...,  0.6894198   0.49056604
   0.73964497]
 [ 0.78207461  0.93023256  0.68301887 ...,  0.83959044  0.73584906
   0.74556213]
 ..., 
 [ 0.58826206  0.62790698  0.57735849 ...,  0.47098976  0.73584906
   0.5443787 ]
 [ 0.51319381  0.37209302  0.35471698 ...,  0.49829352  0.58490566
   0.53846154]
 [ 0.39126479  0.44186047  0.40754717 ...,  0.56313993  0.43396226
   0.46153846]]
[[-0.09158981 -1.14850299 -0.52688771 ..., -0.18404207  0.81959379
   0.12059012]
 [-0.24263264 -0.78922156 -0.63833937 ..., -0.48125286 -1.09199522
   0.14197135]
 [ 0.68612013  1.84550898  0.30899972 ...,  0.52469136  0.46117085
   0.1847338 ]
 ..., 
 [-0.68290645  0.28862275 -0.47116188 ..., -1.94444444  0.46117085
  -1.26918965]
 [-1.21316322 -1.02874251 -2.11507384 ..., -1.7615455  -0.49462366
  -1.31195211]
 [-2.07442877 -0.66946108 -1.72499303 ..., -1.32716049 -1.45041816
  -1.86786402]]


#### Get a matrix of the probability that any subject's reads will be for any particular exon

In [90]:
probability_in_an_exon_matrix = [list(coverage_matrix[0])]
for row in coverage_matrix[1:]:
    total_subj_reads = sum(map(int, row[len(base_headers):]))
    new_row = list(row[:len(base_headers)])
    new_row += [float(item) / total_subj_reads for item in row[len(base_headers):]]
    probability_in_an_exon_matrix.append(new_row)
probability_in_an_exon_matrix = np.array(probability_in_an_exon_matrix)
print probability_in_an_exon_matrix

[['id' 'subject' 'specimen' ..., 'Ex77' 'Ex78' 'Ex79']
 ['FCLR-GP01-2121_1-M1-1_HGGF5AFXX-L001' 'FCLR-GP01-2121'
  'FCLR-GP01-2121_1' ..., 0.010921396708198739 0.0018458698661744347
  0.012459621596677434]
 ['FCLR-GP01-2121_1-M1-1_HGGF5AFXX-L002' 'FCLR-GP01-2121'
  'FCLR-GP01-2121_1' ..., 0.011123296255679148 0.0021933260222465924
  0.010496631677894407]
 ..., 
 ['FPWB-0000-952M_1-M1-1_H753VAFXX-L003' 'FPWB-0000-952M'
  'FPWB-0000-952M_1' ..., 0.011693296438453986 0.0016959742926001963
  0.012853699901812015]
 ['FPWB-0000-952M_1-M1-1_H753VAFXX-L002' 'FPWB-0000-952M'
  'FPWB-0000-952M_1' ..., 0.011130434782608696 0.0019130434782608696
  0.01017391304347826]
 ['FPWB-0000-952M_1-M1-1_H753VAFXX-L004' 'FPWB-0000-952M'
  'FPWB-0000-952M_1' ..., 0.011900684931506849 0.0021404109589041095
  0.010787671232876713]]
