In [2]:
from genepeeks.common import utilities as util
import pysam, os
import tabletext
import numpy as np

INFO:config:Using ini file set by environment: /home/ec2-user/config/genepeeks.ini


#### Get DMD exons from gene collection in mongo using genepeeks science utilities function

In [4]:
DMD_ensembl = util.Mongo.get_collection_data('gene', wanted_db='prod', query={'_id': 'DMD'}, find_one=True, single_field='ensembl')
DMD_exons = util.get_nested_value(DMD_ensembl, ('is_primary', 'transcripts', 'is_primary', 'exons'))

Finished loading ensembl within the gene collection from the genepeeks-prod database with a single entries with a query of {'_id': 'DMD'} after 0.07 sec


#### Get each subject's coverage across all 79 exons

In [30]:
coverage_matrix = []
subjects = []
for root, dirs, files in os.walk('/mnt/vep/subjects'):
    for file_name in files:
        if file_name.endswith('.bam'):
            # Initialize all exons to coverage of 0 for each subject
            subject_coverage = [0] * len(DMD_exons)
            file_path = os.path.join(root, file_name)
            subject = os.path.splitext(file_name)[0]
            bamfile = pysam.AlignmentFile(file_path, "rb")
            for read in bamfile.fetch('X', start=31137345, end=33229636):
                if not read.is_unmapped and read.mapping_quality == 60:
                    pos = read.reference_start
                    # Find what exon each position falls in, and increase that exon's coverage by 1
                    interval_info = util.in_interval(pos, DMD_exons, get_interval=True)
                    if interval_info[0]:
                        exon_num = interval_info[1]
                        subject_coverage[exon_num] += 1
            coverage_matrix.append(subject_coverage)
            subjects.append(subject)
coverage_matrix = np.array(coverage_matrix)
print coverage_matrix

[[ 928    9   96 ...,  149   28  147]
 [ 651   12   80 ...,  108   22  109]
 [ 790   12   95 ...,  147   22  103]
 ..., 
 [ 650    8   83 ...,  123   19   85]
 [ 889    6  112 ...,  130   23  154]
 [1166   17  110 ...,  155   21  165]]


#### Print each exon's mean, std, min, and max coverage, and turn the stats into an array

In [24]:
exon_cov_stats = [['Exon', 'Mean', 'Std', 'Min', 'Max']]
exon_max_cov = []
for exon in range(79):
    exon_coverage = coverage_matrix[:,exon]
    exon_cov_stats.append([exon + 1, round(np.mean(exon_coverage), 2), round(np.std(exon_coverage), 2), min(exon_coverage), max(exon_coverage)])
exon_stats_array = np.array(exon_cov_stats[1:])
print tabletext.to_text(exon_cov_stats)

┌──────┬────────┬────────┬─────┬──────┐
│ Exon │ Mean   │ Std    │ Min │ Max  │
├──────┼────────┼────────┼─────┼──────┤
│    1 │ 827.43 │ 188.44 │ 578 │ 1259 │
├──────┼────────┼────────┼─────┼──────┤
│    2 │  11.36 │   4.05 │   4 │   17 │
├──────┼────────┼────────┼─────┼──────┤
│    3 │   94.5 │  18.75 │  58 │  128 │
├──────┼────────┼────────┼─────┼──────┤
│    4 │  95.93 │  16.97 │  59 │  124 │
├──────┼────────┼────────┼─────┼──────┤
│    5 │ 127.86 │  29.88 │  77 │  192 │
├──────┼────────┼────────┼─────┼──────┤
│    6 │ 137.21 │  26.63 │  88 │  180 │
├──────┼────────┼────────┼─────┼──────┤
│    7 │ 158.21 │   34.7 │ 108 │  219 │
├──────┼────────┼────────┼─────┼──────┤
│    8 │ 115.36 │  18.09 │  84 │  141 │
├──────┼────────┼────────┼─────┼──────┤
│    9 │   40.0 │   9.57 │  31 │   69 │
├──────┼────────┼────────┼─────┼──────┤
│   10 │ 130.43 │  34.65 │  69 │  179 │
├──────┼────────┼────────┼─────┼──────┤
│   11 │  86.79 │  14.69 │  67 │  120 │
├──────┼────────┼────────┼─────┼──────┤


#### Normalize each subject's coverage on each exon compared to the mean and std of that exon's coverage

In [31]:
coverage_matrix_vs_max_exon = coverage_matrix / exon_stats_array[:,-1]
print coverage_matrix_vs_max_exon

normalized_coverage_matrix = (coverage_matrix - exon_stats_array[:,1]) / exon_stats_array[:,2]
print normalized_coverage_matrix

[[ 0.73709293  0.52941176  0.75       ...,  0.96129032  1.          0.79891304]
 [ 0.51707705  0.70588235  0.625      ...,  0.69677419  0.78571429
   0.5923913 ]
 [ 0.62748213  0.70588235  0.7421875  ...,  0.9483871   0.78571429
   0.55978261]
 ..., 
 [ 0.51628276  0.47058824  0.6484375  ...,  0.79354839  0.67857143
   0.46195652]
 [ 0.70611597  0.35294118  0.875      ...,  0.83870968  0.82142857
   0.83695652]
 [ 0.92613185  1.          0.859375   ...,  1.          0.75        0.89673913]]
[[ 0.53369773 -0.58271605  0.08       ...,  1.17672648  1.75824176
   0.60651629]
 [-0.93626619  0.15802469 -0.77333333 ..., -1.05274606  0.43956044
  -0.5839599 ]
 [-0.19863086  0.15802469  0.02666667 ...,  1.06797172  0.43956044
  -0.77192982]
 ..., 
 [-0.94157291 -0.82962963 -0.61333333 ..., -0.23708537 -0.21978022
  -1.3358396 ]
 [ 0.3267353  -1.32345679  0.93333333 ...,  0.14355628  0.65934066
   0.82581454]
 [ 1.79669921  1.39259259  0.82666667 ...,  1.50299076  0.21978022
   1.17042607]]


#### Get summary stats for each subject across all exons

In [39]:
subject_normalized_stats = [['Subject', 'Mean', 'Min', 'Max', 'Norm-Mean', 'Norm-Min', 'Norm-Max']]
for i, subject in enumerate(subjects):
    subject_stats = [subject, round(np.mean(coverage_matrix_vs_max_exon[i]), 3), round(min(coverage_matrix_vs_max_exon[i]), 3), round(max(coverage_matrix_vs_max_exon[i]), 3)]
    subject_stats += [round(np.mean(normalized_coverage_matrix[i]), 3), round(min(normalized_coverage_matrix[i]), 3), round(max(normalized_coverage_matrix[i]), 3)]
    subject_normalized_stats.append(subject_stats)
print tabletext.to_text(subject_normalized_stats)

┌────────────────┬───────┬───────┬───────┬───────────┬──────────┬──────────┐
│ Subject        │ Mean  │ Min   │ Max   │ Norm-Mean │ Norm-Min │ Norm-Max │
├────────────────┼───────┼───────┼───────┼───────────┼──────────┼──────────┤
│ MRMR-0011-266B │ 0.863 │ 0.507 │   1.0 │     0.855 │   -0.644 │    2.453 │
├────────────────┼───────┼───────┼───────┼───────────┼──────────┼──────────┤
│ MRMR-0080-829K │ 0.656 │ 0.404 │ 0.963 │    -0.569 │   -1.968 │    1.142 │
├────────────────┼───────┼───────┼───────┼───────────┼──────────┼──────────┤
│ MRMR-0040-6444 │ 0.769 │ 0.522 │   1.0 │     0.195 │   -1.085 │    1.789 │
├────────────────┼───────┼───────┼───────┼───────────┼──────────┼──────────┤
│ MRMR-0080-518T │ 0.717 │ 0.388 │ 0.993 │    -0.139 │   -2.228 │    1.741 │
├────────────────┼───────┼───────┼───────┼───────────┼──────────┼──────────┤
│ MRMR-0052-1169 │ 0.894 │ 0.622 │   1.0 │     1.013 │   -1.146 │    2.327 │
├────────────────┼───────┼───────┼───────┼───────────┼──────────┼──────────┤

In [40]:
! pwd

/home/ec2-user
