In [1]:
from config import *
import vcf
import gpcrdb
import matplotlib
matplotlib.use('Agg')
matplotlib.rc('pdf', fonttype=42)
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = "Arial"
import numpy as np
from utils import VariationType, Segment
import ensembl
import json
from scipy import stats

In [2]:
num_cds, num_gene = 0, 0

seg_missense, seg_silent, seg_nonsense = {s: 0 for s in Segment}, {s: 0 for s in Segment}, {s: 0 for s in Segment}
seg_length = {s: 0 for s in Segment}

num_frequent_missense = {s: 0 for s in Segment}
seg_length_with_frequent_missense = {s: 0 for s in Segment}
num_receptor_with_frequent_missense = 0

for receptor in gpcrdb.get_filtered_receptor_list():
    ensembl_entry = ensembl.EnsemblGeneEntry(receptor)

    for s in Segment:
        seg_length[s] += ensembl_entry.segments.count(s)

    calls_gene = set()
    with open(receptor.japan_gene_vcf_path) as f:
        for l in f.readlines():
            try:
                var = vcf.VariationEntry.load(l)
                calls_gene.add((var.chromosome, var.position))
            except (vcf.NotPassedError, vcf.BlankLineError):
                continue

    calls_cds = set()
    with open(receptor.japan_cds_vcf_path) as f:
        for l in f.readlines():
            try:
                var = vcf.VariationEntry.load(l)
                calls_cds.add((var.chromosome, var.position))
            except (vcf.NotPassedError, vcf.BlankLineError):
                continue

    assert(calls_cds.issubset(calls_gene))
    num_cds += len(calls_cds)
    num_gene += len(calls_gene)

    with open(receptor.japan_cds_csv_path) as f:
        num_missense = 0
        num_nonsense = 0

        for l in f.readlines():
            try:
                anno = ensembl.Annotation.from_csv_line(l)
                if anno.var_type == VariationType.MISSENSE:
                    seg_missense[anno.segment] += 1

                    if anno.snv.AF > 0.5:
                        num_missense += 1
                        num_frequent_missense[anno.segment] += 1
                elif anno.var_type == VariationType.SILENT:
                    seg_silent[anno.segment] += 1
                elif anno.var_type == VariationType.NONSENSE:
                    seg_nonsense[anno.segment] += 1
                    
                    num_nonsense += 1
            except ensembl.BlankLineError:
                continue
        if num_missense > 0:
            num_receptor_with_frequent_missense += 1
            for s in Segment:
                seg_length_with_frequent_missense[s] += ensembl_entry.segments.count(s)

In [53]:
print(seg_length_with_frequent_missense)
print([(x, y, s) for x, y, s in zip(xs, ys, segs)])

{<Segment.Nterm: 'N-term'>: 23093, <Segment.TM1: 'TM1'>: 2726, <Segment.ICL1: 'ICL1'>: 364, <Segment.TM2: 'TM2'>: 2423, <Segment.ECL1: 'ECL1'>: 443, <Segment.TM3: 'TM3'>: 2966, <Segment.ICL2: 'ICL2'>: 722, <Segment.TM4: 'TM4'>: 2232, <Segment.ECL2: 'ECL2'>: 1593, <Segment.TM5: 'TM5'>: 3076, <Segment.ICL3: 'ICL3'>: 839, <Segment.TM6: 'TM6'>: 2845, <Segment.ECL3: 'ECL3'>: 345, <Segment.TM7: 'TM7'>: 2344, <Segment.ICL4: 'ICL4'>: 1, <Segment.H8: 'H8'>: 1168, <Segment.Cterm: 'C-term'>: 7178, <Segment.NONE: 'None'>: 0, <Segment.FailedToGuess: 'Failed to guess'>: 0}
[(278.2289156626506, 42, <Segment.Nterm: 'N-term'>), (32.8433734939759, 5, <Segment.TM1: 'TM1'>), (4.385542168674699, 4, <Segment.ICL1: 'ICL1'>), (29.19277108433735, 2, <Segment.TM2: 'TM2'>), (5.337349397590361, 0, <Segment.ECL1: 'ECL1'>), (35.734939759036145, 4, <Segment.TM3: 'TM3'>), (8.698795180722891, 0, <Segment.ICL2: 'ICL2'>), (26.89156626506024, 8, <Segment.TM4: 'TM4'>), (19.19277108433735, 6, <Segment.ECL2: 'ECL2'>), (37.0

In [55]:
fig, (ax, ax2) = plt.subplots(1, 2, dpi=300, figsize=(7, 2.5))
segs = [s for s in Segment if s not in (Segment.FailedToGuess, Segment.NONE)]
xs = [seg_length_with_frequent_missense[s] / num_receptor_with_frequent_missense for s in segs]
ys = [num_frequent_missense[s] for s in segs]
for x, y, s in zip(xs, ys, segs):
    ax.scatter(x, y, color=s.color, marker='.', label=s.value)
    ax2.scatter(x, y, color=s.color, marker='.', label=s.value)
xy = (-5, -1)
w, h = 48, 12
ax.fill_between([xy[0], xy[0] + w], xy[1], xy[1] + h, color='whitesmoke', zorder=-1)
ax2.set_facecolor('whitesmoke')
ax.set_xlabel("Mean region length [AA]")
ax2.set_xlabel("Mean region length [AA]")
ax.set_ylabel("Number of missense SNVs\n(AF > 0.5)")
ax2.set_xlim(xy[0], xy[0] + w)
ax2.set_ylim(xy[1], xy[1] + h)
ax2.legend(bbox_to_anchor=(1.05, 0.5), loc='center left', ncol=2)

res = stats.pearsonr(xs, ys)
ax.text(0.05, 0.95, f"r = {res.statistic:.2f}\np = {res.pvalue:.1e}", color='tab:gray', va='top', ha='left', transform=ax.transAxes)
res = stats.pearsonr([x for x, s in zip(xs, segs) if s not in Segment.terms()], [y for y, s in zip(ys, segs) if s not in Segment.terms()])
ax2.text(0.05, 0.95, f"r = {res.statistic:.2f}\np = {res.pvalue:.1e}", color='tab:gray', va='top', ha='left', transform=ax2.transAxes)

fig.tight_layout()
fig.savefig("./figures/R1_region-length_vs_number-of-frequent-missenses.pdf")
fig.savefig("./figures/R1_region-length_vs_number-of-frequent-missenses.png")
fig.show()
plt.close(fig)

meta NOT subset; don't know how to subset; dropped
  fig.show()


In [20]:
res = stats.pearsonr(xs, ys)
print("Pearson correlation coefficient:", res.statistic, "p-value:", res.pvalue)

Pearson correlation coefficient: 0.9696063776068062 p-value: 1.4033525004848267e-10


In [32]:
obs_H8_C = seg_nonsense[Segment.H8] + seg_nonsense[Segment.Cterm]
obs_non_H8_C = sum(seg_nonsense.values()) - obs_H8_C

len_H8_C = seg_length[Segment.H8] + seg_length[Segment.Cterm]
len_non_H8_C = sum(seg_length.values()) - len_H8_C

M = len_H8_C + len_non_H8_C
n = len_H8_C
N = obs_H8_C + obs_non_H8_C
k = obs_H8_C

print(M, n, N, k)
p_hypergeom = stats.hypergeom.cdf(M=M, n=n, N=N, k=k)
print("C-term hypergemometric test p-value:", p_hypergeom)

201611 30242 804 142
C-term hypergemometric test p-value: 0.9833479437826564


In [10]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(6, 6), dpi=300, height_ratios=[1, 5], sharex=True)

num_gene_only = num_gene - num_cds
x = num_gene_only / num_gene * 100
ax1.barh(0, x, color='tab:gray', height=0.25, edgecolor='k', lw=0.2, alpha=0.6)
non_coding_text = "Non-coding region\n{:,} calls ({:.1f}%)".format(num_gene_only, x)
ax1.text(x / 2, 0, non_coding_text, ha='center', va='center')

ax1.barh(0, num_cds / num_gene * 100, left=x, height=0.25, edgecolor='k', lw=0.2, color='tab:orange')
coding_text = "Coding region\n{:,} calls\n({:.1f}%)".format(num_cds, num_cds / num_gene * 100)
ax1.text(x + num_cds / num_gene * 100, 0.15, coding_text, ha='center', va='bottom')
ax1.set_axis_off()

num_missense = sum(seg_missense.values())
num_silent = sum(seg_silent.values())
num_nonsense = sum(seg_nonsense.values())
nums = sum([num_missense, num_silent, num_nonsense])

bottom = -1
heights_and_segs = {
    "Missense": (num_missense / nums, seg_missense),
    "Silent": (num_silent / nums, seg_silent),
    "Nonsense": (num_nonsense / nums, seg_nonsense)
}
for i, t in enumerate(heights_and_segs.keys()):
    h = heights_and_segs[t][0]
    s = heights_and_segs[t][1]
    left = 0
    bottom = -1 if i == 0 else bottom - h - 0.05
    total = sum(s.values())
    non_h8_or_cter = 0
    for seg in Segment:
        width = s.get(seg, 0) / total * 100
        ax2.barh(bottom, width, height=h, left=left, color=seg.color, edgecolor='k', lw=0.2, align='edge')
        
        label = seg.value
        if i == 0 and (label.startswith('TM') or label.endswith('-term')):
            ax2.text(left + width / 2, bottom + h, label, ha='center', va='bottom', size=7)
        left += width

        if seg not in (Segment.H8, Segment.Cterm):
            non_h8_or_cter += width

    text = f"{t}\n{total:,} SNVs\n({h * 100:.1f}%)"
    ax2.text(110, bottom + h / 2, text, ha='center', va='center', multialignment='center')

    if i == 2:
        ax2.plot([0, non_h8_or_cter], [bottom - 0.03] * 2, lw=1, color='tab:gray')
        ax2.text(non_h8_or_cter / 2, bottom - 0.04, f"{non_h8_or_cter:.1f}%", ha='center', va='top', size=8, color='tab:gray')

ax2.spines['right'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.set_yticks([])
ax2.set_xlim(0, 100)
ax2.set_xlabel("Calls / SNVs [%]")

fig.tight_layout()
fig.savefig("./figures/1ac_variations.pdf")

meta NOT subset; don't know how to subset; dropped


In [17]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(6, 1), dpi=300, sharex=True)

num_gene_only = num_gene - num_cds
x = num_gene_only / num_gene * 100
ax1.barh(0, x, color='tab:gray', height=0.25, edgecolor='k', lw=0.2, alpha=0.6)

ax1.barh(0, num_cds / num_gene * 100, left=x, height=0.25, edgecolor='k', lw=0.2, color='tab:orange')
ax1.set_axis_off()

num_missense = sum(seg_missense.values())
num_silent = sum(seg_silent.values())
num_nonsense = sum(seg_nonsense.values())
nums = sum([num_missense, num_silent, num_nonsense])

left = 0
width = num_missense / nums * 100
ax2.barh(0, width, left=left, color='tab:orange', height=0.25, edgecolor='k', lw=0.2)
left += width
width = num_silent / nums * 100
ax2.barh(0, width, left=left, color='tab:gray', height=0.25, edgecolor='k', lw=0.2, alpha=0.6)
left += width
width = num_nonsense / nums * 100
ax2.barh(0, width, left=left, color='tab:gray', height=0.25, edgecolor='k', lw=0.2)
left += width

ax2.set_axis_off()

fig.tight_layout()
fig.savefig("./figures/PR1_variations.pdf")