In [255]:
import altair as alt
import itertools
import pandas as pd
import numpy as np
import yaml
import os
import math

alt.data_transformers.disable_max_rows()

from vega_datasets import data

with open('config.yaml') as f:
    config = yaml.safe_load(f)


revel = pd.read_csv(config['revel'], sep='\t')

revel


Unnamed: 0,chr,Pos,ref,alt,aaref,aaalt,REVEL,Ensembl_transcriptid,Codon,Site,Ref_Codon,codon_site,Alt_Codon,Amino_Acid_Ref,Amino_Acid_Alt,AA_Substitution
0,22,23787170,A,C,M,L,0.575,ENST00000417137;ENST00000344921;ENST0000026312...,1,1,ATG,1,CTG,M,L,M1L
1,22,23787170,A,G,M,V,0.613,ENST00000417137;ENST00000344921;ENST0000026312...,1,1,ATG,1,GTG,M,V,M1V
2,22,23787170,A,T,M,L,0.565,ENST00000417137;ENST00000344921;ENST0000026312...,1,1,ATG,1,TTG,M,L,M1L
3,22,23787171,T,A,M,K,0.602,ENST00000417137;ENST00000344921;ENST0000026312...,1,2,ATG,2,AAG,M,K,M1K
4,22,23787171,T,C,M,T,0.586,ENST00000417137;ENST00000344921;ENST0000026312...,1,2,ATG,2,ACG,M,T,M1T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2623,22,23834175,T,G,W,G,0.853,ENST00000344921;ENST00000263121;ENST0000040742...,385,1153,TGG,1,GGG,W,G,W385G
2624,22,23834176,G,C,W,S,0.778,ENST00000344921;ENST00000263121;ENST0000040742...,385,1154,TGG,2,TCG,W,S,W385S
2625,22,23834176,G,T,W,L,0.738,ENST00000344921;ENST00000263121;ENST0000040742...,385,1154,TGG,2,TTG,W,L,W385L
2626,22,23834177,G,C,W,C,0.795,ENST00000344921;ENST00000263121;ENST0000040742...,385,1155,TGG,3,TGC,W,C,W385C


In [256]:
revel_to_keep = {'REVEL': 'REVEL',
                       'Codon': 'Residue',
                       'Amino_Acid_Ref': 'Wildtype_Amino_Acid',
                       'AA_Substitution': 'Substitution',
                       'Amino_Acid_Alt' : 'Amino Acid'
                      }

# merge the mutational effects and the site annotations
revel_final = (revel[list(revel_to_keep)]
      .rename(columns=revel_to_keep)
     )

revel_final



Unnamed: 0,REVEL,Residue,Wildtype_Amino_Acid,Substitution,Amino Acid
0,0.575,1,M,M1L,L
1,0.613,1,M,M1V,V
2,0.565,1,M,M1L,L
3,0.602,1,M,M1K,K
4,0.586,1,M,M1T,T
...,...,...,...,...,...
2623,0.853,385,W,W385G,G
2624,0.778,385,W,W385S,S
2625,0.738,385,W,W385L,L
2626,0.795,385,W,W385C,C


In [257]:
minimum_domain = 0
maximum_domain = 1

In [258]:
def DMS_heatmaps(data, metric):
    """Create main heatmap for one condition.
    The heatmap is the results of three layers.
    *heatmap* is the main DMS data
    *wildtype* marks wildtype data with an 'x'
    *nulls* creates grey cells for missing data.
    If you exclude nulls, missing data is white, 
    which is appropriate for some color schemes
    but not all.
    Parameters
    ----------
    data :pandas.DataFrame
        Main dataframe
    metric : str
        Column in `data` with values to color by.
    Returns
    -------
    altair.Chart
    """
    aa_order = ['Avg','D', 'P', 'N', 'R', 'K', 'Y', 'E', 'T', 'H', 'G',
            'S', 'Q', 'V', 'I', 'L', 'C', 'F', 'W', 'A', 'M']
    tooltips = [c for c in data.columns if c not in
                {'Residue', 'wildtype_code'}]

    # everything is site v mutant
    base = (alt.Chart(data)
            .encode(x=alt.X('Residue:O',
                             axis=alt.Axis(values=[0,50,100,150,200,250,300,350])),
                    y=alt.Y('Amino Acid:O',
                            sort=aa_order,
                            axis=alt.Axis(labelFontSize=12,
                                          titleFontSize=15))
                   )
           )
    heatmap = (base
               .mark_rect()
               .encode(color=alt.Color(metric).scale(scheme="redblue", domain=[minimum_domain, maximum_domain],
                       domainMid=0.5, clamp=True, reverse=True),
                       stroke=alt.value('black'),
                       strokeWidth=alt.condition(cell_selector,
                                                 alt.value(0.5),
                                                 alt.value(0)),
                       tooltip=tooltips
                      )
               .properties(
                width=1000)
              )
    
    return ((heatmap)
            .interactive()
            #.add_selection(subset_select)  # add dropdown menu
            .add_params(cell_selector)  # mouse over highlighting
            #.transform_filter(subset_select)  # add dropdown filtering
            #.transform_filter(zoom_brush)  # add zoom bar filtering
            .properties(height=250, title=' '.join(metric.split('_'))))


In [259]:
cell_selector = alt.selection_point(on='mouseover',
                                     empty='none')


revel_heatmap = DMS_heatmaps(revel_final, 'REVEL')

revel_heatmap

  stacklevel=1,


In [260]:
cadd = pd.read_csv(config['cadd'], sep='\t')

cadd

Unnamed: 0,#Chrom,Pos,Ref,Alt,RawScore,PHRED,Site,Codon,Ref_Codon,codon_site,Alt_Codon,Amino_Acid_Ref,Amino_Acid_Alt,AA_Substitution
0,22,23787170,A,C,3.008461,21.10,1,1,ATG,1,CTG,M,L,M1L
1,22,23787170,A,G,3.538106,22.80,1,1,ATG,1,GTG,M,V,M1V
2,22,23787170,A,T,3.023243,21.20,1,1,ATG,1,TTG,M,L,M1L
3,22,23787171,T,A,3.813093,23.60,2,1,ATG,2,AAG,M,K,M1K
4,22,23787171,T,C,3.647749,23.10,2,1,ATG,2,ACG,M,T,M1T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3469,22,23834179,A,G,1.056781,10.96,1157,386,TAA,2,TGA,Stop,Stop,Stop386Stop
3470,22,23834179,A,T,1.506653,14.22,1157,386,TAA,2,TTA,Stop,L,Stop386L
3471,22,23834180,A,C,1.824643,15.92,1158,386,TAA,3,TAC,Stop,Y,Stop386Y
3472,22,23834180,A,G,1.286353,12.81,1158,386,TAA,3,TAG,Stop,Stop,Stop386Stop


In [261]:
cadd_to_keep = {'PHRED': 'PHRED',
                 'Codon': 'Residue',
                 'Amino_Acid_Ref': 'Wildtype_Amino_Acid',
                 'AA_Substitution': 'Substitution',
                 'Amino_Acid_Alt' : 'Amino Acid'
                      }

# merge the mutational effects and the site annotations
cadd_final = (cadd[list(cadd_to_keep)]
      .rename(columns=cadd_to_keep)
     )

cadd_final


Unnamed: 0,PHRED,Residue,Wildtype_Amino_Acid,Substitution,Amino Acid
0,21.10,1,M,M1L,L
1,22.80,1,M,M1V,V
2,21.20,1,M,M1L,L
3,23.60,1,M,M1K,K
4,23.10,1,M,M1T,T
...,...,...,...,...,...
3469,10.96,386,Stop,Stop386Stop,Stop
3470,14.22,386,Stop,Stop386L,L
3471,15.92,386,Stop,Stop386Y,Y
3472,12.81,386,Stop,Stop386Stop,Stop


In [262]:
minimum_domain = 0
maximum_domain = 45

In [263]:
def DMS_heatmaps(data, metric):
    """Create main heatmap for one condition.
    The heatmap is the results of three layers.
    *heatmap* is the main DMS data
    *wildtype* marks wildtype data with an 'x'
    *nulls* creates grey cells for missing data.
    If you exclude nulls, missing data is white, 
    which is appropriate for some color schemes
    but not all.
    Parameters
    ----------
    data :pandas.DataFrame
        Main dataframe
    metric : str
        Column in `data` with values to color by.
    Returns
    -------
    altair.Chart
    """
    aa_order = ['Avg','D', 'P', 'N', 'R', 'K', 'Y', 'E', 'T', 'H', 'G',
            'S', 'Q', 'V', 'I', 'L', 'C', 'F', 'W', 'A', 'M']
    tooltips = [c for c in data.columns if c not in
                {'Residue'}]

    # everything is site v mutant
    base = (alt.Chart(data)
            .encode(x=alt.X('Residue:O',
                             axis=alt.Axis(values=[0,50,100,150,200,250,300,350])),
                    y=alt.Y('Amino Acid:O',
                            sort=aa_order,
                            axis=alt.Axis(labelFontSize=12,
                                          titleFontSize=15))
                   )
           )
    heatmap = (base
               .mark_rect()
               .encode(color=alt.Color(metric).scale(scheme="redblue", domain=[minimum_domain, maximum_domain],
                       domainMid=17.5, clamp=True, reverse=True),
                       stroke=alt.value('black'),
                       strokeWidth=alt.condition(cell_selector,
                                                 alt.value(0.5),
                                                 alt.value(0)),
                       tooltip=tooltips
                      )
               .properties(
                width=1000)
              )
    
    return ((heatmap)
            .interactive()
            #.add_selection(subset_select)  # add dropdown menu
            .add_params(cell_selector)  # mouse over highlighting
            #.transform_filter(subset_select)  # add dropdown filtering
            #.transform_filter(zoom_brush)  # add zoom bar filtering
            .properties(height=250, title=' '.join(metric.split('_'))))


In [264]:


cell_selector = alt.selection_point(on='mouseover',
                                     empty='none')


cadd_heatmap = DMS_heatmaps(cadd_final, 'PHRED')

cadd_heatmap

  stacklevel=1,


In [265]:
alphamissense = pd.read_csv(config['alphamissense'], sep='\t')

alphamissense

Unnamed: 0,uniprot_id,protein_variant,am_pathogenicity,am_class,Residue,Amino_Acid_Alt
0,Q12824,M1A,0.1541,benign,1,A
1,Q12824,M1C,0.1262,benign,1,C
2,Q12824,M1D,0.5555,ambiguous,1,D
3,Q12824,M1E,0.3386,benign,1,E
4,Q12824,M1F,0.1195,benign,1,F
...,...,...,...,...,...,...
7310,Q12824,W385R,0.9817,pathogenic,385,R
7311,Q12824,W385S,0.6411,pathogenic,385,S
7312,Q12824,W385T,0.7140,pathogenic,385,T
7313,Q12824,W385V,0.6735,pathogenic,385,V


In [266]:
alphamissense_to_keep = {'am_pathogenicity': 'Score',
                 'Residue': 'Residue',
                 'protein_variant': 'Substitution',
                 'Amino_Acid_Alt': 'Amino Acid'
                      }

# merge the mutational effects and the site annotations
alphamissense_final = (alphamissense[list(alphamissense_to_keep)]
      .rename(columns=alphamissense_to_keep)
     )

alphamissense_final


Unnamed: 0,Score,Residue,Substitution,Amino Acid
0,0.1541,1,M1A,A
1,0.1262,1,M1C,C
2,0.5555,1,M1D,D
3,0.3386,1,M1E,E
4,0.1195,1,M1F,F
...,...,...,...,...
7310,0.9817,385,W385R,R
7311,0.6411,385,W385S,S
7312,0.7140,385,W385T,T
7313,0.6735,385,W385V,V


In [267]:
minimum_domain = 0
maximum_domain = 1

In [268]:
def DMS_heatmaps(data, metric):
    """Create main heatmap for one condition.
    The heatmap is the results of three layers.
    *heatmap* is the main DMS data
    *wildtype* marks wildtype data with an 'x'
    *nulls* creates grey cells for missing data.
    If you exclude nulls, missing data is white, 
    which is appropriate for some color schemes
    but not all.
    Parameters
    ----------
    data :pandas.DataFrame
        Main dataframe
    metric : str
        Column in `data` with values to color by.
    Returns
    -------
    altair.Chart
    """
    aa_order = ['Avg','D', 'P', 'N', 'R', 'K', 'Y', 'E', 'T', 'H', 'G',
            'S', 'Q', 'V', 'I', 'L', 'C', 'F', 'W', 'A', 'M']
    tooltips = [c for c in data.columns if c not in
                {'Residue'}]

    # everything is site v mutant
    base = (alt.Chart(data)
            .encode(x=alt.X('Residue:O',
                             axis=alt.Axis(values=[0,50,100,150,200,250,300,350])),
                    y=alt.Y('Amino Acid:O',
                            sort=aa_order,
                            axis=alt.Axis(labelFontSize=12,
                                          titleFontSize=15))
                   )
           )
    heatmap = (base
               .mark_rect()
               .encode(color=alt.Color(metric).scale(scheme="redblue", domain=[minimum_domain, maximum_domain],
                       domainMid=0.5, clamp=True, reverse=True),
                       stroke=alt.value('black'),
                       strokeWidth=alt.condition(cell_selector,
                                                 alt.value(0.5),
                                                 alt.value(0)),
                       tooltip=tooltips
                      )
               .properties(
                width=1000)
              )
    
    return ((heatmap)
            .interactive()
            #.add_selection(subset_select)  # add dropdown menu
            .add_params(cell_selector)  # mouse over highlighting
            #.transform_filter(subset_select)  # add dropdown filtering
            #.transform_filter(zoom_brush)  # add zoom bar filtering
            .properties(height=250, title=' '.join(metric.split('_'))))

In [269]:


cell_selector = alt.selection_point(on='mouseover',
                                     empty='none')


alphamissense_heatmap = DMS_heatmaps(alphamissense_final, 'Score')

alphamissense_heatmap

  stacklevel=1,


In [270]:
print(alphamissense_final.dtypes)

Score           float64
Residue           int64
Substitution     object
Amino Acid       object
dtype: object
