For those using google colab, uncomment the lines in the next two cells below then head to the
visualization section

In [2]:
!git clone https://github.com/wesleymsmith/Piezo_PIP2_binding_analysis.git
!pip install bokeh
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import scipy as sp
from scipy import stats

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

import os
import sys
import gc
import copy
import glob

import tqdm
import itertools

fatal: destination path 'Piezo_PIP2_binding_analysis' already exists and is not an empty directory.


In [0]:
#baseDir="./"
baseDir="Piezo_PIP2_binding_analysis/"

In [4]:
cg_occupancy_data_files=glob.glob(baseDir+"Coarse_Grain_Occupancy_Data.chunk*")
print "first cg data frame: '%s'"%np.sort(cg_occupancy_data_files)[0]

first cg data frame: 'Piezo_PIP2_binding_analysis/Coarse_Grain_Occupancy_Data.chunk.aa'


In [16]:
cg_frames=[]
for iFile,cg_file in tqdm.tqdm_notebook(enumerate(np.sort(cg_occupancy_data_files))):
    if iFile==0:
        cg_frames.append(pd.read_csv(cg_file))
        colNames=cg_frames[0].columns
    else:
        cg_frames.append(pd.read_csv(cg_file,names=colNames))
cg_occupancy_data=pd.concat(cg_frames)
cg_frames=[]
gc.collect()
aa_occupancy_data=pd.read_csv(baseDir+"All_Atom_long_Occupancy_Data.csv")

print cg_occupancy_data.head()
print aa_occupancy_data.head()

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


   ResID  SeqID  Frame    Time  Occupancy
0   4236   2528      0     0.0          0
1   4236   2528      1  1000.0          0
2   4236   2528      2  2000.0          0
3   4236   2528      3  3000.0          0
4   4236   2528      4  4000.0          0
   ResID  SeqID  Frame  Time  Occupancy
0   1233   2361      0   0.0          0
1   1233   2361      1   1.0          0
2   1233   2361      2   2.0          0
3   1233   2361      3   3.0          0
4   1233   2361      4   4.0          0


Now we can extract residence times from this occupancy by first finding
all 'runs' within the occupancy series of each residue.

This can be accomplished using itertools.groupby to obtain the lengths of each continguous interval where occupancy was non-zero. Since these are discrete integer values, we can easily use the function 'unique' to bin them into a histogram like form by setting the 'return_counts' option to 'True'

We can then plot the resulting distribution...
It seems to look quite exponential like, so it would make sense to try and fit either a geometric (if we think of each frame like an individual 'trial') or an exponential (if want to think of each run's length as a 'wait time')

The exponential distribution with a mean (or characteristic length, $\lambda$) is given by:
$$p_{exp}(x,\lambda)=\frac{e^{-x/\lambda}}{\lambda}$$


To fit an exponential distribution to this data we first compute the characteristic length ($\gamma$) as the mean ($\bar{X}$) of the observed residence times ($X_i$)

It is possible, however, that the distribution is not perfectly exponential. One common alternative would be to use a gamma distribution instead.

$$p_\Gamma(x,\theta,k)=\frac{x^{k-1}e^{-\frac{x}{\theta}}}{\Gamma(k)\theta^k}$$

where $\Gamma(k)$ is the gamma function, which is essentially an extrapolation of the factorial onto real numbers.

For the gamma distribution, we can use closed form estimators based on a generalized gamma distribution:

$$\theta=\frac{1}{N^2}\left(N\sum\limits_{i=1}^{N}x_i\ln(x_i)-\sum\limits_{i=1}^{N}\ln(x_i)\sum\limits_{i=1}^{N}x_i\right)$$

$$k=\frac{N\sum\limits_{i=1}^{N}x_i}{
    N\sum\limits_{i=1}^{N}x_i\ln(x_i)-\sum\limits_{i=1}^{N}\ln(x_i)\sum\limits_{i=1}^{N}x_i}$$
    
We can also correct for bias:

$$\tilde{\theta}=\frac{N}{N-1}\theta$$

$$\tilde{k}=k-\frac{1}{N}\left(
    3k-
    \frac{2}{3}\left(\frac{k}{1+k}\right)-
    \frac{4}{5}\frac{k}{(1+k)^2}
  \right)$$
  
for reference: [Wikipedia Gamma Distribution Closed Form Estimators](https://en.wikipedia.org/wiki/Gamma_distribution#Closed-form_estimators)

In [0]:
def extract_runs(x):
    return [len(list(gg)) for kk,gg in itertools.groupby(x, bool) if kk]

def extract_resDist(x):
    return(np.unique(extract_runs(x),return_counts=True))

def bin_runs(x,binWidth=1.000,frameRate=1.,center='right'):
    init_dist=extract_resDist(x)
    x_dist=(init_dist[0]*frameRate,init_dist[1])
    binMax=np.max(x_dist[0])
    nBins=np.ceil(binMax/binWidth)+1
    hbins=np.arange(nBins)*binWidth
    temp_dist=np.histogram(x_dist[0],weights=x_dist[1],bins=hbins)
    outDat=[temp_dist[1],temp_dist[0]]
    if center=='midpoint':
        outDat[0]=(temp_dist[1][1:]+temp_dist[1][:-1])/2.
    elif center=="left":
        outDat[0]=temp_dist[1][:-1]
    elif center=="right":
        outDat[0]=temp_dist[1][1:]
    return(outDat)

def expDist(x,l):
    return np.exp(-x/l)/l

def frequencyDistribution_mle_exp_params(bin_dist,bias_correction=True):
    Nv=np.sum(bin_dist[1])
    if bias_correction & (Nv>2):
        Nv=Nv-2
    return (np.sum(bin_dist[0]*bin_dist[1])/(1.*Nv))

def beta_dist(x,a,b):
    return (x**(a-1.)*(1.-x)**(b-1.))/\
        (sp.special.gamma(a)*sp.special.gamma(b)/sp.special.gamma(a+b))
def gamma_dist(x,t,k):
    return 1/(sp.special.gamma(k)*(t**k))*x**(k-1.)*np.exp(-x/t)

def frequencyDistribution_mle_gamma_params(bin_dist,bias_correction=True):
    xi,fi=bin_dist
    Nv=np.sum(fi)
    sum_f=np.sum(fi*xi)
    sum_lnf=np.sum(fi*np.log(xi))
    sum_flnf=np.sum(fi*xi*np.log(xi))
    k_est=(Nv*sum_f)/(Nv*sum_flnf-sum_lnf*sum_f)
    t_est=1./(Nv**2)*(Nv*sum_flnf-sum_lnf*sum_f)
    if bias_correction:
        t_est=Nv*t_est/(Nv-1)
        k_est=k_est-1./Nv*(3.*k_est-2./3.*(k_est/(1+k_est))-4./5.*(k_est/(1+k_est)**2))
    return [t_est,k_est]

In [19]:
@interact_manual
def fit_aa_res(tempResID=aa_occupancy_data[aa_occupancy_data.Occupancy>0].ResID.sort_values().unique(),
               bwidth=(1.,10.),binCenter=["midpoint","right"]):
    occData=aa_occupancy_data[aa_occupancy_data.ResID==tempResID].Occupancy
    #bwidth=10.0
    #tempDist=bin_runs(occData,binWidth=bwidth,frameRate=.120,center=binCenter)
    tempDist=bin_runs(occData,binWidth=bwidth,frameRate=1.0,center=binCenter)
    fig,axs=plt.subplots(1,2)
    fig.set_figheight(9)
    fig.set_figwidth(12)
    ax=axs.flat[0]
    ax.bar(tempDist[0],1.*tempDist[1]/np.sum(tempDist[1]),width=bwidth,alpha=.5)
    #plt.show()
    tempGammaParms=frequencyDistribution_mle_gamma_params(tempDist,bias_correction=True)
    expLambda=np.sum(1.*tempDist[0]*tempDist[1])/np.sum(tempDist[1])
    ax.plot(tempDist[0],gamma_dist(tempDist[0],tempGammaParms[0],tempGammaParms[1]),
                label='"Gamma:\n t=%.2f, k=%.2f;\n Mean=%.2f"'%(
                    tempGammaParms[0],tempGammaParms[1],tempGammaParms[0]*tempGammaParms[1]*1.))
    ax.plot(tempDist[0],
                expDist(tempDist[0],
                        expLambda),
                label='"Exp:\n l=%.2f"'%expLambda)
    ax.legend()
    ax.set_title("All Atom (N = %g)"%np.sum(tempDist[1]))
    print "aa RMSE expDist:",
    print "%.4f"%np.sqrt(
        np.sum(
            (tempDist[1]/np.sum(tempDist[1])-\
            expDist(tempDist[0],expLambda))**2)/\
        len(tempDist[1]))
    print "aa RMSE gammaDist:",
    print "%.4f"%np.sqrt(
        np.sum(
            (tempDist[1]/np.sum(tempDist[1])-\
            gamma_dist(tempDist[0],tempGammaParms[0],tempGammaParms[1]))**2)/\
        len(tempDist[1]))
    
    occData=cg_occupancy_data[cg_occupancy_data.ResID==tempResID].Occupancy
    tempDist=bin_runs(occData,binWidth=bwidth,frameRate=1.0,center=binCenter)
    ax=axs.flat[1]
    ax.bar(tempDist[0],1.*tempDist[1]/np.sum(tempDist[1]),width=bwidth,alpha=.5)
    #plt.show()
    tempGammaParms=frequencyDistribution_mle_gamma_params(tempDist,bias_correction=True)
    expLambda=frequencyDistribution_mle_exp_params(tempDist,bias_correction=True)
    ax.plot(tempDist[0],gamma_dist(tempDist[0],tempGammaParms[0],tempGammaParms[1]),
                label='"Gamma:\n t=%.2f, k=%.2f;\n Mean=%.2f"'%(
                    tempGammaParms[0],tempGammaParms[1],tempGammaParms[0]*tempGammaParms[1]*1.))
    ax.plot(tempDist[0],
                expDist(tempDist[0],
                        expLambda),
                label='"Exp:\n l=%.2f"'%expLambda)
    ax.legend()
    ax.set_title('Coarse Grain (N = %g)'%np.sum(tempDist[1]))
    print "cg RMSE expDist:",
    print "%.4f"%np.sqrt(
        np.sum(
            (tempDist[1]/np.sum(tempDist[1])-\
            expDist(tempDist[0],expLambda))**2)/\
        len(tempDist[1]))
    print "cg RMSE gammaDist:",
    print "%.4f"%np.sqrt(
        np.sum(
            (tempDist[1]/np.sum(tempDist[1])-\
            gamma_dist(tempDist[0],tempGammaParms[0],tempGammaParms[1]))**2)/\
        len(tempDist[1]))
    
    plt.show()

interactive(children=(Dropdown(description=u'tempResID', options=(15, 21, 22, 30, 63, 65, 161, 162, 167, 168, …

In [20]:
binwidth=5.50
aaFitDat=aa_occupancy_data #[aa_occupancy_data.ResID.isin([3086,3256])]
aaFitFrame=aaFitDat.groupby(['ResID','SeqID']).agg(
    {"Occupancy": {
         "Total_Occupancy":lambda x: .12*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max_ResTime":lambda x: np.max(extract_runs(x))*.12 if np.sum(x) > 0 else 0,
         "GammaDist_Params":lambda x: list(list(frequencyDistribution_mle_gamma_params(
         bin_runs(x,frameRate=.12,binWidth=binwidth),bias_correction=True))) if np.sum(x)>0 else [np.nan,np.nan],
         "ExpDist_Mean":lambda x: frequencyDistribution_mle_exp_params(
            bin_runs(x,frameRate=.12,binWidth=binwidth)) if np.sum(x)>0 else np.nan}
    })
aaFitFrame.head()



Unnamed: 0_level_0,Unnamed: 1_level_0,Occupancy,Occupancy,Occupancy,Occupancy,Occupancy
Unnamed: 0_level_1,Unnamed: 1_level_1,ExpDist_Mean,N,GammaDist_Params,Max_ResTime,Total_Occupancy
ResID,SeqID,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,782,,0,"[nan, nan]",0.0,0.0
15,796,5.94,27,"[0.0, nan]",5.28,27.96
21,802,7.15,42,"[1.3106197560413353, 4.958103927827665]",19.08,95.16
22,803,14.85,12,"[12.352612730827046, 0.8653090957259573]",64.68,108.0
30,811,6.875,14,"[0.27230782093426575, 18.359043525718626]",6.0,10.32


In [21]:
binwidth=4.8

aaFitDat=aa_occupancy_data #[aa_occupancy_data.ResID.isin([3086,3256])]
aaFitFrame=aaFitDat.groupby(['ResID','SeqID']).agg(
    {"Occupancy": {
         "Total_Occupancy":lambda x: .12*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max_ResTime":lambda x: np.max(extract_runs(x))*.12 if np.sum(x) > 0 else 0,
         "GammaDist_Params":lambda x: list(list(frequencyDistribution_mle_gamma_params(
         bin_runs(x,frameRate=.12,binWidth=binwidth),bias_correction=True))) if np.sum(x)>0 else [np.nan,np.nan],
         "ExpDist_Mean":lambda x: frequencyDistribution_mle_exp_params(
            bin_runs(x,frameRate=.12,binWidth=binwidth)) if np.sum(x)>0 else np.nan}
    })
aaFitFrame.columns=aaFitFrame.columns.map(lambda x: x[1])
aaFitFrame=aaFitFrame.reset_index()
aaFitFrame['GammaDist_Mean']=aaFitFrame.GammaDist_Params.map(np.product)
aaFitFrame['GammaDist_k']=aaFitFrame.GammaDist_Params.map(lambda x: x[1])
aaFitFrame=aaFitFrame.drop(columns='GammaDist_Params')
aaFitFrame['SimType']='All_Atom'
aaFitFrame=aaFitFrame[["SimType","ResID","SeqID","Total_Occupancy","N","Max_ResTime",
                       "ExpDist_Mean","GammaDist_Mean","GammaDist_k"]]
#aaFitFrame=aaFitFrame.dropna()
print aaFitFrame.head()

cgFitDat=cg_occupancy_data #[aa_occupancy_data.ResID.isin([3086,3256])]
cgFitFrame=cgFitDat.groupby(['ResID','SeqID']).agg(
    {"Occupancy": {
        "Total_Occupancy":np.sum,
        "N":lambda x: len(extract_runs(x)),
        "Max_ResTime":lambda x: np.max(extract_runs(x)) if np.sum(x) > 0 else 0,
        "GammaDist_Params":lambda x: list(list(frequencyDistribution_mle_gamma_params(
        bin_runs(x,frameRate=1.,binWidth=binwidth),bias_correction=True))) if np.sum(x)>0 else [np.nan,np.nan],
        "ExpDist_Mean":lambda x: frequencyDistribution_mle_exp_params(
            bin_runs(x,frameRate=1.,binWidth=binwidth)) if np.sum(x)>0 else np.nan}
    })
cgFitFrame.columns=cgFitFrame.columns.map(lambda x: x[1])
cgFitFrame=cgFitFrame.reset_index()
cgFitFrame['GammaDist_Mean']=cgFitFrame.GammaDist_Params.map(np.product)
cgFitFrame['GammaDist_k']=cgFitFrame.GammaDist_Params.map(lambda x: x[1])
cgFitFrame=cgFitFrame.drop(columns='GammaDist_Params')
cgFitFrame['SimType']='Coarse_Grain'
cgFitFrame=cgFitFrame[["SimType","ResID","SeqID","Total_Occupancy","N","Max_ResTime",
                       "ExpDist_Mean","GammaDist_Mean","GammaDist_k"]]
#cgFitFrame=cgFitFrame.dropna()
print cgFitFrame.head()



    SimType  ResID  SeqID  ...  ExpDist_Mean  GammaDist_Mean  GammaDist_k
0  All_Atom      1    782  ...           NaN             NaN          NaN
1  All_Atom     15    796  ...         5.376        4.597927    37.312908
2  All_Atom     21    802  ...         6.240        5.671147     4.958104
3  All_Atom     22    803  ...        14.400       10.422661     0.752637
4  All_Atom     30    811  ...         6.000        4.363035    18.359044

[5 rows x 9 columns]
        SimType  ResID  SeqID  ...  ExpDist_Mean  GammaDist_Mean  GammaDist_k
0  Coarse_Grain      1    782  ...     23.187402       22.868932     1.155617
1  Coarse_Grain     15    796  ...      9.771987        9.667189     0.672089
2  Coarse_Grain     21    802  ...           NaN             NaN          NaN
3  Coarse_Grain     22    803  ...           NaN             NaN          NaN
4  Coarse_Grain     30    811  ...           NaN             NaN          NaN

[5 rows x 9 columns]


In [22]:
jointFitFrame=pd.concat([aaFitFrame,cgFitFrame])
jointFitFrame.to_csv(baseDir+"joint_distribution_fit_frame.csv",index=False)
jointFitFrame.head()

Unnamed: 0,SimType,ResID,SeqID,Total_Occupancy,N,Max_ResTime,ExpDist_Mean,GammaDist_Mean,GammaDist_k
0,All_Atom,1,782,0.0,0,0.0,,,
1,All_Atom,15,796,27.96,27,5.28,5.376,4.597927,37.312908
2,All_Atom,21,802,95.16,42,19.08,6.24,5.671147,4.958104
3,All_Atom,22,803,108.0,12,64.68,14.4,10.422661,0.752637
4,All_Atom,30,811,10.32,14,6.0,6.0,4.363035,18.359044


In [23]:
jointFitMelt=jointFitFrame.melt(id_vars=["SimType","ResID","SeqID"],var_name="Fit_Param")
print jointFitMelt.head()

jointFitWide=jointFitMelt
jointFitWide["Measurement"]=jointFitWide.SimType+"."+jointFitWide.Fit_Param
jointFitWide=jointFitWide.drop(columns=["SimType","Fit_Param"])
jointFitWide=jointFitWide.reset_index()
jointFitWide=pd.pivot_table(index=["ResID","SeqID"],columns="Measurement",values="value",data=jointFitWide)
#jointFitWide.columns=jointFitWide.columns.map(lambda x: x[1])
jointFitWide=jointFitWide.reset_index()
print jointFitWide.columns
jointFitWide.to_csv(baseDir+"joint_Fit_Data_wide.csv",index=False)
jointFitWide.head()

    SimType  ResID  SeqID        Fit_Param   value
0  All_Atom      1    782  Total_Occupancy    0.00
1  All_Atom     15    796  Total_Occupancy   27.96
2  All_Atom     21    802  Total_Occupancy   95.16
3  All_Atom     22    803  Total_Occupancy  108.00
4  All_Atom     30    811  Total_Occupancy   10.32
Index([u'ResID', u'SeqID', u'All_Atom.ExpDist_Mean',
       u'All_Atom.GammaDist_Mean', u'All_Atom.GammaDist_k',
       u'All_Atom.Max_ResTime', u'All_Atom.N', u'All_Atom.Total_Occupancy',
       u'Coarse_Grain.ExpDist_Mean', u'Coarse_Grain.GammaDist_Mean',
       u'Coarse_Grain.GammaDist_k', u'Coarse_Grain.Max_ResTime',
       u'Coarse_Grain.N', u'Coarse_Grain.Total_Occupancy'],
      dtype='object', name=u'Measurement')


Measurement,ResID,SeqID,All_Atom.ExpDist_Mean,All_Atom.GammaDist_Mean,All_Atom.GammaDist_k,All_Atom.Max_ResTime,All_Atom.N,All_Atom.Total_Occupancy,Coarse_Grain.ExpDist_Mean,Coarse_Grain.GammaDist_Mean,Coarse_Grain.GammaDist_k,Coarse_Grain.Max_ResTime,Coarse_Grain.N,Coarse_Grain.Total_Occupancy
0,1,782,,,,0.0,0.0,0.0,23.187402,22.868932,1.155617,141.0,256.0,5212.0
1,15,796,5.376,4.597927,37.312908,5.28,27.0,27.96,9.771987,9.667189,0.672089,423.0,309.0,2012.0
2,21,802,6.24,5.671147,4.958104,19.08,42.0,95.16,,,,0.0,0.0,0.0
3,22,803,14.4,10.422661,0.752637,64.68,12.0,108.0,,,,0.0,0.0,0.0
4,30,811,6.0,4.363035,18.359044,6.0,14.0,10.32,,,,0.0,0.0,0.0


In [24]:
jointFitWide=pd.read_csv(baseDir+"joint_Fit_Data_Wide.csv")
jointFitWide.head()

IOError: ignored

# Load SASA Data and compute trajectory means

In [25]:
sasa_data=pd.read_csv(baseDir+"basicResidueSASA.csv")
sasa_data.head()

Unnamed: 0,System,Frame,ARG_1,ARG_15,ARG_21,ARG_22,HSD_27,LYS_30,LYS_42,ARG_63,ARG_65,LYS_83,LYS_89,HSD_94,ARG_122,ARG_134,LYS_135,HSD_146,ARG_161,ARG_162,HSD_165,ARG_167,ARG_168,HSD_170,ARG_186,ARG_188,LYS_198,LYS_207,ARG_224,HSD_232,ARG_242,ARG_243,ARG_244,ARG_245,ARG_250,ARG_284,LYS_287,LYS_297,ARG_306,ARG_334,...,LYS_3890,LYS_3891,LYS_3892,LYS_3893,LYS_3896,ARG_3922,LYS_3939,ARG_4003,ARG_4009,LYS_4013,ARG_4026,ARG_4033,LYS_4037,LYS_4047,HSD_4048,ARG_4059,ARG_4060,ARG_4069,HSD_4078,LYS_4082,ARG_4085,LYS_4096,ARG_4110,ARG_4114,ARG_4115,LYS_4130,LYS_4146,LYS_4160,LYS_4187,ARG_4190,HSD_4198,ARG_4210,LYS_4213,ARG_4222,ARG_4225,LYS_4236,ARG_4242,LYS_4249,ARG_4252,ARG_4254
0,Piezo_PIP2,0,287.44056,274.89896,324.281827,540.105388,382.686118,172.673159,102.452724,221.306778,447.452766,256.713873,123.055267,647.090862,287.144073,480.432309,162.146745,538.637445,279.11426,309.664123,493.260306,412.461478,272.704988,272.02027,286.435516,338.64868,82.832637,113.01208,618.717804,389.831344,320.093197,285.941343,308.71783,615.201133,287.006336,286.609076,90.224717,158.183706,218.890732,436.556465,...,101.631402,101.179433,90.288226,95.192149,261.378236,530.278606,94.7484,332.894528,614.004874,150.628491,645.040691,642.0641,126.091789,88.136492,283.464645,771.120494,312.335585,484.185575,541.909565,101.086655,525.457905,91.846126,625.423263,274.383184,852.235735,102.560998,126.875329,238.354097,90.964038,649.34243,494.523318,278.618895,116.343258,629.556884,299.282841,140.083915,646.955674,103.634013,282.838529,205.066848
1,Piezo_PIP2,1,334.816766,183.629963,283.576915,578.829623,384.278017,103.769143,109.288093,243.110699,370.97997,212.753228,119.317227,592.189073,285.33235,443.445815,153.197755,544.783199,405.113736,282.547501,421.201768,277.032741,275.104472,253.813191,335.991463,263.115869,92.167881,102.354427,609.426657,381.269639,297.508413,347.085257,256.827351,573.862608,267.140462,311.641375,92.441512,118.909685,230.185243,430.338297,...,97.202201,97.192355,88.362295,98.472066,226.750411,474.589078,96.900493,394.674426,640.765523,136.451945,800.380022,666.115328,98.196137,95.612674,366.515708,811.833375,272.883738,461.856651,505.806757,89.715053,494.81827,77.908978,570.885504,246.136822,793.153795,102.445837,101.317612,233.272066,82.018671,723.031303,355.490064,257.677793,98.921705,453.885378,416.310184,131.922552,462.089912,95.179939,242.496894,223.123915
2,Piezo_PIP2,2,275.073755,202.25545,342.219028,560.460974,336.459908,111.187833,102.056689,240.717753,413.829923,204.340364,131.207151,633.302923,323.950184,444.54434,162.225697,500.465132,351.410413,297.810963,472.923123,423.697811,269.165927,263.472785,327.336885,282.952038,94.976494,93.558031,511.985051,414.468705,323.390843,380.47693,266.484577,568.860474,269.764553,277.321575,101.511456,166.745362,200.365043,369.274142,...,100.511038,102.361447,91.616967,97.819953,214.533927,354.307258,91.374107,397.169595,624.641417,160.29556,712.601343,666.183947,151.059962,82.315898,379.928395,608.345811,227.020788,453.307721,570.459487,94.489914,518.299306,90.14459,605.150987,278.592711,780.79767,99.98649,119.78257,172.316175,100.770357,705.15506,403.09153,300.392173,85.993276,465.20244,257.272547,72.081927,535.732493,82.812868,218.384865,218.979063
3,Piezo_PIP2,3,290.577028,193.34233,351.953645,680.849566,356.34826,126.432774,103.091409,239.646221,382.292417,224.541501,133.538158,605.98735,310.435661,456.473836,150.490118,472.80166,378.931323,338.920833,428.637975,374.127404,282.815727,282.998247,284.865582,338.722129,91.322402,97.385285,506.331895,378.786766,346.371982,383.483469,231.834973,532.108923,327.069526,237.677766,100.568303,198.413178,248.712891,401.828359,...,82.608833,102.854583,85.093879,87.650063,255.298459,269.217214,106.427334,405.936646,656.480971,147.755725,717.362583,706.361147,138.714971,172.864009,377.990101,720.284731,237.047564,300.496922,612.179085,80.870841,400.810417,79.990129,593.053242,245.845673,870.880766,106.58984,117.951184,192.603032,103.737948,637.624491,418.481454,332.128525,96.114586,567.150851,259.985159,88.943817,501.348161,95.815112,247.253744,220.677821
4,Piezo_PIP2,4,233.69904,203.015984,240.717316,603.387966,328.979551,127.112652,111.054757,232.711257,324.249437,188.708462,128.994597,658.256793,259.240203,359.896157,182.236721,493.329878,356.240146,274.884406,453.036909,434.970427,250.035094,266.687841,238.831302,277.684765,90.02478,99.626553,458.289825,382.886414,355.63292,381.645916,229.136131,707.935882,309.801904,245.153684,99.068535,182.672426,240.273366,393.459566,...,96.684743,97.340489,96.150862,89.09191,174.601638,301.956674,104.109032,394.882407,624.539525,171.334129,684.533366,713.199003,156.458767,148.336416,407.323405,663.824466,254.166178,261.593245,529.502194,86.154048,418.017987,93.05701,620.347957,226.259997,649.368035,103.825499,118.91189,193.628586,99.503775,557.085547,410.296426,309.629827,104.504957,468.422037,291.298963,93.244969,460.571667,93.286728,271.298287,222.94999


In [26]:
sasa_data_long=sasa_data.melt(id_vars=['System','Frame'],var_name='Residue',value_name='SASA')
sasa_data_long['ResID']=sasa_data_long.Residue.map(lambda x: x.split('_')[-1])
sasa_data_long['ResName']=sasa_data_long.Residue.map(lambda x: x.split('_')[0])
sasa_data_long['SimType']='All_Atom'
sasa_data_long=sasa_data_long[['System','SimType','ResID','ResName','Frame','SASA']]
sasa_data_long.head()

Unnamed: 0,System,SimType,ResID,ResName,Frame,SASA
0,Piezo_PIP2,All_Atom,1,ARG,0,287.44056
1,Piezo_PIP2,All_Atom,1,ARG,1,334.816766
2,Piezo_PIP2,All_Atom,1,ARG,2,275.073755
3,Piezo_PIP2,All_Atom,1,ARG,3,290.577028
4,Piezo_PIP2,All_Atom,1,ARG,4,233.69904


In [27]:
sasa_summary=sasa_data_long.groupby(['System','SimType','ResID','ResName']).agg({'SASA':np.mean})
sasa_summary=sasa_summary.reset_index()
sasa_summary.ResID=pd.Series(sasa_summary.ResID,dtype=int)
sasa_summary.head()

Unnamed: 0,System,SimType,ResID,ResName,SASA
0,Piezo_PIP2,All_Atom,1,ARG,227.088237
1,Piezo_PIP2,All_Atom,1007,ARG,375.62868
2,Piezo_PIP2,All_Atom,1038,LYS,112.130846
3,Piezo_PIP2,All_Atom,1041,ARG,286.585983
4,Piezo_PIP2,All_Atom,1045,LYS,110.187461


In [28]:
sasa_corr=jointFitWide.set_index("ResID").join(
        other=sasa_summary.drop(columns=["System","SimType"]).set_index("ResID")
    ).reset_index()
sasa_corr.to_csv(baseDir+"SASA_correlation_dataFrame.csv",index=False)
sasa_corr.head()

Unnamed: 0,ResID,SeqID,All_Atom.ExpDist_Mean,All_Atom.GammaDist_Mean,All_Atom.GammaDist_k,All_Atom.Max_ResTime,All_Atom.N,All_Atom.Total_Occupancy,Coarse_Grain.ExpDist_Mean,Coarse_Grain.GammaDist_Mean,Coarse_Grain.GammaDist_k,Coarse_Grain.Max_ResTime,Coarse_Grain.N,Coarse_Grain.Total_Occupancy,ResName,SASA
0,1,782,,,,0.0,0.0,0.0,23.187402,22.868932,1.155617,141.0,256.0,5212.0,ARG,227.088237
1,15,796,5.376,4.597927,37.312908,5.28,27.0,27.96,9.771987,9.667189,0.672089,423.0,309.0,2012.0,ARG,269.308386
2,21,802,6.24,5.671147,4.958104,19.08,42.0,95.16,,,,0.0,0.0,0.0,ARG,278.032111
3,22,803,14.4,10.422661,0.752637,64.68,12.0,108.0,,,,0.0,0.0,0.0,ARG,684.723468
4,30,811,6.0,4.363035,18.359044,6.0,14.0,10.32,,,,0.0,0.0,0.0,LYS,130.928857


# Visualization

For those who just want to visualize the final results, run the loading cells at the top to
load the needed packages and set the 'baseDir' variable, then start at the cell below.

In [0]:
sasa_corr=pd.read_csv(baseDir+"SASA_correlation_dataFrame.csv")

In [34]:
sasa_corr.head()

Unnamed: 0,ResID,SeqID,All_Atom.ExpDist_Mean,All_Atom.GammaDist_Mean,All_Atom.GammaDist_k,All_Atom.Max_ResTime,All_Atom.N,All_Atom.Total_Occupancy,Coarse_Grain.ExpDist_Mean,Coarse_Grain.GammaDist_Mean,Coarse_Grain.GammaDist_k,Coarse_Grain.Max_ResTime,Coarse_Grain.N,Coarse_Grain.Total_Occupancy,ResName,SASA
0,1,782,,,,0.0,0.0,0.0,23.187402,22.868932,1.155617,141.0,256.0,5212.0,ARG,227.088237
1,15,796,5.376,4.597927,37.312908,5.28,27.0,27.96,9.771987,9.667189,0.672089,423.0,309.0,2012.0,ARG,269.308386
2,21,802,6.24,5.671147,4.958104,19.08,42.0,95.16,,,,0.0,0.0,0.0,ARG,278.032111
3,22,803,14.4,10.422661,0.752637,64.68,12.0,108.0,,,,0.0,0.0,0.0,ARG,684.723468
4,30,811,6.0,4.363035,18.359044,6.0,14.0,10.32,,,,0.0,0.0,0.0,LYS,130.928857


In [0]:
sasa_corr=sasa_corr[sasa_corr['All_Atom.N']>3]

In [0]:
import bokeh
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, CDSView, GroupFilter, HoverTool
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral6

In [39]:
@interact_manual
def plot_columns(xCol=sasa_corr.drop(columns=["ResName","ResID","SeqID"]).columns,
                 yCol=sasa_corr.drop(columns=["ResName","ResID","SeqID"]).columns):
    bokeh.io.output_notebook()
    plotData=sasa_corr[[xCol,yCol,"ResName","ResID","SeqID"]].dropna() #,"All_Atom.N",'Coarse_Grain.N']]
    plotData=plotData[(plotData[xCol]>0) & (plotData[yCol]>0)]
    source=ColumnDataSource(plotData)
    view1=CDSView(source=source)
    
    ResNames=list(plotData.ResName.unique())
    
    plot_size_and_tools={'plot_height':640,
                         'plot_width':640,
                         'tools':['pan','wheel_zoom',
                                  'undo','redo','reset','save',
                                  'crosshair','hover']}
    
    p1=figure(**plot_size_and_tools)
    p1.circle(x=xCol,y=yCol,
              source=source,
              color=factor_cmap('ResName',palette=Spectral6,factors=ResNames))
    p1.legend.orientation="vertical"
    hover = p1.select(dict(type=HoverTool))
    hover.tooltips = [(colName,"@{"+colName+"}") for colName in plotData.columns]
    show(p1)

interactive(children=(Dropdown(description=u'xCol', options=('All_Atom.ExpDist_Mean', 'All_Atom.GammaDist_Mean…

In [0]:
sasa_bySeq=sasa_corr
sasa_bySeq=sasa_bySeq.drop(columns='ResID').groupby(['SeqID','ResName']).agg(
    lambda x: x.dropna().mean()).reset_index()
sasa_bySeq.head()

In [0]:
@interact_manual
def plot_columns(xCol=sasa_bySeq.drop(columns=["ResName","SeqID"]).columns,
                 yCol=sasa_bySeq.drop(columns=["ResName","SeqID"]).columns):
    bokeh.io.output_notebook()
    plotData=sasa_bySeq[[xCol,yCol,"ResName","SeqID"]].dropna() #,"All_Atom.N",'Coarse_Grain.N']]
    plotData=plotData[(plotData[xCol]>0) & (plotData[yCol]>0)]
    source=ColumnDataSource(plotData)
    view1=CDSView(source=source)
    
    ResNames=list(plotData.ResName.unique())
    
    plot_size_and_tools={'plot_height':640,
                         'plot_width':640,
                         'tools':['pan','wheel_zoom',
                                  'undo','redo','reset','save',
                                  'crosshair','hover']}
    
    p1=figure(**plot_size_and_tools)
    p1.circle(x=xCol,y=yCol,
              source=source,
              color=factor_cmap('ResName',palette=Spectral6,factors=ResNames))
    p1.legend.orientation="vertical"
    hover = p1.select(dict(type=HoverTool))
    hover.tooltips = [(colName,"@{"+colName+"}") for colName in plotData.columns]
    show(p1)