In [89]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import scipy as sp

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

import os
import sys
import gc
import copy

import tqdm
import itertools

import subprocess

In [8]:
baseDir='./'

If you are viewing this in google colab, you will need to clone the repository first.
To do so uncomment the two code cells below

In [9]:
#!git clone https://github.com/wesleymsmith/Piezo_PIP2_binding_analysis.git

In [10]:
#baseDir='Piezo_PIP2_binding_analysis/'

In [11]:
xcelData=pd.read_excel(baseDir+'Residue_ID_total_occupancy_10_1_2019.xlsx',
              sheet_name=None)

In [12]:
@interact
def show_data(sheet_name=xcelData.keys()):
    return xcelData[sheet_name]

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUnc2hlZXRfbmFtZScsIG9wdGlvbnM9KHUnY2cnLCB1J1NoZWV0MScsIHUnYWEnLCB1J3RlbnNpb25fMzBucycsIHXigKY=


In [13]:
#the above tables are
#aa - result summary for all atom simulation
#cg - result summary for coarse grain simulation
#tension_30ns - results of all atom simulation with membrane tension
#sheet 1 is apparently blank...
#resinfo_table - mapping between cryo-em structure sequence and all atom residue ids
resinfoDataSheet=xcelData['resinfo_table']
resinfoTable=resinfoDataSheet[
    resinfoDataSheet.columns[[0,3,5,7]]][2:]
resinfoTable.columns=['PDB_ID','Arm1_Resid','Arm2_Resid','Arm3_Resid']
resinfoTable.head()

Unnamed: 0,PDB_ID,Arm1_Resid,Arm2_Resid,Arm3_Resid
2,782,1,1419,2837
3,783,2,1420,2838
4,784,3,1421,2839
5,785,4,1422,2840
6,786,5,1423,2841


In [14]:
#the pdb residue id's are sequential, but have gaps corresponding
#to unresolved amino acids in the Cryo-EM structure.
#The easy solution is just to fill in linearly.
#The three arms in our simulation structure are identical, so
#we can generate our back-map by just repeating the pdb sequence 3 times
simResid_to_pdbResid=map(int,list(np.arange(
    np.array(resinfoTable['PDB_ID'])[0],
    782+len(resinfoTable['PDB_ID'])))*3)
', '.join(map(str,simResid_to_pdbResid))

'782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981,

While the above excel sheet provides a useful summary at a glance, we would like to have direct access to the distribution of residence times rather than just the mean max and cummulative sum over all lipids. 

Using the set of all individually measured residence times, we can fit a model distribution. More specifically, the reciporical of residence time would correspond to a frequency. Specifically, the reciporical of residence time gives us the corresponding unbinding frequency. 

This can then be used to fit an appropriate distribution (geometric distribution would be one choice) and provide a characteristic unbinding frequency (or characteristic residence time as its reciporical). More over, it can give us a bound / confidence interval of this distribution as well.

We can then repeat this process for the all atom model. While CG is expected to have shorter residence times due to the notably lower membrane viscocity, we should still be able to see if the ranking and / or relative characteristic residence time / unbinding frequencies match (for each protein amino acid). If CG can rank amino acids in the correct order, based upon PIP2 residence times, then we can be confident that it is functioning well as a model for correctly predicting lipid binding sites.

Below, this residence time distribution data can be exctracted from the coarse grain simulation data files which list individual PIP2 residence time observations for each protein residue (amino acid)

in the Raw_PIP2_CG_residence_time_data directory, the .xvg files contain
the 'occupancy' of each amino acid at each output time step.
The occupancy is zero if there were no PIP2 lipids in contact
and non-zero if there was at least one PIP2 lipid in contact.
The first step is to extract these individual timeseries
into a joint table.

In [15]:
#os.listdir can be used to generate a list of all xvg files present
cg_RawData_dir=baseDir+'Raw_PIP2_CG_residence_time_data/'
cg_dataFile_list=[dataFileName for dataFileName in os.listdir(cg_RawData_dir) \
             if 'xvg' in dataFileName]
cg_dataFile_list

['id_4236_mask.xvg',
 'id_3365_mask.xvg',
 'id_2250_mask.xvg',
 'id_3630_mask.xvg',
 'id_3456_mask.xvg',
 'id_843_mask.xvg',
 'id_3516_mask.xvg',
 'id_548_mask.xvg',
 'id_186_mask.xvg',
 'id_3043_mask.xvg',
 'id_1988_mask.xvg',
 'id_3679_mask.xvg',
 'id_4110_mask.xvg',
 'id_2218_mask.xvg',
 'id_373_mask.xvg',
 'id_1483_mask.xvg',
 'id_1813_mask.xvg',
 'id_3034_mask.xvg',
 'id_1354_mask.xvg',
 'id_2857_mask.xvg',
 'id_162_mask.xvg',
 'id_3794_mask.xvg',
 'id_577_mask.xvg',
 'id_244_mask.xvg',
 'id_4026_mask.xvg',
 'id_245_mask.xvg',
 'id_3529_mask.xvg',
 'id_2772_mask.xvg',
 'id_782_mask.xvg',
 'id_2521_mask.xvg',
 'id_1046_mask.xvg',
 'id_2388_mask.xvg',
 'id_3415_mask.xvg',
 'id_1983_mask.xvg',
 'id_378_mask.xvg',
 'id_2212_mask.xvg',
 'id_1580_mask.xvg',
 'id_4013_mask.xvg',
 'id_800_mask.xvg',
 'id_788_mask.xvg',
 'id_475_mask.xvg',
 'id_613_mask.xvg',
 'id_2325_mask.xvg',
 'id_3523_mask.xvg',
 'id_1642_mask.xvg',
 'id_680_mask.xvg',
 'id_2416_mask.xvg',
 'id_3350_mask.xvg',
 'id_17

In [84]:
#Each of the above files contains an 18 line header, followed by 2 column occupancy series
#the data can be easily extracted using pd.read_table
cgDataTables=[]
for dataFileName in tqdm.tqdm_notebook(cg_dataFile_list):
    resID=int(dataFileName.split('_')[1])
    seqID=simResid_to_pdbResid[resID-1]
    tempTable=pd.read_csv(
        cg_RawData_dir+dataFileName,
        skiprows=17,names=['Time','Occupancy'],
        delim_whitespace=True)
    tempTable['ResID']=resID
    tempTable['SeqID']=seqID
    tempTable['Frame']=np.arange(len(tempTable))
    tempTable=tempTable[['ResID','SeqID','Frame','Time','Occupancy']]
    cgDataTables.append(copy.deepcopy(tempTable))
    
cg_occupancy_data=pd.concat(cgDataTables)
cg_occupancy_data.to_csv(baseDir+"Coarse_Grain_Occupancy_Data.csv",index=False)
cg_occupancy_data.head()

HBox(children=(IntProgress(value=0, max=429), HTML(value=u'')))




Unnamed: 0,ResID,SeqID,Frame,Time,Occupancy
0,4236,2181,0,0.0,0
1,4236,2181,1,1000.0,0
2,4236,2181,2,2000.0,0
3,4236,2181,3,3000.0,0
4,4236,2181,4,4000.0,0


In [95]:
#The coarse grain data table is huge, so we need to split it into chunks
os.system("split -l 100000 "+\
          baseDir+"Coarse_Grain_Occupancy_Data.csv "+\
          baseDir+"Coarse_Grain_Occupancy_Data.chunk.")
os.system("rm "+baseDir+"Coarse_Grain_Occupancy_Data.csv")

0

Next, lets collect the occupancy data for the all atom simulation into a single data frame.

In [85]:
aa_RawData_dir=baseDir+'Raw_PIP2_AA_residence_time_data/'
aa_dataFile_list=[dataFileName for dataFileName in os.listdir(aa_RawData_dir) \
             if 'xvg' in dataFileName]
aa_dataFile_list

#Each of the above files contains an 18 line header, followed by 2 column occupancy series
#the data can be easily extracted using pd.read_table
aaDataTables=[]
for dataFileName in tqdm.tqdm_notebook(aa_dataFile_list):
    resID=int(dataFileName.split('_')[1])
    seqID=simResid_to_pdbResid[resID-1]
    tempTable=pd.read_csv(
        aa_RawData_dir+dataFileName,
        skiprows=17,names=['Time','Occupancy'],
        delim_whitespace=True)
    tempTable['ResID']=resID
    tempTable['SeqID']=seqID
    tempTable['Frame']=np.arange(len(tempTable))
    tempTable=tempTable[['ResID','SeqID','Frame','Time','Occupancy']]
    aaDataTables.append(copy.deepcopy(tempTable))
    
aa_occupancy_data=pd.concat(aaDataTables)
aa_occupancy_data.to_csv(baseDir+"All_Atom_Occupancy_Data.csv",index=False)
aa_occupancy_data.head()

HBox(children=(IntProgress(value=0, max=429), HTML(value=u'')))




Unnamed: 0,ResID,SeqID,Frame,Time,Occupancy
0,4236,2181,0,0.0,0
1,4236,2181,1,1.0,0
2,4236,2181,2,2.0,0
3,4236,2181,3,3.0,0
4,4236,2181,4,4.0,0


Now we can extract residence times from this occupancy by first finding
all 'runs' within the occupancy series of each residue.

This can be accomplished using itertools.groupby to obtain the lengths of each continguous interval where occupancy was non-zero. Since these are discrete integer values, we can easily use the function 'unique' to bin them into a histogram like form by setting the 'return_counts' option to 'True'

We can then plot the resulting distribution...
It seems to look quite exponential like, so it would make sense to try and fit either a geometric (if we think of each frame like an individual 'trial') or an exponential (if want to think of each run's length as a 'wait time')

The exponential distribution with a mean (or characteristic length, $\lambda$) is given by:
$$p(x,\lambda)=\frac{e^{-x/\lambda}}{\lambda}$$


To fit an exponential distribution to this data we first compute the characteristic length ($\gamma$) as the mean ($\bar{X}$) of the observed residence times ($X_i$)

The variance of an exponential distribution with a characteristic length (mean) of $\lambda$ is $\lambda^2$.

Thus for a given confidence level $\gamma$ we may construct the interval:

$$ \lambda \in (\bar{X}-Z_\gamma\sqrt{\frac{\bar{X}^2}{n}},\bar{X}+Z_\gamma\sqrt{\frac{\bar{X}^2}{n}}) $$

Below, we show the estimation procedure for an arbitrarily selected residue.

In [88]:
?sp.special.gamma

In [18]:
def extract_runs(x):
    return [len(list(g)) for k,g in itertools.groupby(x, bool) if k]
def expDist(x,l):
    return np.exp(-x/l)/l
def beta_dist(x,a,b):
    return (x**(a-1.)*(1.-x)**(b-1.))/\
        (sp.special.gamma(a)*sp.special.gamma(b)/sp.special.gamma(a+b))

In [19]:
@interact
def fit_exp_dist(tempResID=cg_occupancy_data[cg_occupancy_data.Occupancy>0].ResID.unique()):
    tempDat=cg_occupancy_data[cg_occupancy_data['ResID']==tempResID]
    print "First five entries of occupancy data for resid %g"%tempResID
    print tempDat.Occupancy.head()
    print '---'
    print "first five observed 'run' lengths in occupancy"
    runLengths=extract_runs(np.array(tempDat.Occupancy))
    print runLengths[:5]
    print '---'
    print "Histogram of run lengths"
    residenceTimeDist=np.unique(runLengths,return_counts=True)
    print residenceTimeDist
    plt.scatter(residenceTimeDist[0],
                residenceTimeDist[1]/(1.*(np.sum(residenceTimeDist[1]))),
                s=8,marker='x',
                label='Data')
    #plt.show()

    print "number of events: %g"%np.sum(residenceTimeDist[0])
    
    #lets get a 95% confidence interval... the needed coefficient is found as sp.stats.norm.ppf(q=.975)
    #n, the number of observations, is equal to the sum of our histogram counts
    lambdaEst=np.sum(residenceTimeDist[0]*residenceTimeDist[1])/np.sum(residenceTimeDist[1])*1.
    lambdaCIradius=sp.stats.norm.ppf(q=.975)*np.sqrt(1.0*lambdaEst**2/np.sum(residenceTimeDist[1]))

    print "Most likely estimate: lambda = %.3f"%lambdaEst
    print "At 95%% confidence: %.3f <= lambda <= %.3f"%(lambdaEst-lambdaCIradius,
                                    lambdaEst+lambdaCIradius)
    lval=lambdaEst-lambdaCIradius
    #plt.plot(residenceTimeDist[0],map(lambda x: np.exp(-x/(lval))/lval,residenceTimeDist[0]),
    #         '#aa0000')
    lval=lambdaEst
    plt.plot(residenceTimeDist[0],map(lambda x: np.exp(-x/(lval))/lval,residenceTimeDist[0]),
             'b',label='l=dataMean')
    lval=lambdaEst+lambdaCIradius
    #plt.plot(residenceTimeDist[0],map(lambda x: np.exp(-x/(lval))/lval,residenceTimeDist[0]),
    #         '#aa0000')
    
    cfitModel=sp.optimize.curve_fit(expDist,
                                residenceTimeDist[0],
                                1.*residenceTimeDist[1]/np.sum(residenceTimeDist[1]),
                                p0=lambdaEst)
    plt.plot(residenceTimeDist[0],
         expDist(residenceTimeDist[0],cfitModel[0][0]),
         'g',label='l=lmsFit')
    print "lambda fit= %.3f"%cfitModel[0][0]
    
    plt.show()
    

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUndGVtcFJlc0lEJywgb3B0aW9ucz0oMzM2NSwgMjI1MCwgODQzLCAxNDgzLCAzMDM0LCAxNjIsIDI0NCwgMjQ1LCDigKY=


After inspecting the results of several resids, we can see that the least mean square fitting result seems to be too low for values at the elbow, while simply using the mean residence time seems to often be slightly too high, but generally a tighter fit.

We can now apply the residence time procedure over the whole CG data set and compare our results.

To begin with, lets encapsulate the procedure a bit better

In [20]:
def extract_runs(x):
    return [len(list(g)) for k,g in itertools.groupby(x, bool) if k]
def expDist(x,l):
    return np.exp(-x/l)/l

def extract_resDist(x):
    return(np.unique(extract_runs(x),return_counts=True))

def mean_residenceTime(x):
    timeDist=extract_resDist(x)
    return np.sum(1.*timeDist[0]*timeDist[1]/np.sum(timeDist[1]))

def min_residenceTime(x):
    timeDist=extract_resDist(x)
    if len( timeDist[0])>0:
        return np.min(1.*timeDist[0])
    else:
        return 0

def median_residenceTime(x):
    timeDist=extract_resDist(x)
    if len( timeDist[0])>0:
        return np.median(1.*timeDist[0])
    else:
        return 0
    
def max_residenceTime(x):
    timeDist=extract_resDist(x)
    if len(timeDist[0])>0:
        return np.max(1.*timeDist[0])
    else:
        return 0


In [21]:
#apply the above calculations over all the raw occupancy data
meanDat=cg_occupancy_data.groupby(
        ['ResID','SeqID']
    ).aggregate(
        {'Occupancy':{'Total_Occupancy':np.sum,'Mean_ResTime':mean_residenceTime,
                      'Medain_ResTime':median_residenceTime,
                      'Min_ResTime':min_residenceTime,'Max_ResTime':max_residenceTime}}
    )
meanDat.columns=meanDat.columns.map(lambda x: x[1])
meanDat=meanDat.reset_index()
meanDat.sort_values('Mean_ResTime',ascending=False).head(n=15)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,ResID,SeqID,Medain_ResTime,Mean_ResTime,Max_ResTime,Min_ResTime,Total_Occupancy
22,242,1023,58.0,67.190476,664,1,11288
214,2105,1468,52.0,36.188356,385,1,10567
165,1660,1023,47.5,23.065708,299,1,11233
0,1,782,34.5,20.359375,141,1,5212
292,2899,844,48.0,16.494186,339,1,8511
252,2475,1838,42.0,16.284053,202,1,9803
219,2162,1525,36.5,16.233333,927,1,8766
26,250,1031,44.0,16.110236,258,1,8184
106,1054,1835,38.5,15.134551,252,1,9111
86,841,1622,31.5,14.232365,281,1,3430


In [22]:
#interactively plot the results
@interact
def scatter_plot(x=list(meanDat.select_dtypes('number').columns),
                 y=list(meanDat.select_dtypes('number').columns)[1:]):
    plt.scatter(meanDat[x],meanDat[y])
    plt.show()

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUneCcsIG9wdGlvbnM9KCdSZXNJRCcsICdTZXFJRCcsICdNZWRhaW5fUmVzVGltZScsICdNZWFuX1Jlc1RpbWUnLCDigKY=


Now lets repeat this for all atom simulations

In [23]:
aa_RawData_dir=baseDir+'Raw_PIP2_AA_residence_time_data/'
aa_dataFile_list=[dataFileName for dataFileName in os.listdir(aa_RawData_dir) \
             if 'xvg' in dataFileName]
aa_dataFile_list

#Each of the above files contains an 18 line header, followed by 2 column occupancy series
#the data can be easily extracted using pd.read_table
aaDataTables=[]
for dataFileName in tqdm.tqdm_notebook(aa_dataFile_list):
    resID=int(dataFileName.split('_')[1])
    seqID=simResid_to_pdbResid[resID-1]
    tempTable=pd.read_csv(
        aa_RawData_dir+dataFileName,
        skiprows=17,names=['Time','Occupancy'],
        delim_whitespace=True)
    tempTable['ResID']=resID
    tempTable['SeqID']=seqID
    tempTable['Frame']=np.arange(len(tempTable))
    tempTable=tempTable[['ResID','SeqID','Frame','Time','Occupancy']]
    aaDataTables.append(copy.deepcopy(tempTable))
    
aa_occupancy_data=pd.concat(aaDataTables)
aa_occupancy_data.head()

HBox(children=(IntProgress(value=0, max=429), HTML(value=u'')))




Unnamed: 0,ResID,SeqID,Frame,Time,Occupancy
0,4236,2181,0,0.0,0
1,4236,2181,1,1.0,0
2,4236,2181,2,2.0,0
3,4236,2181,3,3.0,0
4,4236,2181,4,4.0,0


One problem needs to be addressed before continuing.

The coarse grain simulation has an output frequency of 1 ns, whereas a frequency of 120 ps was used for all atom. This means, for instance, that if you had a 'run' between 1 and 9 frames on all atom, it would only show up as a single frame run in coarse grain.

This poses a problem in trying to compare the histogram results. More specifically, it means that the effective bin size of the all atom simulation is much finer than the coarse grain. This is further compounded by the fact that the all atom simulation will get far less sampling since it was only 200 ns in length while the coarse grain simulation was 12 microseconds. Thus, the all atom samples a time scale that is about 60 fold smaller than coarse grain, and has an effective bin size that is 10 times finer. This poses a problem when sampling from the 'tail' of our distributions...

While there is little that can be done to make up for the 60 fold difference in simulated time, we can compensate for the difference in bin width easily by re-binning.

In [73]:
def bin_aa(x,binWidth=1.000,frameRate=.120):
    init_dist=extract_resDist(x)
    x_dist=(init_dist[0]*frameRate,init_dist[1])
    binMax=np.max(x_dist[0])
    nBins=np.ceil(binMax/binWidth)+1
    hbins=np.arange(nBins)*binWidth
    temp_dist=np.histogram(x_dist[0],weights=x_dist[1],bins=hbins)
    return (temp_dist[1],temp_dist[0])

In [54]:
?plt.bar

In [79]:
@interact_manual
def rebin_hist(binW=(.1,25,.1)):
    fr=.120

    tempX=aa_occupancy_data[aa_occupancy_data.ResID==242].Occupancy
    dist1=extract_resDist(tempX)
    print (dist1[0]*fr,dist1[1])
    plt.bar(dist1[0]*fr,height=1.*dist1[1]/np.sum(dist1[1]),width=fr)
    plt.show()

    dist2=bin_aa(tempX,binWidth=binW)
    print (dist2[0],dist2[1])
    plt.bar(dist2[0][1:],height=1.*dist2[1]/np.sum(dist2[1]),width=binW)
    plt.show()

aW50ZXJhY3RpdmUoY2hpbGRyZW49KEZsb2F0U2xpZGVyKHZhbHVlPTEyLjUsIGRlc2NyaXB0aW9uPXUnYmluVycsIG1heD0yNS4wLCBtaW49MC4xKSwgQnV0dG9uKGRlc2NyaXB0aW9uPXUnUnXigKY=


In [235]:
meanDat=aa_occupancy_data.groupby(
        ['ResID','SeqID']
    ).aggregate(
        {'Occupancy':{'Total_Occupancy':np.sum,'Mean_ResTime':mean_residenceTime,
                      'Medain_ResTime':median_residenceTime,
                      'Min_ResTime':min_residenceTime,'Max_ResTime':max_residenceTime}}
    )
meanDat.columns=meanDat.columns.map(lambda x: x[1])
meanDat=meanDat.reset_index()
meanDat.sort_values('Mean_ResTime',ascending=False).head(n=15)

Unnamed: 0,ResID,SeqID,Medain_ResTime,Mean_ResTime,Max_ResTime,Min_ResTime,Total_Occupancy
233,2330,1693,1592.0,1592.0,1592,1592,1592
395,3893,1838,1590.0,1590.0,1590,1590,1590
310,3080,1025,795.5,795.5,1202,389,1591
392,3890,1835,794.5,794.5,795,794,1589
75,743,1524,430.0,530.0,1067,93,1590
384,3821,1766,399.5,396.5,786,1,1586
362,3580,1525,53.5,389.25,1440,10,1557
250,2473,1836,160.0,387.5,1229,1,1550
308,3078,1023,24.0,317.2,926,2,1586
91,913,1694,84.5,264.333333,1087,19,1586


Note that the residence times seem larger, but this is because the frame rate for the all atom simulation was much higher. For all atom 1 frame = 120.0 ps vs 1 frame = 1000.0 ps for coarse grain.


In [241]:
cg_RawData_dir=baseDir+'Raw_PIP2_CG_residence_time_data/'
cg_dataFile_list=[dataFileName for dataFileName in os.listdir(cg_RawData_dir) \
             if 'xvg' in dataFileName]
cg_dataFile_list

#Each of the above files contains an 18 line header, followed by 2 column occupancy series
#the data can be easily extracted using pd.read_table
cgDataTables=[]
for dataFileName in tqdm.tqdm_notebook(cg_dataFile_list):
    resID=int(dataFileName.split('_')[1])
    seqID=simResid_to_pdbResid[resID-1]
    tempTable=pd.read_csv(
        cg_RawData_dir+dataFileName,
        skiprows=17,names=['Time','Occupancy'],
        delim_whitespace=True)
    tempTable['ResID']=resID
    tempTable['SeqID']=seqID
    tempTable['Frame']=np.arange(len(tempTable))
    tempTable=tempTable[['ResID','SeqID','Frame','Time','Occupancy']]
    cgDataTables.append(copy.deepcopy(tempTable))
    
cg_occupancy_data=pd.concat(cgDataTables)
cg_occupancy_data.head()

HBox(children=(IntProgress(value=0, max=429), HTML(value=u'')))




Unnamed: 0,ResID,SeqID,Frame,Time,Occupancy
0,4236,2181,0,0.0,0
1,4236,2181,1,1000.0,0
2,4236,2181,2,2000.0,0
3,4236,2181,3,3000.0,0
4,4236,2181,4,4000.0,0


In [4]:
?np.histogram

In [242]:
meanDat=cg_occupancy_data.groupby(
        ['ResID','SeqID']
    ).aggregate(
        {'Occupancy':{'Total_Occupancy':np.sum,'Mean_ResTime':mean_residenceTime,
                      'Medain_ResTime':median_residenceTime,
                      'Min_ResTime':min_residenceTime,'Max_ResTime':max_residenceTime}}
    )
meanDat.columns=meanDat.columns.map(lambda x: x[1])
meanDat=meanDat.reset_index()
meanDat.sort_values('Mean_ResTime',ascending=False).head(n=15)

Unnamed: 0,ResID,SeqID,Medain_ResTime,Mean_ResTime,Max_ResTime,Min_ResTime,Total_Occupancy
22,242,1023,58.0,67.190476,664,1,11288
214,2105,1468,52.0,36.188356,385,1,10567
165,1660,1023,47.5,23.065708,299,1,11233
0,1,782,34.5,20.359375,141,1,5212
292,2899,844,48.0,16.494186,339,1,8511
252,2475,1838,42.0,16.284053,202,1,9803
219,2162,1525,36.5,16.233333,927,1,8766
26,250,1031,44.0,16.110236,258,1,8184
106,1054,1835,38.5,15.134551,252,1,9111
86,841,1622,31.5,14.232365,281,1,3430


lets combine these data tables into a joint data frame.
We will normalize the all atom times by mulitplying by .120 (120 ps per frame / 1000 ps per frame).

In [266]:
cg_meanDat=cg_occupancy_data.groupby(
        ['ResID','SeqID']
    ).aggregate(
        {'Occupancy':{'Total_Occupancy':np.sum,'Mean_ResTime':mean_residenceTime,
                      'Medain_ResTime':median_residenceTime,
                      'Min_ResTime':min_residenceTime,'Max_ResTime':max_residenceTime}}
    )
cg_meanDat.columns=cg_meanDat.columns.map(lambda x: x[1])
cg_meanDat=cg_meanDat.reset_index()
cg_meanDat['SimType']='Coarse_Grain'
cg_meanDat=cg_meanDat[np.concatenate([
    ['SimType'],
    cg_meanDat.columns[:-1]
])]
print cg_meanDat.sort_values('Mean_ResTime',ascending=False).head(n=15)


aa_meanDat=aa_occupancy_data.groupby(
        ['ResID','SeqID']
    ).aggregate(
        {'Occupancy':{'Total_Occupancy':np.sum,'Mean_ResTime':mean_residenceTime,
                      'Medain_ResTime':median_residenceTime,
                      'Min_ResTime':min_residenceTime,'Max_ResTime':max_residenceTime}}
    )
aa_meanDat.columns=aa_meanDat.columns.map(lambda x: x[1])
aa_meanDat=aa_meanDat.reset_index()
for colName in aa_meanDat.columns[2:]:
    aa_meanDat[colName]=aa_meanDat[colName]*.120
aa_meanDat['SimType']='All_Atom'
aa_meanDat=aa_meanDat[np.concatenate([
    ['SimType'],
    aa_meanDat.columns[:-1]
])]
print aa_meanDat.sort_values('Mean_ResTime',ascending=False).head(n=15)

joint_meanDat=pd.concat([cg_meanDat,aa_meanDat])
joint_meanDat.head()

          SimType  ResID  SeqID  Medain_ResTime  Mean_ResTime  Max_ResTime  \
22   Coarse_Grain    242   1023            58.0     67.190476          664   
214  Coarse_Grain   2105   1468            52.0     36.188356          385   
165  Coarse_Grain   1660   1023            47.5     23.065708          299   
0    Coarse_Grain      1    782            34.5     20.359375          141   
292  Coarse_Grain   2899    844            48.0     16.494186          339   
252  Coarse_Grain   2475   1838            42.0     16.284053          202   
219  Coarse_Grain   2162   1525            36.5     16.233333          927   
26   Coarse_Grain    250   1031            44.0     16.110236          258   
106  Coarse_Grain   1054   1835            38.5     15.134551          252   
86   Coarse_Grain    841   1622            31.5     14.232365          281   
76   Coarse_Grain    744   1525            33.5     12.593131          153   
218  Coarse_Grain   2161   1524            37.0     12.476471   

Unnamed: 0,SimType,ResID,SeqID,Medain_ResTime,Mean_ResTime,Max_ResTime,Min_ResTime,Total_Occupancy
0,Coarse_Grain,1,782,34.5,20.359375,141.0,1.0,5212.0
1,Coarse_Grain,15,796,18.5,6.511327,423.0,1.0,2012.0
2,Coarse_Grain,21,802,0.0,0.0,0.0,0.0,0.0
3,Coarse_Grain,22,803,0.0,0.0,0.0,0.0,0.0
4,Coarse_Grain,30,811,0.0,0.0,0.0,0.0,0.0


In [267]:
joint_meanDat_wide=joint_meanDat.melt(id_vars=['SimType','ResID','SeqID'])
joint_meanDat_wide['Measurement']=joint_meanDat_wide.SimType+'.'+joint_meanDat_wide.variable
joint_meanDat_wide=joint_meanDat_wide[['ResID','SeqID','Measurement','value']]
joint_meanDat_wide=pd.pivot_table(index=['ResID','SeqID'],
                             columns='Measurement',values='value',
                             fill_value=np.nan,data=joint_meanDat_wide)


joint_meanDat_wide.head()

Unnamed: 0_level_0,Measurement,All_Atom.Max_ResTime,All_Atom.Mean_ResTime,All_Atom.Medain_ResTime,All_Atom.Min_ResTime,All_Atom.Total_Occupancy,Coarse_Grain.Max_ResTime,Coarse_Grain.Mean_ResTime,Coarse_Grain.Medain_ResTime,Coarse_Grain.Min_ResTime,Coarse_Grain.Total_Occupancy
ResID,SeqID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,782,1.56,0.44,0.36,0.12,2.64,141,20.359375,34.5,1,5212
15,796,0.0,0.0,0.0,0.0,0.0,423,6.511327,18.5,1,2012
21,802,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0
22,803,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0
30,811,0.12,0.12,0.12,0.12,0.24,0,0.0,0.0,0,0


In [271]:
#interactively plot the results
@interact
def scatter_plot(x=list(joint_meanDat_wide.columns[:]),
                 y=list(joint_meanDat_wide.columns[:])):
    plt.figure(figsize=(12,12))
    plt.scatter(joint_meanDat_wide[x],joint_meanDat_wide[y])
    plt.show()

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUneCcsIG9wdGlvbnM9KCdBbGxfQXRvbS5NYXhfUmVzVGltZScsICdBbGxfQXRvbS5NZWFuX1Jlc1RpbWUnLCAnQWzigKY=


In [301]:
#interactively plot the results
@interact
def scatter_plot(xx=list(joint_meanDat_wide.columns[:]),
                 yy=list(joint_meanDat_wide.columns[:]),
                 cgCutCol=[colName for colName in joint_meanDat_wide.columns \
                           if 'Coarse' in colName],
                 cgCutVal=(0,1200,1),
                 maxRankCut=(0,400,1)):
    #xx='All_Atom.Mean_ResTime'
    #yy='Coarse_Grain.Mean_ResTime'
    tempData=joint_meanDat_wide[joint_meanDat_wide[cgCutCol]<=cgCutVal][[xx,yy]].dropna()
    tempData=tempData[(tempData[xx]>0) | (tempData[yy]>0)]
    tempData['xRank']=tempData[xx].rank()
    tempData['yRank']=tempData[yy].rank()
    tempData=tempData[(tempData['xRank']<=maxRankCut) & (tempData['yRank']<=maxRankCut)]
    plt.figure(figsize=(12,12))
    plt.scatter(tempData['xRank'],tempData['yRank'])
    plt.show()

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUneHgnLCBvcHRpb25zPSgnQWxsX0F0b20uTWF4X1Jlc1RpbWUnLCAnQWxsX0F0b20uTWVhbl9SZXNUaW1lJywgJ0HigKY=


In [307]:
#interactively plot the results
@interact
def scatter_plot(xx=list(joint_meanDat_wide.columns[:]),
                 yy=list(joint_meanDat_wide.columns[:]),
                 cgCutCol=[colName for colName in joint_meanDat_wide.columns \
                           if 'Coarse' in colName],
                 cgCutVal=(0,1200,1),
                 maxRankCut=(0,400,1)):
    #xx='All_Atom.Mean_ResTime'
    #yy='Coarse_Grain.Mean_ResTime'
    tempData=joint_meanDat_wide[joint_meanDat_wide[cgCutCol]<=cgCutVal][[xx,yy]].dropna()
    tempData=tempData[(tempData[xx]>0) | (tempData[yy]>0)]
    tempData['xRank']=tempData[xx].rank()
    tempData['yRank']=tempData[yy].rank()
    print tempData.sort_values('xRank',ascending=False).head(n=10)
    print tempData.sort_values('yRank',ascending=False).head(n=10)

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUneHgnLCBvcHRpb25zPSgnQWxsX0F0b20uTWF4X1Jlc1RpbWUnLCAnQWxsX0F0b20uTWVhbl9SZXNUaW1lJywgJ0HigKY=


In [309]:
@interact
def fit_exp_dist(tempResID=aa_occupancy_data[aa_occupancy_data.Occupancy>0].ResID.unique()):
    tempDat=aa_occupancy_data[aa_occupancy_data['ResID']==tempResID]
    print "First five entries of occupancy data for resid %g"%tempResID
    print tempDat.Occupancy.head()
    print '---'
    print "first five observed 'run' lengths in occupancy"
    runLengths=extract_runs(np.array(tempDat.Occupancy))
    print runLengths[:5]
    print '---'
    print "Histogram of run lengths"
    residenceTimeDist=np.unique(runLengths,return_counts=True)
    print residenceTimeDist
    plt.figure(figsize=(12,12))
    plt.scatter(residenceTimeDist[0],
                residenceTimeDist[1]/(1.*(np.sum(residenceTimeDist[1]))),
                s=8,marker='x',
                label='Data')
    #plt.show()

    print "number of events: %g"%np.sum(residenceTimeDist[0])
    
    #lets get a 95% confidence interval... the needed coefficient is found as sp.stats.norm.ppf(q=.975)
    #n, the number of observations, is equal to the sum of our histogram counts
    lambdaEst=np.sum(residenceTimeDist[0]*residenceTimeDist[1])/np.sum(residenceTimeDist[1])*1.
    lambdaCIradius=sp.stats.norm.ppf(q=.975)*np.sqrt(1.0*lambdaEst**2/np.sum(residenceTimeDist[1]))

    print "Most likely estimate: lambda = %.3f"%lambdaEst
    print "At 95%% confidence: %.3f <= lambda <= %.3f"%(lambdaEst-lambdaCIradius,
                                    lambdaEst+lambdaCIradius)
    lval=lambdaEst-lambdaCIradius
    #plt.plot(residenceTimeDist[0],map(lambda x: np.exp(-x/(lval))/lval,residenceTimeDist[0]),
    #         '#aa0000')
    lval=lambdaEst
    plt.plot(residenceTimeDist[0],map(lambda x: np.exp(-x/(lval))/lval,residenceTimeDist[0]),
             'b',label='l=dataMean')
    lval=lambdaEst+lambdaCIradius
    #plt.plot(residenceTimeDist[0],map(lambda x: np.exp(-x/(lval))/lval,residenceTimeDist[0]),
    #         '#aa0000')
    
    cfitModel=sp.optimize.curve_fit(expDist,
                                residenceTimeDist[0],
                                1.*residenceTimeDist[1]/np.sum(residenceTimeDist[1]),
                                p0=lambdaEst)
    plt.plot(residenceTimeDist[0],
         expDist(residenceTimeDist[0],cfitModel[0][0]),
         'g',label='l=lmsFit')
    print "lambda fit= %.3f"%cfitModel[0][0]
    
    plt.show()
    

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUndGVtcFJlc0lEJywgb3B0aW9ucz0oODQzLCAzMDM0LCAxNjIsIDI0NCwgMjQ1LCAxNTgwLCAxNjgsIDMyNTksIDHigKY=


In [329]:
joint_meanDat_wide[
        joint_meanDat_wide['Coarse_Grain.Total_Occupancy']>5000
    ].sort_values('Coarse_Grain.Total_Occupancy',ascending=False).reset_index().head(n=40)

Measurement,ResID,SeqID,All_Atom.Max_ResTime,All_Atom.Mean_ResTime,All_Atom.Medain_ResTime,All_Atom.Min_ResTime,All_Atom.Total_Occupancy,Coarse_Grain.Max_ResTime,Coarse_Grain.Mean_ResTime,Coarse_Grain.Medain_ResTime,Coarse_Grain.Min_ResTime,Coarse_Grain.Total_Occupancy
0,242,1023,72.96,4.82625,1.32,0.12,154.44,664,67.190476,58.0,1,11288
1,1660,1023,60.6,3.670769,2.16,0.12,143.16,299,23.065708,47.5,1,11233
2,2105,1468,85.2,18.972,1.32,0.24,189.72,385,36.188356,52.0,1,10567
3,3080,1025,144.24,95.46,95.46,46.68,190.92,522,11.861798,40.5,1,10557
4,3893,1838,190.8,190.8,190.8,190.8,190.8,168,10.967708,39.5,1,10529
5,3078,1023,111.12,38.064,2.88,0.24,190.32,297,8.554615,46.5,1,10103
6,2475,1838,105.36,31.68,1.68,0.12,190.08,202,16.284053,42.0,1,9803
7,744,1525,75.84,5.012571,2.7,0.12,175.44,153,12.593131,33.5,1,9533
8,1057,1838,9.6,0.901132,1.38,0.12,47.76,200,11.188014,43.5,1,9521
9,3004,949,28.44,5.49375,3.48,0.12,175.8,121,6.303333,27.5,1,9455


In [328]:
joint_meanDat_wide[
        joint_meanDat_wide['All_Atom.Total_Occupancy']>100
    ].sort_values('All_Atom.Total_Occupancy',ascending=False).reset_index().head(n=40)

Measurement,ResID,SeqID,All_Atom.Max_ResTime,All_Atom.Mean_ResTime,All_Atom.Medain_ResTime,All_Atom.Min_ResTime,All_Atom.Total_Occupancy,Coarse_Grain.Max_ResTime,Coarse_Grain.Mean_ResTime,Coarse_Grain.Medain_ResTime,Coarse_Grain.Min_ResTime,Coarse_Grain.Total_Occupancy
0,2330,1693,191.04,191.04,191.04,191.04,191.04,42,3.108038,14.0,1,5955
1,3080,1025,144.24,95.46,95.46,46.68,190.92,522,11.861798,40.5,1,10557
2,3893,1838,190.8,190.8,190.8,190.8,190.8,168,10.967708,39.5,1,10529
3,743,1524,128.04,63.6,51.6,11.16,190.8,79,4.38561,22.0,1,7802
4,3890,1835,95.4,95.34,95.34,95.28,190.68,155,5.335977,30.0,1,7401
5,3821,1766,94.32,47.58,47.94,0.12,190.32,54,6.859122,22.0,1,8910
6,3078,1023,111.12,38.064,2.88,0.24,190.32,297,8.554615,46.5,1,10103
7,913,1694,130.44,31.72,10.14,2.28,190.32,344,5.424271,26.5,1,5766
8,2475,1838,105.36,31.68,1.68,0.12,190.08,202,16.284053,42.0,1,9803
9,2105,1468,85.2,18.972,1.32,0.24,189.72,385,36.188356,52.0,1,10567


In [337]:
print 'CG: '+', '.join(np.array(joint_meanDat_wide[
        joint_meanDat_wide['Coarse_Grain.Total_Occupancy']>5000
    ].sort_values('Coarse_Grain.Total_Occupancy',ascending=False).reset_index().head(n=40).ResID,
              dtype=str))
print '---'
print 'AA: '+', '.join(np.array(joint_meanDat_wide[
        joint_meanDat_wide['All_Atom.Total_Occupancy']>100
    ].sort_values('All_Atom.Total_Occupancy',ascending=False).reset_index().head(n=40).ResID,
              dtype=str))

CG: 242, 1660, 2105, 3080, 3893, 3078, 2475, 744, 1057, 3004, 3820, 168, 1054, 243, 3821, 2162, 3079, 1056, 2474, 3086, 2899, 2161, 3892, 2331, 1662, 63, 250, 1586, 1661, 743, 3579, 244, 1481, 3890, 1055, 3580, 2473, 2472, 2330, 2901
---
AA: 2330, 3080, 3893, 743, 3890, 3821, 3078, 913, 2475, 2105, 3579, 740, 3580, 2473, 2464, 420, 1579, 1586, 912, 2899, 3891, 1668, 3882, 3004, 744, 63, 1054, 1662, 3086, 3892, 2474, 1580, 168, 3649, 242, 1752, 2851, 2108, 3820, 1433


In [None]:
joint_meanDat_wide[
        joint_meanDat_wide['Coarse_Grain.Total_Occupancy']>5000
    ].sort_values('Coarse_Grain.Total_Occupancy',ascending=False).reset_index().head(n=40).ResID

In [80]:
?np.random.poisson