In [0]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import scipy as sp
from scipy import stats

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

import os
import sys
import gc
import copy
import glob

import tqdm
import itertools

import subprocess

In [0]:
#baseDir='./'
!rm -r sample_data

If you are viewing this in google colab, you will need to clone the repository first.
To do so uncomment the two code cells below

In [4]:
!git clone https://github.com/wesleymsmith/Piezo_PIP2_binding_analysis.git

Cloning into 'Piezo_PIP2_binding_analysis'...
remote: Enumerating objects: 1531, done.[K
remote: Counting objects: 100% (1531/1531), done.[K
remote: Compressing objects: 100% (257/257), done.[K
remote: Total 5090 (delta 1250), reused 1520 (delta 1241), pack-reused 3559
Receiving objects: 100% (5090/5090), 172.89 MiB | 23.12 MiB/s, done.
Resolving deltas: 100% (3982/3982), done.
Checking out files: 100% (6644/6644), done.


In [0]:
baseDir='Piezo_PIP2_binding_analysis/'

In [0]:
xcelData=pd.read_excel(baseDir+'Residue_ID_total_occupancy_10_1_2019.xlsx',
              sheet_name=None)

In [0]:
@interact
def show_data(sheet_name=xcelData.keys()):
    return xcelData[sheet_name]

In [13]:
#the above tables are
#aa - result summary for all atom simulation
#cg - result summary for coarse grain simulation
#tension_30ns - results of all atom simulation with membrane tension
#sheet 1 is apparently blank...
#resinfo_table - mapping between cryo-em structure sequence and all atom residue ids
resinfoDataSheet=xcelData['resinfo_table']
resinfoTable=resinfoDataSheet[
    resinfoDataSheet.columns[[0,3,5,7]]][2:]
resinfoTable.columns=['PDB_ID','Arm1_Resid','Arm2_Resid','Arm3_Resid']
resinfoTable.head()

Unnamed: 0,PDB_ID,Arm1_Resid,Arm2_Resid,Arm3_Resid
2,782,1,1419,2837
3,783,2,1420,2838
4,784,3,1421,2839
5,785,4,1422,2840
6,786,5,1423,2841


Before continuing, lets check to see if we have any missing values.

In [14]:
for colName in resinfoTable.columns:
    print '%s:'%colName,
    print resinfoTable[colName].isna().sum()

PDB_ID: 53
Arm1_Resid: 0
Arm2_Resid: 0
Arm3_Resid: 0


In [0]:
resinfoTable[resinfoTable['Arm1_Resid']==1329]

In [0]:
np.isnan(resinfoTableFilled.PDB_ID[97])

After our inspection above, we see there is a problem with trying to directly use the resinfo table above.
Specifically, the 'PDB_ID' column we want to use has missing values!

Fortunately, we know that these missing values are gaps in the pdb sequence for structures that could not be resolved in the cryo-em. The corresponding sequence should, therefore, increase linearly across these gaps.

This makes our imputation strategy relatively straight forward. We will iterate over the PDB_ID column. And keep track of the last value we see. When we find valid (integer) values, we just update the last value variable. If we see a missing value we simply increment that last value variable then set the missing entry to be equal to that value.

In [15]:
resinfoTableFilled=copy.deepcopy(resinfoTable)
print resinfoTableFilled.PDB_ID.isna().sum()
for iEntry in resinfoTableFilled.PDB_ID.index:
    entry=resinfoTableFilled.PDB_ID[iEntry]
    if np.isnan(entry):
        lastVal=lastVal+1
        resinfoTableFilled.PDB_ID[iEntry]=lastVal
    else:
        lastVal=entry
resinfoTableFilled.PDB_ID.isna().sum()

53


0

In [16]:
#the pdb residue id's are sequential, but have gaps corresponding
#to unresolved amino acids in the Cryo-EM structure.
#The easy solution is just to fill in linearly.
#The three arms in our simulation structure are identical, so
#we can generate our back-map by just repeating the pdb sequence 3 times
simResid_to_pdbResid=list(np.array(resinfoTableFilled.PDB_ID))*3
print ', '.join(map(str,simResid_to_pdbResid))

782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 

While the above excel sheet provides a useful summary at a glance, we would like to have direct access to the distribution of residence times rather than just the mean max and cummulative sum over all lipids. 

Using the set of all individually measured residence times, we can fit a model distribution. More specifically, the reciporical of residence time would correspond to a frequency. Specifically, the reciporical of residence time gives us the corresponding unbinding frequency. 

This can then be used to fit an appropriate distribution (geometric distribution would be one choice) and provide a characteristic unbinding frequency (or characteristic residence time as its reciporical). More over, it can give us a bound / confidence interval of this distribution as well.

We can then repeat this process for the all atom model. While CG is expected to have shorter residence times due to the notably lower membrane viscocity, we should still be able to see if the ranking and / or relative characteristic residence time / unbinding frequencies match (for each protein amino acid). If CG can rank amino acids in the correct order, based upon PIP2 residence times, then we can be confident that it is functioning well as a model for correctly predicting lipid binding sites.

Below, this residence time distribution data can be exctracted from the coarse grain simulation data files which list individual PIP2 residence time observations for each protein residue (amino acid)

in the Raw_PIP2_CG_residence_time_data directory, the .xvg files contain
the 'occupancy' of each amino acid at each output time step.
The occupancy is zero if there were no PIP2 lipids in contact
and non-zero if there was at least one PIP2 lipid in contact.
The first step is to extract these individual timeseries
into a joint table.

In [0]:
#os.listdir can be used to generate a list of all xvg files present
!cg_RawData_dir=baseDir+'Raw_PIP2_CG_residence_time_data/'
cg_RawData_dir=baseDir+'Raw_POPS_data/calc_CG_all_lysArg/'
cg_dataFile_list=[dataFileName for dataFileName in os.listdir(cg_RawData_dir) \
             if 'xvg' in dataFileName]
cg_dataFile_list

In [17]:
#Each of the above files contains an 18 line header, followed by 2 column occupancy series
#the data can be easily extracted using pd.read_table
cgDataTables=[]
for dataFileName in tqdm.tqdm_notebook(cg_dataFile_list):
    resID=int(dataFileName.split('_')[1])
    seqID=simResid_to_pdbResid[resID-1]
    tempTable=pd.read_csv(
        cg_RawData_dir+dataFileName,
        skiprows=17,names=['Time','Occupancy'],
        delim_whitespace=True)
    tempTable['ResID']=resID
    tempTable['SeqID']=seqID
    tempTable['Frame']=np.arange(len(tempTable))
    tempTable=tempTable[['ResID','SeqID','Frame','Time','Occupancy']]
    cgDataTables.append(copy.deepcopy(tempTable))
    
cg_occupancy_data=pd.concat(cgDataTables)
cg_occupancy_data.to_csv(baseDir+"Coarse_Grain_POPS_Occupancy_Data.csv",index=False)
print cg_occupancy_data.head()
print cg_occupancy_data.tail()

HBox(children=(IntProgress(value=0, max=429), HTML(value=u'')))


   ResID  SeqID  Frame    Time  Occupancy
0   3677   1969      0     0.0          0
1   3677   1969      1  1000.0          0
2   3677   1969      2  2000.0          0
3   3677   1969      3  3000.0          0
4   3677   1969      4  4000.0          0
       ResID  SeqID  Frame        Time  Occupancy
11996   1838   1201  11996  11996000.0          0
11997   1838   1201  11997  11997000.0          0
11998   1838   1201  11998  11998000.0          0
11999   1838   1201  11999  11999000.0          0
12000   1838   1201  12000  12000000.0          0


In [18]:
residtmp=pd.DataFrame(cg_occupancy_data.ResID.unique())
print residtmp.count()
print "......"
segidtmp=pd.DataFrame(cg_occupancy_data.SeqID.unique())
print segidtmp.count()
print residtmp.count()/segidtmp.count()


0    429
dtype: int64
......
0    143
dtype: int64
0    3.0
dtype: float64


In [24]:
#The coarse grain data table is huge, so we need to split it into chunks

os.system("split -l 100000 "+\
          baseDir+"Coarse_Grain_POPS_Occupancy_Data.csv "+\
          baseDir+"Raw_POPS_data/Coarse_Grain_POPS_Occupancy_Data.chunk.")
os.system("rm "+baseDir+"Coarse_Grain_POPS_Occupancy_Data.csv")

0

Next, lets collect the occupancy data for the all atom simulation into a single data frame.

In [25]:
aa_RawData_dir=baseDir+'Raw_POPS_data/calc_aa_190ns_lysArg/'
aa_dataFile_list=[dataFileName for dataFileName in os.listdir(aa_RawData_dir) \
             if 'xvg' in dataFileName]
aa_dataFile_list

#Each of the above files contains an 18 line header, followed by 2 column occupancy series
#the data can be easily extracted using pd.read_table
aaDataTables=[]
for dataFileName in tqdm.tqdm_notebook(aa_dataFile_list):
    resID=int(dataFileName.split('_')[1])
    seqID=simResid_to_pdbResid[resID-1]
    tempTable=pd.read_csv(
        aa_RawData_dir+dataFileName,
        skiprows=17,names=['Time','Occupancy'],
        delim_whitespace=True)
    tempTable['ResID']=resID
    tempTable['SeqID']=seqID
    tempTable['Frame']=np.arange(len(tempTable))
    tempTable=tempTable[['ResID','SeqID','Frame','Time','Occupancy']]
    aaDataTables.append(copy.deepcopy(tempTable))
    
aa_occupancy_data=pd.concat(aaDataTables)
aa_occupancy_data.to_csv(baseDir+"All_Atom_POPS_Occupancy_Data.190ns_0.12ns.csv",index=False)
print aa_occupancy_data.head()
print aa_occupancy_data.tail()

HBox(children=(IntProgress(value=0, max=429), HTML(value=u'')))


   ResID  SeqID  Frame  Time  Occupancy
0   3677   1969      0   0.0          0
1   3677   1969      1   1.0          0
2   3677   1969      2   2.0          0
3   3677   1969      3   3.0          0
4   3677   1969      4   4.0          0
      ResID  SeqID  Frame    Time  Occupancy
1587   1838   1201   1587  1587.0          1
1588   1838   1201   1588  1588.0          1
1589   1838   1201   1589  1589.0          1
1590   1838   1201   1590  1590.0          1
1591   1838   1201   1591  1591.0          1


In [20]:
aa_occupancy_data.Time.max()*0.12

190.92

In [21]:
residtmp=pd.DataFrame(aa_occupancy_data.ResID.unique())
print aa_occupancy_data.ResID.count()
print residtmp.count()
print "......"
segidtmp=pd.DataFrame(aa_occupancy_data.SeqID.unique())
print segidtmp.count()
print residtmp.count()/segidtmp.count()

682968
0    429
dtype: int64
......
0    143
dtype: int64
0    3.0
dtype: float64


In [26]:
aa_RawData_dir=baseDir+'stretch50ns_anton2_68mNm/'
aa_dataFile_list=[dataFileName for dataFileName in os.listdir(aa_RawData_dir) \
             if 'xvg' in dataFileName]
aa_dataFile_list

#Each of the above files contains an 18 line header, followed by 2 column occupancy series
#the data can be easily extracted using pd.read_table
aaDataTables=[]
tempTable=[]
aa_occupancy_data=[]
for dataFileName in tqdm.tqdm_notebook(aa_dataFile_list):
    resID=int(dataFileName.split('_')[1])
    seqID=simResid_to_pdbResid[resID-1]
    tempTable=pd.read_csv(
        aa_RawData_dir+dataFileName,
        skiprows=17,names=['Time','Occupancy'],
        delim_whitespace=True)
    tempTable['ResID']=resID
    tempTable['SeqID']=seqID
    tempTable['Frame']=np.arange(len(tempTable))
    tempTable=tempTable[['ResID','SeqID','Frame','Time','Occupancy']]
    aaDataTables.append(copy.deepcopy(tempTable))
    
aa_occupancy_data=pd.concat(aaDataTables)
aa_occupancy_data.to_csv(baseDir+"All_Atom_PIP2_Occupancy_Data.stretch50ns_0.12ns.csv",index=False)
print aa_occupancy_data.head()
print aa_occupancy_data.tail()
print aa_occupancy_data.Time.max()*0.12
residtmp=pd.DataFrame(aa_occupancy_data.ResID.unique())
print aa_occupancy_data.ResID.count()/aa_occupancy_data.Frame.max()
print residtmp.count()
print "......"
segidtmp=pd.DataFrame(aa_occupancy_data.SeqID.unique())
print segidtmp.count()
print residtmp.count()/segidtmp.count()

HBox(children=(IntProgress(value=0, max=429), HTML(value=u'')))


   ResID  SeqID  Frame  Time  Occupancy
0   3677   1969      0   0.0          0
1   3677   1969      1   1.0          0
2   3677   1969      2   2.0          0
3   3677   1969      3   3.0          0
4   3677   1969      4   4.0          0
     ResID  SeqID  Frame   Time  Occupancy
413   1838   1201    413  413.0          0
414   1838   1201    414  414.0          0
415   1838   1201    415  415.0          0
416   1838   1201    416  416.0          0
417   1838   1201    417  417.0          0
50.04
430
0    429
dtype: int64
......
0    143
dtype: int64
0    3.0
dtype: float64


In [27]:
aa_RawData_dir=baseDir+'Raw_PIP2_AA2us_residence_time_data/'
aa_dataFile_list=[dataFileName for dataFileName in os.listdir(aa_RawData_dir) \
             if 'xvg' in dataFileName]
aa_dataFile_list

#Each of the above files contains an 18 line header, followed by 2 column occupancy series
#the data can be easily extracted using pd.read_table
aaDataTables=[]
tempTable=[]
aa_occupancy_data=[]
for dataFileName in tqdm.tqdm_notebook(aa_dataFile_list):
    resID=int(dataFileName.split('_')[1])
    seqID=simResid_to_pdbResid[resID-1]
    tempTable=pd.read_csv(
        aa_RawData_dir+dataFileName,
        skiprows=17,names=['Time','Occupancy'],
        delim_whitespace=True)
    tempTable['ResID']=resID
    tempTable['SeqID']=seqID
    tempTable['Frame']=np.arange(len(tempTable))
    tempTable=tempTable[['ResID','SeqID','Frame','Time','Occupancy']]
    aaDataTables.append(copy.deepcopy(tempTable))
    
aa_occupancy_data=pd.concat(aaDataTables)
aa_occupancy_data.to_csv(baseDir+"All_Atom_PIP2_Occupancy_Data.2us_0.6ns.csv",index=False)
print aa_occupancy_data.head()
print aa_occupancy_data.tail()
print aa_occupancy_data.Time.max()*0.6
residtmp=pd.DataFrame(aa_occupancy_data.ResID.unique())
print aa_occupancy_data.ResID.count()/aa_occupancy_data.Frame.max()
print residtmp.count()
print "......"
segidtmp=pd.DataFrame(aa_occupancy_data.SeqID.unique())
print segidtmp.count()
print residtmp.count()/segidtmp.count()

HBox(children=(IntProgress(value=0, max=429), HTML(value=u'')))


   ResID  SeqID  Frame  Time  Occupancy
0   3677   1969      0   0.0          0
1   3677   1969      1   5.0          0
2   3677   1969      2  10.0          0
3   3677   1969      3  15.0          0
4   3677   1969      4  20.0          0
      ResID  SeqID  Frame     Time  Occupancy
3329   1838   1201   3329  16645.0          0
3330   1838   1201   3330  16650.0          0
3331   1838   1201   3331  16655.0          0
3332   1838   1201   3332  16660.0          0
3333   1838   1201   3333  16665.0          0
9999.0
429
0    429
dtype: int64
......
0    143
dtype: int64
0    3.0
dtype: float64




---


# **Becaues it's more convenient to have a single notebook file in google colab, below we start the Analyze_Occupancy_Data **

In [0]:
def extract_runs(x):
    return [len(list(gg)) for kk,gg in itertools.groupby(x, bool) if kk]

def extract_resDist(x):
    return(np.unique(extract_runs(x),return_counts=True))

def bin_runs(x,binWidth=1.000,frameRate=1.,center='right'):
    init_dist=extract_resDist(x)
    x_dist=(init_dist[0]*frameRate,init_dist[1])
    binMax=np.max(x_dist[0])
    nBins=np.ceil(binMax/binWidth)+1
    hbins=np.arange(nBins)*binWidth
    temp_dist=np.histogram(x_dist[0],weights=x_dist[1],bins=hbins)
    outDat=[temp_dist[1],temp_dist[0]]
    if center=='midpoint':
        outDat[0]=(temp_dist[1][1:]+temp_dist[1][:-1])/2.
    elif center=="left":
        outDat[0]=temp_dist[1][:-1]
    elif center=="right":
        outDat[0]=temp_dist[1][1:]
    return(outDat)

def expDist(x,l):
    return np.exp(-x/l)/l

def frequencyDistribution_mle_exp_params(bin_dist,bias_correction=True):
    Nv=np.sum(bin_dist[1])
    if bias_correction & (Nv>2):
        Nv=Nv-2
    return (np.sum(bin_dist[0]*bin_dist[1])/(1.*Nv))

        

def beta_dist(x,a,b):
    return (x**(a-1.)*(1.-x)**(b-1.))/\
        (sp.special.gamma(a)*sp.special.gamma(b)/sp.special.gamma(a+b))
def gamma_dist(x,t,k):
    return 1/(sp.special.gamma(k)*(t**k))*x**(k-1.)*np.exp(-x/t)

def frequencyDistribution_mle_gamma_params(bin_dist,bias_correction=True):
    xi,fi=bin_dist
    Nv=np.sum(fi)
    sum_f=np.sum(fi*xi)
    sum_lnf=np.sum(fi*np.log(xi))
    sum_flnf=np.sum(fi*xi*np.log(xi))
    k_est=(Nv*sum_f)/(Nv*sum_flnf-sum_lnf*sum_f)
    t_est=1./(Nv**2)*(Nv*sum_flnf-sum_lnf*sum_f)
    if bias_correction:
        t_est=Nv*t_est/(Nv-1)
        k_est=k_est-1./Nv*(3.*k_est-2./3.*(k_est/(1+k_est))-4./5.*(k_est/(1+k_est)**2))
    return [t_est,k_est]


def calc_GammaParams(x,fr=1.0,bw=4.8):
    return list(list(
        frequencyDistribution_mle_gamma_params(
             bin_runs(x,frameRate=fr,binWidth=bw),
             bias_correction=True))
        ) if np.sum(x)>0 else [np.nan,np.nan]

def calc_ExpParams(x,fr=1.0,bw=4.8):
    return frequencyDistribution_mle_exp_params(
                bin_runs(x,frameRate=fr,binWidth=bw)
            ) if np.sum(x)>0 else np.nan

def calc_GammaRMSE(x,fr=1.0,bw=4.8):
    if np.sum(x)>0:
        xDist=bin_runs(x,frameRate=fr,binWidth=bw)
        fitParams=calc_GammaParams(x)
        return np.sqrt(
            np.sum(
                (xDist[1]/np.sum(xDist[1])-\
                gamma_dist(xDist[0],*fitParams))**2)/\
            len(xDist[1]))
    else:
        return np.nan
        
def calc_ExpRMSE(x,fr=1.0,bw=4.8):
    if np.sum(x)>0:
        xDist=bin_runs(x,frameRate=fr,binWidth=bw)
        fitParams=[calc_ExpParams(x)]
        return np.sqrt(
            np.sum(
                (xDist[1]/np.sum(xDist[1])-\
                expDist(xDist[0],*fitParams))**2)/\
            len(xDist[1]))
    else:
        return np.nan


In [28]:
import glob
cg_occupancy_data_files=glob.glob(baseDir+"Raw_POPS_data/Coarse_Grain_POPS_Occupancy_Data.chunk*")
print "first cg data frame: '%s'"%np.sort(cg_occupancy_data_files)[0]

first cg data frame: 'Piezo_PIP2_binding_analysis/Raw_POPS_data/Coarse_Grain_POPS_Occupancy_Data.chunk.aa'


In [29]:
cg_frames=[]
for iFile,cg_file in tqdm.tqdm_notebook(enumerate(np.sort(cg_occupancy_data_files))):
    if iFile==0:
        cg_frames.append(pd.read_csv(cg_file))
        colNames=cg_frames[0].columns
    else:
        cg_frames.append(pd.read_csv(cg_file,names=colNames))
cg_occupancy_data=pd.concat(cg_frames)
cg_frames=[]
gc.collect()
aa_occupancy_data=pd.read_csv(baseDir+"All_Atom_POPS_Occupancy_Data.190ns_0.12ns.csv")

print cg_occupancy_data.head()
print aa_occupancy_data.head()
np.max(aa_occupancy_data.Time.unique())*.12

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


   ResID  SeqID  Frame    Time  Occupancy
0   3677   1969      0     0.0          0
1   3677   1969      1  1000.0          0
2   3677   1969      2  2000.0          0
3   3677   1969      3  3000.0          0
4   3677   1969      4  4000.0          0
   ResID  SeqID  Frame  Time  Occupancy
0   3677   1969      0   0.0          0
1   3677   1969      1   1.0          0
2   3677   1969      2   2.0          0
3   3677   1969      3   3.0          0
4   3677   1969      4   4.0          0


190.92

In [51]:
aaFrameRate=0.12 #time between frames in ns for all atom
aaBinWidth=4.0 #binning size in ns

aaKwds={'fr':aaFrameRate,'bw':aaBinWidth}
aaFitDat=aa_occupancy_data
aaFitDat['Wait']=aaFitDat.Occupancy.map(lambda x: not x)
print aaFitDat.head()
aaFitFrame=aaFitDat.groupby(['ResID','SeqID']).agg(
    {"Occupancy": {
         "Total":lambda x: aaFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max":lambda x: np.max(extract_runs(x))*aaFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**aaKwds), 
        "Gamma_RMSE":lambda x: calc_GammaRMSE(x,**aaKwds),
        "Exp_Mean": lambda x: calc_ExpParams(x,**aaKwds), 
        "Exp_RMSE": lambda x: calc_ExpRMSE(x,**aaKwds),
        },
     
     "Wait": {
         "Total":lambda x: aaFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max":lambda x: np.max(extract_runs(x))*aaFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**aaKwds), 
        "Gamma_RMSE":lambda x: calc_GammaRMSE(x,**aaKwds),
        "Exp_Mean": lambda x: calc_ExpParams(x,**aaKwds), 
        "Exp_RMSE": lambda x: calc_ExpRMSE(x,**aaKwds),
        },
     })

aaFitFrame.columns=aaFitFrame.columns.map(lambda x: '.'.join(x))
for entryType in ['Wait','Occupancy']:
  aaFitFrame[entryType+'.Gamma_Mean']=aaFitFrame[entryType+'.GammaDist_Params'].map(np.product)
  aaFitFrame[entryType+'.Gamma_k']=aaFitFrame[entryType+'.GammaDist_Params'].map(lambda x: x[1])
  aaFitFrame=aaFitFrame.drop(columns=entryType+'.GammaDist_Params')

aaFitFrame['SimType']='AA'
aaFitFrame=aaFitFrame.reset_index()
aaFitFrame=aaFitFrame[["SimType","ResID", "SeqID","Occupancy.Total", 
                       "Occupancy.N","Occupancy.Max","Occupancy.Exp_Mean","Occupancy.Exp_RMSE",
                       "Occupancy.Gamma_Mean","Occupancy.Gamma_k","Occupancy.Gamma_RMSE",
                       "Wait.N","Wait.Max","Wait.Exp_Mean","Wait.Exp_RMSE",
                       "Wait.Gamma_Mean","Wait.Gamma_k","Wait.Gamma_RMSE"]]
aaFitFrame.head()

   ResID  SeqID  Frame  Time  Occupancy  Wait
0   3677   1969      0   0.0          0  True
1   3677   1969      1   1.0          0  True
2   3677   1969      2   2.0          0  True
3   3677   1969      3   3.0          0  True
4   3677   1969      4   4.0          0  True




Unnamed: 0,SimType,ResID,SeqID,Occupancy.Total,Occupancy.N,Occupancy.Max,Occupancy.Exp_Mean,Occupancy.Exp_RMSE,Occupancy.Gamma_Mean,Occupancy.Gamma_k,Occupancy.Gamma_RMSE,Wait.N,Wait.Max,Wait.Exp_Mean,Wait.Exp_RMSE,Wait.Gamma_Mean,Wait.Gamma_k,Wait.Gamma_RMSE
0,AA,1,782,0.0,0,0.0,,,,,,1,191.04,192.0,0.144258,,,
1,AA,15,796,0.0,0,0.0,,,,,,1,191.04,192.0,0.144258,,,
2,AA,21,802,20.04,39,2.88,4.216216,0.921266,,,0.880988,40,89.64,8.315789,0.010936,7.627288,0.685337,0.007966
3,AA,22,803,0.0,0,0.0,,,,,,1,191.04,192.0,0.144258,,,
4,AA,30,811,0.0,0,0.0,,,,,,1,191.04,192.0,0.144258,,,


In [52]:
cgFrameRate=1.0 #time between frames in ns for coarse grain
cgBinWidth=4.0

cgKwds={'fr':cgFrameRate,'bw':cgBinWidth}
cgFitDat=cg_occupancy_data 
cgFitDat['Wait']=cgFitDat.Occupancy.map(lambda x: not x)
print cgFitDat.head()
cgFitFrame=cgFitDat.groupby(['ResID','SeqID']).agg(
    {"Occupancy": {
         "Total":lambda x: cgFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max":lambda x: np.max(extract_runs(x))*cgFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**cgKwds), 
        "Gamma_RMSE":lambda x: calc_GammaRMSE(x,**cgKwds),
        "Exp_Mean": lambda x: calc_ExpParams(x,**cgKwds), 
        "Exp_RMSE": lambda x: calc_ExpRMSE(x,**cgKwds),
        },
     
     "Wait": {
         "Total":lambda x: cgFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max":lambda x: np.max(extract_runs(x))*cgFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**cgKwds), 
        "Gamma_RMSE":lambda x: calc_GammaRMSE(x,**cgKwds),
        "Exp_Mean": lambda x: calc_ExpParams(x,**cgKwds), 
        "Exp_RMSE": lambda x: calc_ExpRMSE(x,**cgKwds),
        },
     })

cgFitFrame.columns=cgFitFrame.columns.map(lambda x: '.'.join(x))
for entryType in ['Wait','Occupancy']:
  cgFitFrame[entryType+'.Gamma_Mean']=cgFitFrame[entryType+'.GammaDist_Params'].map(np.product)
  cgFitFrame[entryType+'.Gamma_k']=cgFitFrame[entryType+'.GammaDist_Params'].map(lambda x: x[1])
  cgFitFrame=cgFitFrame.drop(columns=entryType+'.GammaDist_Params')
  
cgFitFrame['SimType']='CG'
cgFitFrame=cgFitFrame.reset_index()
cgFitFrame=cgFitFrame[["SimType","ResID", "SeqID","Occupancy.Total", 
                       "Occupancy.N","Occupancy.Max","Occupancy.Exp_Mean","Occupancy.Exp_RMSE",
                       "Occupancy.Gamma_Mean","Occupancy.Gamma_k","Occupancy.Gamma_RMSE",
                       "Wait.N","Wait.Max","Wait.Exp_Mean","Wait.Exp_RMSE",
                       "Wait.Gamma_Mean","Wait.Gamma_k","Wait.Gamma_RMSE"]]  
cgFitFrame.head()

   ResID  SeqID  Frame    Time  Occupancy  Wait
0   3677   1969      0     0.0          0  True
1   3677   1969      1  1000.0          0  True
2   3677   1969      2  2000.0          0  True
3   3677   1969      3  3000.0          0  True
4   3677   1969      4  4000.0          0  True




Unnamed: 0,SimType,ResID,SeqID,Occupancy.Total,Occupancy.N,Occupancy.Max,Occupancy.Exp_Mean,Occupancy.Exp_RMSE,Occupancy.Gamma_Mean,Occupancy.Gamma_k,Occupancy.Gamma_RMSE,Wait.N,Wait.Max,Wait.Exp_Mean,Wait.Exp_RMSE,Wait.Gamma_Mean,Wait.Gamma_k,Wait.Gamma_RMSE
0,CG,1,782,4468,928,50,7.365011,0.026641,7.335326,2.470362,0.037828,929,166.0,10.783172,0.013468,10.74357,0.864513,0.013375
1,CG,15,796,340,262,7,4.107692,0.069764,4.045232,77.641998,0.0,263,2053.0,47.187739,0.00217,46.634071,0.369983,0.001893
2,CG,21,802,4,4,1,8.0,0.931329,,,,5,6803.0,4004.0,0.000133,1573.667915,0.4161,0.000335
3,CG,22,803,0,0,0,,,,,,1,12001.0,12004.0,0.018254,,,
4,CG,30,811,0,0,0,,,,,,1,12001.0,12004.0,0.018254,,,


In [55]:
jointFitFrame=pd.concat([aaFitFrame,cgFitFrame])
jointFitMelt=jointFitFrame.melt(id_vars=["SimType","ResID","SeqID"],var_name="Fit_Param")
print jointFitMelt.head()

jointFitWide=jointFitMelt
jointFitWide["Measurement"]=jointFitWide.SimType+"."+jointFitWide.Fit_Param
jointFitWide=jointFitWide.drop(columns=["SimType","Fit_Param"])
jointFitWide=jointFitWide.reset_index()
jointFitWide=pd.pivot_table(index=["ResID","SeqID"],columns="Measurement",values="value",data=jointFitWide)
#jointFitWide.columns=jointFitWide.columns.map(lambda x: x[1])
jointFitWide=jointFitWide.reset_index()
#print jointFitWide.columns
jointFitWide.to_csv(baseDir+"POPS_aa_cg_joint_Fit_Data_wide.csv",index=False)
print jointFitWide.head()
jointFitWide.describe()

  SimType  ResID  SeqID        Fit_Param  value
0      AA      1    782  Occupancy.Total   0.00
1      AA     15    796  Occupancy.Total   0.00
2      AA     21    802  Occupancy.Total  20.04
3      AA     22    803  Occupancy.Total   0.00
4      AA     30    811  Occupancy.Total   0.00
Measurement  ResID  SeqID  ...  CG.Wait.Max  CG.Wait.N
0                1    782  ...        166.0      929.0
1               15    796  ...       2053.0      263.0
2               21    802  ...       6803.0        5.0
3               22    803  ...      12001.0        1.0
4               30    811  ...      12001.0        1.0

[5 rows x 32 columns]


Measurement,ResID,SeqID,AA.Occupancy.Exp_Mean,AA.Occupancy.Exp_RMSE,AA.Occupancy.Gamma_Mean,AA.Occupancy.Gamma_RMSE,AA.Occupancy.Gamma_k,AA.Occupancy.Max,AA.Occupancy.N,AA.Occupancy.Total,AA.Wait.Exp_Mean,AA.Wait.Exp_RMSE,AA.Wait.Gamma_Mean,AA.Wait.Gamma_RMSE,AA.Wait.Gamma_k,AA.Wait.Max,AA.Wait.N,CG.Occupancy.Exp_Mean,CG.Occupancy.Exp_RMSE,CG.Occupancy.Gamma_Mean,CG.Occupancy.Gamma_RMSE,CG.Occupancy.Gamma_k,CG.Occupancy.Max,CG.Occupancy.N,CG.Occupancy.Total,CG.Wait.Exp_Mean,CG.Wait.Exp_RMSE,CG.Wait.Gamma_Mean,CG.Wait.Gamma_RMSE,CG.Wait.Gamma_k,CG.Wait.Max,CG.Wait.N
count,429.0,429.0,90.0,90.0,50.0,74.0,50.0,429.0,429.0,429.0,429.0,429.0,88.0,90.0,88.0,429.0,429.0,136.0,136.0,99.0,95.0,99.0,429.0,429.0,429.0,429.0,429.0,136.0,136.0,136.0,429.0,429.0
mean,2137.230769,1671.146853,5.212009,0.431114,4.745129,0.295495,27.777384,2.702378,10.72028,10.939301,157.110942,0.122786,5.157396,1122434000.0,9.432348,164.742378,11.60373,6.55343,0.275923,6.11994,0.085483,16.16256,15.748252,134.300699,481.375291,8538.898975,0.013819,89.553802,5.283698e+120,0.047004,8936.545455,135.258741
std,1230.085,560.510031,1.710814,0.440063,1.188704,0.367946,47.220755,11.085029,28.679849,32.586798,70.494485,0.074162,21.639881,10648340000.0,22.301582,56.957761,28.452068,5.258705,0.390201,4.518699,0.086387,30.576604,73.349197,301.41071,1260.030537,5304.47456,0.006883,952.784462,6.161798e+121,4.129998,4771.51575,301.314202
min,1.0,782.0,4.0,0.008736,3.600751,0.006101,0.575373,0.0,0.0,0.0,4.068966,0.000591,-93.250865,0.002496697,-6.597627,3.24,1.0,4.0,0.005233,-15.777966,0.0,-0.114365,0.0,0.0,0.0,6.503294,5.8e-05,-5958.225613,0.0002640655,-45.462365,127.0,1.0
25%,1055.0,1154.0,4.217718,0.040691,4.023201,0.034967,3.430534,0.0,0.0,0.0,192.0,0.144258,4.220847,0.006399313,0.456592,191.04,1.0,4.255058,0.028494,4.309464,0.03113,2.320283,0.0,0.0,0.0,401.6,0.006891,12.706416,0.001231952,0.321404,3656.0,1.0
50%,2105.0,1671.0,4.53832,0.065521,4.238464,0.054802,10.442717,0.0,0.0,0.0,192.0,0.144258,5.360814,0.01601019,1.480661,191.04,1.0,4.913291,0.05014,4.973206,0.068793,4.9144,0.0,0.0,0.0,12004.0,0.018254,31.241628,0.003051639,0.443608,12001.0,1.0
75%,3209.0,2183.0,5.401289,0.915264,4.983823,0.732469,18.869306,0.0,0.0,0.0,192.0,0.144258,10.937788,0.04389535,5.771043,191.04,1.0,6.734937,0.909459,6.624123,0.115352,16.740905,3.0,31.0,59.0,12004.0,0.018254,141.575962,0.00540234,0.585101,12001.0,32.0
max,4254.0,2546.0,12.0,0.96473,8.59606,0.999873,230.799283,130.68,163.0,176.28,200.0,0.922961,29.468288,101019100000.0,135.55897,191.04,163.0,48.0,0.947398,28.180462,0.707107,177.934207,1153.0,1539.0,7231.0,12004.0,0.018254,1835.647122,7.18583e+122,1.742315,12001.0,1540.0




---


### Fit the occupancy/wait data for a cluster of consequtive residues 
(if one of 
the residue has occupany 1, the whole cluster has occupancy 1) 

In [56]:
print "making domain keys"
domainKey=aa_occupancy_data[['ResID','SeqID']].sort_values(['ResID','SeqID']).apply(lambda x: '.'.join(map(str,x)),axis=1).unique()
#domainKey
domainNames={domainKey[0]:domainKey[0]}
currentID=domainKey[0]
print "making domain names"
for iEntry,domainID in tqdm.tqdm_notebook(enumerate(domainKey[1:])):
  if int(domainID.split('.')[-1])-1!=int(domainKey[iEntry].split('.')[-1]):
    currentID=domainID
  domainNames[domainID]=currentID
domainNames

making domain keys
making domain names


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))




{'1.782': '1.782',
 '1007.2135': '1007.2135',
 '1038.2166': '1038.2166',
 '1041.2169': '1041.2169',
 '1045.2173': '1045.2173',
 '1046.2174': '1045.2173',
 '1051.2179': '1051.2179',
 '1054.2182': '1054.2182',
 '1055.2183': '1054.2182',
 '1056.2184': '1054.2182',
 '1057.2185': '1054.2182',
 '1060.2188': '1060.2188',
 '1086.2214': '1086.2214',
 '1103.2231': '1103.2231',
 '1167.2295': '1167.2295',
 '1173.2301': '1173.2301',
 '1177.2305': '1177.2305',
 '1190.2318': '1190.2318',
 '1197.2325': '1197.2325',
 '1201.2329': '1201.2329',
 '1211.2339': '1211.2339',
 '122.903': '122.903',
 '1223.2351': '1223.2351',
 '1224.2352': '1223.2351',
 '1233.2361': '1233.2361',
 '1246.2374': '1246.2374',
 '1249.2377': '1249.2377',
 '1260.2388': '1260.2388',
 '1274.2402': '1274.2402',
 '1278.2406': '1278.2406',
 '1279.2407': '1278.2406',
 '1294.2422': '1294.2422',
 '1310.2438': '1310.2438',
 '1324.2452': '1324.2452',
 '134.915': '134.915',
 '135.916': '134.915',
 '1351.2479': '1351.2479',
 '1354.2482': '1354.2

In [57]:
aa_occupancy_data['Domain']=aa_occupancy_data[['ResID','SeqID']].sort_values(
    ['ResID','SeqID']).apply(
        lambda x: domainNames['.'.join(map(str,x))],axis=1)
aa_occupancy_data.head()

Unnamed: 0,ResID,SeqID,Frame,Time,Occupancy,Wait,Domain
0,3677,1969,0,0.0,0,True,3677.1969
1,3677,1969,1,1.0,0,True,3677.1969
2,3677,1969,2,2.0,0,True,3677.1969
3,3677,1969,3,3.0,0,True,3677.1969
4,3677,1969,4,4.0,0,True,3677.1969


In [58]:
aa_domain_data=aa_occupancy_data.groupby(['Domain','Frame','Time']).agg({
    "Occupancy":lambda x: np.sum(x)>0
})
aa_domain_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Occupancy
Domain,Frame,Time,Unnamed: 3_level_1
1.782,0,0.0,False
1.782,1,1.0,False
1.782,2,2.0,False
1.782,3,3.0,False
1.782,4,4.0,False


In [59]:
aa_domain_data=aa_domain_data.reset_index()
aa_domain_data.head()

Unnamed: 0,Domain,Frame,Time,Occupancy
0,1.782,0,0.0,False
1,1.782,1,1.0,False
2,1.782,2,2.0,False
3,1.782,3,3.0,False
4,1.782,4,4.0,False


In [60]:
aa_domain_data['Wait']=aa_domain_data['Occupancy'].map(lambda x: not x)
aa_domain_data.head()

Unnamed: 0,Domain,Frame,Time,Occupancy,Wait
0,1.782,0,0.0,False,True
1,1.782,1,1.0,False,True
2,1.782,2,2.0,False,True
3,1.782,3,3.0,False,True
4,1.782,4,4.0,False,True


In [120]:
aaFrameRate=0.12 #time between frames in ns for all atom
aaBinWidth=4.0 #binning size in ns
cgFrameRate=1.0 #time between frames in ns for coarse grain
cgBinWidth=4.0

aaKwds={'fr':aaFrameRate,'bw':aaBinWidth}
cgKwds={'fr':cgFrameRate,'bw':cgBinWidth}

aaFitDat=aa_domain_data #aa_occupancy_data
#aaFitDat['Wait']=aaFitDat.Occupancy.map(lambda x: not x)
print aaFitDat.head()
#aaFitFrame=aaFitDat.groupby(['ResID','SeqID']).agg()
aaFitFrame=aaFitDat.groupby('Domain').agg(
    {"Occupancy": {
         "Total":lambda x: aaFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max_ResTime":lambda x: np.max(extract_runs(x))*aaFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**aaKwds), 
        "GammaDist_RMSE":lambda x: calc_GammaRMSE(x,**aaKwds),
        "ExpDist_Mean": lambda x: calc_ExpParams(x,**aaKwds), 
        "ExpDist_RMSE": lambda x: calc_ExpRMSE(x,**aaKwds),
        },
     
     "Wait": {
         "Total":lambda x: aaFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max_ResTime":lambda x: np.max(extract_runs(x))*aaFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**aaKwds), 
        "GammaDist_RMSE":lambda x: calc_GammaRMSE(x,**aaKwds),
        "ExpDist_Mean": lambda x: calc_ExpParams(x,**aaKwds), 
        "ExpDist_RMSE": lambda x: calc_ExpRMSE(x,**aaKwds),
        },
     })
aaFitFrame.columns=aaFitFrame.columns.map(lambda x: '.'.join(x))
for entryType in ['Wait','Occupancy']:
  aaFitFrame[entryType+'.GammaDist_Mean']=aaFitFrame[entryType+'.GammaDist_Params'].map(np.product)
  aaFitFrame[entryType+'.GammaDist_k']=aaFitFrame[entryType+'.GammaDist_Params'].map(lambda x: x[1])
  aaFitFrame=aaFitFrame.drop(columns=entryType+'.GammaDist_Params')
aaFitFrame.head()

  Domain  Frame  Time  Occupancy  Wait
0  1.782      0   0.0      False  True
1  1.782      1   1.0      False  True
2  1.782      2   2.0      False  True
3  1.782      3   3.0      False  True
4  1.782      4   4.0      False  True




Unnamed: 0_level_0,Occupancy.Max_ResTime,Occupancy.ExpDist_Mean,Occupancy.N,Occupancy.GammaDist_RMSE,Occupancy.Total,Occupancy.ExpDist_RMSE,Wait.Max_ResTime,Wait.ExpDist_Mean,Wait.N,Wait.GammaDist_RMSE,Wait.Total,Wait.ExpDist_RMSE,Wait.GammaDist_Mean,Wait.GammaDist_k,Occupancy.GammaDist_Mean,Occupancy.GammaDist_k
Domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1.782,0.0,,0,,0.0,,191.04,192.0,1,,191.04,0.144258,,,,
1007.2135,0.0,,0,,0.0,,191.04,192.0,1,,191.04,0.144258,,,,
1038.2166,0.0,,0,,0.0,,191.04,192.0,1,,191.04,0.144258,,,,
1041.2169,4.92,4.24,52,0.109851,20.28,0.067234,59.04,6.666667,53,0.012828,170.76,0.015219,6.217662,1.404077,3.917729,73.477145
1045.2173,0.48,5.0,10,,2.16,0.91443,155.28,24.888889,11,0.004441,188.88,0.003813,18.037186,0.346131,,


In [96]:
aaFitFrame.columns.map(lambda x: '.'.join(x))

Index([u'Occupancy.Max_ResTime', u'Occupancy.ExpDist_Mean',
       u'Occupancy.GammaDist_Params', u'Occupancy.N',
       u'Occupancy.GammaDist_RMSE', u'Occupancy.Total_Occupancy',
       u'Occupancy.ExpDist_RMSE', u'Wait.Max_ResTime', u'Wait.ExpDist_Mean',
       u'Wait.GammaDist_Params', u'Wait.N', u'Wait.GammaDist_RMSE',
       u'Wait.Total_Occupancy', u'Wait.ExpDist_RMSE'],
      dtype='object')

In [81]:
aaFrameRate=0.12 #time between frames in ns for all atom
aaBinWidth=4.0 #binning size in ns
cgFrameRate=1.0 #time between frames in ns for coarse grain
cgBinWidth=4.0

aaKwds={'fr':aaFrameRate,'bw':aaBinWidth}
cgKwds={'fr':cgFrameRate,'bw':cgBinWidth}

aaFitDat=aa_occupancy_data
aaFitDat['Wait']=aaFitDat.Occupancy.map(lambda x: not x)
aaFitFrame=aaFitDat.groupby(['ResID','SeqID']).agg(
    {"Occupancy": {
         "Total_Occupancy":lambda x: aaFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max_ResTime":lambda x: np.max(extract_runs(x))*aaFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**aaKwds), 
        "GammaDist_RMSE":lambda x: calc_GammaRMSE(x,**aaKwds),
        "ExpDist_Mean": lambda x: calc_ExpParams(x,**aaKwds), 
        "ExpDist_RMSE": lambda x: calc_ExpRMSE(x,**aaKwds),
        },
     "Wait": {
         "Total_Occupancy":lambda x: aaFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max_ResTime":lambda x: np.max(extract_runs(x))*aaFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**aaKwds), 
        "GammaDist_RMSE":lambda x: calc_GammaRMSE(x,**aaKwds),
        "ExpDist_Mean": lambda x: calc_ExpParams(x,**aaKwds), 
        "ExpDist_RMSE": lambda x: calc_ExpRMSE(x,**aaKwds),
        },
     })
aaFitFrame.columns=aaFitFrame.columns.map(lambda x: x[1])
aaFitFrame=aaFitFrame.reset_index()
aaFitFrame['GammaDist_Mean']=aaFitFrame.GammaDist_Params.map(np.product)
aaFitFrame['GammaDist_k']=aaFitFrame.GammaDist_Params.map(lambda x: x[1])
aaFitFrame=aaFitFrame.drop(columns='GammaDist_Params')
aaFitFrame['SimType']='All_Atom'
aaFitFrame=aaFitFrame[["SimType","ResID","SeqID","Total_Occupancy","N","Max_ResTime",
                       "ExpDist_Mean","ExpDist_RMSE","GammaDist_Mean","GammaDist_k","GammaDist_RMSE"]]
#aaFitFrame=aaFitFrame.dropna()
print aaFitFrame.head()

cgFitDat=cg_occupancy_data 
cgFitFrame=cgFitDat.groupby(['ResID','SeqID']).agg(
    {"Occupancy": {
        "Total_Occupancy":np.sum,
        "N":lambda x: len(extract_runs(x)),
        "Max_ResTime":lambda x: np.max(extract_runs(x)) if np.sum(x) > 0 else 0,
        "GammaDist_Params": lambda x: calc_GammaParams(x,**cgKwds), #lambda x: list(list(frequencyDistribution_mle_gamma_params(
         #bin_runs(x,frameRate=.12,binWidth=binwidth),bias_correction=True))) if np.sum(x)>0 else [np.nan,np.nan],
        "GammaDist_RMSE": lambda x: calc_GammaRMSE(x,**cgKwds),
        "ExpDist_Mean": lambda x: calc_ExpParams(x,**cgKwds), #lambda x: frequencyDistribution_mle_exp_params(
            #bin_runs(x,frameRate=.12,binWidth=binwidth)) if np.sum(x)>0 else np.nan}
        "ExpDist_RMSE": lambda x: calc_ExpRMSE(x,**cgKwds),
    }})
cgFitFrame.columns=cgFitFrame.columns.map(lambda x: x[1])
cgFitFrame=cgFitFrame.reset_index()
cgFitFrame['GammaDist_Mean']=cgFitFrame.GammaDist_Params.map(np.product)
cgFitFrame['GammaDist_k']=cgFitFrame.GammaDist_Params.map(lambda x: x[1])
cgFitFrame=cgFitFrame.drop(columns='GammaDist_Params')
cgFitFrame['SimType']='Coarse_Grain'
cgFitFrame=cgFitFrame[["SimType","ResID","SeqID","Total_Occupancy","N","Max_ResTime",
                       "ExpDist_Mean","ExpDist_RMSE","GammaDist_Mean","GammaDist_k","GammaDist_RMSE"]]
#cgFitFrame=cgFitFrame.dropna()
print cgFitFrame.head()



    SimType  ResID  SeqID  ...  GammaDist_Mean  GammaDist_k  GammaDist_RMSE
0  All_Atom      1    782  ...             NaN          NaN             NaN
1  All_Atom     15    796  ...             NaN          NaN             NaN
2  All_Atom     21    802  ...             NaN          NaN        0.880988
3  All_Atom     22    803  ...             NaN          NaN             NaN
4  All_Atom     30    811  ...             NaN          NaN             NaN

[5 rows x 11 columns]




        SimType  ResID  SeqID  ...  GammaDist_Mean  GammaDist_k  GammaDist_RMSE
0  Coarse_Grain      1    782  ...        7.335326     2.470362        0.037828
1  Coarse_Grain     15    796  ...        4.045232    77.641998        0.000000
2  Coarse_Grain     21    802  ...             NaN          NaN             NaN
3  Coarse_Grain     22    803  ...             NaN          NaN             NaN
4  Coarse_Grain     30    811  ...             NaN          NaN             NaN

[5 rows x 11 columns]


In [82]:
jointFitFrame=pd.concat([aaFitFrame,cgFitFrame])
#jointFitFrame.to_csv(baseDir+"POPS_aa_cg_joint_distribution_fit_frame.csv",index=False)
print jointFitFrame.head()
print jointFitFrame.tail()

    SimType  ResID  SeqID  ...  GammaDist_Mean  GammaDist_k  GammaDist_RMSE
0  All_Atom      1    782  ...             NaN          NaN             NaN
1  All_Atom     15    796  ...             NaN          NaN             NaN
2  All_Atom     21    802  ...             NaN          NaN        0.880988
3  All_Atom     22    803  ...             NaN          NaN             NaN
4  All_Atom     30    811  ...             NaN          NaN             NaN

[5 rows x 11 columns]
          SimType  ResID  SeqID  ...  GammaDist_Mean  GammaDist_k  GammaDist_RMSE
424  Coarse_Grain   4236   2528  ...             NaN          NaN             NaN
425  Coarse_Grain   4242   2534  ...             NaN          NaN             NaN
426  Coarse_Grain   4249   2541  ...             NaN          NaN             NaN
427  Coarse_Grain   4252   2544  ...             NaN          NaN             NaN
428  Coarse_Grain   4254   2546  ...             NaN          NaN             NaN

[5 rows x 11 columns]


In [83]:
jointFitMelt=jointFitFrame.melt(id_vars=["SimType","ResID","SeqID"],var_name="Fit_Param")
print jointFitMelt.head()

jointFitWide=jointFitMelt
jointFitWide["Measurement"]=jointFitWide.SimType+"."+jointFitWide.Fit_Param
jointFitWide=jointFitWide.drop(columns=["SimType","Fit_Param"])
jointFitWide=jointFitWide.reset_index()
jointFitWide=pd.pivot_table(index=["ResID","SeqID"],columns="Measurement",values="value",data=jointFitWide)
#jointFitWide.columns=jointFitWide.columns.map(lambda x: x[1])
jointFitWide=jointFitWide.reset_index()
print jointFitWide.columns
jointFitWide.to_csv(baseDir+"POPS_aa_cg_joint_Fit_Data_wide.csv",index=False)
jointFitWide.head()
jointFitWide.describe()

    SimType  ResID  SeqID        Fit_Param  value
0  All_Atom      1    782  Total_Occupancy   0.00
1  All_Atom     15    796  Total_Occupancy   0.00
2  All_Atom     21    802  Total_Occupancy  20.04
3  All_Atom     22    803  Total_Occupancy   0.00
4  All_Atom     30    811  Total_Occupancy   0.00
Index([u'ResID', u'SeqID', u'All_Atom.ExpDist_Mean', u'All_Atom.ExpDist_RMSE',
       u'All_Atom.GammaDist_Mean', u'All_Atom.GammaDist_RMSE',
       u'All_Atom.GammaDist_k', u'All_Atom.Max_ResTime', u'All_Atom.N',
       u'All_Atom.Total_Occupancy', u'Coarse_Grain.ExpDist_Mean',
       u'Coarse_Grain.ExpDist_RMSE', u'Coarse_Grain.GammaDist_Mean',
       u'Coarse_Grain.GammaDist_RMSE', u'Coarse_Grain.GammaDist_k',
       u'Coarse_Grain.Max_ResTime', u'Coarse_Grain.N',
       u'Coarse_Grain.Total_Occupancy'],
      dtype='object', name=u'Measurement')


Measurement,ResID,SeqID,All_Atom.ExpDist_Mean,All_Atom.ExpDist_RMSE,All_Atom.GammaDist_Mean,All_Atom.GammaDist_RMSE,All_Atom.GammaDist_k,All_Atom.Max_ResTime,All_Atom.N,All_Atom.Total_Occupancy,Coarse_Grain.ExpDist_Mean,Coarse_Grain.ExpDist_RMSE,Coarse_Grain.GammaDist_Mean,Coarse_Grain.GammaDist_RMSE,Coarse_Grain.GammaDist_k,Coarse_Grain.Max_ResTime,Coarse_Grain.N,Coarse_Grain.Total_Occupancy
count,429.0,429.0,90.0,90.0,50.0,74.0,50.0,429.0,429.0,429.0,136.0,136.0,99.0,95.0,99.0,429.0,429.0,429.0
mean,2137.230769,1671.146853,5.212009,0.431114,4.745129,0.295495,27.777384,2.702378,10.72028,10.939301,6.55343,0.275923,6.11994,0.085483,16.16256,15.748252,134.300699,481.375291
std,1230.085,560.510031,1.710814,0.440063,1.188704,0.367946,47.220755,11.085029,28.679849,32.586798,5.258705,0.390201,4.518699,0.086387,30.576604,73.349197,301.41071,1260.030537
min,1.0,782.0,4.0,0.008736,3.600751,0.006101,0.575373,0.0,0.0,0.0,4.0,0.005233,-15.777966,0.0,-0.114365,0.0,0.0,0.0
25%,1055.0,1154.0,4.217718,0.040691,4.023201,0.034967,3.430534,0.0,0.0,0.0,4.255058,0.028494,4.309464,0.03113,2.320283,0.0,0.0,0.0
50%,2105.0,1671.0,4.53832,0.065521,4.238464,0.054802,10.442717,0.0,0.0,0.0,4.913291,0.05014,4.973206,0.068793,4.9144,0.0,0.0,0.0
75%,3209.0,2183.0,5.401289,0.915264,4.983823,0.732469,18.869306,0.0,0.0,0.0,6.734937,0.909459,6.624123,0.115352,16.740905,3.0,31.0,59.0
max,4254.0,2546.0,12.0,0.96473,8.59606,0.999873,230.799283,130.68,163.0,176.28,48.0,0.947398,28.180462,0.707107,177.934207,1153.0,1539.0,7231.0
