In [0]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import scipy as sp
from scipy import stats

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

import os
import sys
import gc
import copy
import glob

import tqdm
import itertools

import subprocess

In [3]:
#baseDir='./'
!rm -r sample_data
#If you are viewing this in google colab, you will need to clone the repository first.
#To do so uncomment the two code cells below
!git clone https://github.com/wesleymsmith/Piezo_PIP2_binding_analysis.git
baseDir='Piezo_PIP2_binding_analysis/'

Cloning into 'Piezo_PIP2_binding_analysis'...
remote: Enumerating objects: 1537, done.[K
remote: Counting objects: 100% (1537/1537), done.[K
remote: Compressing objects: 100% (263/263), done.[K
remote: Total 5096 (delta 1254), reused 1520 (delta 1241), pack-reused 3559[K
Receiving objects: 100% (5096/5096), 172.91 MiB | 21.51 MiB/s, done.
Resolving deltas: 100% (3986/3986), done.
Checking out files: 100% (6646/6646), done.


# **1. Get a list of ResID and SeqID**

In [0]:
xcelData=pd.read_excel(baseDir+'Residue_ID_total_occupancy_10_1_2019.xlsx',
              sheet_name=None)
@interact
def show_data(sheet_name=xcelData.keys()):
    return xcelData[sheet_name]
#the above tables are
#aa - result summary for all atom simulation
#cg - result summary for coarse grain simulation
#tension_30ns - results of all atom simulation with membrane tension
#sheet 1 is apparently blank...
#resinfo_table - mapping between cryo-em structure sequence and all atom residue ids
resinfoDataSheet=xcelData['resinfo_table']
resinfoTable=resinfoDataSheet[
    resinfoDataSheet.columns[[0,3,5,7]]][2:]
resinfoTable.columns=['PDB_ID','Arm1_Resid','Arm2_Resid','Arm3_Resid']
resinfoTable.head()
#Before continuing, lets check to see if we have any missing values.
for colName in resinfoTable.columns:
    print '%s:'%colName,
    print resinfoTable[colName].isna().sum()

After our inspection above, we see there is a problem with trying to directly use the resinfo table above.
Specifically, the 'PDB_ID' column we want to use has missing values!

Fortunately, we know that these missing values are gaps in the pdb sequence for structures that could not be resolved in the cryo-em. The corresponding sequence should, therefore, increase linearly across these gaps.

This makes our imputation strategy relatively straight forward. We will iterate over the PDB_ID column. And keep track of the last value we see. When we find valid (integer) values, we just update the last value variable. If we see a missing value we simply increment that last value variable then set the missing entry to be equal to that value.

In [6]:
resinfoTableFilled=copy.deepcopy(resinfoTable)
print resinfoTableFilled.PDB_ID.isna().sum()
for iEntry in resinfoTableFilled.PDB_ID.index:
    entry=resinfoTableFilled.PDB_ID[iEntry]
    if np.isnan(entry):
        lastVal=lastVal+1
        resinfoTableFilled.PDB_ID[iEntry]=lastVal
    else:
        lastVal=entry
resinfoTableFilled.PDB_ID.isna().sum()

53


0

In [35]:
#the pdb residue id's are sequential, but have gaps corresponding
#to unresolved amino acids in the Cryo-EM structure.
#The easy solution is just to fill in linearly.
#The three arms in our simulation structure are identical, so
#we can generate our back-map by just repeating the pdb sequence 3 times
simResid_to_pdbResid=list(np.array(resinfoTableFilled.PDB_ID))*3
print simResid_to_pdbResid
np.savetxt(baseDir+'simResid_to_pdbResid.out', simResid_to_pdbResid, newline=" ", fmt='%s')
#print ', '.join(map(str,simResid_to_pdbResid))
simResid_to_pdbResid=list(
      np.loadtxt(baseDir+'simResid_to_pdbResid.out',dtype=int)
    )
print simResid_to_pdbResid

[782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981,

In [0]:
?np.loadtxt

While the above excel sheet provides a useful summary at a glance, we would like to have direct access to the distribution of residence times rather than just the mean max and cummulative sum over all lipids. 

Using the set of all individually measured residence times, we can fit a model distribution. More specifically, the reciporical of residence time would correspond to a frequency. Specifically, the reciporical of residence time gives us the corresponding unbinding frequency. 

This can then be used to fit an appropriate distribution (geometric distribution would be one choice) and provide a characteristic unbinding frequency (or characteristic residence time as its reciporical). More over, it can give us a bound / confidence interval of this distribution as well.

We can then repeat this process for the all atom model. While CG is expected to have shorter residence times due to the notably lower membrane viscocity, we should still be able to see if the ranking and / or relative characteristic residence time / unbinding frequencies match (for each protein amino acid). If CG can rank amino acids in the correct order, based upon PIP2 residence times, then we can be confident that it is functioning well as a model for correctly predicting lipid binding sites.

Below, this residence time distribution data can be exctracted from the coarse grain simulation data files which list individual PIP2 residence time observations for each protein residue (amino acid)

in the Raw_PIP2_CG_residence_time_data directory, the .xvg files contain
the 'occupancy' of each amino acid at each output time step.
The occupancy is zero if there were no PIP2 lipids in contact
and non-zero if there was at least one PIP2 lipid in contact.
The first step is to extract these individual timeseries
into a joint table.

# **2. Write raw occupancy data (a list of 1 and 0) to csv file**

In [0]:
#os.listdir can be used to generate a list of all xvg files present
cg_RawData_dir=baseDir+'Raw_PIP2_CG_residence_time_data/'
!cg_RawData_dir=baseDir+'Raw_POPS_data/calc_CG_all_lysArg/'
cg_dataFile_list=[dataFileName for dataFileName in os.listdir(cg_RawData_dir) \
             if 'xvg' in dataFileName]
cg_dataFile_list

#simResid_to_pdbResid=np.loadtxt(baseDir+"test.out", dtype=int)
#simResid_to_pdbResid=', '.join(map(str,simResid_to_pdbResid))
#print simResid_to_pdbResid

In [36]:
#Each of the above files contains an 18 line header, followed by 2 column occupancy series
#the data can be easily extracted using pd.read_table
cgDataTables=[]
for dataFileName in tqdm.tqdm_notebook(cg_dataFile_list):
    resID=int(dataFileName.split('_')[1])
    seqID=simResid_to_pdbResid[resID-1]
    tempTable=pd.read_csv(
        cg_RawData_dir+dataFileName,
        skiprows=17,names=['Time','Occupancy'],
        delim_whitespace=True)
    tempTable['ResID']=resID
    tempTable['SeqID']=seqID
    tempTable['Frame']=np.arange(len(tempTable))
    tempTable=tempTable[['ResID','SeqID','Frame','Time','Occupancy']]
    cgDataTables.append(copy.deepcopy(tempTable))
    
cg_occupancy_data=pd.concat(cgDataTables)
cg_occupancy_data.to_csv(baseDir+"Coarse_Grain_PIP2_Occupancy_Data.csv",index=False)
print cg_occupancy_data.head()
print cg_occupancy_data.tail()

HBox(children=(IntProgress(value=0, max=429), HTML(value=u'')))


   ResID  SeqID  Frame    Time  Occupancy
0   1997   1360      0     0.0          0
1   1997   1360      1  1000.0          0
2   1997   1360      2  2000.0          0
3   1997   1360      3  3000.0          0
4   1997   1360      4  4000.0          0
       ResID  SeqID  Frame        Time  Occupancy
11996    744   1728  11996  11996000.0          1
11997    744   1728  11997  11997000.0          1
11998    744   1728  11998  11998000.0          1
11999    744   1728  11999  11999000.0          1
12000    744   1728  12000  12000000.0          1


In [10]:
residtmp=pd.DataFrame(cg_occupancy_data.ResID.unique())
print residtmp.count()
print "......"
segidtmp=pd.DataFrame(cg_occupancy_data.SeqID.unique())
print segidtmp.count()
print residtmp.count()/segidtmp.count()


0    429
dtype: int64
......
0    143
dtype: int64
0    3.0
dtype: float64


In [11]:
#The coarse grain data table is huge, so we need to split it into chunks

os.system("split -l 100000 "+\
          baseDir+"Coarse_Grain_PIP2_Occupancy_Data.csv "+\
          baseDir+"Coarse_Grain_Occupancy_Data/Coarse_Grain_Occupancy_Data.chunk.")
os.system("rm "+baseDir+"Coarse_Grain_PIP2_Occupancy_Data.csv")

0

Next, lets collect the occupancy data for the all atom simulation into a single data frame.

In [0]:
aa_RawData_dir=baseDir+'Raw_POPS_data/calc_aa_190ns_lysArg/'
aa_dataFile_list=[dataFileName for dataFileName in os.listdir(aa_RawData_dir) \
             if 'xvg' in dataFileName]
aa_dataFile_list

#Each of the above files contains an 18 line header, followed by 2 column occupancy series
#the data can be easily extracted using pd.read_table
aaDataTables=[]
for dataFileName in tqdm.tqdm_notebook(aa_dataFile_list):
    resID=int(dataFileName.split('_')[1])
    seqID=simResid_to_pdbResid[resID-1]
    tempTable=pd.read_csv(
        aa_RawData_dir+dataFileName,
        skiprows=17,names=['Time','Occupancy'],
        delim_whitespace=True)
    tempTable['ResID']=resID
    tempTable['SeqID']=seqID
    tempTable['Frame']=np.arange(len(tempTable))
    tempTable=tempTable[['ResID','SeqID','Frame','Time','Occupancy']]
    aaDataTables.append(copy.deepcopy(tempTable))
    
aa_occupancy_data=pd.concat(aaDataTables)
aa_occupancy_data.to_csv(baseDir+"All_Atom_POPS_Occupancy_Data.190ns_0.12ns.csv",index=False)
print aa_occupancy_data.head()
print aa_occupancy_data.tail()

In [0]:
aa_occupancy_data.Time.max()*0.12
residtmp=pd.DataFrame(aa_occupancy_data.ResID.unique())
print aa_occupancy_data.ResID.count()
print residtmp.count()
print "......"
segidtmp=pd.DataFrame(aa_occupancy_data.SeqID.unique())
print segidtmp.count()
print residtmp.count()/segidtmp.count()

# **3. Fit occupancy and waiting distributions**

In [0]:
def extract_runs(x):
    return [len(list(gg)) for kk,gg in itertools.groupby(x, bool) if kk]

def extract_resDist(x):
    return(np.unique(extract_runs(x),return_counts=True))

def bin_runs(x,binWidth=1.000,frameRate=1.,center='right'):
    init_dist=extract_resDist(x)
    x_dist=(init_dist[0]*frameRate,init_dist[1])
    binMax=np.max(x_dist[0])
    nBins=np.ceil(binMax/binWidth)+1
    hbins=np.arange(nBins)*binWidth
    temp_dist=np.histogram(x_dist[0],weights=x_dist[1],bins=hbins)
    outDat=[temp_dist[1],temp_dist[0]]
    if center=='midpoint':
        outDat[0]=(temp_dist[1][1:]+temp_dist[1][:-1])/2.
    elif center=="left":
        outDat[0]=temp_dist[1][:-1]
    elif center=="right":
        outDat[0]=temp_dist[1][1:]
    return(outDat)

def expDist(x,l):
    return np.exp(-x/l)/l

def frequencyDistribution_mle_exp_params(bin_dist,bias_correction=True):
    Nv=np.sum(bin_dist[1])
    if bias_correction & (Nv>2):
        Nv=Nv-2
    return (np.sum(bin_dist[0]*bin_dist[1])/(1.*Nv))

        

def beta_dist(x,a,b):
    return (x**(a-1.)*(1.-x)**(b-1.))/\
        (sp.special.gamma(a)*sp.special.gamma(b)/sp.special.gamma(a+b))
def gamma_dist(x,t,k):
    return 1/(sp.special.gamma(k)*(t**k))*x**(k-1.)*np.exp(-x/t)

def frequencyDistribution_mle_gamma_params(bin_dist,bias_correction=True):
    xi,fi=bin_dist
    Nv=np.sum(fi)
    sum_f=np.sum(fi*xi)
    sum_lnf=np.sum(fi*np.log(xi))
    sum_flnf=np.sum(fi*xi*np.log(xi))
    k_est=(Nv*sum_f)/(Nv*sum_flnf-sum_lnf*sum_f)
    t_est=1./(Nv**2)*(Nv*sum_flnf-sum_lnf*sum_f)
    if bias_correction:
        t_est=Nv*t_est/(Nv-1)
        k_est=k_est-1./Nv*(3.*k_est-2./3.*(k_est/(1+k_est))-4./5.*(k_est/(1+k_est)**2))
    return [t_est,k_est]


def calc_GammaParams(x,fr=1.0,bw=4.8):
    return list(list(
        frequencyDistribution_mle_gamma_params(
             bin_runs(x,frameRate=fr,binWidth=bw),
             bias_correction=True))
        ) if np.sum(x)>0 else [np.nan,np.nan]

def calc_ExpParams(x,fr=1.0,bw=4.8):
    return frequencyDistribution_mle_exp_params(
                bin_runs(x,frameRate=fr,binWidth=bw)
            ) if np.sum(x)>0 else np.nan

def calc_GammaRMSE(x,fr=1.0,bw=4.8):
    if np.sum(x)>0:
        xDist=bin_runs(x,frameRate=fr,binWidth=bw)
        fitParams=calc_GammaParams(x)
        return np.sqrt(
            np.sum(
                (xDist[1]/np.sum(xDist[1])-\
                gamma_dist(xDist[0],*fitParams))**2)/\
            len(xDist[1]))
    else:
        return np.nan
        
def calc_ExpRMSE(x,fr=1.0,bw=4.8):
    if np.sum(x)>0:
        xDist=bin_runs(x,frameRate=fr,binWidth=bw)
        fitParams=[calc_ExpParams(x)]
        return np.sqrt(
            np.sum(
                (xDist[1]/np.sum(xDist[1])-\
                expDist(xDist[0],*fitParams))**2)/\
            len(xDist[1]))
    else:
        return np.nan


In [138]:
cg_occupancy_data_files=glob.glob(baseDir+"Coarse_Grain_Occupancy_Data/Coarse_Grain_Occupancy_Data.chunk*")
print "first cg data frame: '%s'"%np.sort(cg_occupancy_data_files)[0]
cg_frames=[]
for iFile,cg_file in tqdm.tqdm_notebook(enumerate(np.sort(cg_occupancy_data_files))):
    if iFile==0:
        cg_frames.append(pd.read_csv(cg_file))
        colNames=cg_frames[0].columns
    else:
        cg_frames.append(pd.read_csv(cg_file,names=colNames))
cg_occupancy_data=pd.concat(cg_frames)
cg_frames=[]
gc.collect()

aa_occupancy_data=pd.read_csv(baseDir+"All_Atom_PIP2_Occupancy_Data.2us_0.6ns.csv")

print cg_occupancy_data.head()
print aa_occupancy_data.head()
np.max(aa_occupancy_data.Time.unique())*.12

first cg data frame: 'Piezo_PIP2_binding_analysis/Coarse_Grain_Occupancy_Data/Coarse_Grain_Occupancy_Data.chunk.aa'


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


   ResID  SeqID  Frame    Time  Occupancy
0   3677   1969      0     0.0          0
1   3677   1969      1  1000.0          0
2   3677   1969      2  2000.0          0
3   3677   1969      3  3000.0          0
4   3677   1969      4  4000.0          0
   ResID  SeqID  Frame  Time  Occupancy
0   3677   1969      0   0.0          0
1   3677   1969      1   5.0          0
2   3677   1969      2  10.0          0
3   3677   1969      3  15.0          0
4   3677   1969      4  20.0          0


1999.8

In [0]:
aaFrameRate=0.12 #time between frames in ns for all atom
aaBinWidth=4.0 #binning size in ns

aaKwds={'fr':aaFrameRate,'bw':aaBinWidth}
aaFitDat=aa_occupancy_data
aaFitDat['Wait']=aaFitDat.Occupancy.map(lambda x: not x)
print aaFitDat.head()
aaFitFrame=aaFitDat.groupby(['ResID','SeqID']).agg(
    {"Occupancy": {
         "Total":lambda x: aaFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max":lambda x: np.max(extract_runs(x))*aaFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**aaKwds), 
        "Gamma_RMSE":lambda x: calc_GammaRMSE(x,**aaKwds),
        "Exp_Mean": lambda x: calc_ExpParams(x,**aaKwds), 
        "Exp_RMSE": lambda x: calc_ExpRMSE(x,**aaKwds),
        },
     
     "Wait": {
         "Total":lambda x: aaFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max":lambda x: np.max(extract_runs(x))*aaFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**aaKwds), 
        "Gamma_RMSE":lambda x: calc_GammaRMSE(x,**aaKwds),
        "Exp_Mean": lambda x: calc_ExpParams(x,**aaKwds), 
        "Exp_RMSE": lambda x: calc_ExpRMSE(x,**aaKwds),
        },
     })

aaFitFrame.columns=aaFitFrame.columns.map(lambda x: '.'.join(x))
for entryType in ['Wait','Occupancy']:
  aaFitFrame[entryType+'.Gamma_Mean']=aaFitFrame[entryType+'.GammaDist_Params'].map(np.product)
  aaFitFrame[entryType+'.Gamma_k']=aaFitFrame[entryType+'.GammaDist_Params'].map(lambda x: x[1])
  aaFitFrame=aaFitFrame.drop(columns=entryType+'.GammaDist_Params')

aaFitFrame['SimType']='AA'
aaFitFrame=aaFitFrame.reset_index()
aaFitFrame=aaFitFrame[["SimType","ResID", "SeqID","Occupancy.Total", 
                       "Occupancy.N","Occupancy.Max","Occupancy.Exp_Mean","Occupancy.Exp_RMSE",
                       "Occupancy.Gamma_Mean","Occupancy.Gamma_k","Occupancy.Gamma_RMSE",
                       "Wait.N","Wait.Max","Wait.Exp_Mean","Wait.Exp_RMSE",
                       "Wait.Gamma_Mean","Wait.Gamma_k","Wait.Gamma_RMSE"]]
aaFitFrame.head()

In [0]:
cgFrameRate=1.0 #time between frames in ns for coarse grain
cgBinWidth=4.0

cgKwds={'fr':cgFrameRate,'bw':cgBinWidth}
cgFitDat=cg_occupancy_data 
cgFitDat['Wait']=cgFitDat.Occupancy.map(lambda x: not x)
print cgFitDat.head()
cgFitFrame=cgFitDat.groupby(['ResID','SeqID']).agg(
    {"Occupancy": {
         "Total":lambda x: cgFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max":lambda x: np.max(extract_runs(x))*cgFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**cgKwds), 
        "Gamma_RMSE":lambda x: calc_GammaRMSE(x,**cgKwds),
        "Exp_Mean": lambda x: calc_ExpParams(x,**cgKwds), 
        "Exp_RMSE": lambda x: calc_ExpRMSE(x,**cgKwds),
        },
     
     "Wait": {
         "Total":lambda x: cgFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max":lambda x: np.max(extract_runs(x))*cgFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**cgKwds), 
        "Gamma_RMSE":lambda x: calc_GammaRMSE(x,**cgKwds),
        "Exp_Mean": lambda x: calc_ExpParams(x,**cgKwds), 
        "Exp_RMSE": lambda x: calc_ExpRMSE(x,**cgKwds),
        },
     })

cgFitFrame.columns=cgFitFrame.columns.map(lambda x: '.'.join(x))
for entryType in ['Wait','Occupancy']:
  cgFitFrame[entryType+'.Gamma_Mean']=cgFitFrame[entryType+'.GammaDist_Params'].map(np.product)
  cgFitFrame[entryType+'.Gamma_k']=cgFitFrame[entryType+'.GammaDist_Params'].map(lambda x: x[1])
  cgFitFrame=cgFitFrame.drop(columns=entryType+'.GammaDist_Params')
  
cgFitFrame['SimType']='CG'
cgFitFrame=cgFitFrame.reset_index()
cgFitFrame=cgFitFrame[["SimType","ResID", "SeqID","Occupancy.Total", 
                       "Occupancy.N","Occupancy.Max","Occupancy.Exp_Mean","Occupancy.Exp_RMSE",
                       "Occupancy.Gamma_Mean","Occupancy.Gamma_k","Occupancy.Gamma_RMSE",
                       "Wait.N","Wait.Max","Wait.Exp_Mean","Wait.Exp_RMSE",
                       "Wait.Gamma_Mean","Wait.Gamma_k","Wait.Gamma_RMSE"]]  
cgFitFrame.head()

In [0]:
jointFitFrame=pd.concat([aaFitFrame,cgFitFrame])
jointFitMelt=jointFitFrame.melt(id_vars=["SimType","ResID","SeqID"],var_name="Fit_Param")
print jointFitMelt.head()

jointFitWide=jointFitMelt
jointFitWide["Measurement"]=jointFitWide.SimType+"."+jointFitWide.Fit_Param
jointFitWide=jointFitWide.drop(columns=["SimType","Fit_Param"])
jointFitWide=jointFitWide.reset_index()
jointFitWide=pd.pivot_table(index=["ResID","SeqID"],columns="Measurement",values="value",data=jointFitWide)
#jointFitWide.columns=jointFitWide.columns.map(lambda x: x[1])
jointFitWide=jointFitWide.reset_index()
#print jointFitWide.columns
jointFitWide.to_csv(baseDir+"POPS_aa_cg_joint_Fit_Data_wide.csv",index=False)
print jointFitWide.head()
jointFitWide.describe()

# **4. Fit the occupancy/wait data for a cluster of consequtive residues**
(if one of 
the residue has occupany 1, the whole cluster has occupancy 1) 

In [0]:
print "making domain keys"
domainKey=aa_occupancy_data[['ResID','SeqID']].sort_values(['ResID','SeqID']).apply(lambda x: '.'.join(map(str,x)),axis=1).unique()
#domainKey
domainNames={domainKey[0]:domainKey[0]}
currentID=domainKey[0]
print "making domain names"
for iEntry,domainID in tqdm.tqdm_notebook(enumerate(domainKey[1:])):
  if int(domainID.split('.')[-1])-1!=int(domainKey[iEntry].split('.')[-1]):
    currentID=domainID
  domainNames[domainID]=currentID
domainNames

In [92]:
aa_occupancy_data['Domain']=aa_occupancy_data[['ResID','SeqID']].sort_values(
    ['ResID','SeqID']).apply(
        lambda x: domainNames['.'.join(map(str,x))],axis=1)

aa_domain_data=aa_occupancy_data.groupby(['Domain','Frame','Time']).agg({
    "Occupancy":lambda x: np.sum(x)>0
})

aa_domain_data=aa_domain_data.reset_index()

aa_domain_data['Wait']=aa_domain_data['Occupancy'].map(lambda x: not x)
aa_domain_data.head()

Unnamed: 0,Domain,Frame,Time,Occupancy,Wait
0,1.782,0,0.0,False,True
1,1.782,1,5.0,True,False
2,1.782,2,10.0,False,True
3,1.782,3,15.0,True,False
4,1.782,4,20.0,True,False


In [95]:
aaFrameRate=0.12 #time between frames in ns for all atom
aaBinWidth=4.0 #binning size in ns
cgFrameRate=1.0 #time between frames in ns for coarse grain
cgBinWidth=4.0

aaKwds={'fr':aaFrameRate,'bw':aaBinWidth}
cgKwds={'fr':cgFrameRate,'bw':cgBinWidth}

aaFitDat=aa_domain_data 
aaFitFrame=aaFitDat.groupby('Domain').agg(
    {"Occupancy": {
         "Total":lambda x: aaFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max":lambda x: np.max(extract_runs(x))*aaFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**aaKwds), 
        "Gamma_RMSE":lambda x: calc_GammaRMSE(x,**aaKwds),
        "Exp_Mean": lambda x: calc_ExpParams(x,**aaKwds), 
        "Exp_RMSE": lambda x: calc_ExpRMSE(x,**aaKwds),
        },
     
     "Wait": {
         "Total":lambda x: aaFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max":lambda x: np.max(extract_runs(x))*aaFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**aaKwds), 
        "Gamma_RMSE":lambda x: calc_GammaRMSE(x,**aaKwds),
        "Exp_Mean": lambda x: calc_ExpParams(x,**aaKwds), 
        "Exp_RMSE": lambda x: calc_ExpRMSE(x,**aaKwds),
        },
     })

aaFitFrame.columns=aaFitFrame.columns.map(lambda x: '.'.join(x))
for entryType in ['Wait','Occupancy']:
  aaFitFrame[entryType+'.Gamma_Mean']=aaFitFrame[entryType+'.GammaDist_Params'].map(np.product)
  aaFitFrame[entryType+'.Gamma_k']=aaFitFrame[entryType+'.GammaDist_Params'].map(lambda x: x[1])
  aaFitFrame=aaFitFrame.drop(columns=entryType+'.GammaDist_Params')

aaFitFrame['SimType']='AA'
aaFitFrame=aaFitFrame.reset_index()
aaFitFrame=aaFitFrame[["SimType","Domain","Occupancy.Total", 
                       "Occupancy.N","Occupancy.Max","Occupancy.Exp_Mean","Occupancy.Exp_RMSE",
                       "Occupancy.Gamma_Mean","Occupancy.Gamma_k","Occupancy.Gamma_RMSE",
                       "Wait.N","Wait.Max","Wait.Exp_Mean","Wait.Exp_RMSE",
                       "Wait.Gamma_Mean","Wait.Gamma_k","Wait.Gamma_RMSE"]]
aaFitFrame.head()
aaFitFrame.to_csv(baseDir+"PIP_aa_domain_fit.csv",index=False)



In [0]:
print "making domain keys"
domainKey=cg_occupancy_data[['ResID','SeqID']].sort_values(['ResID','SeqID']).apply(lambda x: '.'.join(map(str,x)),axis=1).unique()
#domainKey
domainNames={domainKey[0]:domainKey[0]}
currentID=domainKey[0]
print "making domain names"
for iEntry,domainID in tqdm.tqdm_notebook(enumerate(domainKey[1:])):
  if int(domainID.split('.')[-1])-1!=int(domainKey[iEntry].split('.')[-1]):
    currentID=domainID
  domainNames[domainID]=currentID
domainNames

In [57]:
tempDict={'1.1':'1.1','1.2':'1.1','1.4':'1.4'}
#tempDict=domainNames
tempFrame=pd.DataFrame({'IndexPair':np.sort(tempDict.keys()),
              'Domain':[tempDict[iKey] for iKey in np.sort(tempDict.keys())]})
print tempFrame
tempFrame=zip(tempFrame['IndexPair'],tempFrame['Domain'])
print tempFrame

  Domain IndexPair
0    1.1       1.1
1    1.1       1.2
2    1.4       1.4
[('1.1', '1.1'), ('1.2', '1.1'), ('1.4', '1.4')]


In [61]:
#tempDict={'1.1':'1.1','1.2':'1.1','1.4':'1.4'}
tempDict=domainNames
tempFrame=pd.DataFrame({'IndexPair':np.sort(tempDict.keys()),
              'Domain':[tempDict[iKey] for iKey in np.sort(tempDict.keys())]})
tempFrame.to_csv(baseDir+'domainNames.csv')
print tempFrame


        Domain  IndexPair
0        1.782      1.782
1    1007.2135  1007.2135
2    1038.2166  1038.2166
3    1041.2169  1041.2169
4    1045.2173  1045.2173
5    1045.2173  1046.2174
6    1051.2179  1051.2179
7    1054.2182  1054.2182
8    1054.2182  1055.2183
9    1054.2182  1056.2184
10   1054.2182  1057.2185
11   1060.2188  1060.2188
12   1086.2214  1086.2214
13   1103.2231  1103.2231
14   1167.2295  1167.2295
15   1173.2301  1173.2301
16   1177.2305  1177.2305
17   1190.2318  1190.2318
18   1197.2325  1197.2325
19   1201.2329  1201.2329
20   1211.2339  1211.2339
21     122.903    122.903
22   1223.2351  1223.2351
23   1223.2351  1224.2352
24   1233.2361  1233.2361
25   1246.2374  1246.2374
26   1249.2377  1249.2377
27   1260.2388  1260.2388
28   1274.2402  1274.2402
29   1278.2406  1278.2406
..         ...        ...
399   690.1674   690.1674
400   693.1677   693.1677
401   740.1724   740.1724
402   743.1727   743.1727
403   743.1727   744.1728
404   760.1744   760.1744
405   777.17

In [62]:
tempFrame=pd.read_csv(baseDir+'domainNames.csv')
tempFrame=zip(tempFrame['IndexPair'],tempFrame['Domain'])
print tempFrame

[(1.7819999999999998, 1.7819999999999998), (1007.2135, 1007.2135), (1038.2166, 1038.2166), (1041.2169, 1041.2169), (1045.2173, 1045.2173), (1046.2174, 1045.2173), (1051.2179, 1051.2179), (1054.2182, 1054.2182), (1055.2183, 1054.2182), (1056.2184, 1054.2182), (1057.2185, 1054.2182), (1060.2188, 1060.2188), (1086.2214, 1086.2214), (1103.2231, 1103.2231), (1167.2295, 1167.2295), (1173.2301, 1173.2301), (1177.2305, 1177.2305), (1190.2318, 1190.2318), (1197.2325, 1197.2325), (1201.2329, 1201.2329), (1211.2339, 1211.2339), (122.90299999999999, 122.90299999999999), (1223.2351, 1223.2351), (1224.2352, 1223.2351), (1233.2361, 1233.2361), (1246.2374, 1246.2374), (1249.2377, 1249.2377), (1260.2388, 1260.2388), (1274.2402, 1274.2402), (1278.2406, 1278.2406), (1279.2407, 1278.2406), (1294.2422, 1294.2422), (1310.2438, 1310.2438), (1324.2452, 1324.2452), (134.915, 134.915), (135.916, 134.915), (1351.2479, 1351.2479), (1354.2482, 1354.2482), (1374.2502, 1374.2502), (1377.2505, 1377.2505), (1386.2514,

In [63]:
cg_occupancy_data['Domain']=cg_occupancy_data[['ResID','SeqID']].sort_values(
    ['ResID','SeqID']).apply(
        lambda x: domainNames['.'.join(map(str,x))],axis=1)

cg_domain_data=cg_occupancy_data.groupby(['Domain','Frame','Time']).agg({
    "Occupancy":lambda x: np.sum(x)>0
})

cg_domain_data=cg_domain_data.reset_index()

cg_domain_data['Wait']=cg_domain_data['Occupancy'].map(lambda x: not x)
cg_domain_data.head()

Unnamed: 0,Domain,Frame,Time,Occupancy,Wait
0,1.782,0,0.0,False,True
1,1.782,1,1000.0,False,True
2,1.782,2,2000.0,False,True
3,1.782,3,3000.0,False,True
4,1.782,4,4000.0,False,True


In [43]:

cgFrameRate=1.0 #time between frames in ns for coarse grain
cgBinWidth=4.0
cgKwds={'fr':cgFrameRate,'bw':cgBinWidth}

cgFitDat=cg_domain_data 
cgFitFrame=cgFitDat.groupby('Domain').agg(
    {"Occupancy": {
        "Total":lambda x: cgFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max":lambda x: np.max(extract_runs(x))*cgFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**cgKwds), 
        "Gamma_RMSE":lambda x: calc_GammaRMSE(x,**cgKwds),
        "Exp_Mean": lambda x: calc_ExpParams(x,**cgKwds), 
        "Exp_RMSE": lambda x: calc_ExpRMSE(x,**cgKwds),
        },
     
     "Wait": {
         "Total":lambda x: cgFrameRate*np.sum(x),
         "N":lambda x: len(extract_runs(x)),
         "Max":lambda x: np.max(extract_runs(x))*cgFrameRate if np.sum(x) > 0 else 0,
         "GammaDist_Params": lambda x: calc_GammaParams(x,**cgKwds), 
        "Gamma_RMSE":lambda x: calc_GammaRMSE(x,**cgKwds),
        "Exp_Mean": lambda x: calc_ExpParams(x,**cgKwds), 
        "Exp_RMSE": lambda x: calc_ExpRMSE(x,**cgKwds),
        },
     })

cgFitFrame.columns=cgFitFrame.columns.map(lambda x: '.'.join(x))
for entryType in ['Wait','Occupancy']:
  cgFitFrame[entryType+'.Gamma_Mean']=cgFitFrame[entryType+'.GammaDist_Params'].map(np.product)
  cgFitFrame[entryType+'.Gamma_k']=cgFitFrame[entryType+'.GammaDist_Params'].map(lambda x: x[1])
  cgFitFrame=cgFitFrame.drop(columns=entryType+'.GammaDist_Params')
  
cgFitFrame['SimType']='CG'
cgFitFrame=cgFitFrame.reset_index()
cgFitFrame=cgFitFrame[["SimType","Domain","Occupancy.Total", 
                       "Occupancy.N","Occupancy.Max","Occupancy.Exp_Mean","Occupancy.Exp_RMSE",
                       "Occupancy.Gamma_Mean","Occupancy.Gamma_k","Occupancy.Gamma_RMSE",
                       "Wait.N","Wait.Max","Wait.Exp_Mean","Wait.Exp_RMSE",
                       "Wait.Gamma_Mean","Wait.Gamma_k","Wait.Gamma_RMSE"]]  
#cgFitFrame.head()
cgFitFrame.to_csv(baseDir+"PIP2_cg_domain_fit.csv",index=False)

