In [89]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import scipy as sp

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

import os
import sys
import gc
import copy

import tqdm
import itertools

import subprocess

In [8]:
baseDir='./'

If you are viewing this in google colab, you will need to clone the repository first.
To do so uncomment the two code cells below

In [9]:
#!git clone https://github.com/wesleymsmith/Piezo_PIP2_binding_analysis.git

In [10]:
#baseDir='Piezo_PIP2_binding_analysis/'

In [119]:
xcelData=pd.read_excel(baseDir+'Residue_ID_total_occupancy_10_1_2019.xlsx',
              sheet_name=None)

In [12]:
@interact
def show_data(sheet_name=xcelData.keys()):
    return xcelData[sheet_name]

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUnc2hlZXRfbmFtZScsIG9wdGlvbnM9KHUnY2cnLCB1J1NoZWV0MScsIHUnYWEnLCB1J3RlbnNpb25fMzBucycsIHXigKY=


In [13]:
#the above tables are
#aa - result summary for all atom simulation
#cg - result summary for coarse grain simulation
#tension_30ns - results of all atom simulation with membrane tension
#sheet 1 is apparently blank...
#resinfo_table - mapping between cryo-em structure sequence and all atom residue ids
resinfoDataSheet=xcelData['resinfo_table']
resinfoTable=resinfoDataSheet[
    resinfoDataSheet.columns[[0,3,5,7]]][2:]
resinfoTable.columns=['PDB_ID','Arm1_Resid','Arm2_Resid','Arm3_Resid']
resinfoTable.head()

Unnamed: 0,PDB_ID,Arm1_Resid,Arm2_Resid,Arm3_Resid
2,782,1,1419,2837
3,783,2,1420,2838
4,784,3,1421,2839
5,785,4,1422,2840
6,786,5,1423,2841


Before continuing, lets check to see if we have any missing values.

In [121]:
for colName in resinfoTable.columns:
    print '%s:'%colName,
    print resinfoTable[colName].isna().sum()

PDB_ID: 53
Arm1_Resid: 0
Arm2_Resid: 0
Arm3_Resid: 0


In [101]:
resinfoTable[resinfoTable['Arm1_Resid']==1329]

Unnamed: 0,PDB_ID,Arm1_Resid,Arm2_Resid,Arm3_Resid
1330,,1329,2747,4165


In [115]:
np.isnan(resinfoTableFilled.PDB_ID[97])

True

After our inspection above, we see there is a problem with trying to directly use the resinfo table above.
Specifically, the 'PDB_ID' column we want to use has missing values!

Fortunately, we know that these missing values are gaps in the pdb sequence for structures that could not be resolved in the cryo-em. The corresponding sequence should, therefore, increase linearly across these gaps.

This makes our imputation strategy relatively straight forward. We will iterate over the PDB_ID column. And keep track of the last value we see. When we find valid (integer) values, we just update the last value variable. If we see a missing value we simply increment that last value variable then set the missing entry to be equal to that value.

In [118]:
resinfoTableFilled=copy.deepcopy(resinfoTable)
print resinfoTableFilled.PDB_ID.isna().sum()
for iEntry in resinfoTableFilled.PDB_ID.index:
    entry=resinfoTableFilled.PDB_ID[iEntry]
    if np.isnan(entry):
        lastVal=lastVal+1
        resinfoTableFilled.PDB_ID[iEntry]=lastVal
    else:
        lastVal=entry
resinfoTableFilled.PDB_ID.isna().sum()

53


0

In [122]:
#the pdb residue id's are sequential, but have gaps corresponding
#to unresolved amino acids in the Cryo-EM structure.
#The easy solution is just to fill in linearly.
#The three arms in our simulation structure are identical, so
#we can generate our back-map by just repeating the pdb sequence 3 times
simResid_to_pdbResid=list(np.array(resinfoTableFilled.PDB_ID))*3
print ', '.join(map(str,simResid_to_pdbResid))

782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 880, 881, 882, 883, 884, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 

While the above excel sheet provides a useful summary at a glance, we would like to have direct access to the distribution of residence times rather than just the mean max and cummulative sum over all lipids. 

Using the set of all individually measured residence times, we can fit a model distribution. More specifically, the reciporical of residence time would correspond to a frequency. Specifically, the reciporical of residence time gives us the corresponding unbinding frequency. 

This can then be used to fit an appropriate distribution (geometric distribution would be one choice) and provide a characteristic unbinding frequency (or characteristic residence time as its reciporical). More over, it can give us a bound / confidence interval of this distribution as well.

We can then repeat this process for the all atom model. While CG is expected to have shorter residence times due to the notably lower membrane viscocity, we should still be able to see if the ranking and / or relative characteristic residence time / unbinding frequencies match (for each protein amino acid). If CG can rank amino acids in the correct order, based upon PIP2 residence times, then we can be confident that it is functioning well as a model for correctly predicting lipid binding sites.

Below, this residence time distribution data can be exctracted from the coarse grain simulation data files which list individual PIP2 residence time observations for each protein residue (amino acid)

in the Raw_PIP2_CG_residence_time_data directory, the .xvg files contain
the 'occupancy' of each amino acid at each output time step.
The occupancy is zero if there were no PIP2 lipids in contact
and non-zero if there was at least one PIP2 lipid in contact.
The first step is to extract these individual timeseries
into a joint table.

In [123]:
#os.listdir can be used to generate a list of all xvg files present
cg_RawData_dir=baseDir+'Raw_PIP2_CG_residence_time_data/'
cg_dataFile_list=[dataFileName for dataFileName in os.listdir(cg_RawData_dir) \
             if 'xvg' in dataFileName]
cg_dataFile_list

['id_4236_mask.xvg',
 'id_3365_mask.xvg',
 'id_2250_mask.xvg',
 'id_3630_mask.xvg',
 'id_3456_mask.xvg',
 'id_843_mask.xvg',
 'id_3516_mask.xvg',
 'id_548_mask.xvg',
 'id_186_mask.xvg',
 'id_3043_mask.xvg',
 'id_1988_mask.xvg',
 'id_3679_mask.xvg',
 'id_4110_mask.xvg',
 'id_2218_mask.xvg',
 'id_373_mask.xvg',
 'id_1483_mask.xvg',
 'id_1813_mask.xvg',
 'id_3034_mask.xvg',
 'id_1354_mask.xvg',
 'id_2857_mask.xvg',
 'id_162_mask.xvg',
 'id_3794_mask.xvg',
 'id_577_mask.xvg',
 'id_244_mask.xvg',
 'id_4026_mask.xvg',
 'id_245_mask.xvg',
 'id_3529_mask.xvg',
 'id_2772_mask.xvg',
 'id_782_mask.xvg',
 'id_2521_mask.xvg',
 'id_1046_mask.xvg',
 'id_2388_mask.xvg',
 'id_3415_mask.xvg',
 'id_1983_mask.xvg',
 'id_378_mask.xvg',
 'id_2212_mask.xvg',
 'id_1580_mask.xvg',
 'id_4013_mask.xvg',
 'id_800_mask.xvg',
 'id_788_mask.xvg',
 'id_475_mask.xvg',
 'id_613_mask.xvg',
 'id_2325_mask.xvg',
 'id_3523_mask.xvg',
 'id_1642_mask.xvg',
 'id_680_mask.xvg',
 'id_2416_mask.xvg',
 'id_3350_mask.xvg',
 'id_17

In [124]:
#Each of the above files contains an 18 line header, followed by 2 column occupancy series
#the data can be easily extracted using pd.read_table
cgDataTables=[]
for dataFileName in tqdm.tqdm_notebook(cg_dataFile_list):
    resID=int(dataFileName.split('_')[1])
    seqID=simResid_to_pdbResid[resID-1]
    tempTable=pd.read_csv(
        cg_RawData_dir+dataFileName,
        skiprows=17,names=['Time','Occupancy'],
        delim_whitespace=True)
    tempTable['ResID']=resID
    tempTable['SeqID']=seqID
    tempTable['Frame']=np.arange(len(tempTable))
    tempTable=tempTable[['ResID','SeqID','Frame','Time','Occupancy']]
    cgDataTables.append(copy.deepcopy(tempTable))
    
cg_occupancy_data=pd.concat(cgDataTables)
cg_occupancy_data.to_csv(baseDir+"Coarse_Grain_Occupancy_Data.csv",index=False)
cg_occupancy_data.head()

HBox(children=(IntProgress(value=0, max=429), HTML(value=u'')))




Unnamed: 0,ResID,SeqID,Frame,Time,Occupancy
0,4236,2528,0,0.0,0
1,4236,2528,1,1000.0,0
2,4236,2528,2,2000.0,0
3,4236,2528,3,3000.0,0
4,4236,2528,4,4000.0,0


In [125]:
#The coarse grain data table is huge, so we need to split it into chunks
os.system("split -l 100000 "+\
          baseDir+"Coarse_Grain_Occupancy_Data.csv "+\
          baseDir+"Coarse_Grain_Occupancy_Data.chunk.")
os.system("rm "+baseDir+"Coarse_Grain_Occupancy_Data.csv")

0

Next, lets collect the occupancy data for the all atom simulation into a single data frame.

In [126]:
aa_RawData_dir=baseDir+'Raw_PIP2_AA_residence_time_data/'
aa_dataFile_list=[dataFileName for dataFileName in os.listdir(aa_RawData_dir) \
             if 'xvg' in dataFileName]
aa_dataFile_list

#Each of the above files contains an 18 line header, followed by 2 column occupancy series
#the data can be easily extracted using pd.read_table
aaDataTables=[]
for dataFileName in tqdm.tqdm_notebook(aa_dataFile_list):
    resID=int(dataFileName.split('_')[1])
    seqID=simResid_to_pdbResid[resID-1]
    tempTable=pd.read_csv(
        aa_RawData_dir+dataFileName,
        skiprows=17,names=['Time','Occupancy'],
        delim_whitespace=True)
    tempTable['ResID']=resID
    tempTable['SeqID']=seqID
    tempTable['Frame']=np.arange(len(tempTable))
    tempTable=tempTable[['ResID','SeqID','Frame','Time','Occupancy']]
    aaDataTables.append(copy.deepcopy(tempTable))
    
aa_occupancy_data=pd.concat(aaDataTables)
aa_occupancy_data.to_csv(baseDir+"All_Atom_Occupancy_Data.csv",index=False)
aa_occupancy_data.head()

HBox(children=(IntProgress(value=0, max=429), HTML(value=u'')))




Unnamed: 0,ResID,SeqID,Frame,Time,Occupancy
0,4236,2528,0,0.0,0
1,4236,2528,1,1.0,0
2,4236,2528,2,2.0,0
3,4236,2528,3,3.0,0
4,4236,2528,4,4.0,0
