In [1]:
#!git clone https://github.com/wesleymsmith/Piezo_PIP2_binding_analysis.git
#!pip install bokeh
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import scipy as sp
from scipy import stats

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

import os
import sys
import gc
import copy
import glob

import tqdm
import itertools

import bokeh
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, CDSView, GroupFilter, HoverTool
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral6

# Load Occupancy Data

If you are just trying to view pre-generated convergence data, then skip this section and go to the 'Load Convergence Data' Section

In [64]:
baseDir='.'

aaOccupancyFile="All_Atom_Occupancy_Data.2us.csv"
aaFrameRate=.6

print 'reading all atom data'
aa_occupancy_data=pd.read_csv('/'.join([baseDir,aaOccupancyFile]))
aa_occupancy_data.Time=aa_occupancy_data.Frame*aaFrameRate
aa_occupancy_data['SimType']='AA'
aa_occupancy_data['Rep']=0

print aa_occupancy_data.head()

cgOccupancyFileBase="Coarse_Grain_Occupancy_Data.chunk"
cgFrameRate=1.0

cgOccupancyFileList=[fileName for fileName in \
                     os.listdir(baseDir) if \
                     cgOccupancyFileBase in fileName]
#print cgOccupancyFileList
cgOccupancyChunks=[]
print 'Reading coarse grain data chunks'
cgColumns=[]
for iFile,fileName in tqdm.tqdm_notebook(enumerate(cgOccupancyFileList)):
    if 'chunk.aa' in fileName:
        cgOccupancyChunks.append(pd.read_csv('/'.join([baseDir,fileName])))
        print "cg occupancy file header:"
        cgColumns=cgOccupancyChunks[iFile].columns
        print cgColumns
    else:
        cgOccupancyChunks.append(pd.read_csv('/'.join([baseDir,fileName]),
            header=None))
for cgFrame in cgOccupancyChunks:
    cgFrame.columns=cgColumns
cg_occupancy_data=pd.concat(cgOccupancyChunks)
del(cgOccupancyChunks)
cg_occupancy_data.Time=cg_occupancy_data.Frame*cgFrameRate
cg_occupancy_data['Rep']=0
cg_occupancy_data['SimType']='CG'
print cg_occupancy_data.head()
gc.collect()

joint_occupancy_data=pd.concat([aa_occupancy_data,cg_occupancy_data])
joint_occupancy_data.head()

reading all atom data
   ResID  SeqID  Frame  Time  Occupancy SimType  Rep
0   4236   2528      0   0.0          0      AA    0
1   4236   2528      1   0.6          0      AA    0
2   4236   2528      2   1.2          0      AA    0
3   4236   2528      3   1.8          0      AA    0
4   4236   2528      4   2.4          0      AA    0
Reading coarse grain data chunks


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))

cg occupancy file header:
Index([u'ResID', u'SeqID', u'Frame', u'Time', u'Occupancy'], dtype='object')

   ResID  SeqID  Frame    Time  Occupancy  Rep SimType
0    589   1497   3866  3866.0          0    0      CG
1    589   1497   3867  3867.0          1    0      CG
2    589   1497   3868  3868.0          1    0      CG
3    589   1497   3869  3869.0          1    0      CG
4    589   1497   3870  3870.0          1    0      CG


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Frame,Occupancy,Rep,ResID,SeqID,SimType,Time
0,0,0,0,4236,2528,AA,0.0
1,1,0,0,4236,2528,AA,0.6
2,2,0,0,4236,2528,AA,1.2
3,3,0,0,4236,2528,AA,1.8
4,4,0,0,4236,2528,AA,2.4


In [124]:
groupingCols=['ResID','SeqID','Rep','SimType']
with pd.option_context('mode.use_inf_as_null',True):
    analysisData=joint_occupancy_data
    runType='Occupancy'
    analysisData[runType+'.Accumulation']=analysisData.groupby(
        groupingCols)[runType].cumsum()
    analysisData[runType+'.Shifted']=analysisData[runType].shift(1)
    analysisData[runType+'.Shifted']=analysisData[runType+'.Shifted'].map(
        lambda x: x if np.isfinite(x) else 0)
    analysisData[runType+'.delta']=analysisData[runType] - \
        analysisData[runType+'.Shifted']
    analysisData[runType+'.delta']=analysisData[runType+'.delta'].map(
        lambda x: x if np.isfinite(x) else 0)
    analysisData[runType+'.delta']=analysisData[runType+'.delta']>0
    #analysisData=analysisData.dropna()
    analysisData[runType+'.Nruns']=analysisData.groupby(
        groupingCols)[runType+'.delta'].cumsum()
    analysisData[runType+'.Nruns']=analysisData[runType+'.Nruns'].map(
        lambda x: np.max([x-2,1.0]))
    analysisData[runType+'.CumulativeMean']=aaFrameRate * \
        analysisData[runType+'.Accumulation'] / \
        analysisData[runType+'.Nruns']
    
    runType='Vacancy'
    analysisData[runType]=analysisData.Occupancy==0
    analysisData[runType+'.Accumulation']=analysisData.groupby(
        groupingCols)[runType].cumsum()
    analysisData[runType+'.Shifted']=analysisData[runType].shift(1)
    analysisData[runType+'.Shifted']=analysisData[runType+'.Shifted'].map(
        lambda x: x if np.isfinite(x) else 0)
    analysisData[runType+'.delta']=analysisData[runType] - \
        analysisData[runType+'.Shifted']
    analysisData[runType+'.delta']=analysisData[runType+'.delta'].map(
        lambda x: x if np.isfinite(x) else 0)
    analysisData[runType+'.delta']=analysisData[runType+'.delta']>0
    #analysisData=analysisData.dropna()
    analysisData[runType+'.Nruns']=analysisData.groupby(
        groupingCols)[runType+'.delta'].cumsum()
    analysisData[runType+'.Nruns']=analysisData[runType+'.Nruns'].map(
        lambda x: np.max([x-2,1.0]))
    analysisData[runType+'.CumulativeMean']=aaFrameRate * \
        analysisData[runType+'.Accumulation'] / \
        analysisData[runType+'.Nruns']
analysisData.head()

Unnamed: 0,Frame,Occupancy,Rep,ResID,SeqID,SimType,Time,CumulativeOccupancy,CumulativeMean,ShiftedOcc,...,Occupancy.Shifted,Occupancy.delta,Occupancy.Nruns,Occupancy.CumulativeMean,Vacancy,Vacancy.Accumulation,Vacancy.Shifted,Vacancy.delta,Vacancy.Nruns,Vacancy.CumulativeMean
0,0,0,0,4236,2528,AA,0.0,0,0.0,,...,0.0,False,1.0,0.0,True,1.0,0,True,1.0,0.6
1,1,0,0,4236,2528,AA,0.6,0,0.0,0.0,...,0.0,False,1.0,0.0,True,2.0,True,False,1.0,1.2
2,2,0,0,4236,2528,AA,1.2,0,0.0,0.0,...,0.0,False,1.0,0.0,True,3.0,True,False,1.0,1.8
3,3,0,0,4236,2528,AA,1.8,0,0.0,0.0,...,0.0,False,1.0,0.0,True,4.0,True,False,1.0,2.4
4,4,0,0,4236,2528,AA,2.4,0,0.0,0.0,...,0.0,False,1.0,0.0,True,5.0,True,False,1.0,3.0


In [134]:
analysisData.to_csv(baseDir+'/'+'Convergence_Analysis_Data.csv',index=False)

In [136]:
#The convergence data table is huge, so we need to split it into chunks
os.system("split -l 100000 "+\
          baseDir+'/'+"Convergence_Analysis_Data.csv "+\
          baseDir+'/'+"Convergence_Analysis_Data.chunk.")
os.system("rm "+baseDir+'/'+"Convergence_Analysis_Data.csv")

0

# Load Convergence Data

You can start here if you only need to view pre-generated convergence data

In [2]:
baseDir="."
ConvergenceFileBase="Convergence_Analysis_Data.chunk."
convergenceFrameRate=1.0

ConvergenceFileList=[fileName for fileName in \
                     os.listdir(baseDir) if \
                     (ConvergenceFileBase in fileName) & \
                     (not ('swp' in fileName))]
#print ConvergenceFileList
ConvergenceChunks=[]
print 'Reading convergence data chunks'
convergenceColumns=[]
with tqdm.tqdm_notebook(enumerate(ConvergenceFileList)) as pbar:
    for iFile,fileName in enumerate(ConvergenceFileList):
        pbar.set_description(fileName)
        if 'chunk.aa' in fileName:
            ConvergenceChunks.append(pd.read_csv('/'.join([baseDir,fileName])))
            print "convergence occupancy file header:"
            convergenceColumns=ConvergenceChunks[iFile].columns
            print convergenceColumns
        else:
            ConvergenceChunks.append(pd.read_csv('/'.join([baseDir,fileName]),
                header=None))
        pbar.update()
for convergenceFrame in ConvergenceChunks:
    convergenceFrame.columns=convergenceColumns
convergence_data=pd.concat(ConvergenceChunks)
del(ConvergenceChunks)
print convergence_data.head()
gc.collect()

Reading convergence data chunks


HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))

  interactivity=interactivity, compiler=compiler, result=result)


convergence occupancy file header:
Index([u'Frame', u'Occupancy', u'Rep', u'ResID', u'SeqID', u'SimType', u'Time',
       u'CumulativeOccupancy', u'CumulativeMean', u'ShiftedOcc', u'deltaOcc',
       u'Nruns', u'Occupancy.Accumulation', u'Occupancy.Shifted',
       u'Occupancy.delta', u'Occupancy.Nruns', u'Occupancy.CumulativeMean',
       u'Vacancy', u'Vacancy.Accumulation', u'Vacancy.Shifted',
       u'Vacancy.delta', u'Vacancy.Nruns', u'Vacancy.CumulativeMean'],
      dtype='object')

   Frame  Occupancy  Rep  ResID  SeqID SimType    Time  CumulativeOccupancy  \
0   9482          0    0   2330   2040      CG  9482.0                 4391   
1   9483          0    0   2330   2040      CG  9483.0                 4391   
2   9484          1    0   2330   2040      CG  9484.0                 4392   
3   9485          1    0   2330   2040      CG  9485.0                 4393   
4   9486          0    0   2330   2040      CG  9486.0                 4393   

   CumulativeMean  ShiftedOcc  .

0

In [3]:
analysisData=convergence_data
print analysisData.Rep.unique()
analysisData.head()

[0]


Unnamed: 0,Frame,Occupancy,Rep,ResID,SeqID,SimType,Time,CumulativeOccupancy,CumulativeMean,ShiftedOcc,...,Occupancy.Shifted,Occupancy.delta,Occupancy.Nruns,Occupancy.CumulativeMean,Vacancy,Vacancy.Accumulation,Vacancy.Shifted,Vacancy.delta,Vacancy.Nruns,Vacancy.CumulativeMean
0,9482,0,0,2330,2040,CG,9482.0,4391,1.855352,1.0,...,1.0,False,1420.0,1.855352,True,5092.0,False,True,1420.0,2.151549
1,9483,0,0,2330,2040,CG,9483.0,4391,1.855352,0.0,...,0.0,False,1420.0,1.855352,True,5093.0,True,False,1420.0,2.151972
2,9484,1,0,2330,2040,CG,9484.0,4392,1.855775,0.0,...,0.0,True,1421.0,1.854469,False,5093.0,True,False,1420.0,2.151972
3,9485,1,0,2330,2040,CG,9485.0,4393,1.856197,1.0,...,1.0,False,1421.0,1.854891,False,5093.0,False,False,1420.0,2.151972
4,9486,0,0,2330,2040,CG,9486.0,4393,1.854891,1.0,...,1.0,False,1421.0,1.854891,True,5094.0,False,True,1421.0,2.15088


# Visualize Convergence Data

In [4]:
@interact
def plot_cumMeans(resid=analysisData[analysisData['Occupancy.Accumulation']>0].ResID.sort_values().unique(),
                  simType=analysisData.SimType.unique(),
                  show_occ_mean=widgets.ToggleButton(description='Occupancy_Mean',
                                                     value=True),
                  show_occ_nruns=widgets.ToggleButton(description='Occupancy_Runs'),
                  show_occ_accrued=widgets.ToggleButton(description='Occupancy_Accrued'),
                  show_vac_mean=widgets.ToggleButton(description='Vacancy_Mean'),
                  show_vac_nruns=widgets.ToggleButton(description='Vacancy_Runs'),
                  show_vac_accrued=widgets.ToggleButton(description='Vacancy_Accrued'),
                 ):
    tempData=analysisData[(analysisData.ResID==resid) & \
                          (analysisData.SimType==simType)]
    plt.figure(figsize=(12,9))
    for rep in tempData.Rep.unique():
        plotData=tempData[tempData.Rep==rep]
        if show_occ_mean:
            plt.plot(plotData.Time,
                     plotData['Occupancy.CumulativeMean'],
                 label='occupancy_mean',c='green')
        if show_occ_accrued:
            plt.plot(plotData.Time,
                        np.array(plotData['Occupancy.Accumulation'],dtype=float),
                     label='occupancy_occ',c='yellow')
        if show_occ_nruns:
            plt.plot(plotData.Time,
                     plotData['Occupancy.Nruns'],c='cyan',
                     label='occupancy_nruns')
        if show_vac_mean:
            plt.plot(plotData.Time,
                     plotData['Vacancy.CumulativeMean'],
                 label='vacancy_mean',c='magenta')
        if show_vac_accrued:
            plt.plot(plotData.Time,
                        np.array(plotData['Vacancy.Accumulation'],dtype=float),
                     label='vacancy_occ',c='red')
        if show_vac_nruns:
            plt.plot(plotData.Time,
                     plotData['Vacancy.Nruns'],c='blue',
                     label='vacancy_nruns')
    plt.legend()
    plt.show()

aW50ZXJhY3RpdmUoY2hpbGRyZW49KERyb3Bkb3duKGRlc2NyaXB0aW9uPXUncmVzaWQnLCBvcHRpb25zPSgxLCAxNSwgMjEsIDIyLCAzMCwgNjMsIDY1LCAxNjEsIDE2MiwgMTY3LCAxNjgsIDHigKY=


# Ranking Convergence

In [5]:
rankingData=analysisData[np.concatenate([
    ['SimType','Rep','Time','ResID','SeqID'],
    [colName for colName in analysisData.columns \
     if ('.Accumulation' in colName) | \
        ('.Nruns' in colName) | \
        ('.CumulativeMean' in colName)]])]
rankingData.head()

Unnamed: 0,SimType,Rep,Time,ResID,SeqID,Occupancy.Accumulation,Occupancy.Nruns,Occupancy.CumulativeMean,Vacancy.Accumulation,Vacancy.Nruns,Vacancy.CumulativeMean
0,CG,0,9482.0,2330,2040,4391,1420.0,1.855352,5092.0,1420.0,2.151549
1,CG,0,9483.0,2330,2040,4391,1420.0,1.855352,5093.0,1420.0,2.151972
2,CG,0,9484.0,2330,2040,4392,1421.0,1.854469,5093.0,1420.0,2.151972
3,CG,0,9485.0,2330,2040,4393,1421.0,1.854891,5093.0,1420.0,2.151972
4,CG,0,9486.0,2330,2040,4393,1421.0,1.854891,5094.0,1421.0,2.15088


In [149]:
testDat=rankingData[(rankingData.Time>100.0) & (rankingData.Time<110.0)]
testDat.shape

(11154, 11)

In [164]:
rankKeyCols=['SimType','Rep','Time']
rankValueCols=['Occupancy.CumulativeMean','Vacancy.CumulativeMean']
rankDataCols=np.concatenate([rankKeyCols,rankValueCols])
rankFrame=testDat[rankDataCols].groupby(rankKeyCols).rank(axis=1,ascending=False)
rankFrame.columns=rankFrame.columns.map(lambda x: x+'.Rank')
testDat[testDat.SimType=='AA'][
        ['SimType','Rep','Time','ResID','SeqID','Occupancy.CumulativeMean']
    ].join(other=rankFrame).sort_values(['SeqID','Time','Rep','SimType']).head(n=15)

Unnamed: 0,SimType,Rep,Time,ResID,SeqID,Occupancy.CumulativeMean,Occupancy.CumulativeMean.Rank,Vacancy.CumulativeMean.Rank
186871,AA,0,100.2,2837,782,0.0,254.5,175.5
903681,AA,0,100.2,1419,782,0.0,254.5,175.5
1177069,AA,0,100.2,1,782,4.2,56.5,372.0
186872,AA,0,100.8,2837,782,0.0,254.5,175.5
903682,AA,0,100.8,1419,782,0.0,254.5,175.5
1177070,AA,0,100.8,1,782,4.2,56.5,372.0
186873,AA,0,101.4,2837,782,0.0,254.5,175.5
903683,AA,0,101.4,1419,782,0.0,254.5,175.5
1177071,AA,0,101.4,1,782,3.75,58.0,372.0
186874,AA,0,102.0,2837,782,0.0,254.5,175.5


In [7]:
rankKeyCols=['SimType','Rep','Time']
rankValueCols=['Occupancy.CumulativeMean','Vacancy.CumulativeMean']
rankFrameCols=np.concatenate([rankKeyCols,rankValueCols])

rankDataCols=np.concatenate([rankKeyCols,['ResID','SeqID'],rankValueCols])
rankData=rankingData[rankDataCols]
print 'computing ranking'
rankFrame=rankData[rankFrameCols].groupby(rankKeyCols).rank(axis=1,ascending=False,method='dense')
rankFrame.columns=rankFrame.columns.map(lambda x: x+'.Rank')
print 'merging data'
rankData=rankData[rankData.SimType=='AA'][
        ['SimType','Rep','Time','ResID','SeqID','Occupancy.CumulativeMean']
    ].join(other=rankFrame).sort_values(['SeqID','Time','Rep','SimType'])
rankData.head()

computing ranking
merging data


Unnamed: 0,SimType,Rep,Time,ResID,SeqID,Occupancy.CumulativeMean,Occupancy.CumulativeMean.Rank,Vacancy.CumulativeMean.Rank
3515,AA,0,0.0,1419,782,0.0,74.0,23.0
3515,AA,0,0.0,1419,782,0.0,102.0,18.0
3515,AA,0,0.0,1419,782,0.0,107.0,10.0
3515,AA,0,0.0,1419,782,0.0,79.0,22.0
3515,AA,0,0.0,1419,782,0.0,39.0,63.0


In [None]:
@interact
def plot_rank_fluctuation(
    seqID=rankData.SeqID.unique(),
    simType=rankData.SimType.unique(),
    rankProperty=[colName for colName in rankData.columns if 'Rank' in colName]):
    
    tempData=rankData[
        (rankData.SeqID==seqID) & \
        (rankData.SimType==simType)]
    
    plt.figure(figsize=(12,12))
    for resid in tempData.ResID.unique():
        plotData=tempData[tempData.ResID==resid]
        plt.plot(plotData['Time'],plotData[rankProperty],
                 label='Resid=%s'%resid)
    plt.title('Ranking convergence for %s of %s SeqID=%s'%(
                rankProperty,simType,seqID))
    plt.legend()
    plt.show()