In [11]:
import numpy as np
import scipy as sp
#import gridData as gd
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
#import pytraj as pt
import py3Dmol as pmol

import sys
import os
import tqdm
import gc
import copy

from collections import defaultdict
import itertools

import plotly as ply
ply.io.renderers.default="notebook"

## Testing py3Dmol for molecule viewing

In [2]:
p = pmol.view(query='pdb:1ycr')
p.setStyle({'cartoon': {'color':'spectrum'}})
p.show()

## testing plotly for interactive 3D rendering

In [13]:
#Toy test system to test rendering of 3d scatter plots with plotly
pointsList=np.array([
    np.array([0.,0.,10.])+np.random.random(3),
    np.array([0.,0.,-10.])+np.random.random(3),
    np.array([1.,0.,0.])*5.+np.random.random(3),
    np.array([np.cos(120.*np.pi/180.),np.sin(120.*np.pi/180.),0])*5.+np.random.random(3),
    np.array([np.cos(-120*np.pi/180.),np.sin(-120.*np.pi/180.),0])*5.+np.random.random(3)
])
pointsFrame=pd.DataFrame(
    {'X':pointsList[:,0],
     'Y':pointsList[:,1],
     'Z':pointsList[:,2]})
pointsFrame

Unnamed: 0,X,Y,Z
0,0.324507,0.018762,10.064975
1,0.511473,0.171533,-9.8027
2,5.025049,0.497257,0.569166
3,-1.554803,5.141253,0.080946
4,-2.376186,-3.565475,0.697159


In [14]:
import plotly as ply

In [15]:
ply.io.renderers.default="notebook"

In [16]:
go=ply.graph_objs
fig=go.Figure(data=[go.Scatter3d(x=pointsFrame['X'],y=pointsFrame['Y'],z=pointsFrame['Z'],
                                mode='markers')])
fig.show()

# Load Coordinate data sets

In [7]:
dataDir="simulationData"

potFilePattern="pot_"
potDataFiles=[fileName for fileName in os.listdir(dataDir) \
              if potFilePattern in fileName]
print("potassium data file list:",potDataFiles)

print()
alphaFilePattern="atomCA_"
alphaDataFiles=[fileName for fileName in os.listdir(dataDir) \
                if alphaFilePattern in fileName] 
print("alpha carbon control atom data file list:",alphaDataFiles)

potassium data file list: ['pot_165552.dat', 'pot_165627.dat', 'pot_165623.dat', 'pot_165554.dat', 'pot_165581.dat', 'pot_165557.dat', 'pot_165530.dat', 'pot_165872.dat', 'pot_165711.dat', 'pot_165891.dat', 'pot_166052.dat', 'pot_165570.dat', 'pot_166069.dat', 'pot_166136.dat', 'pot_166095.dat', 'pot_165774.dat']

alpha carbon control atom data file list: ['atomCA_resid2377.dat', 'atomCA_resid1178.dat', 'atomCA_resid3796.dat', 'atomCA_resid958.dat', 'atomCA_resid1366.dat']


In [9]:
atomDataFrames=[]
for potFile in tqdm.tqdm_notebook(potDataFiles,desc='Loading Potassium Data'):
    potID=potFile.split('.')[0].split('_')[-1]
    potFilePath='/'.join([dataDir,potFile])
    tempFrame=pd.read_csv(potFilePath,delim_whitespace=True,skiprows=1,
                          names=['Frame','X','Y','Z','Unused1','Unused2','Unused3'])
    tempFrame=tempFrame.drop(columns=[colName for colName in tempFrame.columns \
                                      if 'Unused' in colName])
    tempFrame['AtomID']=potID
    tempFrame['AtomType']='POT'
    tempFrame=tempFrame[['AtomID','AtomType','Frame','X','Y','Z']]
    atomDataFrames.append(tempFrame.copy())
    
for alphaFile in tqdm.tqdm_notebook(alphaDataFiles,'Loading Alpha Carbon Data'):
    alphaID=alphaFile.split('.')[0].split('_')[-1].replace('resid','')
    alphaFilePath='/'.join([dataDir,alphaFile])
    tempFrame=pd.read_csv(alphaFilePath,delim_whitespace=True,skiprows=1,
                          names=['Frame','X','Y','Z','Unused1','Unused2','Unused3'])
    tempFrame=tempFrame.drop(columns=[colName for colName in tempFrame.columns \
                                      if 'Unused' in colName])
    tempFrame['AtomID']=alphaID
    tempFrame['AtomType']='CA'
    tempFrame=tempFrame[['AtomID','AtomType','Frame','X','Y','Z']]
    atomDataFrames.append(tempFrame.copy())
    
atomDataFrame=pd.concat(atomDataFrames)
atomDataFrames=[]
del(atomDataFrames)
atomDataFrame.head()

HBox(children=(IntProgress(value=0, description='Loading Potassium Data', max=16, style=ProgressStyle(descript…




HBox(children=(IntProgress(value=0, description='Loading Alpha Carbon Data', max=5, style=ProgressStyle(descri…




Unnamed: 0,AtomID,AtomType,Frame,X,Y,Z
0,165552,POT,1,109.3891,81.3629,25.0487
1,165552,POT,2,105.4785,241.4059,92.7848
2,165552,POT,3,106.965,240.9525,91.4489
3,165552,POT,4,106.4484,240.0916,91.492
4,165552,POT,5,105.1581,240.6961,91.8088


In [10]:
atomDataFrame.AtomType.unique()

array(['POT', 'CA'], dtype=object)

In [11]:
atomDataFrame.to_csv('/'.join([dataDir,'atom_coordinate_data_table.csv']),
                     index=False)

## Visualize Loaded Atom Data as 3D scatter plot

In [12]:
atomDataFrame=pd.read_csv('/'.join([dataDir,'atom_coordinate_data_table.csv']))
atomDataFrame.head()

Unnamed: 0,AtomID,AtomType,Frame,X,Y,Z
0,165552,POT,1,109.3891,81.3629,25.0487
1,165552,POT,2,105.4785,241.4059,92.7848
2,165552,POT,3,106.965,240.9525,91.4489
3,165552,POT,4,106.4484,240.0916,91.492
4,165552,POT,5,105.1581,240.6961,91.8088


In [13]:
go=ply.graph_objs
fig=go.Figure(
    data=[
            go.Scatter3d(x=atomDataFrame.query('AtomType == "POT"')['X'],
                         y=atomDataFrame.query('AtomType == "POT"')['Y'],
                         z=atomDataFrame.query('AtomType == "POT"')['Z'],
                         mode='markers',
                         marker=dict(size=2,
                                     color='blue',
                                     colorscale='Viridis',
                                     opacity=.125)),
            go.Scatter3d(x=atomDataFrame.query('AtomType == "CA"')['X'],
                         y=atomDataFrame.query('AtomType == "CA"')['Y'],
                         z=atomDataFrame.query('AtomType == "CA"')['Z'],
                         mode='markers',
                         marker=dict(size=12,color='red',opacity=.75))
         ],
    layout=dict(width=800,height=600)
)
fig.show()

# Compute Potassium to Alpha Carbon Distances

In [14]:
tempFrames=[]
baseFrame=atomDataFrame.set_index('Frame')
for groupName,groupData in \
    tqdm.tqdm_notebook(atomDataFrame.query('AtomType == "CA"').groupby('AtomID'),
                       desc='Computing Distances'):
    tempFrame=baseFrame.join(groupData.set_index('Frame')[['X','Y','Z']],
                             rsuffix='_ref',how='outer')
    tempFrame['RefAtomID']=groupName
    tempFrame['Distance']=tempFrame[['X','Y','Z','X_ref','Y_ref','Z_ref']].apply(
        lambda x: np.sqrt(np.sum(
            [(x[0]-x[3])**2,
             (x[1]-x[4])**2,
             (x[2]-x[5])**2])),
        axis=1)
    tempFrames.append(tempFrame.copy())

diffData=pd.concat(tempFrames)
tempFrames=[]
del(tempFrames)
gc.collect()
diffData.head()

HBox(children=(IntProgress(value=0, description='Computing Distances', max=5, style=ProgressStyle(description_…




Unnamed: 0_level_0,AtomID,AtomType,X,Y,Z,X_ref,Y_ref,Z_ref,RefAtomID,Distance
Frame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,165552,POT,109.3891,81.3629,25.0487,123.2928,110.994,58.3107,958,46.665572
1,165627,POT,32.3418,165.33,37.5258,123.2928,110.994,58.3107,958,107.96526
1,165623,POT,187.0743,90.5251,102.1417,123.2928,110.994,58.3107,958,80.05131
1,165554,POT,42.9653,78.8911,51.0144,123.2928,110.994,58.3107,958,86.812093
1,165581,POT,57.8748,122.7727,107.9835,123.2928,110.994,58.3107,958,82.979754


In [15]:
diffCoords=diffData.drop(
        columns=[colName for colName in diffData.columns if 'ref' in colName]
    ).reset_index().pivot_table(index=['Frame','AtomID','AtomType','X','Y','Z'],
                                columns='RefAtomID',values='Distance')
diffCoords.columns=np.array(diffCoords.columns.map(lambda x: 'D_%g'%(x)))
diffCoords=diffCoords.reset_index()
diffCoords.head()

Unnamed: 0,Frame,AtomID,AtomType,X,Y,Z,D_958,D_1178,D_1366,D_2377,D_3796
0,1,958,CA,123.2928,110.994,58.3107,0.0,92.177611,41.136629,70.165424,63.56818
1,1,1178,CA,76.6708,159.4728,121.3417,92.177611,0.0,96.855696,79.014331,72.235503
2,1,1366,CA,95.1964,126.0866,32.3294,41.136629,96.855696,0.0,51.106721,51.621998
3,1,2377,CA,53.3092,115.8309,59.7553,70.165424,79.014331,51.106721,0.0,75.214944
4,1,3796,CA,103.7853,171.428,55.4642,63.56818,72.235503,51.621998,75.214944,0.0


In [16]:
diffCoords.to_csv('/'.join([dataDir,'potassium_distance_data.csv']),
                  index=False)

# Visualize Potassium to Alpha Carbon Distances

We will use an ipython widget to allow the user to select which alpha carbon to use for colormapping the
potassium to alpha carbon distances.

In [17]:
diffCoords=pd.read_csv('/'.join([dataDir,'potassium_distance_data.csv']))
diffCoords.head()

Unnamed: 0,Frame,AtomID,AtomType,X,Y,Z,D_958,D_1178,D_1366,D_2377,D_3796
0,1,958,CA,123.2928,110.994,58.3107,0.0,92.177611,41.136629,70.165424,63.56818
1,1,1178,CA,76.6708,159.4728,121.3417,92.177611,0.0,96.855696,79.014331,72.235503
2,1,1366,CA,95.1964,126.0866,32.3294,41.136629,96.855696,0.0,51.106721,51.621998
3,1,2377,CA,53.3092,115.8309,59.7553,70.165424,79.014331,51.106721,0.0,75.214944
4,1,3796,CA,103.7853,171.428,55.4642,63.56818,72.235503,51.621998,75.214944,0.0


In [18]:
diffCoords.sort_values(['AtomID','Frame']).tail()

Unnamed: 0,Frame,AtomID,AtomType,X,Y,Z,D_958,D_1178,D_1366,D_2377,D_3796
43616,2077,166136,POT,80.8816,223.3779,20.3468,127.960128,112.182235,96.202377,112.633095,62.610885
43637,2078,166136,POT,88.1827,216.8769,20.8253,119.425785,108.561632,88.875757,108.786181,54.407042
43658,2079,166136,POT,85.9101,213.5211,22.8924,116.520063,105.470168,85.208274,104.386001,51.314128
43679,2080,166136,POT,87.5495,208.5846,24.3888,110.868936,102.451812,80.666047,99.734712,47.000275
43700,2081,166136,POT,91.2438,200.8141,23.9601,103.45252,99.975768,72.538005,94.789927,40.659838


In [19]:
import ipywidgets as widgets
from ipywidgets import interact

In [20]:
@interact
def color_by_distance(distanceColumn=[colName for colName in diffCoords.columns \
                                      if 'D_' in colName]):
    go=ply.graph_objs
    fig=go.Figure(
        data=[
                go.Scatter3d(x=diffCoords.query('AtomType == "POT"')['X'],
                             y=diffCoords.query('AtomType == "POT"')['Y'],
                             z=diffCoords.query('AtomType == "POT"')['Z'],
                             mode='markers',
                             marker=dict(size=2,
                                         color=diffCoords.query(
                                                 'AtomType == "POT"'
                                             )[distanceColumn],
                                         colorscale='RdBu',
                                         opacity=.125)),
                go.Scatter3d(
                    x=diffCoords.query(
                            '(AtomType == "CA") and (AtomID != %s)'%(distanceColumn.split('_')[-1])
                        )['X'],
                    y=diffCoords.query(
                            '(AtomType == "CA") and (AtomID != %s)'%(distanceColumn.split('_')[-1])
                        )['Y'],
                    z=diffCoords.query(
                            '(AtomType == "CA") and (AtomID != %s)'%(distanceColumn.split('_')[-1])
                        )['Z'],
                    mode='markers',
                    marker=dict(size=12,
                                color='black',
                                opacity=.25)),
                go.Scatter3d(x=diffCoords.query('AtomID == %s'%(distanceColumn.split('_')[-1]))['X'],
                             y=diffCoords.query('AtomID == %s'%(distanceColumn.split('_')[-1]))['Y'],
                             z=diffCoords.query('AtomID == %s'%(distanceColumn.split('_')[-1]))['Z'],
                             mode='markers',
                             marker=dict(size=12,color='red',opacity=.75))
             ],
        layout=dict(width=800,height=600)
    )
    fig.show()

interactive(children=(Dropdown(description='distanceColumn', options=('D_958', 'D_1178', 'D_1366', 'D_2377', '…

# Compute PCA projection

In [21]:
import sklearn as skl
from sklearn.decomposition import PCA

In [22]:
Xdata=diffCoords.query('AtomType == "POT"')[diffCoords.columns[-5:]]
Xdata.head()

Unnamed: 0,D_958,D_1178,D_1366,D_2377,D_3796
5,132.703255,158.73605,161.974605,148.917295,185.519797
6,46.665572,128.234104,47.483162,74.414691,95.227241
7,86.812093,112.140092,72.832906,39.343976,110.823964
8,85.844916,99.317973,92.829412,60.282709,120.622686
9,64.929022,102.972238,24.265077,47.409354,58.21416


In [23]:
pca=PCA()

pca.fit(Xdata)

pca_coords=pca.transform(diffCoords[diffCoords.columns[-5:]])
pca_coords[:10]

array([[ -92.36958027,  -43.23515844,  -20.98241854,  -39.53352024,
          -8.62474089],
       [ -61.67236156,   80.78070756,   34.81190576,  -24.19951003,
          15.48067544],
       [-109.6038469 ,  -46.11865923,  -16.529022  ,   14.96258879,
          16.22040298],
       ...,
       [ -26.22338032,  -30.64675317,   42.40147382,   20.22039947,
          -7.77121906],
       [  -8.09234265,  -12.6624656 ,   41.36420923,   -0.71243343,
          -4.90427829],
       [ -84.66814926,  -33.20334896,   -3.63380097,   27.72520521,
           4.74670254]])

In [24]:
pcaData=diffCoords.join(
    pd.DataFrame(pca_coords,
                 columns=['PCA_%g'%ii for ii in np.arange(pca_coords.shape[1])]))
pcaData.head()

Unnamed: 0,Frame,AtomID,AtomType,X,Y,Z,D_958,D_1178,D_1366,D_2377,D_3796,PCA_0,PCA_1,PCA_2,PCA_3,PCA_4
0,1,958,CA,123.2928,110.994,58.3107,0.0,92.177611,41.136629,70.165424,63.56818,-92.36958,-43.235158,-20.982419,-39.53352,-8.624741
1,1,1178,CA,76.6708,159.4728,121.3417,92.177611,0.0,96.855696,79.014331,72.235503,-61.672362,80.780708,34.811906,-24.19951,15.480675
2,1,1366,CA,95.1964,126.0866,32.3294,41.136629,96.855696,0.0,51.106721,51.621998,-109.603847,-46.118659,-16.529022,14.962589,16.220403
3,1,2377,CA,53.3092,115.8309,59.7553,70.165424,79.014331,51.106721,0.0,75.214944,-92.192533,-20.807768,52.877672,25.542441,-16.465812
4,1,3796,CA,103.7853,171.428,55.4642,63.56818,72.235503,51.621998,75.214944,0.0,-101.597084,25.930154,-42.318357,17.236239,-16.867316


In [25]:
pcaData.to_csv('/'.join([dataDir,'pca_projection_data.csv']),
                  index=False)

# Visualize PCA projection data

Lets see how well the first 3 principal components do at reconstructing our data set.

In [26]:
import ipywidgets as widgets
from ipywidgets import interact_manual

In [27]:
pcaData=pd.read_csv('/'.join([dataDir,'pca_projection_data.csv']))
pcaData.head()

Unnamed: 0,Frame,AtomID,AtomType,X,Y,Z,D_958,D_1178,D_1366,D_2377,D_3796,PCA_0,PCA_1,PCA_2,PCA_3,PCA_4
0,1,958,CA,123.2928,110.994,58.3107,0.0,92.177611,41.136629,70.165424,63.56818,-92.36958,-43.235158,-20.982419,-39.53352,-8.624741
1,1,1178,CA,76.6708,159.4728,121.3417,92.177611,0.0,96.855696,79.014331,72.235503,-61.672362,80.780708,34.811906,-24.19951,15.480675
2,1,1366,CA,95.1964,126.0866,32.3294,41.136629,96.855696,0.0,51.106721,51.621998,-109.603847,-46.118659,-16.529022,14.962589,16.220403
3,1,2377,CA,53.3092,115.8309,59.7553,70.165424,79.014331,51.106721,0.0,75.214944,-92.192533,-20.807768,52.877672,25.542441,-16.465812
4,1,3796,CA,103.7853,171.428,55.4642,63.56818,72.235503,51.621998,75.214944,0.0,-101.597084,25.930154,-42.318357,17.236239,-16.867316


In [28]:
@interact_manual
def color_by_distance(colorColumn=widgets.Dropdown(
                          options=[colName for colName in pcaData.columns \
                                   if ('PCA_' in colName) | ('D_' in colName) | \
                                      (colName=='X') | (colName=='Y') | (colName=='Z')],
                          value='D_958'),
                      xCol=widgets.Dropdown(
                          options=[colName for colName in pcaData.columns \
                                   if ('PCA_' in colName) | ('D_' in colName) | \
                                      (colName=='X') | (colName=='Y') | (colName=='Z')],
                          value='PCA_0'),
                      yCol=widgets.Dropdown(
                          options=[colName for colName in pcaData.columns \
                                   if ('PCA_' in colName) | ('D_' in colName) | \
                                      (colName=='X') | (colName=='Y') | (colName=='Z')],
                          value='PCA_1'),
                      zCol=widgets.Dropdown(
                          options=[colName for colName in pcaData.columns \
                                   if ('PCA_' in colName) | ('D_' in colName) | \
                                      (colName=='X') | (colName=='Y') | (colName=='Z')],
                          value='PCA_2'),
                     ):
    go=ply.graph_objs
    fig=go.Figure(
        data=[
                go.Scatter3d(x=pcaData.query('AtomType == "POT"')[xCol],
                             y=pcaData.query('AtomType == "POT"')[yCol],
                             z=pcaData.query('AtomType == "POT"')[zCol],
                             mode='markers',
                             marker=dict(size=2,
                                         color=pcaData.query(
                                                 'AtomType == "POT"'
                                             )[colorColumn],
                                         colorscale='RdBu',
                                         opacity=.125)),
                go.Scatter3d(
                    x=pcaData.query(
                            '(AtomType == "CA")'
                        )[xCol],
                    y=pcaData.query(
                            '(AtomType == "CA")'
                        )[yCol],
                    z=pcaData.query(
                            '(AtomType == "CA")'
                        )[zCol],
                    mode='markers',
                    marker=dict(size=12,
                                color='black',
                                opacity=.25))
             ],
        layout=dict(
            width=800,height=600)
    )
    fig.show()

interactive(children=(Dropdown(description='colorColumn', index=3, options=('X', 'Y', 'Z', 'D_958', 'D_1178', …

In [29]:
@interact_manual
def testingFun( #'D_958', 'D_1178', 'D_1366', 'D_2377', 'D_3796'
    X_bound=widgets.FloatRangeSlider(     min=pcaData.X.min(),max=pcaData.X.max(),
                                   value=(pcaData.X.min(),pcaData.X.max())),
    
    Y_bound=widgets.FloatRangeSlider(     min=pcaData.Y.min(),max=pcaData.Y.max(),
                                   value=(pcaData.Y.min(),pcaData.Y.max())),
    
    Z_bound=widgets.FloatRangeSlider(     min=pcaData.Z.min(),max=pcaData.Z.max(),
                                   value=(pcaData.Z.min(),pcaData.Z.max())),
    
    D_958_bound=widgets.FloatRangeSlider( min=pcaData.D_958.min(),max=pcaData.D_958.max(),
                                   value=(pcaData.D_958.min(),pcaData.D_958.max())),
    
    D_1178_bound=widgets.FloatRangeSlider(min=pcaData.D_1178.min(),max=pcaData.D_1178.max(),
                                   value=(pcaData.D_1178.min(),pcaData.D_1178.max())),
    
    D_1366_bound=widgets.FloatRangeSlider(min=pcaData.D_1366.min(),max=pcaData.D_1366.max(),
                                   value=(pcaData.D_1366.min(),pcaData.D_1366.max())),
    
    D_2377_bound=widgets.FloatRangeSlider(min=pcaData.D_2377.min(),max=pcaData.D_2377.max(),
                                   value=(pcaData.D_2377.min(),pcaData.D_2377.max())),
    
    D_3796_bound=widgets.FloatRangeSlider(min=pcaData.D_3796.min(),max=pcaData.D_3796.max(),
                                   value=(pcaData.D_3796.min(),pcaData.D_3796.max())),
    
    PCA_0_bound=widgets.FloatRangeSlider( min=pcaData.PCA_0.min(),max=pcaData.PCA_0.max(),
                                   value=(pcaData.PCA_0.min(),pcaData.PCA_0.max())),
    
    PCA_1_bound=widgets.FloatRangeSlider( min=pcaData.PCA_1.min(),max=pcaData.PCA_1.max(),
                                   value=(pcaData.PCA_1.min(),pcaData.PCA_1.max())),
    
    PCA_2_bound=widgets.FloatRangeSlider( min=pcaData.PCA_1.min(),max=pcaData.PCA_1.max(),
                                   value=(pcaData.PCA_1.min(),pcaData.PCA_1.max())),
    
    PCA_3_bound=widgets.FloatRangeSlider( min=pcaData.PCA_3.min(),max=pcaData.PCA_3.max(),
                                   value=(pcaData.PCA_3.min(),pcaData.PCA_3.max())),
    
    PCA_4_bound=widgets.FloatRangeSlider( min=pcaData.PCA_4.min(),max=pcaData.PCA_4.max(),
                                   value=(pcaData.PCA_4.min(),pcaData.PCA_4.max())),
    xCol=widgets.Dropdown(options=pcaData.columns[3:],value='X'),
    yCol=widgets.Dropdown(options=pcaData.columns[3:],value='Y'),
    zCol=widgets.Dropdown(options=pcaData.columns[3:],value='Z'),
    ):
    kwargDict=dict(locals())
    print(kwargDict.keys(),kwargDict.values())
    kwargDict=dict(locals())
    kwargKeys=np.array(list(kwargDict.keys()))[:13]
    for kwargKey in kwargKeys:
        print(kwargKey,end="")
        print(len(kwargDict[kwargKey]),kwargDict[kwargKey][0],kwargDict[kwargKey][1])
    queryStr=' and '.join(
        ['({factorKey:s} > {factorMin:f}) and ({factorKey:s} < {factorMax:f})'.format(
            factorKey='_'.join(kwargKey.split('_')[:-1]),
            factorMin=kwargDict[kwargKey][0],
            factorMax=kwargDict[kwargKey][1]) \
         for kwargKey in kwargKeys])
    print("query:",queryStr)
    print("selection shape:",pcaData.query(queryStr).shape)

interactive(children=(FloatRangeSlider(value=(0.0063, 196.48), description='X_bound', max=196.48, min=0.0063),…

In [30]:
@interact_manual
def explore_milestone_bounds( #'D_958', 'D_1178', 'D_1366', 'D_2377', 'D_3796'
    X_bound=widgets.FloatRangeSlider(     min=pcaData.X.min()-.1,max=pcaData.X.max()+.1,
                                   value=(pcaData.X.min()-.1,pcaData.X.max()+.1)),
    
    Y_bound=widgets.FloatRangeSlider(     min=pcaData.Y.min()-.1,max=pcaData.Y.max()+.1,
                                   value=(pcaData.Y.min()-.1,pcaData.Y.max()+.1)),
    
    Z_bound=widgets.FloatRangeSlider(     min=pcaData.Z.min()-.1,max=pcaData.Z.max()+.1,
                                   value=(pcaData.Z.min()-.1,pcaData.Z.max()+.1)),
    
    D_958_bound=widgets.FloatRangeSlider( min=pcaData.D_958.min()-.1,max=pcaData.D_958.max()+.1,
                                   value=(pcaData.D_958.min()-.1,pcaData.D_958.max()+.1)),
    
    D_1178_bound=widgets.FloatRangeSlider(min=pcaData.D_1178.min()-.1,max=pcaData.D_1178.max()+.1,
                                   value=(pcaData.D_1178.min()-.1,pcaData.D_1178.max()+.1)),
    
    D_1366_bound=widgets.FloatRangeSlider(min=pcaData.D_1366.min()-.1,max=pcaData.D_1366.max()+.1,
                                   value=(pcaData.D_1366.min()-.1,pcaData.D_1366.max()+.1)),
    
    D_2377_bound=widgets.FloatRangeSlider(min=pcaData.D_2377.min()-.1,max=pcaData.D_2377.max()+.1,
                                   value=(pcaData.D_2377.min()-.1,pcaData.D_2377.max()+.1)),
    
    D_3796_bound=widgets.FloatRangeSlider(min=pcaData.D_3796.min()-.1,max=pcaData.D_3796.max()+.1,
                                   value=(pcaData.D_3796.min()-.1,pcaData.D_3796.max()+.1)),
    
    PCA_0_bound=widgets.FloatRangeSlider( min=pcaData.PCA_0.min()-.1,max=pcaData.PCA_0.max()+.1,
                                   value=(pcaData.PCA_0.min()-.1,pcaData.PCA_0.max()+.1)),
    
    PCA_1_bound=widgets.FloatRangeSlider( min=pcaData.PCA_1.min()-.1,max=pcaData.PCA_1.max()+.1,
                                   value=(pcaData.PCA_1.min()-.1,pcaData.PCA_1.max()+.1)),
    
    PCA_2_bound=widgets.FloatRangeSlider( min=pcaData.PCA_1.min()-.1,max=pcaData.PCA_1.max()+.1,
                                   value=(pcaData.PCA_1.min()-.1,pcaData.PCA_1.max()+.1)),
    
    PCA_3_bound=widgets.FloatRangeSlider( min=pcaData.PCA_3.min()-.1,max=pcaData.PCA_3.max()+.1,
                                   value=(pcaData.PCA_3.min()-.1,pcaData.PCA_3.max()+.1)),
    
    PCA_4_bound=widgets.FloatRangeSlider( min=pcaData.PCA_4.min(),max=pcaData.PCA_4.max(),
                                   value=(pcaData.PCA_4.min(),pcaData.PCA_4.max())),
    xCol=widgets.Dropdown(options=pcaData.columns[3:],value='X'),
    yCol=widgets.Dropdown(options=pcaData.columns[3:],value='Y'),
    zCol=widgets.Dropdown(options=pcaData.columns[3:],value='Z'),
    verbose=widgets.ToggleButton(value=False)
    ):
    kwargDict=dict(locals())
    if verbose:
        print(kwargDict.keys(),kwargDict.values())
    kwargDict=dict(locals())
    kwargKeys=np.array(list(kwargDict.keys()))[:13]
    if verbose:
        for kwargKey in kwargKeys:
            print(kwargKey,end="")
            print(len(kwargDict[kwargKey]),kwargDict[kwargKey][0],kwargDict[kwargKey][1])
    queryStr=' and '.join(
        ['({factorKey:s} >= {factorMin:f}) and ({factorKey:s} <= {factorMax:f})'.format(
            factorKey='_'.join(kwargKey.split('_')[:-1]),
            factorMin=kwargDict[kwargKey][0],
            factorMax=kwargDict[kwargKey][1]) \
         for kwargKey in kwargKeys])
    if verbose:
        print("query:",queryStr)
        print("selection shape:",pcaData.query(queryStr).shape)
    go=ply.graph_objs
    fig=go.Figure(
        data=[
                go.Scatter3d(x=pcaData.query('AtomType == "POT"')[xCol],
                             y=pcaData.query('AtomType == "POT"')[yCol],
                             z=pcaData.query('AtomType == "POT"')[zCol],
                             mode='markers',
                             marker=dict(size=2,
                                         color='grey',
                                         opacity=.0625)),
                go.Scatter3d(x=pcaData.query(queryStr)[xCol],
                             y=pcaData.query(queryStr)[yCol],
                             z=pcaData.query(queryStr)[zCol],
                             mode='markers',
                             marker=dict(size=2,
                                         color='blue',
                                         opacity=.25)),
                go.Scatter3d(
                    x=pcaData.query(
                            '(AtomType == "CA")'
                        )[xCol],
                    y=pcaData.query(
                            '(AtomType == "CA")'
                        )[yCol],
                    z=pcaData.query(
                            '(AtomType == "CA")'
                        )[zCol],
                    mode='markers',
                    marker=dict(size=12,
                                color='black',
                                opacity=.25))
             ],
        layout=dict(
            width=800,height=600)
    )
    fig.show()

interactive(children=(FloatRangeSlider(value=(-0.0937, 196.57999999999998), description='X_bound', max=196.579…

# Compute Cylindrical Coordinates

In [31]:
import math
tqdm.tqdm.pandas(tqdm.tqdm_notebook)

In [32]:
diffCoords.query('AtomType=="CA"').head()

Unnamed: 0,Frame,AtomID,AtomType,X,Y,Z,D_958,D_1178,D_1366,D_2377,D_3796
0,1,958,CA,123.2928,110.994,58.3107,0.0,92.177611,41.136629,70.165424,63.56818
1,1,1178,CA,76.6708,159.4728,121.3417,92.177611,0.0,96.855696,79.014331,72.235503
2,1,1366,CA,95.1964,126.0866,32.3294,41.136629,96.855696,0.0,51.106721,51.621998
3,1,2377,CA,53.3092,115.8309,59.7553,70.165424,79.014331,51.106721,0.0,75.214944
4,1,3796,CA,103.7853,171.428,55.4642,63.56818,72.235503,51.621998,75.214944,0.0


In [33]:
tqdm.tqdm.pandas(tqdm.tqdm_notebook)
cylCenters=diffCoords.query('AtomType=="CA"')[['Frame','X','Y','Z']].groupby(
    ['Frame']).progress_aggregate(np.mean)
diffCyl=diffCoords.set_index(['Frame']).join(
    cylCenters,rsuffix='_Center',how='outer')
diffCyl=diffCyl.reset_index()
diffCyl['R']=np.sqrt(
        (diffCyl['X']-diffCyl['X_Center'])**2 + \
        (diffCyl['Y']-diffCyl['Y_Center'])**2)
diffCyl['Theta']=np.arctan2(diffCyl['Y']-diffCyl['Y_Center'],diffCyl['X']-diffCyl['X_Center'])
diffCyl=diffCyl[np.concatenate([
    diffCyl.columns[:6],
    [colName for colName in diffCyl.columns if '_Center' in colName],
    ['R','Theta'],
    [colName for colName in diffCyl.columns if 'D_' in colName],
])]
print(diffCyl.shape)
diffCyl.head()

100%|██████████| 2081/2081 [00:00<00:00, 20806.05it/s]

(43701, 16)





Unnamed: 0,Frame,AtomID,AtomType,X,Y,Z,X_Center,Y_Center,Z_Center,R,Theta,D_958,D_1178,D_1366,D_2377,D_3796
0,1,958,CA,123.2928,110.994,58.3107,90.4509,136.76246,65.44026,41.744508,-0.665293,0.0,92.177611,41.136629,70.165424,63.56818
1,1,1178,CA,76.6708,159.4728,121.3417,90.4509,136.76246,65.44026,26.564087,2.116184,92.177611,0.0,96.855696,79.014331,72.235503
2,1,1366,CA,95.1964,126.0866,32.3294,90.4509,136.76246,65.44026,11.683054,-1.152519,41.136629,96.855696,0.0,51.106721,51.621998
3,1,2377,CA,53.3092,115.8309,59.7553,90.4509,136.76246,65.44026,42.633743,-2.628399,70.165424,79.014331,51.106721,0.0,75.214944
4,1,3796,CA,103.7853,171.428,55.4642,90.4509,136.76246,65.44026,37.1417,1.203585,63.56818,72.235503,51.621998,75.214944,0.0


## Explore radial cutoff

In [34]:
@interact_manual
def explore_milestone_bounds( #'D_958', 'D_1178', 'D_1366', 'D_2377', 'D_3796'
    R_bound=widgets.FloatRangeSlider(     min=0,max=diffCyl.R.max()+.1,
                                   value=(0,diffCyl.R.max()+.1)),
    
    Theta_bound=widgets.FloatRangeSlider(     min=-np.pi,max=np.pi,
                                   value=(-np.pi,np.pi)),
    
    Z_bound=widgets.FloatRangeSlider(     min=diffCyl.Z.min()-.1,max=diffCyl.Z.max()+.1,
                                   value=(diffCyl.Z.min()-.1,diffCyl.Z.max()+.1)),
    xCol=widgets.Dropdown(options=diffCyl.columns[3:],value='X'),
    yCol=widgets.Dropdown(options=diffCyl.columns[3:],value='Y'),
    zCol=widgets.Dropdown(options=diffCyl.columns[3:],value='Z'),
    showCenter=widgets.ToggleButton(value=True),
    verbose=widgets.ToggleButton(value=False)
    ):
    kwargDict=dict(locals())
    if verbose:
        print(kwargDict.keys(),kwargDict.values())
    kwargDict=dict(locals())
    kwargKeys=np.array(list(kwargDict.keys()))[:3]
    if verbose:
        for kwargKey in kwargKeys:
            print(kwargKey,end="")
            print(len(kwargDict[kwargKey]),kwargDict[kwargKey][0],kwargDict[kwargKey][1])
    queryStr=' and '.join(
        ['({factorKey:s} >= {factorMin:f}) and ({factorKey:s} <= {factorMax:f})'.format(
            factorKey='_'.join(kwargKey.split('_')[:-1]),
            factorMin=kwargDict[kwargKey][0],
            factorMax=kwargDict[kwargKey][1]) \
         for kwargKey in kwargKeys])
    if verbose:
        print("query:",queryStr)
        print("selection shape:",diffCyl.query(queryStr).shape)
        print(diffCyl.query(queryStr).head())
    go=ply.graph_objs
    plotData=[
                go.Scatter3d(x=diffCyl.query(
                                    'AtomType == "POT"'
                                 ).query("not ("+queryStr+")")[xCol],
                             y=diffCyl.query(
                                     'AtomType == "POT"'
                                 ).query("not ("+queryStr+")")[yCol],
                             z=diffCyl.query(
                                     'AtomType == "POT"'
                                 ).query("not ("+queryStr+")")[zCol],
                             mode='markers',
                             marker=dict(size=24,
                                         color='grey',
                                         opacity=.0625)),
                go.Scatter3d(x=diffCyl.query(queryStr)[xCol],
                             y=diffCyl.query(queryStr)[yCol],
                             z=diffCyl.query(queryStr)[zCol],
                             mode='markers',
                             marker=dict(size=2,
                                         color='blue',
                                         opacity=.25)),
                go.Scatter3d(
                    x=diffCyl.query(
                            '(AtomType == "CA")'
                        )[xCol],
                    y=diffCyl.query(
                            '(AtomType == "CA")'
                        )[yCol],
                    z=diffCyl.query(
                            '(AtomType == "CA")'
                        )[zCol],
                    mode='markers',
                    marker=dict(size=12,
                                color='black',
                                opacity=.25))
             ]
    if showCenter:
        plotData.append(
                go.Scatter3d(x=diffCyl.query('AtomType == "CA"')['X_Center'],
                             y=diffCyl.query('AtomType == "CA"')['Y_Center'],
                             z=diffCyl.query('AtomType == "CA"')['Z_Center'],
                             mode='markers',
                             marker=dict(size=12,
                                         color='red',
                                         opacity=.5)))
    fig=go.Figure(
        data=plotData,
        layout=dict(
            width=800,height=600)
    )
    fig.show()

interactive(children=(FloatRangeSlider(value=(0.0, 171.15737440875327), description='R_bound', max=171.1573744…

## Apply Cylindrical Filter

In [35]:
queryEntry="(R >= 0.000000) and (R <= 171.000000) and (Theta >= -3.141593) and (Theta <= 3.141593) and (Z >= 0.000000) and (Z <= 145)"
cylData=diffCyl.query(queryEntry)
print(cylData.shape,diffCoords.shape)
display(diffCyl.query("not ("+queryEntry+")"))
cylData.head()

(43700, 16) (43701, 11)


Unnamed: 0,Frame,AtomID,AtomType,X,Y,Z,X_Center,Y_Center,Z_Center,R,Theta,D_958,D_1178,D_1366,D_2377,D_3796
509,25,165530,POT,194.4881,1.2067,12.1098,92.17352,138.2919,64.04378,171.057374,-0.92963,136.709467,226.560729,159.858488,187.306998,198.938532


Unnamed: 0,Frame,AtomID,AtomType,X,Y,Z,X_Center,Y_Center,Z_Center,R,Theta,D_958,D_1178,D_1366,D_2377,D_3796
0,1,958,CA,123.2928,110.994,58.3107,90.4509,136.76246,65.44026,41.744508,-0.665293,0.0,92.177611,41.136629,70.165424,63.56818
1,1,1178,CA,76.6708,159.4728,121.3417,90.4509,136.76246,65.44026,26.564087,2.116184,92.177611,0.0,96.855696,79.014331,72.235503
2,1,1366,CA,95.1964,126.0866,32.3294,90.4509,136.76246,65.44026,11.683054,-1.152519,41.136629,96.855696,0.0,51.106721,51.621998
3,1,2377,CA,53.3092,115.8309,59.7553,90.4509,136.76246,65.44026,42.633743,-2.628399,70.165424,79.014331,51.106721,0.0,75.214944
4,1,3796,CA,103.7853,171.428,55.4642,90.4509,136.76246,65.44026,37.1417,1.203585,63.56818,72.235503,51.621998,75.214944,0.0


## Compute Filtered PCA projection

In [36]:
import sklearn as skl
from sklearn.decomposition import PCA

In [37]:
queryEntry="(R >= 0.000000) and (R <= 80.000000) and (Theta >= -3.141593) and (Theta <= 3.141593) and (Z >= 0.000000) and (Z <= 140)"
cylData=diffCyl.query(queryEntry)

XCylData=cylData.query('AtomType == "POT"')[cylData.columns[-5:]]

pcaCyl=PCA()
pcaCyl.fit(XCylData)
pcaCyl_coords=pcaCyl.transform(cylData[cylData.columns[-5:]])

pcaCylData=cylData.copy()
for ii in np.arange(pcaCyl_coords.shape[1]):
    pcaCylData['PCAcyl_%g'%ii]=pcaCyl_coords[:,ii]
    
pcaCylData.head()

Unnamed: 0,Frame,AtomID,AtomType,X,Y,Z,X_Center,Y_Center,Z_Center,R,...,D_958,D_1178,D_1366,D_2377,D_3796,PCAcyl_0,PCAcyl_1,PCAcyl_2,PCAcyl_3,PCAcyl_4
0,1,958,CA,123.2928,110.994,58.3107,90.4509,136.76246,65.44026,41.744508,...,0.0,92.177611,41.136629,70.165424,63.56818,-37.793808,27.388747,-19.905732,-45.029068,-8.578721
1,1,1178,CA,76.6708,159.4728,121.3417,90.4509,136.76246,65.44026,26.564087,...,92.177611,0.0,96.855696,79.014331,72.235503,35.590037,-82.218968,-5.244114,-0.797064,17.517078
2,1,1366,CA,95.1964,126.0866,32.3294,90.4509,136.76246,65.44026,11.683054,...,41.136629,96.855696,0.0,51.106721,51.621998,-61.828181,31.87486,0.939633,3.330112,14.68839
3,1,2377,CA,53.3092,115.8309,59.7553,90.4509,136.76246,65.44026,42.633743,...,70.165424,79.014331,51.106721,0.0,75.214944,-31.627981,-9.635206,58.098393,6.779326,-17.362311
4,1,3796,CA,103.7853,171.428,55.4642,90.4509,136.76246,65.44026,37.1417,...,63.56818,72.235503,51.621998,75.214944,0.0,-32.761493,-14.019092,-45.214532,33.42976,-17.449956


In [38]:
pcaCylData.to_csv('/'.join([dataDir,'pca_projection_data.cylindricalFilter.csv']),
                  index=False)

# Visualize Cylindrical Filtered PCA projection

In [5]:
dataDir='simulationData'
pcaCylData=pd.read_csv('/'.join([dataDir,'pca_projection_data.cylindricalFilter.csv']))
display(pcaCylData.head())
display(pcaCylData[pcaCylData.columns[3:]].agg(['min','mean','max','std']).T)

Unnamed: 0,Frame,AtomID,AtomType,X,Y,Z,X_Center,Y_Center,Z_Center,R,...,D_958,D_1178,D_1366,D_2377,D_3796,PCAcyl_0,PCAcyl_1,PCAcyl_2,PCAcyl_3,PCAcyl_4
0,1,958,CA,123.2928,110.994,58.3107,90.4509,136.76246,65.44026,41.744508,...,0.0,92.177611,41.136629,70.165424,63.56818,-37.793808,27.388747,-19.905732,-45.029068,-8.578721
1,1,1178,CA,76.6708,159.4728,121.3417,90.4509,136.76246,65.44026,26.564087,...,92.177611,0.0,96.855696,79.014331,72.235503,35.590037,-82.218968,-5.244114,-0.797064,17.517078
2,1,1366,CA,95.1964,126.0866,32.3294,90.4509,136.76246,65.44026,11.683054,...,41.136629,96.855696,0.0,51.106721,51.621998,-61.828181,31.87486,0.939633,3.330112,14.68839
3,1,2377,CA,53.3092,115.8309,59.7553,90.4509,136.76246,65.44026,42.633743,...,70.165424,79.014331,51.106721,0.0,75.214944,-31.627981,-9.635206,58.098393,6.779326,-17.362311
4,1,3796,CA,103.7853,171.428,55.4642,90.4509,136.76246,65.44026,37.1417,...,63.56818,72.235503,51.621998,75.214944,0.0,-32.761493,-14.019092,-45.214532,33.42976,-17.449956


Unnamed: 0,min,mean,max,std
X,12.8977,95.850923,172.9234,28.714918
Y,59.5238,135.896598,219.7266,27.864356
Z,0.0018,65.804663,139.9682,34.760019
X_Center,90.4509,92.723626,93.81732,0.347674
Y_Center,136.76246,139.457834,140.84524,0.508592
Z_Center,61.63296,62.758913,65.44026,0.388478
R,0.084237,34.853375,79.996104,20.166349
Theta,-3.140898,-0.245226,3.141472,1.724236
D_958,0.0,60.451554,146.410816,27.344729
D_1178,0.0,74.06138,158.201029,31.873921


In [40]:
@interact_manual
def color_by_distance(colorColumn=widgets.Dropdown(
                          options=[colName for colName in pcaCylData.columns \
                                   if ('PCAcyl_' in colName) | ('D_' in colName) | \
                                      (colName=='X') | (colName=='Y') | (colName=='Z')],
                          value='PCAcyl_0'),
                      xCol=widgets.Dropdown(
                          options=[colName for colName in pcaCylData.columns \
                                   if ('PCAcyl_' in colName) | ('D_' in colName) | \
                                      (colName=='X') | (colName=='Y') | (colName=='Z')],
                          value='X'),
                      yCol=widgets.Dropdown(
                          options=[colName for colName in pcaCylData.columns \
                                   if ('PCAcyl_' in colName) | ('D_' in colName) | \
                                      (colName=='X') | (colName=='Y') | (colName=='Z')],
                          value='Y'),
                      zCol=widgets.Dropdown(
                          options=[colName for colName in pcaCylData.columns \
                                   if ('PCAcyl_' in colName) | ('D_' in colName) | \
                                      (colName=='X') | (colName=='Y') | (colName=='Z')],
                          value='Z'),
                     ):
    go=ply.graph_objs
    fig=go.Figure(
        data=[
                go.Scatter3d(x=pcaCylData.query('AtomType == "POT"')[xCol],
                             y=pcaCylData.query('AtomType == "POT"')[yCol],
                             z=pcaCylData.query('AtomType == "POT"')[zCol],
                             mode='markers',
                             marker=dict(size=2,
                                         color=pcaCylData.query(
                                                 'AtomType == "POT"'
                                             )[colorColumn],
                                         colorscale='RdBu',
                                         opacity=.125)),
                go.Scatter3d(
                    x=pcaCylData.query(
                            '(AtomType == "CA")'
                        )[xCol],
                    y=pcaCylData.query(
                            '(AtomType == "CA")'
                        )[yCol],
                    z=pcaCylData.query(
                            '(AtomType == "CA")'
                        )[zCol],
                    mode='markers',
                    marker=dict(size=12,
                                color='black',
                                opacity=.25))
             ],
        layout=dict(
            width=800,height=600)
    )
    fig.show()

interactive(children=(Dropdown(description='colorColumn', index=8, options=('X', 'Y', 'Z', 'D_958', 'D_1178', …

In [60]:
@interact_manual
def explore_milestone_bounds( #'D_958', 'D_1178', 'D_1366', 'D_2377', 'D_3796'
    X_bound=widgets.FloatRangeSlider(     min=pcaCylData.X.min()-1.,max=pcaCylData.X.max()+1.,
                                   value=(pcaCylData.X.min()-1.,pcaCylData.X.max()+1.)),
    
    Y_bound=widgets.FloatRangeSlider(     min=pcaCylData.Y.min()-1.,max=pcaCylData.Y.max()+1.,
                                   value=(pcaCylData.Y.min()-1.,pcaCylData.Y.max()+1.)),
    
    Z_bound=widgets.FloatRangeSlider(     min=pcaCylData.Z.min()-1.,max=pcaCylData.Z.max()+1.,
                                   value=(pcaCylData.Z.min()-1.,pcaCylData.Z.max()+1.)),
    
    R_bound=widgets.FloatRangeSlider(     min=0,max=pcaCylData.R.max()+1.,
                                   value=(0,pcaCylData.R.max()+1.)),
    
    Theta_bound=widgets.FloatRangeSlider(     min=-np.pi,max=np.pi,
                                   value=(-np.pi,np.pi)),
    
    D_958_bound=widgets.FloatRangeSlider( min=pcaCylData.D_958.min()-1.,max=pcaCylData.D_958.max()+1.,
                                   value=(pcaCylData.D_958.min()-1.,pcaCylData.D_958.max()+1.)),
    
    D_1178_bound=widgets.FloatRangeSlider(min=pcaCylData.D_1178.min()-1.,max=pcaCylData.D_1178.max()+1.,
                                   value=(pcaCylData.D_1178.min()-1.,pcaCylData.D_1178.max()+1.)),
    
    D_1366_bound=widgets.FloatRangeSlider(min=pcaCylData.D_1366.min()-1.,max=pcaCylData.D_1366.max()+1.,
                                   value=(pcaCylData.D_1366.min()-1.,pcaCylData.D_1366.max()+1.)),
    
    D_2377_bound=widgets.FloatRangeSlider(min=pcaCylData.D_2377.min()-1.,max=pcaCylData.D_2377.max()+1.,
                                   value=(pcaCylData.D_2377.min()-1.,pcaCylData.D_2377.max()+1.)),
    
    D_3796_bound=widgets.FloatRangeSlider(min=pcaCylData.D_3796.min()-1.,max=pcaCylData.D_3796.max()+1.,
                                   value=(pcaCylData.D_3796.min()-1.,pcaCylData.D_3796.max()+1.)),
    
    PCAcyl_0_bound=widgets.FloatRangeSlider( min=pcaCylData.PCAcyl_0.min()-1.,max=pcaCylData.PCAcyl_0.max()+1.,
                                   value=(pcaCylData.PCAcyl_0.min()-1.,pcaCylData.PCAcyl_0.max()+1.)),
    
    PCAcyl_1_bound=widgets.FloatRangeSlider( min=pcaCylData.PCAcyl_1.min()-1.,max=pcaCylData.PCAcyl_1.max()+1.,
                                   value=(pcaCylData.PCAcyl_1.min()-1.,pcaCylData.PCAcyl_1.max()+1.)),
    
    PCAcyl_2_bound=widgets.FloatRangeSlider( min=pcaCylData.PCAcyl_1.min()-1.,max=pcaCylData.PCAcyl_1.max()+1.,
                                   value=(pcaCylData.PCAcyl_1.min()-1.,pcaCylData.PCAcyl_1.max()+1.)),
    
    PCAcyl_3_bound=widgets.FloatRangeSlider( min=pcaCylData.PCAcyl_3.min()-1.,max=pcaCylData.PCAcyl_3.max()+1.,
                                   value=(pcaCylData.PCAcyl_3.min()-1.,pcaCylData.PCAcyl_3.max()+1.)),
    
    PCAcyl_4_bound=widgets.FloatRangeSlider( min=pcaCylData.PCAcyl_4.min(),max=pcaCylData.PCAcyl_4.max(),
                                   value=(pcaCylData.PCAcyl_4.min(),pcaCylData.PCAcyl_4.max())),
    
    xCol=widgets.Dropdown(options=pcaCylData.columns[3:],value='X'),
    yCol=widgets.Dropdown(options=pcaCylData.columns[3:],value='Y'),
    zCol=widgets.Dropdown(options=pcaCylData.columns[3:],value='Z'),
    verbose=widgets.ToggleButton(value=False),
    showCutRegions=widgets.ToggleButton(value=False),
    ):
    kwargDict=dict(locals())
    if verbose:
        print(kwargDict.keys(),kwargDict.values())
    kwargDict=dict(locals())
    kwargKeys=np.array(list(kwargDict.keys()))[:15]
    if verbose:
        for kwargKey in kwargKeys:
            print(kwargKey,end="")
            print(len(kwargDict[kwargKey]),kwargDict[kwargKey][0],kwargDict[kwargKey][1])
    queryStr=' and '.join(
        ['({factorKey:s} >= {factorMin:f}) and ({factorKey:s} <= {factorMax:f})'.format(
            factorKey='_'.join(kwargKey.split('_')[:-1]),
            factorMin=kwargDict[kwargKey][0],
            factorMax=kwargDict[kwargKey][1]) \
         for kwargKey in kwargKeys])
    if verbose:
        print("query:",queryStr)
        print("selection shape:",pcaCylData.query(queryStr).shape)
    go=ply.graph_objs
    fig=go.Figure(
        data=[
                go.Scatter3d(x=pcaCylData.query('AtomType == "POT"')[xCol],
                             y=pcaCylData.query('AtomType == "POT"')[yCol],
                             z=pcaCylData.query('AtomType == "POT"')[zCol],
                             mode='markers',
                             marker=dict(size=2,
                                         color='grey',
                                         opacity=.0625)),
                go.Scatter3d(x=pcaCylData.query(queryStr)[xCol],
                             y=pcaCylData.query(queryStr)[yCol],
                             z=pcaCylData.query(queryStr)[zCol],
                             mode='markers',
                             marker=dict(size=2,
                                         color='blue',
                                         opacity=.25)),
                go.Scatter3d(
                    x=pcaCylData.query(
                            '(AtomType == "CA")'
                        )[xCol],
                    y=pcaCylData.query(
                            '(AtomType == "CA")'
                        )[yCol],
                    z=pcaCylData.query(
                            '(AtomType == "CA")'
                        )[zCol],
                    mode='markers',
                    marker=dict(size=12,
                                color='black',
                                opacity=.25))
             ],
        layout=dict(
            width=900,height=675)
    )
    fig.show()
    if showCutRegions:
        aggCols=['_'.join(keyName.split('_')[:-1]) for keyName in kwargKeys]
        summaryData=pcaCylData.query(
                "AtomType == 'POT'"
            ).query(
                queryStr
            )[aggCols].agg(['min','max','mean'])
        summaryData=summaryData.T
        #print(summaryData.columns)
        summaryData['Interval']=summaryData['min'].map(str) + \
            ' - '+summaryData['max'].map(str)
        display(summaryData)

interactive(children=(FloatRangeSlider(value=(11.8977, 173.9234), description='X_bound', max=173.9234, min=11.…

# Explore using clustering to find milestone centers

In [19]:
import sklearn as skl
from sklearn.cluster import OPTICS, cluster_optics_dbscan

In [20]:
clust=skl.cluster.AgglomerativeClustering(n_clusters=12)

clustDat=pcaCylData.query('AtomType == "POT"')

Xvals=clustDat[
    #[colName for colName in pcaCylData if 'PCAcyl' in colName]
        ['PCAcyl_2','PCAcyl_3','PCAcyl_1']
        #['X','Y','Z']
    ]
dataInds=pcaCylData.query('AtomType == "POT"').index

clust.fit(Xvals)

xinds=np.arange(len(Xvals))

#reachability=clust.reachability_[clust.ordering_]
#labels=clust.labels_[clust.ordering_]
labels=clust.labels_

In [21]:
plotData=clustDat
plotData['Cluster']=labels
xCol='X'#'PCAcyl_2'
yCol='Y'#'PCAcyl_3'
zCol='Z'#'PCAcyl_1'
nLabels=len(np.unique(labels))
colorpalette=sns.palettes.color_palette('colorblind',nLabels)
go=ply.graph_objs
scatterPlots=[]
for iGroup,plotGroup in enumerate(plotData.groupby('Cluster')):
    kClass,groupData=plotGroup
    scatterPlots.append(
        go.Scatter3d(
                 x=groupData[xCol],
                 y=groupData[yCol],
                 z=groupData[zCol],
                 mode='markers',
                 marker=dict(size=2,
                             color=colorpalette.as_hex()[iGroup],
                             opacity=.25)))

fig=go.Figure(
        data=scatterPlots,
        layout=dict(
            width=900,height=675))
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [23]:
clusterSummary=plotData.groupby('Cluster').agg('mean')
clusterSummary

Unnamed: 0_level_0,Frame,AtomID,X,Y,Z,X_Center,Y_Center,Z_Center,R,Theta,D_958,D_1178,D_1366,D_2377,D_3796,PCAcyl_0,PCAcyl_1,PCAcyl_2,PCAcyl_3,PCAcyl_4
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,1370.419913,165802.050024,74.03575,143.279038,29.585023,92.720109,139.611108,62.667386,21.632019,0.366639,67.553352,92.043959,30.51206,45.93066,53.668518,-33.238758,13.891688,11.838837,18.685019,0.989985
1,1095.19717,165764.091509,108.359214,128.208116,86.818671,92.746764,139.482479,62.737338,23.344872,-0.856825,47.695562,63.308559,60.591258,64.706587,63.007481,-11.207366,-12.559195,-5.843688,-13.846379,-1.937451
2,1036.371414,165719.01632,107.970852,109.229309,57.427304,92.741738,139.548263,62.741881,36.616778,-1.194485,46.342603,97.16256,39.812655,64.035685,76.359719,-16.894957,26.784474,5.562048,-11.634567,4.631466
3,915.40342,165755.0095,119.038245,168.074076,99.552645,92.771541,139.512699,62.741911,43.622701,0.717972,75.508455,56.99047,85.98112,95.095849,58.50146,28.037464,-24.578725,-25.210124,4.47542,0.623574
4,1007.145765,165776.160616,63.725931,146.20446,95.502143,92.708228,139.344688,62.826606,34.446036,0.461061,83.973821,44.009977,78.114737,53.908219,71.116865,11.694084,-41.089517,14.978137,6.066537,1.13797
5,1049.253846,165847.484615,88.63282,88.518726,100.759968,92.754367,139.470825,62.753787,54.040825,-1.617269,64.62339,82.744023,83.677281,66.443444,102.98858,30.640207,3.550155,21.580211,-19.732598,-3.081853
6,756.93508,165712.468109,146.098543,110.544828,17.619708,92.664775,139.201696,62.933076,64.484547,-0.475766,51.799051,133.792978,59.435307,102.58574,87.035444,23.702665,62.961191,-13.730121,-10.381165,-0.468159
7,884.970634,165749.258114,75.96893,85.931045,13.589444,92.774404,139.392826,62.70665,61.673119,-1.847707,75.807729,132.899806,56.728534,64.363647,105.610865,21.960611,55.281661,32.774695,2.495232,-0.388921
8,1301.240719,165855.736659,129.257989,145.285657,54.693985,92.665104,139.427399,62.700317,37.941271,0.201737,50.544888,90.56059,48.561208,84.410949,49.661636,-12.36481,16.643777,-25.168035,1.406812,-0.655298
9,1477.519118,165857.713235,114.438574,189.650648,14.203173,92.677851,139.582124,62.650825,63.519131,1.125598,95.951531,116.210131,73.888335,107.106133,54.682203,37.211955,31.145176,-25.738692,36.283116,-4.726514


In [196]:
clusterSummary.to_csv('Agglomerative_clustering_centers.csv')

In [6]:
clusterSummary=pd.read_csv('Agglomerative_clustering_centers.csv')
clusterSummary.head()

Unnamed: 0,Cluster,Frame,AtomID,X,Y,Z,X_Center,Y_Center,Z_Center,R,...,D_958,D_1178,D_1366,D_2377,D_3796,PCAcyl_0,PCAcyl_1,PCAcyl_2,PCAcyl_3,PCAcyl_4
0,0,1370.419913,165802.050024,74.03575,143.279038,29.585023,92.720109,139.611108,62.667386,21.632019,...,67.553352,92.043959,30.51206,45.93066,53.668518,-33.238758,13.891688,11.838837,18.685019,0.989985
1,1,1095.19717,165764.091509,108.359214,128.208116,86.818671,92.746764,139.482479,62.737338,23.344872,...,47.695562,63.308559,60.591258,64.706587,63.007481,-11.207366,-12.559195,-5.843688,-13.846379,-1.937451
2,2,1036.371414,165719.01632,107.970852,109.229309,57.427304,92.741738,139.548263,62.741881,36.616778,...,46.342603,97.16256,39.812655,64.035685,76.359719,-16.894957,26.784474,5.562048,-11.634567,4.631466
3,3,915.40342,165755.0095,119.038245,168.074076,99.552645,92.771541,139.512699,62.741911,43.622701,...,75.508455,56.99047,85.98112,95.095849,58.50146,28.037464,-24.578725,-25.210124,4.47542,0.623574
4,4,1007.145765,165776.160616,63.725931,146.20446,95.502143,92.708228,139.344688,62.826606,34.446036,...,83.973821,44.009977,78.114737,53.908219,71.116865,11.694084,-41.089517,14.978137,6.066537,1.13797


In [9]:
tessellationClusters=[0,1,2,3,4,8,10]
tessellationCoordinateColumns=['PCAcyl_2','PCAcyl_3','PCAcyl_1']

clusterCenteroids=clusterSummary[
    clusterSummary.index.map(lambda x: x in (tessellationClusters))
    ][tessellationCoordinateColumns]
clusterCenteroids

Unnamed: 0,PCAcyl_2,PCAcyl_3,PCAcyl_1
0,11.838837,18.685019,13.891688
1,-5.843688,-13.846379,-12.559195
2,5.562048,-11.634567,26.784474
3,-25.210124,4.47542,-24.578725
4,14.978137,6.066537,-41.089517
8,-25.168035,1.406812,16.643777
10,-4.184592,-3.752873,-38.234956


In [10]:
clusterVoronoi=sp.spatial.Voronoi(clusterCenteroids)
clusterDelaunay=sp.spatial.Delaunay(clusterCenteroids)

In [12]:
clusterDelaunay.vertices

array([[0, 1, 4, 3],
       [2, 0, 1, 4],
       [6, 1, 4, 3],
       [5, 0, 1, 3],
       [5, 2, 0, 1]], dtype=int32)

In [14]:
neiList=defaultdict(set)
for p in clusterDelaunay.vertices:
    for i,j in itertools.combinations(p,2):
        neiList[i].add(j)
        neiList[j].add(i)

for key in sorted(neiList.keys()):
    print("%d:%s" % (key,','.join([str(i) for i in neiList[key]])))

0:1,2,3,4,5
1:0,2,3,4,5,6
2:0,1,4,5
3:0,1,4,5,6
4:0,1,2,3,6
5:0,1,2,3
6:1,3,4


In [28]:
plotData=clusterCenteroids
plotData['Cluster']=plotData.index
xCol='PCAcyl_2'
yCol='PCAcyl_3'
zCol='PCAcyl_1'
nLabels=len(np.unique(labels))
colorpalette=sns.palettes.color_palette('colorblind',nLabels)
go=ply.graph_objs
scatterPlots=[]
for iGroup,plotGroup in enumerate(plotData.groupby('Cluster')):
    kClass,groupData=plotGroup
    scatterPlots.append(
        go.Scatter3d(
                 x=groupData[xCol],
                 y=groupData[yCol],
                 z=groupData[zCol],
                 mode='markers',
                 marker=dict(size=8,
                             symbol='circle-open',
                             color=colorpalette.as_hex()[kClass],
                             opacity=.75)))
    
    neighborPoints=neiList[iGroup]
    for nbPoint in neighborPoints:
        scatterPlots.append(
            go.Scatter3d(
                x=plotData[xCol].to_numpy()[[iGroup,nbPoint]],
                y=plotData[yCol].to_numpy()[[iGroup,nbPoint]],
                z=plotData[zCol].to_numpy()[[iGroup,nbPoint]],
                mode='lines',
                line=dict(color='black')
            )
        )

plotData=clustDat
plotData['Cluster']=labels
xCol='PCAcyl_2'
yCol='PCAcyl_3'
zCol='PCAcyl_1'
nLabels=len(np.unique(labels))
colorpalette=sns.palettes.color_palette('colorblind',nLabels)
go=ply.graph_objs
#scatterPlots=[]
for iGroup,plotGroup in enumerate(plotData.groupby('Cluster')):
    kClass,groupData=plotGroup
    scatterPlots.append(
        go.Scatter3d(
                 x=groupData[xCol],
                 y=groupData[yCol],
                 z=groupData[zCol],
                 mode='markers',
                 marker=dict(size=2,
                             color=colorpalette.as_hex()[iGroup],
                             opacity=.125)))

fig=go.Figure(
        data=scatterPlots,
        layout=dict(
            width=900,height=675))
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [75]:
clusterVoronoi.regions

[[-1, 2],
 [-1, 0, 1, 2],
 [-1, 3, 4],
 [],
 [-1, 0, 1, 3, 4],
 [-1, 0, 2, 3],
 [-1, 1, 4],
 [-1, 0, 1, 2, 3, 4]]