In [2]:
import pytraj as pt
import pytraj.utils.progress
import numpy as np
import scipy as sp
import matplotlib
from matplotlib import pyplot as plt
import collections
import sys
import gc
import os
import tqdm
import nglview as nv
import ipywidgets
import copy

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

# Load trajectories into a joint dictionary

In [32]:
systems=['POPC','PIP2','POPS']
trajFileName='memb_prot.dcd'
topFileName='topology.memb_prot.nochamber.parm7'
trajBaseDir='trajData'
dataFileDir='dataFiles'

trajDict={}
with tqdm.tqdm_notebook() as pbar:
    pbar.n=len(systems)
    pbar.refresh()
    for system in systems:
        pbar.set_description_str('loading %s'%system)
        trajDir='/'.join([trajBaseDir,system])
        trajFilePath='/'.join([trajDir,trajFileName])
        topFilePath='/'.join([trajDir,topFileName])
        trajDict[system]=pt.iterload(trajFilePath,top=topFilePath)
        pbar.update()

trajDict

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))




{'PIP2': pytraj.TrajectoryIterator, 1592 frames: 
 Size: 8.000000 (GB)
 <Topology: 243780 atoms, 5544 residues, 1302 mols, PBC with box type = ortho>
            , 'POPC': pytraj.TrajectoryIterator, 2001 frames: 
 Size: 10.000000 (GB)
 <Topology: 227880 atoms, 5430 residues, 1188 mols, PBC with box type = ortho>
            , 'POPS': pytraj.TrajectoryIterator, 1592 frames: 
 Size: 8.000000 (GB)
 <Topology: 241272 atoms, 5536 residues, 1294 mols, PBC with box type = ortho>
            }

# Setup lipid headgroup center calculation

There are at most 2 components in each lipid simulation, POPC and up to one additional charged lipid. These charged lipids come last, so we can look at the last residue of each trajectory. Using NGLVIEW, we can quickly visualize the lipids to investigate the atom names and come up with the needed atom mask for use in pytraj.

In [33]:
ngViewDict={}
for trajKey in trajDict:
    print trajKey
    ngViewDict[trajKey]=nv.show_pytraj(trajDict[trajKey])
    ngViewDict[trajKey].clear_representations()
    ngViewDict[trajKey].add_representation('licorice',selection='%g'%trajDict[trajKey].topology.n_residues)

POPC
POPS
PIP2


In [30]:
ngViewDict['POPC']

NGLWidget(count=2001)

In [31]:
ngViewDict['PIP2']

NGLWidget(count=1592)

In [34]:
ngViewDict['POPS']

NGLWidget(count=1592)

We note that the phosphate linker group as shown in the viewer pannels above is the same for all lipids we are working with. This makes a good choice for headgroup center coordinates. Lets compute the needed center of mass coordinates

In [41]:
headgroupAtoms='P,O11,O12,O13,O14'
maskStr=':%g@%s'
comDataDict={}
for trajKey in trajDict:
    tempTraj=trajDict[trajKey]
    print '--- %s ---'%trajKey
    print tempTraj
    print 'Computing headgroup centers'
    lipidResids=np.arange(4255,tempTraj.topology.n_residues+1)
    commands=['vector center %s'%(maskStr%(resid,headgroupAtoms)) \
              for resid in lipidResids]
    pbar=pt.utils.progress.ProgressBarTrajectory(tempTraj,style='tqdm')
    comDataDict[trajKey]=np.array(pt.compute(commands,pbar).values())

comDataDict

--- POPC ---
pytraj.TrajectoryIterator, 2001 frames: 
Size: 10.000000 (GB)
<Topology: 227880 atoms, 5430 residues, 1188 mols, PBC with box type = ortho>
           
Computing headgroup centers


HBox(children=(IntProgress(value=0, max=2001), HTML(value=u'')))


--- POPS ---
pytraj.TrajectoryIterator, 1592 frames: 
Size: 8.000000 (GB)
<Topology: 241272 atoms, 5536 residues, 1294 mols, PBC with box type = ortho>
           
Computing headgroup centers


HBox(children=(IntProgress(value=0, max=1592), HTML(value=u'')))


--- PIP2 ---
pytraj.TrajectoryIterator, 1592 frames: 
Size: 8.000000 (GB)
<Topology: 243780 atoms, 5544 residues, 1302 mols, PBC with box type = ortho>
           
Computing headgroup centers


HBox(children=(IntProgress(value=0, max=1592), HTML(value=u'')))




{'PIP2': array([[[158.80795501, 163.60398941, 102.04238811],
         [161.70295966, 164.75003812, 102.70277894],
         [162.02592931, 164.7040303 , 105.42301153],
         ...,
         [169.42432859, 179.06665936, 102.01596918],
         [168.93647262, 176.59928379, 102.01822497],
         [168.67553989, 176.44350305, 101.55110012]],
 
        [[ 26.7998961 , 166.67130667, 115.62012125],
         [ 26.88017206, 167.92777029, 116.10853092],
         [ 26.96510424, 166.47299445, 114.53795037],
         ...,
         [ 32.45077133, 141.06500285, 103.81752566],
         [ 32.44769554, 141.51805088, 104.62883462],
         [ 32.55496749, 141.20013276, 105.79062446]],
 
        [[208.30535165,  50.78301659, 121.78023518],
         [207.94849656,  50.32583826, 122.06760598],
         [206.19372102,  48.60183835, 122.17740175],
         ...,
         [181.41097325,  56.88239062, 109.61539056],
         [178.93938514,  55.43089415, 109.52019529],
         [178.79314434,  55.91658588, 109.1

These coordinate sets are quite large. In order to keep the file size to within the required 20Mb limit for github, we will need to save them in chunks. For instance, the coordinates for POPC end up taking about 54Mb to store as a single file. Numpy has functions 'array_split' and 'concatenate' that can help with this, though we still need to coordinate saving / loading each chunk into a different file. Below we define a pair of functions to do so. Lets have a look at the dimensions of one of those trajectories as well.
By virtue of the way we calculated the coordinates, the second axis (axis=1) is actually the frame number, while the first axis is the lipid number. The last axis is the coordinate dimension.

In [45]:
print "             (resid frame coord)"
for trajKey in comDataDict:
    print '%s shape: '%trajKey,
    print comDataDict[trajKey].shape

             (resid frame coord)
POPC shape:  (1176, 2001, 3)
POPS shape:  (1282, 1592, 3)
PIP2 shape:  (1290, 1592, 3)


In [46]:
def saveArrayChunks(pathBase,arr,nChunks,axis=0,
                    pbar=None):
    """
        pathBase: the prefix of the file path to save each chunk to.
                    files will be named pathBase.chunk_#.npy, where # is
                    a zero padded integer (to make loading, sorting, etc easier)
        arr: the array to be saved
        axis: the axis along which to split the array ()
    """
    arrayChunks=np.array_split(arr,nChunks,axis=axis)
    ndigits=int(np.ceil(np.log10(nChunks)))
    digitStr='%'+'0%g'%ndigits+'g'
    if not pbar is None:
        pbar.n=len(arrayChunks)
        pbar.refresh()
    for iChunk,arrayChunk in enumerate(arrayChunks):
        outPath='.'.join([pathBase,'chunk_%s'%(digitStr%iChunk),'npy'])
        np.save(outPath,arrayChunk)
        if not pbar is None:
            pbar.update()
            
def loadArrayChunks(pathBase,nChunks,axis=0,
                    pbar=None):
    arrayChunks=[]
    ndigits=int(np.ceil(np.log10(nChunks)))
    digitStr='%'+'0%g'%ndigits+'g'
    if not pbar is None:
        pbar.n=len(arrayChunks)
        pbar.refresh()
    for iChunk in np.arange(nChunks):
        dataPath='.'.join([pathBase,'chunk_%s'%(digitStr%iChunk),'npy'])
        arrayChunks.append(np.load(dataPath))
        if not pbar is None:
            pbar.update()
    return np.concatenate(arrayChunks,axis=axis)

We will save these trajectories into the 'dataFiles/headgroupCoords' directory in 4 chunks each.

In [49]:
dataDir='dataFiles/headgroupCoords'
with tqdm.tqdm_notebook() as pbar:
    for coordKey in comDataDict:
        print 'saving %s center of mass data'%coordKey
        dataFileBase='/'.join([dataDir,
                               '%s.headgroup_COM_coords'%coordKey])
        saveArrayChunks(dataFileBase,comDataDict[coordKey],nChunks=4,axis=1,
                        pbar=pbar)
print 'done'

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))

saving POPC center of mass data
saving POPS center of mass data
saving PIP2 center of mass data

done


Lets test loading them to make sure this worked correctly

In [51]:
with tqdm.tqdm_notebook() as pbar:
    tempCOMdata=loadArrayChunks(
        'dataFiles/headgroupCoords/POPC.headgroup_COM_coords',
        nChunks=4,axis=1,pbar=pbar)

print 'Loaded data'
print tempCOMdata[:4,0,:]
print '---'
print 'Original data'
print comDataDict['POPC'][:4,0,:]

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))


Loaded data
[[175.92526853 107.26068401 116.07219356]
 [172.64456347 120.53817621 110.09801014]
 [141.58276495 135.7406374   56.84068142]
 [175.62751097 200.4206751  126.43119004]]
---
Original data
[[175.92526853 107.26068401 116.07219356]
 [172.64456347 120.53817621 110.09801014]
 [141.58276495 135.7406374   56.84068142]
 [175.62751097 200.4206751  126.43119004]]


We now have the center of mass coordinates of each lipid saved in nice manageable chunks.