## Google Colab specific initialization cell
#For those users running this on google colab, this cell can be
#turned into a code cell and run. It will have colab install needed packages and,
#most importantly, it will clone the 'dataFiles' folder of this repository
#so that you can access it here.
#Use ctrl+m+y to convert to code cell in google colab

!git clone https://github.com/wesleymsmith/lipidMapVisualization.git
!ln -s lipidMapVisualization/dataFiles ./dataFiles

In [1]:
import numpy as np
import scipy as sp
import matplotlib
from matplotlib import pyplot as plt
import collections
import sys
import gc
import os
import sklearn as skl
from sklearn import decomposition
from sklearn import metrics
from sklearn import discriminant_analysis
from sklearn import cluster
import tqdm
import ipywidgets
import copy

from ipywidgets import interact, interactive, fixed, interact_manual, interactive_output
import ipywidgets as widgets

In [2]:
def saveArrayChunks(pathBase,arr,nChunks,axis=0,
                    pbar=None):
    """
        pathBase: the prefix of the file path to save each chunk to.
                    files will be named pathBase.chunk_#.npy, where # is
                    a zero padded integer (to make loading, sorting, etc easier)
        arr: the array to be saved
        axis: the axis along which to split the array ()
    """
    arrayChunks=np.array_split(arr,nChunks,axis=axis)
    ndigits=int(np.ceil(np.log10(nChunks)))
    digitStr='%'+'0%g'%ndigits+'g'
    if not pbar is None:
        pbar.n=len(arrayChunks)
        pbar.refresh()
        pbar.clear()
    for iChunk,arrayChunk in enumerate(arrayChunks):
        outPath='.'.join([pathBase,'chunk_%s'%(digitStr%iChunk),'npy'])
        np.save(outPath,arrayChunk)
        if not pbar is None:
            pbar.update()
            
def loadArrayChunks(pathBase,nChunks,axis=0,
                    pbar=None):
    arrayChunks=[]
    ndigits=int(np.ceil(np.log10(nChunks)))
    digitStr='%'+'0%g'%ndigits+'g'
    if not pbar is None:
        pbar.n=len(arrayChunks)
        pbar.refresh()
    for iChunk in np.arange(nChunks):
        dataPath='.'.join([pathBase,'chunk_%s'%(digitStr%iChunk),'npy'])
        arrayChunks.append(np.load(dataPath))
        if not pbar is None:
            pbar.update()
    return np.concatenate(arrayChunks,axis=axis)

In [3]:
dataFileDir='dataFiles'
comDataDir='/'.join([dataFileDir,'headgroupCoords'])
leafletClusteringDir='/'.join([dataFileDir,'leafletClustering'])

comFileTypeName='headgroup_COM_coords'

systems=['POPC','POPS','PIP2']

nChunks=4

comDataDict={}
print 'Loading data sets ',
with tqdm.tqdm_notebook() as pbar:
    for system in systems:
        print system,
        pbar.set_description_str(system)
        comFileNameBase='.'.join([system,comFileTypeName])
        comFilePathBase='/'.join([comDataDir,comFileNameBase])
        comDataDict[system]=loadArrayChunks(comFilePathBase,nChunks=nChunks,axis=1,
                                            pbar=pbar)
        gc.collect()
    print ''
print 'done loading data'
print '--- --- --- ---'

for setKey in comDataDict:
    print setKey,
    print comDataDict[setKey].shape

Loading data sets 

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))

 POPC POPS PIP2 

done loading data
--- --- --- ---
POPC (1176, 2001, 3)
POPS (1282, 1592, 3)
PIP2 (1290, 1592, 3)


In [4]:
clusterDataDir='dataFiles/leafletClustering/'
clusterFileTypeName='leaflet_clustering_array'

systems=['POPC','PIP2','POPS']

nChunks=3

clusterDataDict={}
print 'Loading clustering data sets ',
with tqdm.tqdm_notebook() as pbar:
    for system in systems:
        print system,
        pbar.set_description_str(system)
        clusterFileNameBase='.'.join([system,clusterFileTypeName])
        clusterFilePathBase='/'.join([clusterDataDir,clusterFileNameBase])
        clusterDataDict[system]=loadArrayChunks(clusterFilePathBase,nChunks=nChunks,axis=0,
                                            pbar=pbar)
        gc.collect()
    print ''
print 'done loading data'
print '--- --- --- ---'

for setKey in clusterDataDict:
    print setKey,
    print clusterDataDict[setKey].shape

Loading clustering data sets 

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))

 POPC PIP2 POPS 

done loading data
--- --- --- ---
POPC (2001, 1176)
POPS (1592, 1282)
PIP2 (1592, 1290)


In [5]:
#Compute an XY grid for each system and save it to disk.
#only needs to be run once unless new systems are being analyzed.
outDir=dataFileDir
gridSpacing=1.0 #1 Å grid
for system in comDataDict:
    print '--- %s ---'%system
    comData=comDataDict[system]
    gridBounds=np.array([
            [np.min(comData[:,0]),np.min(comData[:,1])],
            [np.max(comData[:,0]),np.max(comData[:,1])]])
    centerX=np.mean(gridBounds[:,0])
    centerY=np.mean(gridBounds[:,1])
    nX=int(np.ceil((gridBounds[1,0]-gridBounds[0,0])/gridSpacing))+1
    nY=int(np.ceil((gridBounds[1,1]-gridBounds[0,1])/gridSpacing))+1
    pointsX=np.linspace(centerX-(nX-1)*gridSpacing/2,
                        centerX+(nX-1)*gridSpacing/2,
                        nX)
    pointsY=np.linspace(centerY-(nY-1)*gridSpacing/2,
                        centerY+(nY-1)*gridSpacing/2,
                        nY)
    gridX,gridY=np.meshgrid(pointsX,pointsY)
    print '%s Grid shape: '%system,
    print gridX.shape
    print '%s Grid Bounds: '%system,
    print gridBounds
    print 'saving to disk...'
    outFileName='.'.join([system,'gridX.npy'])
    outFilePath='/'.join([outDir,outFileName])
    np.save(outFilePath,arr=gridX)
    outFileName='.'.join([system,'gridY.npy'])
    outFilePath='/'.join([outDir,outFileName])
    np.save(outFilePath,arr=gridY)

--- POPC ---
POPC Grid shape:  (230, 229)
POPC Grid Bounds:  [[ -7.08916131  -5.28010502]
 [220.81656324 222.76967284]]
saving to disk...
--- POPS ---
POPS Grid shape:  (232, 233)
POPS Grid Bounds:  [[ -7.18940158  -4.86527905]
 [224.4948777  225.75348258]]
saving to disk...
--- PIP2 ---
PIP2 Grid shape:  (235, 233)
PIP2 Grid Bounds:  [[ -2.96129944  -3.34464858]
 [229.03306636 229.90644927]]
saving to disk...


In [6]:
gridXdict={}
gridYdict={}
gridBoundsDict={}
for system in comDataDict:
    print '--- %s ---'%system
    gridXfileName='.'.join([system,'gridX.npy'])
    gridXfilePath='/'.join([dataFileDir,gridXfileName])
    gridXdict[system]=np.load(gridXfilePath)
    gridYfileName='.'.join([system,'gridY.npy'])
    gridYfilePath='/'.join([dataFileDir,gridYfileName])
    gridYdict[system]=np.load(gridYfilePath)
    print '%s Grid shape: '%system,
    print gridXdict[system].shape
    gridBoundsDict[system]=[
        [np.min(gridXdict[system]),np.min(gridYdict[system])],
        [np.max(gridXdict[system]),np.max(gridYdict[system])]
    ]
    print '%s Grid Bounds: '%system,
    print gridBoundsDict[system]

--- POPC ---
POPC Grid shape:  (230, 229)
POPC Grid Bounds:  [[-7.136299035217178, -5.755216087030789], [220.86370096478282, 223.2447839129692]]
--- POPS ---
POPS Grid shape:  (232, 233)
POPS Grid Bounds:  [[-7.34726194302722, -5.0558982344096535], [224.6527380569728, 225.94410176559035]]
--- PIP2 ---
PIP2 Grid shape:  (235, 233)
PIP2 Grid Bounds:  [[-2.9641165389713535, -3.7190996543211696], [229.03588346102865, 230.28090034567884]]
