In [1]:
import sys
import os
import numpy as np

# handling different types of data
import pandas as pd
import pickle as pk
import shelve
import h5py
from joblib import Parallel, delayed  # conda install -c anaconda joblib=0.9.4
from cyflann import *
import argparse
import scipy.sparse as sp
from scipy import *

In [2]:
def loadSubjectGraph(fn):
    """
    Load the subject graphs from an HDF5 file.

    Inputs:
    - fn: filename to load from 

    Returns:
    - graphs: loaded subject graphs
    """
    print "Loading the subject graph..."
    fn = fn + ".h5"
    with h5py.File(fn, 'r') as hf:
        # print("List of arrays in this file: \n" + str(hf.keys()))
        metadata = hf.get('metadata').shape
        print metadata
        graph = []
        for j in xrange(metadata[0]):
            # get the name of the group
            dsName = str(j).zfill(4)
            # extract the group and the items from the groups
            g = hf.get(dsName)
            nodes = g.get("nodes")
            dists = g.get("dists")
            # put the items into the data structure
            temp = {
                "nodes": np.array(nodes),
                "dists": np.array(dists)
            }
            graph.append(temp)
    print "Graph loaded!"
    return graph


In [3]:
def loadPickledData():     

    # pickleFn =  '%(pickleRootFolder)s/%(pickleSettingName)s/%(pickleSettingName)s.data.p'%\
        # {'pickleSettingName':pickleSettingName, 'pickleRootFolder':pickleRootFolder}

    # On Bridges for job
    # pickleFn =  '/pylon1/ms4s88p/jms565/COPDGene_pickleFiles/histFHOG_largeRange_setting1.data.p'
    # shelveFn = '/pylon1/ms4s88p/jms565/COPDGene_pickleFiles/histFHOG_largeRange_setting1.shelve'

    # desktop or laptop or home
    pickleFn = "COPDGene_pickleFiles/histFHOG_largeRange_setting1.data.p"
    shelveFn = 'COPDGene_pickleFiles/histFHOG_largeRange_setting1.shelve'
    print "pickleFn : ", pickleFn
    print "shelveFn :", shelveFn
    
    # reading pickle and shelve files
    print "Reading the shelve file ..."
    fid = shelve.open(shelveFn,'r')
    metaVoxelDict = fid['metaVoxelDict']
    subjList = fid['subjList']
    phenotypeDB_clean = fid['phenotypeDB_clean']
    fid.close()
    print "Done !"
    print "Sample of the metadata: "
    print "IDs of a few subjects : " , metaVoxelDict[0]['id']
    print "labelIndex of the meta data (a few elements): " , metaVoxelDict[0]['labelIndex'][1:10]   
    print "Reading pickle file ...."
    fid = open(pickleFn,'rb')
    data = pk.load(open(pickleFn,'rb'))
    fid.close()
    print "Done !"
    return metaVoxelDict,subjList, phenotypeDB_clean, data


In [7]:
fn = "0000"
subjGraph = loadSubjectGraph("individualSubjectGraphs/"+fn)

Loading the subject graph...
(7292,)
Graph loaded!


In [8]:
# read data from original files
metaVoxelDict, subjList, phenotypeDB_clean, data = loadPickledData()

pickleFn :  COPDGene_pickleFiles/histFHOG_largeRange_setting1.data.p
shelveFn : COPDGene_pickleFiles/histFHOG_largeRange_setting1.shelve
Reading the shelve file ...
Done !
Sample of the metadata: 
IDs of a few subjects :  10002K
labelIndex of the meta data (a few elements):  [132 133 154 163 170 180 181 182 184]
Reading pickle file ....
Done !


In [9]:
# look at size of each file
numSubjSuperPixels = [ len(s['I']) for s in data ]
print len(numSubjSuperPixels)
totalSuperPixels = sum(numSubjSuperPixels)
print totalSuperPixels
# create list/dictionary for storing sizes of each subject, start index, and end index?
superPixelIndexingStart = np.zeros(len(numSubjSuperPixels))
superPixelIndexingEnd = np.zeros(len(numSubjSuperPixels))
# subj1: 0 - len(subj1)-
# subj2: len(subj1) - len(subj1)+len(subj2)-1
for i in xrange(len(numSubjSuperPixels)):
    if i == 0 :
        superPixelIndexingStart[i] = 0
        superPixelIndexingEnd[i] = numSubjSuperPixels[i]-1
    else:
        superPixelIndexingStart[i] = numSubjSuperPixels[i-1] + superPixelIndexingStart[i-1]
        superPixelIndexingEnd[i] = numSubjSuperPixels[i] + superPixelIndexingEnd[i-1]

# return the list/dictionary
superMetaData = {
    "totalSuperPixels": totalSuperPixels,
    "subjectSuperPixels": numSubjSuperPixels,
    # add both start and end and figure out which one to use later
    "superPixelIndexingStart": superPixelIndexingStart,
    "superPixelIndexingEnd": superPixelIndexingEnd
}

print superPixelIndexingStart[0:10]
print superPixelIndexingEnd[0:10]
print numSubjSuperPixels[0:10]

7292
1573110
[    0.   180.   442.   673.   877.  1144.  1352.  1549.  1822.  2082.]
[  179.   441.   672.   876.  1143.  1351.  1548.  1821.  2081.  2289.]
[180, 262, 231, 204, 267, 208, 197, 273, 260, 208]


In [10]:
# Uses the superMetaData, subjIdx, numSimNodes
subjIdx = 0
numSimNodes = 5
# set up matrix: number of elements in DB subject x total number of elements in query subjects
numSubjPix = superMetaData["subjectSuperPixels"][subjIdx]

rows = np.matrix([[i] * numSimNodes for i in xrange(superMetaData["subjectSuperPixels"][0])])
# set up initial sparse matrix
subjJShape = (superMetaData["subjectSuperPixels"][0], numSubjPix)
# get 3 closest distances 
cols = subjGraph[0]["nodes"][:, 0:numSimNodes] 
dists = subjGraph[0]["dists"][:, 0:numSimNodes]
# make sparse matrix here
sparseSubj = sp.csr_matrix( (list(dists.flat),(list(rows.flat), list(cols.flat))), shape=subjJShape)

# for each query subject in the h5 file for one db subject
for j in xrange(len(subjGraph)-1):
    subjJShape = (superMetaData["subjectSuperPixels"][j+1], numSubjPix)
    # get 3 closest distances 
    cols = subjGraph[j+1]["nodes"][:, 0:numSimNodes] 
    dists = subjGraph[j+1]["dists"][:, 0:numSimNodes]
    rows = np.matrix([[i] * numSimNodes for i in xrange(superMetaData["subjectSuperPixels"][j+1])])
    # make sparse matrix here
    sparseJ = sp.csr_matrix( (list(dists.flat),(list(rows.flat), list(cols.flat))), shape=subjJShape)
    # concatenate w/ row matrix?
    sparseSubj = sp.vstack((sparseSubj, sparseJ), format='csr')
    
print "Finished building this graph"
A = sparseSubj.todense()
print A.shape

Finished building this graph
(1573110, 180)


In [11]:
A = sparseSubj.todense()
print A.shape
B = sparseJ.todense()
print B.shape
subjI = sp.vstack((sparseSubj, sparseJ), format='csr')

(1573110, 180)
(201, 180)


In [12]:
def compileGraphSingleSubj(subjGraph, superMetaData, subjIdx, numSimNodes=5):
    """ 
    Extract the data from a single subject into a massive matrix
    Matrix size is # elements in DB subject x # elements in query subject

    Inputs:
    - subjGraph: data loaded from h5 files
    - superMetaData: data about the size and index of the query subjects
                    (from getSubjectSizes())
    - subjIdx: number for identifying the current subject
    - numSimNodes (opt): how many similar nodes will be placed in the matrix

    Returns:
    """
    # Uses the superMetaData, subjIdx, numSimNodes
    numSubjPix = superMetaData["subjectSuperPixels"][subjIdx]

    # set up initial sparse matrix
    subjJShape = (superMetaData["subjectSuperPixels"][0], numSubjPix)
    # get 3 closest distances for all elements in subj
    cols = subjGraph[0]["nodes"][:, 0:numSimNodes] 
    dists = subjGraph[0]["dists"][:, 0:numSimNodes]
    rows = np.matrix([[i] * numSimNodes for i in xrange(superMetaData["subjectSuperPixels"][0])])
    # make sparse matrix here
    sparseSubj = sp.csr_matrix( (list(dists.flat),(list(rows.flat), list(cols.flat))), shape=subjJShape)

    # for each query subject in the h5 file for one db subject
    for j in xrange(len(subjGraph)-1):
        subjJShape = (superMetaData["subjectSuperPixels"][j+1], numSubjPix)
        # get 3 closest distances 
        cols = subjGraph[j+1]["nodes"][:, 0:numSimNodes] 
        dists = subjGraph[j+1]["dists"][:, 0:numSimNodes]
        rows = np.matrix([[i] * numSimNodes for i in xrange(superMetaData["subjectSuperPixels"][j+1])])
        # make sparse matrix here
        sparseJ = sp.csr_matrix( (list(dists.flat),(list(rows.flat), list(cols.flat))), shape=subjJShape)
        # concatenate w/ row matrix?
        sparseSubj = sp.vstack((sparseSubj, sparseJ), format='csr')
        
    print "Finished building single graph for DB subject " + str(subjIdx) + "!"
    return sparseSubj

In [13]:
# set up initial graph:
fn = str(0).zfill(4)
subjGraph = loadSubjectGraph("individualSubjectGraphs/"+fn)
# compileGraphSingleSubj()
sparseGraph = compileGraphSingleSubj(subjGraph, superMetaData, 0, numSimNodes=3)

# for each subject
for s in xrange(len(superMetaData["subjectSuperPixels"])-1):
    fn = str(s+1).zfill(4)
    subjGraph = loadSubjectGraph("individualSubjectGraphs/"+fn)
    # compileGraphSingleSubj()
    sparseSubjI = compileGraphSingleSubj(subjGraph, superMetaData, s+1, numSimNodes=3)
    sparseGraph = sp.hstack((sparseGraph, sparseSubjI), format='csr')
# return the massive joint graph matrix
print "Finished compiling complete sparse graph!"

Loading the subject graph...
(7292,)
Graph loaded!
Finished building single graph for DB subject 0!
Loading the subject graph...
(7292,)
Graph loaded!
Finished building single graph for DB subject 1!
Loading the subject graph...


IOError: Unable to open file (Unable to open file: name = 'individualsubjectgraphs/0002.h5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

In [14]:
D = sparseGraph.todense()
print D.shape
print str(sum(superMetaData["subjectSuperPixels"][0:2]))
print type(D)

(1573110, 442)
442
<class 'numpy.matrixlib.defmatrix.matrix'>


In [15]:
fn = "0000"
subjGraph = loadSubjectGraph("individualSubjectGraphs/"+fn)

# Uses the superMetaData, subjIdx, numSimNodes
subjIdx = 0
numSimNodes = 5
# set up matrix: number of elements in DB subject x total number of elements in query subjects
numSubjPix = superMetaData["subjectSuperPixels"][subjIdx]
singleSubjGraph = np.zeros((superMetaData["totalSuperPixels"], numSubjPix))
# for each query subject in the h5 file
for j in xrange(len(superMetaData["subjectSuperPixels"])):
    for k in xrange(len(subjGraph[j]["nodes"])):
        # get 3 closest distances 
        nodes = subjGraph[j]["nodes"][k][0:numSimNodes]
        dists = subjGraph[j]["dists"][k][0:numSimNodes]
        # put dists at the location (db subj node, query subj nodes)
        shiftedK = int(superMetaData["superPixelIndexingStart"][j]+k)
        for i in xrange(numSimNodes):
            singleSubjGraph[shiftedK][nodes[i]] = dists[i]
# * make sure to adjust the query subj nodes wrt the offset from the prev subjs
print "Finished building this graph!"

Loading the subject graph...
(7292,)
Graph loaded!
Finished building this graph!


In [16]:
print singleSubjGraph.shape
print A.shape
print np.asarray(A).all()==singleSubjGraph.all()

(1573110, 180)
(1573110, 180)
True


In [17]:
def saveSparseGraph(graph, fn):
    """
    Try to save the graph using numpy.save
    
    Inputs:
    - graph: the csr_matrix to save
    - fn: the filename base (no extensions)
    
    Returns: none
    """
    np.savez(fn, data=graph.data, indices=graph.indices, indptr=graph.indptr, shape=graph.shape)
    print "Saved the files"
    
    
def loadSparseGraph(fn):
    """
    Try to load the previously saved graph
    
    Inputs:
    - fn: the file name/path base (no extensions)
    
    Returns: 
    - the loaded sparse matrix
    """
    loader = np.load(fn+".npz")
    print "Sparse graph loaded"
    return sp.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])


fn = "./graph-test"
saveSparseGraph(sparseGraph, fn)
loadedSparseGraph = loadSparseGraph(fn)

E = loadedSparseGraph.todense()
print D.all()==E.all()
    

Saved the files
Sparse graph loaded
True
