In [1]:
import sys
import os
import numpy as np

# handling different types of data
import pandas as pd
import pickle as pk
import shelve
import h5py
from joblib import Parallel, delayed  # conda install -c anaconda joblib=0.9.4
from pyflann import *

In [2]:
pickleFn =  'COPDGene_pickleFiles/histFHOG_largeRange_setting1.data.p'
# pickleFn =  '%(pickleRootFolder)s/%(pickleSettingName)s/%(pickleSettingName)s.data.p'%\
        # {'pickleSettingName':pickleSettingName, 'pickleRootFolder':pickleRootFolder}
shelveFn = 'COPDGene_pickleFiles/histFHOG_largeRange_setting1.shelve'
print "pickleFn : ", pickleFn
print "shelveFn :", shelveFn

# reading pickle and shelve files
print "Reading the shelve file ..."
fid = shelve.open(shelveFn,'r')
metaVoxelDict = fid['metaVoxelDict']
subjList = fid['subjList']
phenotypeDB_clean = fid['phenotypeDB_clean']
fid.close()
print "Done !"

print "Sample of the metadata: "
print "IDs of a few subjects : " , metaVoxelDict[0]['id']
print "labelIndex of the meta data (a few elements): " , metaVoxelDict[0]['labelIndex'][1:10]   

print "Reading pickle file ...."
fid = open(pickleFn,'rb')
data = pk.load(open(pickleFn,'rb'))
fid.close()
print "Done !"

pickleFn :  COPDGene_pickleFiles/histFHOG_largeRange_setting1.data.p
shelveFn : COPDGene_pickleFiles/histFHOG_largeRange_setting1.shelve
Reading the shelve file ...
Done !
Sample of the metadata: 
IDs of a few subjects :  10002K
labelIndex of the meta data (a few elements):  [132 133 154 163 170 180 181 182 184]
Reading pickle file ....
Done !


In [4]:
def buildSubjectTrees(subjects, data, neighbors=5):
    """
    Find the numNodes nodes of each subject that are closest to N nodes
    in every other subject.

    Inputs:
    - subjects: included for size (hackish programming)
    - data: collection of data to be tree'ed
    - neighbors: the number of nearest nodes to save

    Returns:
    - subjDBs: list of lists of dictionaries of lists
        - first layer = first subject
        - second layer = second subject
        - third layer = dictionary accessed by keys
        - "nodes": list of 

    """
    flann = FLANN()
    subjDBs = []
    # build the tree for each subject
    print "Now building subject-subject mini databases..."
    # for i in xrange(len(subjects)-1):
    for i in xrange(20):  # for testing only
        results = []
        # for j in xrange(len(subjects[i+1:])):
        for j in xrange(20):  # for testing only
            # print "i: " + str(i) + " j: " + str(j)
            nodes, dists = flann.nn(data[i]['I'], data[j]['I'], neighbors, algorithm='kmeans')
            # save the numNodes number of distances and nodes
            temp = {
                "nodes": nodes,
                "dists": dists
            }
            results.append(temp)
#         results = buildBranches(i, subjects, data, neighbors, flann)
        subjDBs.append(results)
    # [subjDBs.append(buildBranches(i, subjects, data, neighbors, flann)) for i in xrange(2)]
    # subjDBs=Parallel(n_jobs=6)(delayed(buildBranches)(i, subjects, data, neighbors, flann) for i in xrange(8))

    print "Subject level databases complete!"
    print subjDBs
    return subjDBs

neighbors = 5
subjTrees = buildSubjectTrees(subjList, data, neighbors)

Now building subject-subject mini databases...
Subject level databases complete!
[[{'nodes': array([[  0,   2,   4,   5, 177],
       [  1,   7,   9,  49, 147],
       [  2,   0,  10,   5, 153],
       [  3,   9, 127, 152,   1],
       [  4,   5,   0, 177, 156],
       [  5,   4,   0,  14,  18],
       [  6,  12,   7, 152, 127],
       [  7,   1,   9,  49,  29],
       [  8,  11,  12,   2,   0],
       [  9,  28,  49, 111,  16],
       [ 10,   2,   0, 142, 153],
       [ 11,   8,  12,   4,   2],
       [ 12,  44,   7, 153,   4],
       [ 13,  77,  53, 111,  34],
       [ 14,  29,   5,  20,  96],
       [ 15, 108,  50,  32, 125],
       [ 16,  54,  98,  64,  38],
       [ 17, 151, 154, 130, 121],
       [ 18,  20,   5, 156,   4],
       [ 19, 103,  33, 119,  45],
       [ 20,  18, 156,  14,  96],
       [ 21,  23,  29,   5,  14],
       [ 22,  90, 108,  36,  15],
       [ 23, 172,  49, 169,  28],
       [ 24, 143,  78, 105, 159],
       [ 25,  84,  78,  58,  37],
       [ 26,  64,  53, 

In [5]:
def buildSubjectTreesParallel(subjects, data, neighbors=5):
    """
    Find the numNodes nodes of each subject that are closest to N nodes
    in every other subject.

    Inputs:
    - subjects: included for size (hackish programming)
    - data: collection of data to be tree'ed
    - neighbors: the number of nearest nodes to save

    Returns:
    - subjDBs: list of lists of dictionaries of lists
        - first layer = first subject
        - second layer = second subject
        - third layer = dictionary accessed by keys
        - "nodes": list of 

    """
    flann = FLANN()
    subjDBs = []
    # build the tree for each subject
    print "Now building subject-subject mini databases..."
    # for i in xrange(len(subjects)-1):
    # for i in xrange(2):  # for testing only
        # results = []
        # for j in xrange(len(subjects[i+1:])):
        # # for j in xrange(3):  # for testing only
        #     # print "i: " + str(i) + " j: " + str(j)
        #     nodes, dists = flann.nn(data[i]['I'], data[j]['I'], neighbors, algorithm='kmeans')
        #     # save the numNodes number of distances and nodes
        #     temp = {
        #         "nodes": nodes,
        #         "dists": dists
        #     }
        #     results.append(temp)
        # results = buildBranches(i, subjects, data, neighbors, flann)
        # subjDBs.append(results)
    # [subjDBs.append(buildBranches(i, subjects, data, neighbors, flann)) for i in xrange(2)]
    subjDBs=Parallel(n_jobs=4)(delayed(buildBranches)(i, subjects, data, neighbors, flann) for i in xrange(8))

    print "Subject level databases complete!"
    print subjDBs
    return subjDBs

def buildBranches(i, subjects, data, neighbors, flann):
    """
    Inner loop for buildSubjectTrees()

    Inputs:
    - i: current index to start at
    - subjects: for determining how many items to iterate through
    - data: data to cluster
    - neighbors: the number of nearest nodes to save
    - flann: from the containing function

    Returns:
    - results: a single branch of tree'ed data
    """
    results = []
    # for j in xrange(len(subjects[i+1:])):
    for j in xrange(3):  # for testing only
        # print "i: " + str(i) + " j: " + str(j)
        nodes, dists = flann.nn(data[i]['I'], data[j]['I'], neighbors, algorithm='kmeans')
        # save the numNodes number of distances and nodes
        temp = {
            "nodes": nodes,
            "dists": dists
        }
        results.append(temp)
    return results

parallelTrees = buildSubjectTreesParallel(subjList, data, neighbors)

Now building subject-subject mini databases...
Subject level databases complete!
[[{'nodes': array([[  0,   2,   4,   5, 177],
       [  1,   7,   9,  49, 147],
       [  2,   0,  10,   5, 153],
       [  3,   9, 127, 152,   1],
       [  4,   5,   0, 177, 156],
       [  5,   4,   0,  14,  18],
       [  6,  12,   7, 152, 127],
       [  7,   1,   9,  49,  29],
       [  8,  11,  12,   2,   0],
       [  9,  28,  49, 111,   7],
       [ 10,   2,   0, 142, 153],
       [ 11,   8,  12,   4,   2],
       [ 12,  44,   7, 153,   4],
       [ 13,  77,  53,  89,  30],
       [ 14,  29,   5,  20,  96],
       [ 15, 108,  50,  32, 125],
       [ 16,  54,  98,  64,  38],
       [ 17, 151, 154, 130, 121],
       [ 18,  20,   5, 156,   4],
       [ 19, 103,  33, 119,  45],
       [ 20,  18, 156,  14,  96],
       [ 21,  23, 168,  29,   5],
       [ 22,  90, 108,  36,  15],
       [ 23, 172,  49, 169,  28],
       [ 24, 143,  78, 105, 159],
       [ 25,  84,  78,  58,  37],
       [ 26,  64,  53, 

In [6]:
def saveSubjectTrees(trees, fn):
    """
    Save the subject trees in an HDF5 file.

    Inputs:
    - trees: subject trees
    - fn: filename to save to

    Returns:
    nothing
    """
    fn = fn + ".h5"
    with h5py.File(fn, 'w') as hf:
        # metadata storage
        tableDims = [len(trees), len(trees[0])]
        hf.create_dataset("metadata", tableDims, compression='gzip', compression_opts=7)
        for i in xrange(len(trees)):
            for j in xrange(len(trees[0])):
                dsName = str(i).zfill(4)+"_"+str(j).zfill(4)
                g = hf.create_group(dsName)
                g.create_dataset("nodes", data=trees[i][j]['nodes'], compression='gzip', compression_opts=7)
                g.create_dataset("dists", data=trees[i][j]['dists'], compression='gzip', compression_opts=7)
    print "Saved the data to a HDF5 file"
                

def loadSubjectTrees(fn):
    """
    Load the subject trees from an HDF5 file.

    Inputs:
    - fn: filename to load from 

    Returns:
    - trees: loaded subject trees
    """
    fn = fn + ".h5"
    print "Loading data from HDF5 file..."

    with h5py.File(fn, 'r') as hf:
        print("List of arrays in this file: \n" + str(hf.keys()))
        metadata = hf.get('metadata').shape
        print metadata
        trees = []
        for i in xrange(metadata[0]):
            branch = []
            for j in xrange(metadata[1]):
                # get the name of the group
                dsName = str(i).zfill(4)+"_"+str(j).zfill(4)
                # extract the group and the items from the groups
                g = hf.get(dsName)
                nodes = g.get("nodes")
                dists = g.get("dists")
                # put the items into the data structure
                temp = {
                    "nodes": np.array(nodes),
                    "dists": np.array(dists)
                }
                branch.append(temp)
            trees.append(branch)
    print "Finished loading data!"
    return trees

fn = "subjTrees"
saveSubjectTrees(subjTrees, fn)
data = loadSubjectTrees(fn)

List of arrays in this file: 
[u'0000_0000', u'0000_0001', u'0000_0002', u'0000_0003', u'0000_0004', u'0000_0005', u'0000_0006', u'0000_0007', u'0000_0008', u'0000_0009', u'0000_0010', u'0000_0011', u'0000_0012', u'0000_0013', u'0000_0014', u'0000_0015', u'0000_0016', u'0000_0017', u'0000_0018', u'0000_0019', u'0001_0000', u'0001_0001', u'0001_0002', u'0001_0003', u'0001_0004', u'0001_0005', u'0001_0006', u'0001_0007', u'0001_0008', u'0001_0009', u'0001_0010', u'0001_0011', u'0001_0012', u'0001_0013', u'0001_0014', u'0001_0015', u'0001_0016', u'0001_0017', u'0001_0018', u'0001_0019', u'0002_0000', u'0002_0001', u'0002_0002', u'0002_0003', u'0002_0004', u'0002_0005', u'0002_0006', u'0002_0007', u'0002_0008', u'0002_0009', u'0002_0010', u'0002_0011', u'0002_0012', u'0002_0013', u'0002_0014', u'0002_0015', u'0002_0016', u'0002_0017', u'0002_0018', u'0002_0019', u'0003_0000', u'0003_0001', u'0003_0002', u'0003_0003', u'0003_0004', u'0003_0005', u'0003_0006', u'0003_0007', u'0003_0008', u'0