In [1]:
import os
import bcolz
import json

import tensorflow as tf
import numpy as np
import pandas as ps

%reload_ext autoreload
%autoreload 2

%aimport dataset_interval_reader

### Write some dummy test data

In [2]:
NUM_SEQ_CHARS = 4
SEQ_LEN_CHR = int(1e4)
NUM_CHRS = 5
NUM_DATASETS = 3

NUM_INTERVALS = 500
INTERVAL_LENGTH = 100
NUM_TASKS = 6

DATA_DIR = 'test-data'
FA_DIRS = [os.path.join(DATA_DIR, 'seq-{}'.format(i)) for i in range(NUM_DATASETS)]
BW_DIRS = [os.path.join(DATA_DIR, 'bgw-{}'.format(i)) for i in range(NUM_DATASETS)]

INTERVALS_FILE = os.path.join(DATA_DIR, 'intervals_file.json')
INPUTS_FILE = os.path.join(DATA_DIR, 'inputs_file.json')

TASK_NAMES = ['task-name-{}'.format(i) for i in range(NUM_TASKS)]
DATASET_NAMES = ['dataset-name-{}'.format(i) for i in range(NUM_DATASETS)]

BLOSC_CPARAMS = bcolz.cparams(clevel=5, shuffle=bcolz.SHUFFLE, cname='lz4')

if not os.path.isdir(DATA_DIR):
    os.mkdir(DATA_DIR)

for FA_DIR in FA_DIRS:
    if not os.path.isdir(FA_DIR):
        os.mkdir(FA_DIR)

for BW_DIR in BW_DIRS:
    if not os.path.isdir(BW_DIR):
        os.mkdir(BW_DIR)

def random_fasta_seq():
    seq_idxs = np.random.randint(0, NUM_SEQ_CHARS, SEQ_LEN_CHR)
    seq_arr = np.zeros((NUM_SEQ_CHARS, SEQ_LEN_CHR))
    seq_arr[seq_idxs, np.arange(SEQ_LEN_CHR, dtype=int)] = 1
    return seq_arr

def random_bw_data():
    # Just use low-frequency wave function for now
    bw_data = np.sin(np.arange(SEQ_LEN_CHR) / 1e-3)
    return bw_data

def random_labels():
    # Just random labels for now, as ints
    labels = np.random.randint(0, 3, size=(NUM_INTERVALS, NUM_TASKS))
    return labels

def random_intervals():
    interval_starts = np.random.randint(0, SEQ_LEN_CHR - INTERVAL_LENGTH, size=NUM_INTERVALS)
    interval_ends = interval_starts + INTERVAL_LENGTH
    interval_chrs = np.random.randint(0, NUM_CHRS, size=NUM_INTERVALS)
    interval_chrs = np.array(list(map(lambda x: 'chr{}'.format(x), interval_chrs)))
    intervals = ps.DataFrame([interval_chrs, interval_starts, interval_ends]).T
    return intervals
    

seq_arrs = {'chr{}'.format(i): random_fasta_seq() for i in range(NUM_CHRS)}
bw_arrs = {'chr{}'.format(i): random_bw_data() for i in range(NUM_CHRS)}

def dump_to_disk(chr_key, arr, base_dir):
    target_fname = os.path.join(base_dir, chr_key)
    c_arr = bcolz.carray(arr, cparams=BLOSC_CPARAMS, rootdir=target_fname, mode='w')
    c_arr.flush()

def write_metadata(base_dir):
    # Check the first file to get the shape
    arr_shape = bcolz.carray(rootdir=os.path.join(base_dir, 'chr0'), mode='r').shape
    chr_shapes = {'chr{}'.format(i): arr_shape for i in range(NUM_CHRS)}
    metadata = {'type': 'array_bcolz', 'file_shapes': chr_shapes}
    with open(os.path.join(base_dir, 'metadata.json'), 'w') as fp:
        json.dump(metadata, fp)

for FA_DIR in FA_DIRS:
    for chr_key, arr in seq_arrs.items():
        dump_to_disk(chr_key, arr, FA_DIR)
    write_metadata(FA_DIR)

for BW_DIR in BW_DIRS:
    for chr_key, arr in bw_arrs.items():
        dump_to_disk(chr_key, arr, BW_DIR)
    write_metadata(BW_DIR)
    
intervals_file_dict = {'task_names': TASK_NAMES}
inputs_file_dict = {}

for dataset_idx, dataset_name in enumerate(DATASET_NAMES):
    labels_file = os.path.join(DATA_DIR, 'labels{}.npy'.format(dataset_idx))
    intervals_file = os.path.join(DATA_DIR, 'intervals{}.bed'.format(dataset_idx))
    
    labels = random_labels()
    np.save(labels_file, labels)
    
    intervals = random_intervals()
    intervals.to_csv(intervals_file, sep='\t', header=False, index=False)
    
    intervals_file_dict[dataset_name] = {'regions': intervals_file, 'labels': labels_file}
    inputs_file_dict[dataset_name] = {'dnase_data_dir': BW_DIRS[dataset_idx], 'genome_data_dir': FA_DIRS[dataset_idx]}

with open(INTERVALS_FILE, 'w') as fp:
    json.dump(intervals_file_dict, fp)

with open(INPUTS_FILE, 'w') as fp:
    json.dump(inputs_file_dict, fp)


### Set up the readers

The data we want to read is in `test-data/`.

We just need to use `dataset_interval_reader.get_readers` to create readers for all the datasets

In [3]:
readers = dataset_interval_reader.get_readers(INPUTS_FILE, INTERVALS_FILE)

TypeError: Using a `tf.Tensor` as a Python `bool` is not allowed. Use `if t is not None:` instead of `if t:` to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the value of a tensor.

In [4]:
s = tf.InteractiveSession()

In [5]:
s.run(tf.global_variables_initializer())
s.run(tf.local_variables_initializer())

# Note that you *must* start queue runners before fetching any of the dequeues.
queue_runner_threads = tf.train.start_queue_runners(s)

In [6]:
# Note that readers is a dictionary of `examples_queue`s
readers

{u'dataset-name-0': <tensorflow.python.ops.data_flow_ops.FIFOQueue at 0x115ad0d90>,
 u'dataset-name-1': <tensorflow.python.ops.data_flow_ops.FIFOQueue at 0x115008790>,
 u'dataset-name-2': <tensorflow.python.ops.data_flow_ops.FIFOQueue at 0x115008710>}

In [7]:
# We can fetch from any of the readers like this
s.run(readers['dataset-name-0'].dequeue())

{'data/dnase_data_dir': array([-0.09652851,  1.10265791,  1.31425726,  0.35306901, -0.93963468,
        -1.4324255 , -0.6939922 ,  0.6293574 ,  1.37937236,  0.89960825,
        -0.39002532, -1.36078715, -1.16302574,  0.03016974,  1.17446482,
         1.26832438,  0.22959876, -1.03257596, -1.41349173, -0.57975501,
         0.73891282,  1.38835859,  0.80016017, -0.51086664, -1.39725626,
        -1.08320332,  0.15641975,  1.23664308,  1.21200991,  0.10408031,
        -1.11743939, -1.38342404, -0.46107277,  0.84233403,  1.38600016,
         0.69408625, -0.62781566, -1.42272186, -0.99489695,  0.28120866,
         1.28869402,  1.14576566, -0.02247933, -1.19354415, -1.34246385,
        -0.33889771,  0.93879122,  1.37231624,  0.58223784, -0.73993403,
        -1.43697941, -0.89881486,  0.40353531,  1.33019984,  1.0701232 ,
        -0.14906482, -1.26027966, -1.29093981, -0.21420996,  1.02751064,
         1.34741628,  0.4655121 , -0.84632242, -1.43991482, -0.79572821,
         0.52241838,  1.3608

In [8]:
# Or fetch a batch like this
_ = s.run(readers['dataset-name-0'].dequeue_many(128))

In [9]:
# We can also check how fast it is,
# but note that this is slower because the full result is returned to python

def fetch():
    _ = s.run(readers['dataset-name-1'].dequeue_many(128))

%timeit fetch()

100 loops, best of 3: 27.1 ms per loop


### Try parallel output queue

In [None]:
shared_queue = tf.FIFOQueue()