In [1]:
import os
import bcolz
import json

import tensorflow as tf
import numpy as np
import pandas as ps

%reload_ext autoreload
%autoreload 2

%aimport dataset_interval_reader

### Write some dummy test data

In [2]:
NUM_SEQ_CHARS = 4
SEQ_LEN_CHR = int(1e4)
NUM_CHRS = 5
NUM_DATASETS = 3

NUM_INTERVALS = 500
INTERVAL_LENGTH = 100
NUM_TASKS = 6

DATA_DIR = 'test-data'
FA_DIRS = [os.path.join(DATA_DIR, 'seq-{}'.format(i)) for i in range(NUM_DATASETS)]
BW_DIRS = [os.path.join(DATA_DIR, 'bgw-{}'.format(i)) for i in range(NUM_DATASETS)]

INTERVALS_FILE = os.path.join(DATA_DIR, 'intervals_file.json')
INPUTS_FILE = os.path.join(DATA_DIR, 'inputs_file.json')

TASK_NAMES = ['task-name-{}'.format(i) for i in range(NUM_TASKS)]
DATASET_NAMES = ['dataset-name-{}'.format(i) for i in range(NUM_DATASETS)]

BLOSC_CPARAMS = bcolz.cparams(clevel=5, shuffle=bcolz.SHUFFLE, cname='lz4')

if not os.path.isdir(DATA_DIR):
    os.mkdir(DATA_DIR)

for FA_DIR in FA_DIRS:
    if not os.path.isdir(FA_DIR):
        os.mkdir(FA_DIR)

for BW_DIR in BW_DIRS:
    if not os.path.isdir(BW_DIR):
        os.mkdir(BW_DIR)

def random_fasta_seq():
    seq_idxs = np.random.randint(0, NUM_SEQ_CHARS, SEQ_LEN_CHR)
    seq_arr = np.zeros((NUM_SEQ_CHARS, SEQ_LEN_CHR))
    seq_arr[seq_idxs, np.arange(SEQ_LEN_CHR, dtype=int)] = 1
    return seq_arr

def random_bw_data():
    # Just use low-frequency wave function for now
    bw_data = np.sin(np.arange(SEQ_LEN_CHR) / 1e-3)
    return bw_data

def random_labels():
    # Just random labels for now, as ints
    labels = np.random.randint(0, 3, size=(NUM_INTERVALS, NUM_TASKS))
    return labels

def random_intervals():
    interval_starts = np.random.randint(0, SEQ_LEN_CHR - INTERVAL_LENGTH, size=NUM_INTERVALS)
    interval_ends = interval_starts + INTERVAL_LENGTH
    interval_chrs = np.random.randint(0, NUM_CHRS, size=NUM_INTERVALS)
    interval_chrs = np.array(list(map(lambda x: 'chr{}'.format(x), interval_chrs)))
    intervals = ps.DataFrame([interval_chrs, interval_starts, interval_ends]).T
    return intervals
    

seq_arrs = {'chr{}'.format(i): random_fasta_seq() for i in range(NUM_CHRS)}
bw_arrs = {'chr{}'.format(i): random_bw_data() for i in range(NUM_CHRS)}

def dump_to_disk(chr_key, arr, base_dir):
    target_fname = os.path.join(base_dir, chr_key)
    c_arr = bcolz.carray(arr, cparams=BLOSC_CPARAMS, rootdir=target_fname, mode='w')
    c_arr.flush()

def write_metadata(base_dir):
    # Check the first file to get the shape
    arr_shape = bcolz.carray(rootdir=os.path.join(base_dir, 'chr0'), mode='r').shape
    chr_shapes = {'chr{}'.format(i): arr_shape for i in range(NUM_CHRS)}
    metadata = {'type': 'array_bcolz', 'file_shapes': chr_shapes}
    with open(os.path.join(base_dir, 'metadata.json'), 'w') as fp:
        json.dump(metadata, fp)

for FA_DIR in FA_DIRS:
    for chr_key, arr in seq_arrs.items():
        dump_to_disk(chr_key, arr, FA_DIR)
    write_metadata(FA_DIR)

for BW_DIR in BW_DIRS:
    for chr_key, arr in bw_arrs.items():
        dump_to_disk(chr_key, arr, BW_DIR)
    write_metadata(BW_DIR)
    
intervals_file_dict = {'task_names': TASK_NAMES}
inputs_file_dict = {}

for dataset_idx, dataset_name in enumerate(DATASET_NAMES):
    labels_file = os.path.join(DATA_DIR, 'labels{}.npy'.format(dataset_idx))
    intervals_file = os.path.join(DATA_DIR, 'intervals{}.bed'.format(dataset_idx))
    
    labels = random_labels()
    np.save(labels_file, labels)
    
    intervals = random_intervals()
    intervals.to_csv(intervals_file, sep='\t', header=False, index=False)
    
    intervals_file_dict[dataset_name] = {'regions': intervals_file, 'labels': labels_file}
    inputs_file_dict[dataset_name] = {'dnase_data_dir': BW_DIRS[dataset_idx], 'genome_data_dir': FA_DIRS[dataset_idx]}

with open(INTERVALS_FILE, 'w') as fp:
    json.dump(intervals_file_dict, fp)

with open(INPUTS_FILE, 'w') as fp:
    json.dump(inputs_file_dict, fp)


### The data we want to read is now in test-data/

In [3]:
readers = dataset_interval_reader.get_readers(INPUTS_FILE, INTERVALS_FILE)

>>>DATA SHAPE [10000]
>>>DATA SHAPE [4, 10000]
>>>DATA SHAPE [10000]
>>>DATA SHAPE [4, 10000]
>>>DATA SHAPE [10000]
>>>DATA SHAPE [4, 10000]


In [4]:
s = tf.InteractiveSession()

In [5]:
s.run(tf.initialize_all_variables())
tf.train.start_queue_runners(s)

Instructions for updating:
Use `tf.global_variables_initializer` instead.


[<Thread(Thread-4, started daemon 123145338359808)>,
 <Thread(Thread-5, started daemon 123145342566400)>,
 <Thread(Thread-6, started daemon 123145346772992)>,
 <Thread(Thread-7, started daemon 123145350979584)>,
 <Thread(Thread-8, started daemon 123145355186176)>,
 <Thread(Thread-9, started daemon 123145359392768)>,
 <Thread(Thread-10, started daemon 123145363599360)>,
 <Thread(Thread-11, started daemon 123145367805952)>,
 <Thread(Thread-12, started daemon 123145372012544)>,
 <Thread(Thread-13, started daemon 123145376219136)>,
 <Thread(Thread-14, started daemon 123145380425728)>,
 <Thread(Thread-15, started daemon 123145384632320)>,
 <Thread(Thread-16, started daemon 123145388838912)>,
 <Thread(Thread-17, started daemon 123145393045504)>,
 <Thread(Thread-18, started daemon 123145397252096)>,
 <Thread(Thread-19, started daemon 123145401458688)>,
 <Thread(Thread-20, started daemon 123145405665280)>,
 <Thread(Thread-21, started daemon 123145409871872)>,
 <Thread(Thread-22, started daemon

>>>OUTPUT SHAPE [128, 10000, 100]
>>>OUTPUT SHAPE [128, 100]
ERROR:tensorflow:Exception in QueueRunner: Failed to run py callback pyfunc_1: see error log.
	 [[Node: dataset-name-1/genome_data_dir-bcolz-reader/genome_data_dir-bcolz-reader = PyFunc[Tin=[DT_STRING, DT_INT64, DT_INT64], Tout=[DT_FLOAT], token="pyfunc_1", _device="/job:localhost/replica:0/task:0/cpu:0"](dataset-name-1/interval-queue/chrom-buffer_DequeueMany, dataset-name-1/interval-queue/start-buffer_DequeueMany, dataset-name-1/interval-queue/end-buffer_DequeueMany)]]

Caused by op u'dataset-name-1/genome_data_dir-bcolz-reader/genome_data_dir-bcolz-reader', defined at:
  File "/Users/chris/anaconda3/envs/tflow/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/Users/chris/anaconda3/envs/tflow/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/Users/chris/anaconda3/envs/tflow/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, 

Exception in thread Thread-13:
Traceback (most recent call last):
  File "/Users/chris/anaconda3/envs/tflow/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/Users/chris/anaconda3/envs/tflow/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/Users/chris/anaconda3/envs/tflow/lib/python2.7/site-packages/tensorflow/python/training/queue_runner_impl.py", line 234, in _run
    sess.run(enqueue_op)
  File "/Users/chris/anaconda3/envs/tflow/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 766, in run
    run_metadata_ptr)
  File "/Users/chris/anaconda3/envs/tflow/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 964, in _run
    feed_dict_string, options, run_metadata)
  File "/Users/chris/anaconda3/envs/tflow/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1014, in _do_run
    target_list, options, run_metadata)
  File "/Users/chris/anaconda

In [6]:
readers

{u'dataset-name-0': <tensorflow.python.ops.data_flow_ops.FIFOQueue at 0x10587c5d0>,
 u'dataset-name-1': <tensorflow.python.ops.data_flow_ops.FIFOQueue at 0x10587c850>,
 u'dataset-name-2': <tensorflow.python.ops.data_flow_ops.FIFOQueue at 0x114ddca50>}

In [9]:
s.run(readers['dataset-name-0'].dequeue())

{'data/dnase_data_dir': array([-0.23548365,  1.01358044,  1.35452569,  0.48894244, -0.82557446,
        -1.43850493, -0.8133865 ,  0.50265092,  1.3577565 ,  1.00350583,
        -0.25004598, -1.30573785, -1.23958421, -0.10948545,  1.09544885,
         1.32060957,  0.3689267 , -0.92664707, -1.43217146, -0.70519024,
         0.61801207,  1.37931347,  0.91239125, -0.37408495, -1.35413706,
        -1.16998267,  0.01719869,  1.16833627,  1.27590609,  0.24575873,
        -1.02047777, -1.41454029, -0.59152883,  0.72822261,  1.38961232,
         0.81376427, -0.49531516, -1.3918649 , -1.091187  ,  0.14355244,
         1.23165798,  1.22077405,  0.12042672, -1.10631406, -1.38575327,
        -0.4733142 ,  0.83239847,  1.38857019,  0.70841646, -0.612764  ,
        -1.41861868, -1.0038296 ,  0.26856211,  1.28490615,  1.15565574,
        -0.00606384, -1.18346691, -1.34604108, -0.35149467,  0.92970383,
         1.37619579,  0.59719265, -0.7254892 , -1.43418348, -0.90861118,
         0.39122477,  1.3276

In [17]:
tf.py_func?

In [23]:
bw_arrs['chr0'][0:10,...].shape

(10,)

In [7]:
seq_arrs['chr0'].shape

(4, 10000)