In [1]:
import os
import bcolz
import json

import tensorflow as tf
import numpy as np
import pandas as ps

%reload_ext autoreload
%autoreload 2

%aimport dataset_interval_reader

from shared_examples_queue import SharedExamplesQueue
from bcolz_reader_op import bcolz_interval_reader
from datasets import parse_inputs_and_intervals
from interval_queue import interval_queue

### Write some dummy test data

In [2]:
NUM_SEQ_CHARS = 4
SEQ_LEN_CHR = int(1e5)
NUM_CHRS = 5
NUM_DATASETS = 3

NUM_INTERVALS = 500
INTERVAL_LENGTH = 1000
NUM_TASKS = 6

DATA_DIR = 'test-data'
FA_DIRS = [os.path.join(DATA_DIR, 'seq-{}'.format(i)) for i in range(NUM_DATASETS)]
BW_DIRS = [os.path.join(DATA_DIR, 'bgw-{}'.format(i)) for i in range(NUM_DATASETS)]

INTERVALS_FILE = os.path.join(DATA_DIR, 'intervals_file.json')
INPUTS_FILE = os.path.join(DATA_DIR, 'inputs_file.json')

TASK_NAMES = ['task-name-{}'.format(i) for i in range(NUM_TASKS)]
DATASET_NAMES = ['dataset-name-{}'.format(i) for i in range(NUM_DATASETS)]

BLOSC_CPARAMS = bcolz.cparams(clevel=5, shuffle=bcolz.SHUFFLE, cname='lz4')

if not os.path.isdir(DATA_DIR):
    os.mkdir(DATA_DIR)

for FA_DIR in FA_DIRS:
    if not os.path.isdir(FA_DIR):
        os.mkdir(FA_DIR)

for BW_DIR in BW_DIRS:
    if not os.path.isdir(BW_DIR):
        os.mkdir(BW_DIR)

def random_fasta_seq():
    seq_idxs = np.random.randint(0, NUM_SEQ_CHARS, SEQ_LEN_CHR)
    seq_arr = np.zeros((NUM_SEQ_CHARS, SEQ_LEN_CHR))
    seq_arr[seq_idxs, np.arange(SEQ_LEN_CHR, dtype=int)] = 1
    return seq_arr

def random_bw_data():
    # Just use low-frequency wave function for now
    bw_data = np.sin(np.arange(SEQ_LEN_CHR) / 1e-3)
    return bw_data

def random_labels():
    # Just random labels for now, as ints
    labels = np.random.randint(-1, 2, size=(NUM_INTERVALS, NUM_TASKS))
    return labels

def random_intervals():
    interval_starts = np.random.randint(0, SEQ_LEN_CHR - INTERVAL_LENGTH, size=NUM_INTERVALS)
    interval_ends = interval_starts + INTERVAL_LENGTH
    interval_chrs = np.random.randint(0, NUM_CHRS, size=NUM_INTERVALS)
    interval_chrs = np.array(list(map(lambda x: 'chr{}'.format(x), interval_chrs)))
    intervals = ps.DataFrame([interval_chrs, interval_starts, interval_ends]).T
    return intervals
    

seq_arrs = {'chr{}'.format(i): random_fasta_seq() for i in range(NUM_CHRS)}
bw_arrs = {'chr{}'.format(i): random_bw_data() for i in range(NUM_CHRS)}

def dump_to_disk(chr_key, arr, base_dir):
    target_fname = os.path.join(base_dir, chr_key)
    c_arr = bcolz.carray(arr, cparams=BLOSC_CPARAMS, rootdir=target_fname, mode='w')
    c_arr.flush()

def write_metadata(base_dir):
    # Check the first file to get the shape
    arr_shape = bcolz.carray(rootdir=os.path.join(base_dir, 'chr0'), mode='r').shape
    chr_shapes = {'chr{}'.format(i): arr_shape for i in range(NUM_CHRS)}
    metadata = {'type': 'array_bcolz', 'file_shapes': chr_shapes}
    with open(os.path.join(base_dir, 'metadata.json'), 'w') as fp:
        json.dump(metadata, fp)

for FA_DIR in FA_DIRS:
    for chr_key, arr in seq_arrs.items():
        dump_to_disk(chr_key, arr, FA_DIR)
    write_metadata(FA_DIR)

for BW_DIR in BW_DIRS:
    for chr_key, arr in bw_arrs.items():
        dump_to_disk(chr_key, arr, BW_DIR)
    write_metadata(BW_DIR)
    
intervals_file_dict = {'task_names': TASK_NAMES}
inputs_file_dict = {}

for dataset_idx, dataset_name in enumerate(DATASET_NAMES):
    labels_file = os.path.join(DATA_DIR, 'labels{}.npy'.format(dataset_idx))
    intervals_file = os.path.join(DATA_DIR, 'intervals{}.bed'.format(dataset_idx))
    
    labels = random_labels()
    np.save(labels_file, labels)
    
    intervals = random_intervals()
    intervals.to_csv(intervals_file, sep='\t', header=False, index=False)
    
    intervals_file_dict[dataset_name] = {'regions': intervals_file, 'labels': labels_file}
    inputs_file_dict[dataset_name] = {'dnase_data_dir': BW_DIRS[dataset_idx], 'genome_data_dir': FA_DIRS[dataset_idx]}

with open(INTERVALS_FILE, 'w') as fp:
    json.dump(intervals_file_dict, fp)

with open(INPUTS_FILE, 'w') as fp:
    json.dump(inputs_file_dict, fp)


### Set up the readers

The data we want to read is in `test-data/`.

We just need to use `dataset_interval_reader.get_readers` to create readers for all the datasets

In [3]:
INPUTS_FILE = "/users/jisraeli/src/tf-dragonn_tensorflow/tfdragonn/tensorflow/examples/processed_inputs_example.json"
INTERVALS_FILE = "/users/jisraeli/src/tf-dragonn_tensorflow/tfdragonn/tensorflow/examples/myc_conservative_dnase_regions_and_labels_stride200_flank400.json"

readers, task_names = dataset_interval_reader.get_readers_and_tasks(INPUTS_FILE, INTERVALS_FILE)

In [4]:
dataset = parse_inputs_and_intervals(INPUTS_FILE, INTERVALS_FILE)

intervals = dataset['A549']['intervals']
dnase_data_dir = dataset['A549']['inputs']['dnase_data_dir']
genome_data_dir = dataset['A549']['inputs']['genome_data_dir']

In [5]:
interval_q = interval_queue(intervals)

dnase_interval_reader = bcolz_interval_reader(interval_q, dnase_data_dir)
genome_interval_reader = bcolz_interval_reader(interval_q, genome_data_dir)

In [6]:
s = tf.InteractiveSession()

s.run(tf.initialize_all_variables())

# Note that you *must* start queue runners before fetching any of the dequeues.
queue_runner_threads = tf.train.start_queue_runners(s)

In [7]:
# Note that readers is a dictionary of `examples_queue`s
readers

{u'A549': <tensorflow.python.ops.data_flow_ops.FIFOQueue at 0x7fb76a41b350>}

In [8]:
# We can fetch from any of the readers like this
s.run(readers['A549'].dequeue())

{'data/dnase_data_dir': array([ -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7.35034168e-01,
         -7.35034168e-01,  -7.35034168e-01,  -7

In [10]:
# Or fetch a batch like this

BATCH_SIZE = 32

_ = s.run(readers['A549'].dequeue_many(BATCH_SIZE))

In [14]:
# We can also check how fast it is,
# but note that this is slower because the full result is copied and returned to python

BATCH_SIZE = 32

def fetch():
    _ = s.run(readers['A549'].dequeue_many(BATCH_SIZE))

%timeit fetch()

10 loops, best of 3: 167 ms per loop


### Set up a parallel examples queue
This takes the set of readers as input, and provides a shuffled buffer of examples from all datasets in a defined batch size

In [10]:
shared_queue = SharedExamplesQueue(readers)

In [11]:
s.run(tf.global_variables_initializer())
s.run(tf.local_variables_initializer())

# Note that you *must* start queue runners before fetching.
queue_runner_threads = tf.train.start_queue_runners(s)

In [15]:
# The shared queue provides outputs shuffled across all the datasets:

shared_queue.outputs

{'data/dnase_data_dir': <tf.Tensor 'shared-examples-queue/shuffle_batch:0' shape=(128, 100) dtype=float32>,
 'data/genome_data_dir': <tf.Tensor 'shared-examples-queue/shuffle_batch:1' shape=(128, 4, 100) dtype=float32>,
 'dataset': <tf.Tensor 'shared-examples-queue/shuffle_batch:2' shape=(128,) dtype=string>,
 'dataset-index': <tf.Tensor 'shared-examples-queue/shuffle_batch:3' shape=(128,) dtype=int32>,
 'intervals/chrom': <tf.Tensor 'shared-examples-queue/shuffle_batch:4' shape=(128,) dtype=string>,
 'intervals/end': <tf.Tensor 'shared-examples-queue/shuffle_batch:5' shape=(128,) dtype=int64>,
 'intervals/start': <tf.Tensor 'shared-examples-queue/shuffle_batch:6' shape=(128,) dtype=int64>,
 'labels': <tf.Tensor 'shared-examples-queue/shuffle_batch:7' shape=(128, 6) dtype=int64>}

In [18]:
# It also provides the dataset (and the `dataest-index`) that each example came from

# This is intended to make it easy to analyze performance by dataset

print(shared_queue.outputs['dataset'].eval()[:10])

print(shared_queue.outputs['dataset-index'].eval()[:10])

['dataset-name-2' 'dataset-name-1' 'dataset-name-0' 'dataset-name-2'
 'dataset-name-1' 'dataset-name-1' 'dataset-name-0' 'dataset-name-2'
 'dataset-name-2' 'dataset-name-0']
[2 1 0 2 2 0 2 2 2 0]


In [6]:
queue_runner_threads

[<Thread(Thread-4, started daemon 140338869991168)>,
 <Thread(Thread-5, started daemon 140338861598464)>,
 <Thread(Thread-6, started daemon 140338853205760)>,
 <Thread(Thread-7, started daemon 140338844813056)>,
 <Thread(Thread-8, started daemon 140338836420352)>]

In [7]:
t = queue_runner_threads[0]

In [16]:
[t.is_alive() for t in queue_runner_threads]

[True, True, True, True, True]

In [23]:
readers['A549'].size().eval()

639

In [7]:
dataset

{u'A549': {'inputs': {'dnase_data_dir': u'/srv/scratch/jisraeli/bcolz_data/DNASE.A549.fc.signal.bigwig',
   'genome_data_dir': u'/srv/scratch/jisraeli/bcolz_data/hg19.genome.fa'},
  'intervals': {'chrom': array(['chr1', 'chr1', 'chr1', ..., 'chrY', 'chrY', 'chrY'], dtype=object),
   'end': array([  714481,   714681,   714881, ..., 59002228, 59002428, 59002628]),
   'start': array([  713481,   713681,   713881, ..., 59001228, 59001428, 59001628])},
  'labels': array([[0],
         [0],
         [0],
         ..., 
         [0],
         [0],
         [0]]),
  'task_names': [u'MYC']}}

In [11]:
intervals

{'chrom': array(['chr1', 'chr1', 'chr1', ..., 'chrY', 'chrY', 'chrY'], dtype=object),
 'end': array([  714481,   714681,   714881, ..., 59002228, 59002428, 59002628]),
 'start': array([  713481,   713681,   713881, ..., 59001228, 59001428, 59001628])}

In [19]:
%timeit s.run(interval_q)

1000 loops, best of 3: 998 µs per loop
