In [1]:
%reload_ext autoreload
%autoreload 2
import numpy as np
import json
from pybedtools import BedTool, Interval
import tensorflow as tf
from tfdragonn.tensorflow.dataset_interval_reader import get_readers_and_tasks

load dnase peaks, manually count within [1000.0, 5000.0, 10000.0, 50000.0, 100000.0] windows of 3 representative intervals

In [2]:
# load dnase peaks
dnase_bed = "/mnt/lab_data/kundaje/jisraeli/projects/TF_Challenge/data/DNASE/peaks/conservative/DNASE.A549.conservative.narrowPeak.gz"
dnase_bedtool = BedTool(dnase_bed).sort().merge() # intervals are merged to collapse multi-summit peaks!
# create 2 test intervals
test_intervals = [Interval("chr1", 1177002, 1178002), Interval("chrX", 177002, 178002), Interval("chrX", 177015, 178015)]
test_bedtool = BedTool(test_intervals).set_chromsizes("hg19")
interval_length = test_bedtool[0].stop - test_bedtool[0].start
peak_counts = np.zeros((3, 5))
# get peaks counts within windows using bedtools slop+intersect
for i, window_size in enumerate([1000.0, 5000.0, 10000.0, 50000.0, 100000.0]):
    slop_size = 1. * (window_size - interval_length) / 2
    slopped_test_bedtool = test_bedtool.slop(b=slop_size)
    peak_counts[:, i] = [int(result[-1]) for result in slopped_test_bedtool.intersect(dnase_bedtool, c=True, F=0.5)]
print(peak_counts)

[[ 0.  1.  1.  3.  8.]
 [ 0.  3.  4.  8.  8.]
 [ 1.  3.  4.  8.  8.]]


pass pre-extracted count data through our tensorflow readers and check if we get the same results

In [3]:
# write inputs file with peaks counts in "count_window_sizes": [1e3, 5e3, 1e4, 5e4, 1e5]
INPUTS_FILE = "./inputs.json"
dnase_peaks_counts_data_dir = "/mnt/data/memmap/bcolz_data/DNASE.A549.conservative_peaks_counts_data_dir_dec19"
inputs_file_dict = {"test": {"dnase_peaks_counts_data_dir": dnase_peaks_counts_data_dir}}
with open(INPUTS_FILE, 'w') as fp:
    json.dump(inputs_file_dict, fp)
# write intervals config file with the 2 test intervals
INTERVALS_FILE = "./intervals.json"
INTERVALS_BED_FILE = "test.bed"
test_bedtool.saveas(INTERVALS_BED_FILE)
intervals_file_dict = {"test": {"regions": INTERVALS_BED_FILE}, "task_names": ["dummy_task"]}
with open(INTERVALS_FILE, 'w') as fp:
    json.dump(intervals_file_dict, fp)

get data readers

In [4]:
readers, task_names = get_readers_and_tasks(INPUTS_FILE, INTERVALS_FILE, in_memory=False)

initialize a session and start the readers

In [5]:
s = tf.InteractiveSession()
s.run(tf.initialize_all_variables())
queue_runner_threads = tf.train.start_queue_runners(s)

Instructions for updating:
Use `tf.global_variables_initializer` instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.


check first interval

In [6]:
s.run(readers['test'].dequeue())

{'data/dnase_peaks_counts_data_dir': array([[ -1.52271245e-16],
        [  1.00000000e+00],
        [  1.00000000e+00],
        [  3.00000000e+00],
        [  8.00000000e+00]], dtype=float32),
 'intervals/chrom': 'chr1',
 'intervals/end': 1178002,
 'intervals/start': 1177002}

matches the [ 0.  1.  1.  3.  8.] above

In [7]:
s.run(readers['test'].dequeue())

{'data/dnase_peaks_counts_data_dir': array([[ -1.02907642e-15],
        [  3.00000000e+00],
        [  4.00000000e+00],
        [  8.00000000e+00],
        [  8.00000000e+00]], dtype=float32),
 'intervals/chrom': 'chrX',
 'intervals/end': 178002,
 'intervals/start': 177002}

matches the [ 0.  3.  4.  8.  8.] above

In [8]:
s.run(readers['test'].dequeue())

{'data/dnase_peaks_counts_data_dir': array([[ 1.],
        [ 3.],
        [ 4.],
        [ 8.],
        [ 8.]], dtype=float32),
 'intervals/chrom': 'chrX',
 'intervals/end': 178015,
 'intervals/start': 177015}

matches the [ 1.  3.  4.  8.  8.] above (this interval and the previous one were intentially selected to be so close to each other). the data seems to check out.