In [1]:
%reload_ext autoreload
%autoreload 2

from __future__ import division
from tfdragonn.tensorflow.extractors import write_tss_expression_bedgraph
from pybedtools import BedTool
from genomeflow.io.interval_queue import IntervalQueue
from genomeflow.io.example_queue import ExampleQueue
from genomeflow.io.data_source import DataSource
from genomeflow.io.readers.bedgraph_reader import BedGraphReader
import tensorflow as tf

INFO:2017-01-30 21:33:42,469:genomedatalayer.config] No options file found. Using defaults.
INFO:2017-01-30 21:33:42,472:genomedatalayer] Using generic Layer


we start from raw data - tss gff file, and expression tsvs - and write a tss expression bedgraph (mean expression tpm in 4th column)

In [2]:
tss_bed = "/mnt/data/annotations/by_release/hg19.GRCh37/GENCODE_ann/gencodeTSS/v19/gencode.v19.annotation_capped_sites_nr_with_confidence.gff.gz"
test_bed = "/mnt/lab_data/kundaje/jisraeli/projects/TF_Challenge/tfdragonn_0.1.1_data/regions_and_labels/all_TFs/all_TFs_on_relaxed_DNASE_peaks_bin200_flank400_stride50.A549.intervals.bed"
datafile = "/mnt/lab_data/kundaje/jisraeli/projects/TF_Challenge/data/RNAseq/gene_expression.A549.biorep1.tsv"
replicate_datafile = "/mnt/lab_data/kundaje/jisraeli/projects/TF_Challenge/data/RNAseq/gene_expression.A549.biorep2.tsv"

In [3]:
write_tss_expression_bedgraph(tss_bed,
                              datafile,
                              "tss_tpms.bedGraph",
                              expression_tsv2=replicate_datafile,
                              overwrite=True)



define a genomeflow data source

In [4]:
data_source = DataSource("tss_tpms.bedGraph", reader='bedgraph', options={'window_half_widths': [1e3, 5e3, 1e4, 5e4, 1e5]})
data_sources = {"tss_tpms": data_source}

create an interval queue using our test bed (DNase-based bins in A549)

In [5]:
bt = BedTool(test_bed)
intervals_dict = {k: bt.to_dataframe()[k].as_matrix() for k in [
                    'chrom', 'start', 'end']}
interval_q = IntervalQueue(intervals_dict)

create an example queue using the interval queue and our data source

In [6]:
ex_queue = ExampleQueue(interval_q, data_sources)

start a tensorflow session

In [7]:
s = tf.InteractiveSession()
s.run(tf.initialize_all_variables())
queue_runner_threads = tf.train.start_queue_runners(s)

Instructions for updating:
Use `tf.global_variables_initializer` instead.


stream mean tss expression within [1e3, 5e3, 1e4, 5e4, 1e5] bp for 100,000 intervals and time it

In [9]:
%timeit rv = s.run(ex_queue.dequeue_many(100000))

1 loop, best of 3: 3.29 s per loop


That's the runtime for ~1000 batches worth of data.

In [10]:
rv['data/tss_tpms'].shape

(100000, 5)

In [11]:
rv['data/tss_tpms'][:30]

array([[ 0.19376981,  0.2582714 ,  0.2582714 ,  0.2454464 ,  0.27618313],
       [ 0.19376981,  0.2582714 ,  0.2582714 ,  0.2454464 ,  0.27618313],
       [ 0.19376981,  0.2582714 ,  0.2582714 ,  0.2454464 ,  0.27618313],
       [ 0.26391789,  0.2582714 ,  0.2582714 ,  0.2454464 ,  0.27618313],
       [ 0.22445352,  0.2626546 ,  0.2582714 ,  0.2454464 ,  0.27618313],
       [ 0.37277722,  0.37277722,  0.37153444,  0.33829579,  0.3358877 ],
       [ 0.37277722,  0.37277722,  0.37153444,  0.34079304,  0.3358877 ],
       [ 0.37277722,  0.37277722,  0.37153444,  0.34079304,  0.3358877 ],
       [ 0.37277722,  0.37277722,  0.37153444,  0.34079304,  0.3358877 ],
       [ 0.37277722,  0.37277722,  0.37153444,  0.34079304,  0.3358877 ],
       [ 0.37277722,  0.37277722,  0.37153444,  0.34079304,  0.3358877 ],
       [ 0.37277722,  0.37200049,  0.37153444,  0.34079304,  0.3358877 ],
       [ 0.36381236,  0.36987385,  0.34813738,  0.34502089,  0.32836786],
       [ 0.36381236,  0.36987385,  0.3