In [31]:
import os
import bcolz
import json

import tensorflow as tf
import numpy as np
import pandas as ps

%reload_ext autoreload
%autoreload 1

import dataset_interval_reader

### Write some dummy test data

In [38]:
NUM_SEQ_CHARS = 4
SEQ_LEN_CHR = int(1e5)
NUM_CHRS = 5
NUM_DATASETS = 3

NUM_INTERVALS = 5000
INTERVAL_LENGTH = 1000
NUM_TASKS = 6

DATA_DIR = 'test-data'
FA_DIRS = [os.path.join(DATA_DIR, 'seq-{}'.format(i)) for i in range(NUM_DATASETS)]
BW_DIRS = [os.path.join(DATA_DIR, 'bgw-{}'.format(i)) for i in range(NUM_DATASETS)]

INTERVALS_FILE = os.path.join(DATA_DIR, 'intervals_file.json')
INPUTS_FILE = os.path.join(DATA_DIR, 'inputs_file.json')

TASK_NAMES = ['task-name-{}'.format(i) for i in range(NUM_TASKS)]
DATASET_NAMES = ['dataset-name-{}'.format(i) for i in range(NUM_DATASETS)]

BLOSC_CPARAMS = bcolz.cparams(clevel=5, shuffle=bcolz.SHUFFLE, cname='lz4')

if not os.path.isdir(DATA_DIR):
    os.mkdir(DATA_DIR)

for FA_DIR in FA_DIRS:
    if not os.path.isdir(FA_DIR):
        os.mkdir(FA_DIR)

for BW_DIR in BW_DIRS:
    if not os.path.isdir(BW_DIR):
        os.mkdir(BW_DIR)

def random_fasta_seq():
    seq_idxs = np.random.randint(0, NUM_SEQ_CHARS, SEQ_LEN_CHR)
    seq_arr = np.zeros((NUM_SEQ_CHARS, SEQ_LEN_CHR))
    seq_arr[seq_idxs, np.arange(SEQ_LEN_CHR, dtype=int)] = 1
    return seq_arr

def random_bw_data():
    # Just use low-frequency wave function for now
    bw_data = np.sin(np.arange(SEQ_LEN_CHR) / 1e-3)
    return bw_data

def random_labels():
    # Just random labels for now, as ints
    labels = np.random.randint(0, 3, size=(NUM_INTERVALS, NUM_TASKS))
    return labels

def random_intervals():
    interval_starts = np.random.randint(0, SEQ_LEN_CHR - INTERVAL_LENGTH, size=NUM_INTERVALS)
    interval_ends = interval_starts + INTERVAL_LENGTH
    interval_chrs = np.random.randint(0, NUM_CHRS, size=NUM_INTERVALS)
    interval_chrs = np.array(list(map(lambda x: 'chr{}'.format(x), interval_chrs)))
    intervals = ps.DataFrame([interval_chrs, interval_starts, interval_ends]).T
    return intervals
    

seq_arrs = {'chr{}'.format(i): random_fasta_seq() for i in range(NUM_CHRS)}
bw_arrs = {'chr{}'.format(i): random_bw_data() for i in range(NUM_CHRS)}

def dump_to_disk(chr_key, arr, base_dir):
    target_fname = os.path.join(base_dir, chr_key)
    c_arr = bcolz.carray(arr, cparams=BLOSC_CPARAMS, rootdir=target_fname, mode='w')
    c_arr.flush()

def write_metadata(base_dir):
    # Check the first file to get the shape
    arr_shape = bcolz.carray(rootdir=os.path.join(base_dir, 'chr0'), mode='r').shape
    chr_shapes = {'chr{}'.format(i): arr_shape for i in range(NUM_CHRS)}
    metadata = {'type': 'array_bcolz', 'file_shapes': chr_shapes}
    with open(os.path.join(base_dir, 'metadata.json'), 'w') as fp:
        json.dump(metadata, fp)

for FA_DIR in FA_DIRS:
    for chr_key, arr in seq_arrs.items():
        dump_to_disk(chr_key, arr, FA_DIR)
    write_metadata(FA_DIR)

for BW_DIR in BW_DIRS:
    for chr_key, arr in bw_arrs.items():
        dump_to_disk(chr_key, arr, BW_DIR)
    write_metadata(BW_DIR)
    
intervals_file_dict = {'task_names': TASK_NAMES}
inputs_file_dict = {}

for dataset_idx, dataset_name in enumerate(DATASET_NAMES):
    labels_file = os.path.join(DATA_DIR, 'labels{}.npy'.format(dataset_idx))
    intervals_file = os.path.join(DATA_DIR, 'intervals{}.bed'.format(dataset_idx))
    
    labels = random_labels()
    np.save(labels_file, labels)
    
    intervals = random_intervals()
    intervals.to_csv(intervals_file, sep='\t', header=False, index=False)
    
    intervals_file_dict[dataset_name] = {'regions': intervals_file, 'labels': labels_file}
    inputs_file_dict[dataset_name] = {'dnase_data_dir': BW_DIRS[dataset_idx], 'genome_data_dir': FA_DIRS[dataset_idx]}

with open(INTERVALS_FILE, 'w') as fp:
    json.dump(intervals_file_dict, fp)

with open(INPUTS_FILE, 'w') as fp:
    json.dump(inputs_file_dict, fp)


### The data we want to read is now in test-data/

{"dataset-name-1": {"regions": "data2/intervals1.bed", "labels": "data2/labels1.npy"}, "task_names": ["task-name-0", "task-name-1", "task-name-2", "task-name-3", "task-name-4", "task-name-5"], "dataset-name-0": {"regions": "data2/intervals0.bed", "labels": "data2/labels0.npy"}, "dataset-name-2": {"regions": "data2/intervals2.bed", "labels": "data2/labels2.npy"}}

In [41]:
! head data2/intervals0.bed

chr4	37070	38070
chr3	64737	65737
chr2	70491	71491
chr2	32290	33290
chr1	59849	60849
chr0	98491	99491
chr4	58996	59996
chr4	29531	30531
chr0	28951	29951
chr4	49048	50048


In [19]:
x = np.random.randint(0, 1, 100)

In [20]:
x.tofile?

In [42]:
!pwd

/Users/chris/dev/bcolz-reader-local-dev
