In [6]:
import pandas as pd
import h5py
import numpy as np

# One hot encode

For DeepSTARR, N's are sometimes present.

In [7]:
# define a dictionary to map nucleotides to their one-hot encoded representation
nucleotide_dict = {'A': [1.0, 0, 0, 0],
                   'C': [0, 1.0, 0, 0],
                   'G': [0, 0, 1.0, 0],
                   'T': [0, 0, 0, 1.0],
                   'N': [0, 0, 0, 0]} # sometimes there are Ns

# write function for one hot encoding a DNA sequence
def one_hot_encode(seq):
    return [nucleotide_dict[base] for base in seq]

# DeepSTARR data
- All: `wget https://data.starklab.org/almeida/DeepSTARR/Tutorial/Sequences_activity_all.txt`
- Subset: `wget https://data.starklab.org/almeida/DeepSTARR/Tutorial/Sequences_activity_subset.txt`

## Load subset

In [21]:
subset = pd.read_table('../data/DeepSTARR/Sequences_activity_subset.txt')
subset.head()

Unnamed: 0,seqnames,start,end,ID,set,Sequence,Dev_log2_enrichment,Hk_log2_enrichment
0,chr3R,21360001,21360249,chr3R_21360001_21360249_+_negative,Train,TGGGTCAGCTCGGCGTAGTCCGAAATCTATTCTTTCAATTATTAAT...,0.438053,-1.102117
1,chr3L,4121751,4121999,chr3L_4121751_4121999_-_positive_peaks,Train,TTGTCAAGATTTTATCTTCGCGCGCCAAATGCCAAAAATTAGCCAA...,5.796507,2.271401
2,chrX,17616495,17616743,chrX_17616495_17616743_+_peak_849bp_region,Train,GTTCTATTGCTCGACTGTGTGTGCGGCAATCTATAATATAAGATGT...,1.271845,0.089503
3,chr3R,23774097,23774345,chr3R_23774097_23774345_+_peak_849bp_region,Train,TACATGAAAAGATACTAATTTGTTTCAAATATAAATCATATATCTA...,-1.425885,-1.103772
4,chr3L,17300157,17300405,chr3L_17300157_17300405_-_peak_849bp_region,Train,GGTCCGCAAACAAACACACTCAATTACATGCAGTAAAATTTGTTTT...,-0.964305,-1.241142


In [22]:
# check number of sequences between train/val/test data
subset['set'].value_counts()

set
Train    50000
Test     41186
Val      40570
Name: count, dtype: int64

## Load all

In [23]:
all = pd.read_table("../data/DeepSTARR/Sequences_activity_all.txt")
all.head()

Unnamed: 0,seqnames,start,end,ID,set,Sequence,Dev_log2_enrichment,Hk_log2_enrichment
0,chr2L,5587,5835,chr2L_5587_5835_+_positive_peaks,Train,ATTCAGATTGCCTCTCATTGTCTCACCCATATTATGGGAACCAAAT...,5.711541,1.362522
1,chr2L,5778,6026,chr2L_5778_6026_+_positive_peaks,Train,AAATGGCCGCTCAAGAAAAGGCTCGAATATATATTGCCTGCCTCTC...,5.153053,1.671419
2,chr2L,14226,14474,chr2L_14226_14474_+_positive_peaks,Train,ATAAGGATCAAAAAGTCCTGATTTCCGAAATGGCGGTTCTCCTTCA...,2.537589,0.290201
3,chr2L,18618,18866,chr2L_18618_18866_+_positive_peaks,Train,TTTCCATGACTGACTGGAATGGGTGGAGAACATCGCTTTGGGAGTG...,1.60888,4.097828
4,chr2L,34121,34369,chr2L_34121_34369_+_positive_peaks,Train,TCTATCGACCCATAGCCGTAGTCGCTAGACCCGCCCTTCGGAGCAT...,2.767123,0.393657


In [24]:
# check number of sequences between train/val/test data
all['set'].value_counts()

set
Train    402296
Test      41186
Val       40570
Name: count, dtype: int64

## Parse data and write to h5 file

In [25]:
def parse_to_h5(data, h5_path, hierarchical = False):

    hf = h5py.File(h5_path, 'w')

    for set in data['set'].unique():
        print(set)

        # get one hot encoded seqs
        one_hot_seqs = np.array(data[data['set']==set]['Sequence'].apply(lambda x: one_hot_encode(x)).to_list())

        # get target values
        targets = data[data['set']==set][['Dev_log2_enrichment', 'Hk_log2_enrichment']].values

        # get indices of samples in this set
        set_idx = data[data['set']==set].index.values
        
        if hierarchical:
            grp = hf.create_group(set)
            grp.create_dataset('X', data = one_hot_seqs)
            grp.create_dataset('y', data = targets)
            grp.create_dataset('idx', data = set_idx)
        else:
            hf.create_dataset('X_'+set, data = one_hot_seqs)
            hf.create_dataset('y_'+set, data = targets)
            hf.create_dataset('idx_'+set, data = set_idx)
    
    hf.close()


In [30]:
parse_to_h5(all, "../data/DeepSTARR/Sequences_activity_all_hierarchical.h5", hierarchical=True)
parse_to_h5(subset, "../data/DeepSTARR/Sequences_activity_subset_hierarchical.h5", hierarchical=True)

Train
Val
Test


## Check saved data in h5 files

### Subset

In [40]:
# hf = h5py.File('../data/DeepSTARR/Sequences_activity_subset.h5', 'r')
hf = h5py.File('../data/DeepSTARR/Sequences_activity_subset_hierarchical.h5', 'r')
hf.keys()

<KeysViewHDF5 ['Test', 'Train', 'Val']>

In [41]:
hf['Test'].keys()

<KeysViewHDF5 ['X', 'idx', 'y']>

In [16]:
np.array(hf['one_hot_seqs_Train']).astype(np.float32)

array([[[0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]],

       [[0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        ...,
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.]],

       [[0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        ...,
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.]],

       ...,

       [[0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.]],

       [[0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]],

       [[0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        ...,
        [0., 0., 1., 0.],
        [0., 1.

In [17]:
np.array(hf['one_hot_seqs_Train'])[0].astype(np.float32).shape

(249, 4)

In [10]:
hf.close()

# Add data for distillation 
- mean of ensemble prediction on `X_train`
- standard deviation data

In [31]:
def load_DeepSTARR_data(file, get_idx=False):
    '''
    load Train/Test/Val data from DeepSTARR h5
    '''
    data = h5py.File(file, 'r')
    
    # test
    X_train = np.array(data['X_Train'])
    y_train = np.array(data['y_Train'])

    # train
    X_test = np.array(data['X_Test'])
    y_test = np.array(data['y_Test'])

    # validation
    X_val = np.array(data['X_Val'])
    y_val = np.array(data['y_Val'])

    # get idx
    if get_idx:
        idx_train = np.array(data['idx_Train'])
        idx_test = np.array(data['idx_Test'])
        idx_val = np.array(data['idx_Val'])
        data.close()
        return X_train, y_train, X_test, y_test, X_val, y_val, idx_train, idx_test, idx_val
    else:
        data.close()
        return X_train, y_train, X_test, y_test, X_val, y_val

In [32]:
X_train, y_train, X_test, y_test, X_val, y_val, idx_train, idx_test, idx_val = load_DeepSTARR_data("../data/DeepSTARR/Sequences_activity_all.h5", get_idx=True)

In [33]:
# # ensemble mean
# ensemble_mean = np.load("../results/DeepSTARR_lr-decay/distilled_y_train.npy")

# # stdev
# std_train = np.load("../data/DeepSTARR/ensemble_std_train.npy")
# std_test = np.load("../data/DeepSTARR/ensemble_std_test.npy")
# std_val = np.load("../data/DeepSTARR/ensemble_std_val.npy")

# ensemble mean
ensemble_mean = np.load("../data/DeepSTARR/evoaug/ensemble_mean_y_train.npy")

# stdev
std_train = np.load("../data/DeepSTARR/evoaug/ensemble_std_train.npy")
std_test = np.load("../data/DeepSTARR/evoaug/ensemble_std_test.npy")
std_val = np.load("../data/DeepSTARR/evoaug/ensemble_std_val.npy")

## Write to h5 file - hierarchical structure
- Train/Test/Val groups 
- respective X, y, idx, etc. datasets

In [34]:
h5_path = "../data/DeepSTARR/evoaug/all_data_with_ensemble_metrics_hierarchical.h5"

In [35]:
# hf.close()
hf = h5py.File(h5_path, 'w')

# write train data
train = hf.create_group("Train")
train.create_dataset("X", data = X_train)
train.create_dataset("y", data=y_train)
train.create_dataset("idx", data=idx_train)
train.create_dataset("ensemble_mean", data = ensemble_mean)
train.create_dataset("std", data=std_train)

# write test data
test = hf.create_group("Test")
test.create_dataset("X", data=X_test)
test.create_dataset("y", data=y_test)
test.create_dataset("idx", data=idx_test)
test.create_dataset("std", data=std_test)

# write val data
val = hf.create_group("Val")
val.create_dataset("X", data=X_val)
val.create_dataset("y", data=y_val)
val.create_dataset("idx", data=idx_val)
val.create_dataset("std", data=std_val)

hf.close()



## Examine file

In [40]:
f = h5py.File("../data/DeepSTARR/all_data_with_ensemble_metrics_hierarchical.h5", 'r')

In [41]:
list(f.keys())

['Test', 'Train', 'Val']

In [43]:
(np.array(f['Train']['X']).shape[-1] == 4) & (np.array(f['Test']['X']).shape[-1] == 4) & (np.array(f['Val']['X']).shape[-1] == 4)

True

In [38]:
X = np.array(f['Train']['X'])
X.shape

(402296, 249, 4)

In [80]:
y = np.array(f['Test']['y'])

In [81]:
std = np.array(f['Test']['std'])

In [82]:
y = np.append(y, std, axis=1)

In [83]:
y.shape

(41186, 4)

In [84]:
y

array([[ 3.41830648,  1.98308121,  0.48917714,  0.43232328],
       [ 2.21154471, -0.37998202,  0.37549123,  0.22830618],
       [ 3.48383142,  1.43486312,  0.62010232,  0.40132043],
       ...,
       [ 0.68102999, -2.15150483,  0.16026792,  0.39154715],
       [ 1.14443051, -1.87732987,  0.18699398,  0.32178818],
       [ 0.61593496, -1.20595262,  0.21096289,  0.20294738]])

In [75]:
std

array([[0.48917714, 0.43232328],
       [0.37549123, 0.22830618],
       [0.62010232, 0.40132043],
       ...,
       [0.16026792, 0.39154715],
       [0.18699398, 0.32178818],
       [0.21096289, 0.20294738]])

In [39]:
f.close()

## Write to h5 - flat structure
- all datasets

In [86]:
# hf.close()
h5_path = "../data/DeepSTARR/all_data_with_ensemble_metrics.h5"
hf = h5py.File(h5_path, 'w')

# write train data
hf.create_dataset("X_Train", data=X_train)
hf.create_dataset("y_Train", data=y_train)
hf.create_dataset("idx_Train", data=idx_train)
hf.create_dataset("ensemble_mean", data=ensemble_mean)
hf.create_dataset("std_Train", data=std_train)

# write test data
hf.create_dataset("X_Test", data=X_test)
hf.create_dataset("y_Test", data=y_test)
hf.create_dataset("idx_Test", data=idx_test)
hf.create_dataset("std_Test", data=std_test)

# write val data
hf.create_dataset("X_Val", data=X_val)
hf.create_dataset("y_Val", data=y_val)
hf.create_dataset("idx_Val", data=idx_val)
hf.create_dataset("std_Val", data=std_val)

hf.close()



In [88]:
'hierarchical' in "../data/DeepSTARR/all_data_with_ensemble_metrics_hierarchical.h5"

True