In [1]:
import pandas as pd
import h5py
import numpy as np

# One hot encode

For DeepSTARR, N's are sometimes present.

In [2]:
# define a dictionary to map nucleotides to their one-hot encoded representation
nucleotide_dict = {'A': [1.0, 0, 0, 0],
                   'C': [0, 1.0, 0, 0],
                   'G': [0, 0, 1.0, 0],
                   'T': [0, 0, 0, 1.0],
                   'N': [0, 0, 0, 0]} # sometimes there are Ns

# write function for one hot encoding a DNA sequence
def one_hot_encode(seq):
    return [nucleotide_dict[base] for base in seq]

# DeepSTARR data
All: `wget https://data.starklab.org/almeida/DeepSTARR/Tutorial/Sequences_activity_all.txt`
Subset: `wget https://data.starklab.org/almeida/DeepSTARR/Tutorial/Sequences_activity_subset.txt`

## Load subset

In [3]:
subset = pd.read_table("../data/DeepSTARR/Sequences_activity_subset.txt")
subset.head()

Unnamed: 0,seqnames,start,end,ID,set,Sequence,Dev_log2_enrichment,Hk_log2_enrichment
0,chr3R,21360001,21360249,chr3R_21360001_21360249_+_negative,Train,TGGGTCAGCTCGGCGTAGTCCGAAATCTATTCTTTCAATTATTAAT...,0.438053,-1.102117
1,chr3L,4121751,4121999,chr3L_4121751_4121999_-_positive_peaks,Train,TTGTCAAGATTTTATCTTCGCGCGCCAAATGCCAAAAATTAGCCAA...,5.796507,2.271401
2,chrX,17616495,17616743,chrX_17616495_17616743_+_peak_849bp_region,Train,GTTCTATTGCTCGACTGTGTGTGCGGCAATCTATAATATAAGATGT...,1.271845,0.089503
3,chr3R,23774097,23774345,chr3R_23774097_23774345_+_peak_849bp_region,Train,TACATGAAAAGATACTAATTTGTTTCAAATATAAATCATATATCTA...,-1.425885,-1.103772
4,chr3L,17300157,17300405,chr3L_17300157_17300405_-_peak_849bp_region,Train,GGTCCGCAAACAAACACACTCAATTACATGCAGTAAAATTTGTTTT...,-0.964305,-1.241142


In [4]:
# check number of sequences between train/val/test data
subset['set'].value_counts()

set
Train    50000
Test     41186
Val      40570
Name: count, dtype: int64

## Load all

In [5]:
all = pd.read_table("../data/DeepSTARR/Sequences_activity_all.txt")
all.head()

Unnamed: 0,seqnames,start,end,ID,set,Sequence,Dev_log2_enrichment,Hk_log2_enrichment
0,chr2L,5587,5835,chr2L_5587_5835_+_positive_peaks,Train,ATTCAGATTGCCTCTCATTGTCTCACCCATATTATGGGAACCAAAT...,5.711541,1.362522
1,chr2L,5778,6026,chr2L_5778_6026_+_positive_peaks,Train,AAATGGCCGCTCAAGAAAAGGCTCGAATATATATTGCCTGCCTCTC...,5.153053,1.671419
2,chr2L,14226,14474,chr2L_14226_14474_+_positive_peaks,Train,ATAAGGATCAAAAAGTCCTGATTTCCGAAATGGCGGTTCTCCTTCA...,2.537589,0.290201
3,chr2L,18618,18866,chr2L_18618_18866_+_positive_peaks,Train,TTTCCATGACTGACTGGAATGGGTGGAGAACATCGCTTTGGGAGTG...,1.60888,4.097828
4,chr2L,34121,34369,chr2L_34121_34369_+_positive_peaks,Train,TCTATCGACCCATAGCCGTAGTCGCTAGACCCGCCCTTCGGAGCAT...,2.767123,0.393657


In [6]:
# check number of sequences between train/val/test data
all['set'].value_counts()

set
Train    402296
Test      41186
Val       40570
Name: count, dtype: int64

## Parse data and write to h5 file

In [7]:
def parse_to_h5(data, h5_path):

    hf = h5py.File(h5_path, 'w')
    
    for set in data['set'].unique():
        print(set)

        # get one hot encoded seqs
        one_hot_seqs = np.array(data[data['set']==set]['Sequence'].apply(lambda x: one_hot_encode(x)).to_list())

        # get target values
        targets = data[data['set']==set][['Dev_log2_enrichment', 'Hk_log2_enrichment']].values

        # get indices of samples in this set
        set_idx = data[data['set']==set].index.values

        hf.create_dataset('X_'+set, data = one_hot_seqs)
        hf.create_dataset('y_'+set, data = targets)
        hf.create_dataset('idx_'+set, data = set_idx)
    
    hf.close()


In [8]:
parse_to_h5(all, "../data/DeepSTARR/Sequences_activity_all.h5")
parse_to_h5(subset, "../data/DeepSTARR/Sequences_activity_subset.h5")

Train
Val
Test
Train
Val
Test


## Check saved data in h5 files

### Subset

In [9]:
hf = h5py.File('../data/DeepSTARR/Sequences_activity_subset.h5', 'r')
hf.keys()

<KeysViewHDF5 ['X_Test', 'X_Train', 'X_Val', 'idx_Test', 'idx_Train', 'idx_Val', 'y_Test', 'y_Train', 'y_Val']>

In [16]:
np.array(hf['one_hot_seqs_Train']).astype(np.float32)

array([[[0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]],

       [[0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        ...,
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.]],

       [[0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        ...,
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.]],

       ...,

       [[0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.]],

       [[0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]],

       [[0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        ...,
        [0., 0., 1., 0.],
        [0., 1.

In [17]:
np.array(hf['one_hot_seqs_Train'])[0].astype(np.float32).shape

(249, 4)

In [10]:
hf.close()