Use EvoAug to generate OOD mutated sequences given a reference sequence (DeepSTARR).

Mutations:
- random shuffle: apply `tf.random.shuffle(seqs)` to seqs
- mutagenesis: apply `RandomMutation(mutate_frac=0.25)` to seqs
- evoaug: apply list of augmentations with `max_augs_per_seq=2` to seqs

For each reference sequence from the DeepSTARR test set, we will generate 5 OOD sequences.

OOD sequences will be saved as (5,N,L,4) numpy arrays in an h5 file, where N is the number of DeepSTARR test seqs and L is 249 (DeepSTARR input sequence length). 

In [2]:
import tensorflow as tf
# import tensorflow.keras as keras
import evoaug_tf
from evoaug_tf import evoaug, augment 
import keras
from keras.models import load_model
import numpy as np
import sys
import yaml
import h5py
sys.path.append('../code')
from utils import load_DeepSTARR_data
from model_zoo import DeepSTARR
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

2024-08-18 17:19:51.484976: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-18 17:19:52.213997: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64:/lib
2024-08-18 17:19:52.214062: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64:/lib


In [3]:
# set seeds
tf.random.set_seed(1)
np.random.seed(1)

# Functions

In [4]:
@tf.function
def apply_augment(x, augment_list, hard_aug=True, max_augs_per_seq=2):

    if len(x.shape)==2:
        x = tf.reshape(x, (1, x.shape[0], x.shape[1]))
    """Apply augmentations to each sequence in batch, x."""
    # number of augmentations per sequence
    if hard_aug:
        batch_num_aug = tf.constant(max_augs_per_seq, dtype=tf.int32)
    else:
        batch_num_aug = tf.random.uniform(shape=[], minval=1, maxval=max_augs_per_seq+1, dtype=tf.int32)

    max_num_aug = len(augment_list)
    insert_max = _augment_max_len(augment_list)

    # randomly choose which subset of augmentations from augment_list
    aug_indices = tf.sort(tf.random.shuffle(tf.range(max_num_aug))[:batch_num_aug])
    # apply augmentation combination to sequences
    insert_status = True
    ind = 0
    for augment in augment_list:
        augment_condition = tf.reduce_any(tf.equal(tf.constant(ind), aug_indices))
        x = tf.cond(augment_condition, lambda: augment(x), lambda: x)
        if augment_condition and hasattr(augment, 'insert_max'):
            insert_status = False
        ind += 1
    if insert_status:
        if insert_max:
            x = _pad_end(x, insert_max)
    return x

@tf.function
def _pad_end(x, insert_max):
        """Add random DNA padding of length insert_max to the end of each sequence in batch."""

        N = tf.shape(x)[0]
        L = tf.shape(x)[1]
        A = tf.cast(tf.shape(x)[2], dtype = tf.float32)
        p = tf.ones((A,)) / A
        padding = tf.transpose(tf.gather(tf.eye(A), tf.random.categorical(tf.math.log([p] * insert_max), N)), perm=[1,0,2])

        half = int(insert_max/2)
        x_padded = tf.concat([padding[:,:half,:], x, padding[:,half:,:]], axis=1)
        return x_padded
    
def _augment_max_len(augment_list):
    """
    Determine whether insertions are applied to determine the insert_max,
    which will be applied to pad other sequences with random DNA.
    Parameters
    ----------
    augment_list : list
        List of augmentations.
    Returns
    -------
    int
        Value for insert max.
    """
    insert_max = 0
    for augment in augment_list:
        if hasattr(augment, 'insert_max'):
            insert_max = augment.insert_max
    return insert_max

def shuffle_channels(tensor):
    # Get the shape of the input tensor
    N, L, C = tf.shape(tensor)[0], tf.shape(tensor)[1], tf.shape(tensor)[2]
    
    # Create indices for shuffling
    indices = tf.range(C)
    
    # Tile and reshape indices to match the shape of the input tensor
    tiled_indices = tf.tile(tf.expand_dims(indices, 0), [N * L, 1])
    
    # Shuffle the tiled indices
    shuffled_indices = tf.random.shuffle(tiled_indices)
    
    # Reshape the shuffled indices to match the input tensor shape
    shuffled_indices = tf.reshape(shuffled_indices, [N, L, C])
    
    # Use tf.gather with the shuffled indices to reorder the channel axis
    shuffled_tensor = tf.gather(tensor, shuffled_indices, batch_dims=2)
    
    return shuffled_tensor


# Load data

In [5]:
data = '../data/DeepSTARR_ensemble_NEW/all_data_with_ensemble_metrics_hierarchical.h5'
X_train, y_train, X_test, y_test, X_val, y_val = load_DeepSTARR_data(data, std=True)

loading data from h5 file with hierarchical structure


In [7]:
# cast as tensor 
X_test_tf = tf.cast(X_test, tf.float32)

2024-08-18 17:29:18.541939: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-18 17:29:19.094422: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 141 MB memory:  -> device: 0, name: NVIDIA RTX A4000, pci bus id: 0000:41:00.0, compute capability: 8.6
2024-08-18 17:29:29.239049: W tensorflow/tsl/framework/bfc_allocator.cc:479] Allocator (GPU_0_bfc) ran out of memory trying to allocate 312.97MiB (rounded to 328170240)requested by op _EagerConst
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
20

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

# Apply random shuffle

In [6]:
X_test_random = np.stack([np.random.permutation(X_test, axis=1) for _ in range(5)])

TypeError: RandomState.permutation() takes no keyword arguments

In [None]:
X_test_random.shape

(5, 41186, 249, 4)

# Apply mutagenesis

In [8]:
mutagenesis = augment.RandomMutation(mutate_frac=0.25)

In [9]:
X_test_mutagenesis = np.stack([mutagenesis(X_test_tf).numpy() for _ in range(5)])

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [10]:
X_test_mutagenesis.shape

(5, 41186, 249, 4)

# Apply EvoAug

Arguments:
- `hard_aug=True`
- `max_augs_per_seq=2`

In [11]:
augment_list = [augment.RandomDeletion(delete_min=0, delete_max=20),
                augment.RandomTranslocationBatch(shift_min=0, shift_max=20),
                # augment.RandomNoise(noise_mean=0, noise_std=0.2),
                augment.RandomMutation(mutate_frac=0.05)]

# X_test_evoaug = np.stack([tf.map_fn(lambda x: apply_augment(x, augment_list), X_test_tf).numpy()[:,0,:,:] for _ in range(5)])

In [12]:
configfile = '../results/DeepSTARR_ensemble_NEW/config.yaml'
config  = yaml.safe_load(open(configfile, 'r'))
model = evoaug.RobustModel(DeepSTARR, config=config, input_shape=X_test[0].shape, augment_list=augment_list, max_augs_per_seq=2, hard_aug=True)
model.compile(keras.optimizers.Adam(learning_rate=0.001, decay=1e-6), loss='mse') 

# Create a dataset
dataset = tf.data.Dataset.from_tensor_slices(X_test_tf)

# Batch the dataset
batch_size = 1024
batched_dataset = dataset.batch(batch_size)



In [13]:
# Iterate over batches
X_test_evoaug_list = []
for i in range(5):
    x_mut_list = []
    for batch in batched_dataset:
        x_mut = model._apply_augment(batch)
        x_mut_list.append(x_mut)
    X_test_evoaug_list.append(tf.concat(x_mut_list, axis=0))

X_test_evoaug = tf.stack(X_test_evoaug_list).numpy()

In [14]:
X_test_evoaug.shape

(5, 41186, 249, 4)

# Save data to h5

In [18]:
hf = h5py.File('../data/DeepSTARR/ood_seqs.h5', 'w')

hf.create_dataset('random', data=X_test_random)
hf.create_dataset('mutagenesis', data=X_test_mutagenesis)
hf.create_dataset('evoaug', data=X_test_evoaug)

hf.close()
