In [1]:
import numpy as np
import sys
import umap
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
def Run_UMAP(data, neighbors, latentDim, name):
    reducer = umap.UMAP(n_neighbors = neighbors)
    x = reducer.fit_transform(data)
    outputFile = open("UMAP."+str(neighbors)+"."+str(latentDim)+"."+name+".txt", "w+")
    for entry in x:
        print(entry[0], "\t", entry[1], file=outputFile)
    outputFile.close()

In [3]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [4]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]
    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.binary_crossentropy(data, reconstruction)
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

In [5]:
def One_Hot_Encode(seq):
    baseDict = {"A":0, "C":1, "G":2, "T":3}
    oneHotArray = []
    seq = seq.upper()
    seqArray = np.array(list(seq))
    for base in seqArray:
        code = [0, 0, 0, 0]
        if base != "N":
            code[baseDict[base]] = 1
        else:
            code = [0.25, 0.25, 0.25, 0.25]
        oneHotArray.append(code)
    return oneHotArray

In [6]:
def Process_Sequences(fasta, seqLength):
    elementList = [next(fasta).lstrip(">").rstrip()]
    inputData = []
    seq=''
    for line in fasta:
        if ">" not in line:
            seq += line.rstrip()
        else:
            inputData.append(One_Hot_Encode(seq))
            elementList.append(line.lstrip(">"))
            seq=''
    inputData.append(One_Hot_Encode(seq))
    inputData = np.array(inputData)
    inputData = inputData.reshape(inputData.shape[0], seqLength, 4)
    return elementList, inputData

In [42]:
latentDim = 10 #int(sys.argv[1])
seqLength = 300
fasta=open("/home/moorej3/Lab/ENCODE/Encyclopedia/V7/Registry/V7-hg38/CORVUS/Input-Sequences/HCT116.PLS-ELS.fa")
#fasta=open(sys.argv[2])
ccres, input = Process_Sequences(fasta, seqLength)
fasta.close()

In [43]:
latentDim = 10 #int(sys.argv[1])

encoder_inputs = keras.Input(shape=(seqLength, 4))
x = layers.Conv1D(filters=16, kernel_size=10, strides=1, activation="relu", padding="same")(encoder_inputs)
x = layers.MaxPooling1D(pool_size=2)(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latentDim, name="z_mean")(x)
z_log_var = layers.Dense(latentDim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
print(encoder.summary())

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_22 (InputLayer)           [(None, 300, 4)]     0                                            
__________________________________________________________________________________________________
conv1d_15 (Conv1D)              (None, 300, 16)      656         input_22[0][0]                   
__________________________________________________________________________________________________
max_pooling1d_15 (MaxPooling1D) (None, 150, 16)      0           conv1d_15[0][0]                  
__________________________________________________________________________________________________
flatten_9 (Flatten)             (None, 2400)         0           max_pooling1d_15[0][0]           
____________________________________________________________________________________________

In [44]:
latent_inputs = keras.Input(shape=(latentDim,))
x = layers.Dense(16, activation="relu")(latent_inputs)
x = layers.Dense(2400, activation="relu")(latent_inputs)
x = layers.Reshape((150, 16))(x)
x = layers.UpSampling1D(size=2)(x)
x = layers.Conv1DTranspose(filters=16, kernel_size=10, activation="relu", strides=1, padding="same")(x)
decoder_outputs = layers.Conv1DTranspose(4, 3, activation="sigmoid", padding="same")(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
print(decoder.summary())

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_23 (InputLayer)        [(None, 10)]              0         
_________________________________________________________________
dense_21 (Dense)             (None, 2400)              26400     
_________________________________________________________________
reshape_11 (Reshape)         (None, 150, 16)           0         
_________________________________________________________________
up_sampling1d_13 (UpSampling (None, 300, 16)           0         
_________________________________________________________________
conv1d_transpose_24 (Conv1DT (None, 300, 16)           2576      
_________________________________________________________________
conv1d_transpose_25 (Conv1DT (None, 300, 4)            196       
Total params: 29,172
Trainable params: 29,172
Non-trainable params: 0
_______________________________________________________

In [45]:
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())

In [46]:
vae.fit(input, epochs=25)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7fa413192d60>

In [47]:
z_mean, _, _ = vae.encoder.predict(input)
Run_UMAP(z_mean, 100, latentDim, "HCT116-16CNN-16Dense")