In [1]:
import numpy as np
import sys
import umap
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

  from .autonotebook import tqdm as notebook_tqdm
2025-09-05 12:19:47.580051: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-05 12:19:48.493004: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def Run_UMAP(data, neighbors, latentDim, name):
    reducer = umap.UMAP(n_neighbors = neighbors)
    x = reducer.fit_transform(data)
    outputFile = open("UMAP."+str(neighbors)+"."+str(latentDim)+"."+name+".txt", "w+")
    for entry in x:
        print(entry[0], "\t", entry[1], file=outputFile)
    outputFile.close()

In [3]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [27]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]
    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            cce = tf.keras.losses.categorical_crossentropy(data, reconstruction)  
            reconstruction_loss = tf.reduce_mean(tf.reduce_sum(cce, axis=1))    

            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            beta = 0.001
            total_loss = reconstruction_loss + beta * kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

In [19]:
def One_Hot_Encode(seq):
    baseDict = {"A":0, "C":1, "G":2, "T":3}
    oneHotArray = []
    seq = seq.upper()
    seqArray = np.array(list(seq))
    for base in seqArray:
        code = [0, 0, 0, 0]
        if base != "N":
            code[baseDict[base]] = 1
        else:
            code = [0.25, 0.25, 0.25, 0.25]
        oneHotArray.append(code)
    return oneHotArray

In [20]:
def Process_Sequences(fasta, seqLength):
    elementList = [next(fasta).lstrip(">").rstrip()]
    inputData = []
    seq=''
    for line in fasta:
        if ">" not in line:
            seq += line.rstrip()
        else:
            inputData.append(One_Hot_Encode(seq))
            elementList.append(line.lstrip(">"))
            seq=''
    inputData.append(One_Hot_Encode(seq))
    inputData = np.array(inputData)
    inputData = inputData.reshape(inputData.shape[0], seqLength, 4)
    return elementList, inputData

In [21]:
latentDim = 10 #int(sys.argv[1])
seqLength = 300
fasta=open("/home/moorej3/Lab/ENCODE/Encyclopedia/V7/Registry/V7-hg38/CORVUS/Input-Sequences/K562.PLS-ELS.fa")
#fasta=open(sys.argv[2])
ccres, input = Process_Sequences(fasta, seqLength)
fasta.close()

In [28]:
latentDim = 10 #int(sys.argv[1])

encoder_inputs = keras.Input(shape=(seqLength, 4))
x = layers.Conv1D(filters=16, kernel_size=10, strides=1, activation="relu", padding="same")(encoder_inputs)
x = layers.MaxPooling1D(pool_size=2)(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latentDim, name="z_mean")(x)
z_log_var = layers.Dense(latentDim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
print(encoder.summary())

None


In [29]:
latent_inputs = keras.Input(shape=(latentDim,))
x = layers.Dense(16, activation="relu")(latent_inputs)
x = layers.Dense(2400, activation="relu")(x)
x = layers.Reshape((150, 16))(x)
x = layers.UpSampling1D(size=2)(x)
x = layers.Conv1DTranspose(filters=16, kernel_size=10, activation="relu", strides=1, padding="same")(x)
decoder_outputs = layers.Conv1DTranspose(4, 3, activation="softmax", padding="same")(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
print(decoder.summary())

None


In [30]:
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())

In [31]:
vae.fit(input, epochs=25)


Epoch 1/25
[1m3226/3226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 29ms/step - kl_loss: 54.0184 - loss: 403.7706 - reconstruction_loss: 403.7168 
Epoch 2/25
[1m3226/3226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 29ms/step - kl_loss: 57.4909 - loss: 397.9042 - reconstruction_loss: 397.8468 
Epoch 3/25
[1m3226/3226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 29ms/step - kl_loss: 50.7111 - loss: 397.6177 - reconstruction_loss: 397.5670 
Epoch 4/25
[1m3226/3226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 29ms/step - kl_loss: 48.1009 - loss: 397.3908 - reconstruction_loss: 397.3427 
Epoch 5/25
[1m3226/3226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 29ms/step - kl_loss: 46.5429 - loss: 397.3634 - reconstruction_loss: 397.3165 
Epoch 6/25
[1m3226/3226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 29ms/step - kl_loss: 44.8973 - loss: 397.3999 - reconstruction_loss: 397.3553 
Epoch 7/25
[1m3226/3226[0m [32m━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x74eb0a8f3680>

In [32]:
z_mean, _, _ = vae.encoder.predict(input)
Run_UMAP(z_mean, 100, latentDim, "K562-16CNN-16Dense-New3")

[1m3226/3226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step


