<a href="https://colab.research.google.com/github/tiffany1016/NYCU_iGEM2023/blob/web_crawler/2023_5_19_GAN_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

Fri May 19 14:15:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P0    30W /  70W |    679MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Reshape, Conv1D, Flatten
import numpy as np


In [None]:
# Define the generator network
def make_generator_model(sequences):
    max_seq_length = sequences.shape[1]
    num_unique_chars = sequences.shape[2]

    model = tf.keras.Sequential()

    # Input layer
    model.add(layers.Dense(256, input_dim=NOISE_DIM))
    model.add(layers.LeakyReLU(alpha=0.2))
    model.add(layers.BatchNormalization())

    # Hidden layers
    model.add(layers.Dense(512))
    model.add(layers.LeakyReLU(alpha=0.2))
    model.add(layers.BatchNormalization())

    model.add(layers.Dense(1024))
    model.add(layers.LeakyReLU(alpha=0.2))
    model.add(layers.BatchNormalization())

    # Output layer
    model.add(layers.Dense(max_seq_length * num_unique_chars, activation='sigmoid'))
    model.add(layers.Reshape((max_seq_length, num_unique_chars)))

    return model

# Define the discriminator network
def make_discriminator_model(sequences):
    max_seq_length = sequences.shape[1]
    num_unique_chars = sequences.shape[2]

    model = tf.keras.Sequential()

    # Input layer
    model.add(layers.Flatten(input_shape=(max_seq_length, num_unique_chars)))

    # Hidden layers
    model.add(layers.Dense(512))
    model.add(layers.LeakyReLU(alpha=0.2))

    model.add(layers.Dense(256))
    model.add(layers.LeakyReLU(alpha=0.2))

    # Output layer
    model.add(layers.Dense(1, activation='sigmoid'))

    return model

In [None]:
# Define the loss functions for the generator and discriminator
def generator_loss(fake_output):
    # Compute generator loss
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)(tf.ones_like(fake_output), fake_output)
    return loss

def discriminator_loss(real_output, fake_output):
    # Compute discriminator loss
    real_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)(tf.ones_like(real_output), real_output)
    fake_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)(tf.zeros_like(fake_output), fake_output)
    loss = real_loss + fake_loss
    return loss

In [None]:
# Define the optimizer for the generator and discriminator
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

In [None]:
# Define the training loop
def train_step(real_sequences):
    # Generate random noise as input to the generator
    noise = tf.random.normal([real_sequences.shape[0], NOISE_DIM])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        # Generate a sequence from the generator using the random noise
        generated_sequences = generator(noise, training=True)

        # Evaluate the discriminator on real and generated sequences
        real_output = discriminator(real_sequences, training=True)
        fake_output = discriminator(generated_sequences, training=True)

        # Compute the generator and discriminator losses
        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    # Compute the gradients and update the generator and discriminator weights
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))



In [None]:
# Define the main training function
def train(dataset, epochs):
    for epoch in range(epochs):
        for batch in dataset:
            real_sequences = batch
            train_step(real_sequences)

        print('Epoch {} complete'.format(epoch + 1))


In [None]:
# Load the dataset
dataset = np.loadtxt("/content/drive/MyDrive/Colab Notebooks/iGEM2023_model/2023_5_16_GAN_1_dataset.txt", dtype=np.str_)  # Update "your_dataset.txt" with your actual dataset file name

# Convert the dataset to one-hot encoding
def one_hot_encode(sequences):
    unique_characters = list(set("".join(sequences)))
    print(unique_characters)
    char_to_idx = {char: idx for idx, char in enumerate(unique_characters)}

    num_sequences = len(sequences)
    max_seq_length = max(len(seq) for seq in sequences)
    num_unique_chars = len(unique_characters)

    encoded_sequences = np.zeros((num_sequences, max_seq_length, num_unique_chars), dtype=np.float32)

    for i, seq in enumerate(sequences):
        for j, char in enumerate(seq):
            encoded_sequences[i, j, char_to_idx[char]] = 1.0

    return encoded_sequences


# One-hot encode the dataset
dataset = one_hot_encode(dataset)

['T', 'A', 'C', 'G']


In [None]:

# num_sequences, max_seq_length, num_unique_chars
dataset.shape

(16, 108, 4)

In [None]:
for data in dataset:
  print(data)

[[0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1.

In [None]:
# Specify the number of training epochs
num_epochs = 10  # Update with the desired number of epochs

# Set other hyperparameters
BATCH_SIZE = 64  # Update with your desired batch size
NOISE_DIM = 100  # Update with the dimensionality of your noise input

# Create instances of the generator and discriminator
generator = make_generator_model(dataset)
discriminator = make_discriminator_model(dataset)
dataset = tf.data.Dataset.from_tensor_slices(dataset).batch(BATCH_SIZE)

In [None]:
# Run the training
train(dataset, num_epochs)

  output, from_logits = _get_logits(


Epoch 1 complete
Epoch 2 complete
Epoch 3 complete
Epoch 4 complete
Epoch 5 complete
Epoch 6 complete
Epoch 7 complete
Epoch 8 complete
Epoch 9 complete
Epoch 10 complete


In [68]:
# decode_sequences
def decode_sequences(encoded_sequences, encoding_mapping):
    decoded_sequences = []
    for encoded_sequence in encoded_sequences:
        decoded_sequence = ''
        for encoded_character in encoded_sequence:
            for character, binary_vector in encoding_mapping.items():
                if np.array_equal(encoded_character, binary_vector):
                    decoded_sequence += character
                    # print("ckp")
                    # print(character)
                    break
        decoded_sequences.append(decoded_sequence)
    return decoded_sequences

# Example usage
encoded_sequences = [[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
encoding_mapping = {'A': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [1, 0, 0, 0], 'T': [0, 0, 0, 1]}

decoded_sequences = decode_sequences(encoded_sequences, encoding_mapping)
print(decoded_sequences)

['', '', '']


In [69]:
# Generate new DNA sequences using the trained generator
def generate_proteins(num_samples):
    # Generate random noise as input to the generator
    noise = tf.random.normal([num_samples, NOISE_DIM])

    # Generate sequences using the generator
    generated_sequences = generator(noise, training=False)
    print(generated_sequences)

    modify_seq = np.array(generated_sequences.numpy())

    for seq in modify_seq:
      for char in seq:
        max_index=0
        for i in range(0, 4):
          if char[i]>char[max_index]: 
            max_index=i
        for i in range(0, 4):
          if i==max_index:
            char[i]=1
          else:
            char[i]=0


    print(modify_seq)

    # Decode the generated sequences from one-hot encoding to protein sequences
    decoded_sequences = decode_sequences(modify_seq, encoding_mapping)  # Use the correct decoding function and provide the encoding_mapping

    return decoded_sequences

In [70]:
# Generate 10 new protein sequences
generated_proteins = generate_proteins(10)

tf.Tensor(
[[[0.4904123  0.46170533 0.3813923  0.6405524 ]
  [0.48111856 0.53359056 0.5221405  0.5460419 ]
  [0.5740205  0.4199112  0.5802223  0.53056234]
  ...
  [0.5222903  0.48116904 0.5222149  0.51832813]
  [0.5459612  0.49907938 0.4960175  0.5433233 ]
  [0.47552213 0.49396735 0.5411267  0.50307566]]

 [[0.5014918  0.4504532  0.5113118  0.5869162 ]
  [0.46731058 0.47168243 0.53909314 0.59291023]
  [0.53233486 0.45684937 0.5353138  0.457286  ]
  ...
  [0.48833972 0.49047044 0.5754148  0.48968056]
  [0.5432372  0.4454365  0.46736774 0.46809036]
  [0.5290691  0.5069443  0.41220903 0.45127878]]

 [[0.61193603 0.51141727 0.45765668 0.59295833]
  [0.5240039  0.47698066 0.52997094 0.41792515]
  [0.5512287  0.39912066 0.5548228  0.5535164 ]
  ...
  [0.45505017 0.5125292  0.49688527 0.37315944]
  [0.537227   0.55983204 0.4672026  0.4549263 ]
  [0.5399435  0.5504036  0.47967646 0.5194895 ]]

 ...

 [[0.6337008  0.59891003 0.47712073 0.5652614 ]
  [0.5234755  0.443153   0.5347197  0.56054693]

In [71]:
# Print the generated protein sequences
for protein in generated_proteins:
    print(protein)

TTCTACCTAGGCAATGTCTCTGGAGTATGTTTACTTCAATATCGGGTTCCGTATTAGCATTTCGAGCGACAGTCTGTTGAGGGGTAATCGTCGCGCAATTAGATAGGC
TTCGATATTAGCTGAATTTCAGTAACTCTACTTCGGCTCTAGCTGATGGGCTAAATCATGGCCGATCGCCGGCCTAAGCGTGTCTATACGCCAGGCATTTGCATACGG
GCCGATCTGGACGCTGTCTAGAGCGAAGTTTGACCTCAAGTGTATGTTAAGTAATTAGGGGTCATTCGTACGCGTAAAAAACGGTGAGGATCGGCTGACTGTCAGAAA
ATCCCTGGGAGACCTGGTTTAGTGAGCGTTATCCAGAGGGCATTTATGACCAATCTAAGTTTCATCCAACAAAATGACCCAGTGTGCGGGTCTATTATTTCACTGGGT
AGCGACCGGACCCGTATTTCGGGTGCAGGTTTGCAACAATTGCATCGGATGTAAAACTGCGTCGAACGCCAACCTTACCGTGTGCGTAAAACGGATCGGTGTATATGT
GAGTACAAGAAAGATGATTCAGGCCCAAGTTGTCAGTTCGTCGGTTTTTAGAAACATCAGACCATACCTCTAGCAATTCTTGAGTGATAGTCCGCCAATACACAGGGC
TATGATACGGGCCATATTTTAGATTCACCATTCGCGCTCTTTCATAGGCGCCCTAGCCGTAGCTAGCGGGAATGTATAAGTGTCTGACTGTCTATAGATGGTGTCGCG
GTCACTAGGACGAGTGATTCAGGCAATGCATAGTAGATCGTGCGTGTCTTGCGATAAAAGATCAAGAGCCCACCTGATCGTGTGTGCAGGTCGGATAATTGGCTAGAC
ACCGATGGGAGCAATGTTTAGGTATACGCATTCTGTCGACTTCTTATTGGGCCAGGCTGGTCCGGGCGTAGGCCTAACCCCCGGGGGGAGGTGTCAAATGTACTGATT
TGCCACCGAAGCTATGTTT