<a href="https://colab.research.google.com/github/tiffany1016/NYCU_iGEM2023/blob/web_crawler/2023_5_19_GAN_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Tue May 23 10:35:36 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P8    13W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Reshape, Conv1D, Flatten
import numpy as np


In [3]:
# Define the generator network
def make_generator_model(sequences):
    max_seq_length = sequences.shape[1]
    num_unique_chars = sequences.shape[2]

    model = tf.keras.Sequential()

    # Input layer
    model.add(layers.Dense(256, input_dim=NOISE_DIM))
    model.add(layers.LeakyReLU(alpha=0.2))
    model.add(layers.BatchNormalization())

    # Hidden layers
    model.add(layers.Dense(512))
    model.add(layers.LeakyReLU(alpha=0.2))
    model.add(layers.BatchNormalization())

    model.add(layers.Dense(1024))
    model.add(layers.LeakyReLU(alpha=0.2))
    model.add(layers.BatchNormalization())

    # Output layer
    model.add(layers.Dense(max_seq_length * num_unique_chars, activation='sigmoid'))
    model.add(layers.Reshape((max_seq_length, num_unique_chars)))

    return model

# Define the discriminator network
def make_discriminator_model(sequences):
    max_seq_length = sequences.shape[1]
    num_unique_chars = sequences.shape[2]

    model = tf.keras.Sequential()

    # Input layer
    model.add(layers.Flatten(input_shape=(max_seq_length, num_unique_chars)))

    # Hidden layers
    model.add(layers.Dense(512))
    model.add(layers.LeakyReLU(alpha=0.2))

    model.add(layers.Dense(256))
    model.add(layers.LeakyReLU(alpha=0.2))

    # Output layer
    model.add(layers.Dense(1, activation='sigmoid'))

    return model

In [4]:
# Define the loss functions for the generator and discriminator
def generator_loss(fake_output):
    # Compute generator loss
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)(tf.ones_like(fake_output), fake_output)
    return loss

def discriminator_loss(real_output, fake_output):
    # Compute discriminator loss
    real_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)(tf.ones_like(real_output), real_output)
    fake_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)(tf.zeros_like(fake_output), fake_output)
    loss = real_loss + fake_loss
    return loss

In [5]:
# Define the optimizer for the generator and discriminator
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

In [6]:
# Define the training loop
def train_step(real_sequences):
    # Generate random noise as input to the generator
    noise = tf.random.normal([real_sequences.shape[0], NOISE_DIM])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        # Generate a sequence from the generator using the random noise
        generated_sequences = generator(noise, training=True)

        # Evaluate the discriminator on real and generated sequences
        real_output = discriminator(real_sequences, training=True)
        fake_output = discriminator(generated_sequences, training=True)

        # Compute the generator and discriminator losses
        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    # Compute the gradients and update the generator and discriminator weights
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))



In [7]:
# Define the main training function
def train(dataset, epochs):
    for epoch in range(epochs):
        for batch in dataset:
            real_sequences = batch
            train_step(real_sequences)

        print('Epoch {} complete'.format(epoch + 1))


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# Load the dataset
dataset = np.loadtxt("/content/drive/MyDrive/Colab Notebooks/iGEM2023_model/random_protein_input.txt", dtype=np.str_)  # Update "your_dataset.txt" with your actual dataset file name

encoding_mapping={}
# Convert the dataset to one-hot encoding
def one_hot_encode(sequences):
    unique_characters = list(set("".join(sequences)))
    print(unique_characters)
    char_to_idx = {char: idx for idx, char in enumerate(unique_characters)}

    num_sequences = len(sequences)
    max_seq_length = max(len(seq) for seq in sequences)
    num_unique_chars = len(unique_characters)

    encoded_sequences = np.zeros((num_sequences, max_seq_length, num_unique_chars), dtype=np.float32)

    for i, seq in enumerate(sequences):
        for j, char in enumerate(seq):
            encoded_sequences[i, j, char_to_idx[char]] = 1.0

    for char in unique_characters:
      encode_list=np.zeros(len(unique_characters))
      encode_list[char_to_idx[char]]=1
      encoding_mapping[char]=encode_list
    
    
    print(encoding_mapping)

    return encoded_sequences


# One-hot encode the dataset
dataset = one_hot_encode(dataset)

['L', 'W', 'P', 'Q', 'R', 'I', 'C', 'K', 'V', 'H', 'Y', 'N', 'E', 'F', 'S', 'M', 'T', 'G', 'D', 'A']
{'L': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.]), 'W': array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.]), 'P': array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.]), 'Q': array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.]), 'R': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.]), 'I': array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.]), 'C': array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.]), 'K': array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.]), 'V': array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.,

In [10]:
# num_sequences, max_seq_length, num_unique_chars
dataset.shape

(10, 93, 20)

In [11]:
for data in dataset:
  print(data)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ...

In [12]:
# Specify the number of training epochs
num_epochs = 10  # Update with the desired number of epochs

# Set other hyperparameters
BATCH_SIZE = 64  # Update with your desired batch size
NOISE_DIM = 100  # Update with the dimensionality of your noise input

# Create instances of the generator and discriminator
generator = make_generator_model(dataset)
discriminator = make_discriminator_model(dataset)
dataset = tf.data.Dataset.from_tensor_slices(dataset).batch(BATCH_SIZE)

In [13]:
# Run the training
train(dataset, num_epochs)

  output, from_logits = _get_logits(


Epoch 1 complete
Epoch 2 complete
Epoch 3 complete
Epoch 4 complete
Epoch 5 complete
Epoch 6 complete
Epoch 7 complete
Epoch 8 complete
Epoch 9 complete
Epoch 10 complete


In [14]:
# decode_sequences
def decode_sequences(encoded_sequences, encoding_mapping):
    decoded_sequences = []
    for encoded_sequence in encoded_sequences:
        decoded_sequence = ''
        for encoded_character in encoded_sequence:
            for character, binary_vector in encoding_mapping.items():
                if np.array_equal(encoded_character, binary_vector):
                    decoded_sequence += character
                    # print("ckp")
                    # print(character)
                    break
        decoded_sequences.append(decoded_sequence)
    return decoded_sequences

# Example usage
# encoded_sequences = [[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
# encoding_mapping = {'A': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [1, 0, 0, 0], 'T': [0, 0, 0, 1]}

# decoded_sequences = decode_sequences(encoded_sequences, encoding_mapping)
# print(decoded_sequences)

In [15]:
# Generate new DNA sequences using the trained generator
def generate_proteins(num_samples):
    # Generate random noise as input to the generator
    noise = tf.random.normal([num_samples, NOISE_DIM])

    # Generate sequences using the generator
    generated_sequences = generator(noise, training=False)
    print(generated_sequences)

    modify_seq = np.array(generated_sequences.numpy())

    for seq in modify_seq:
      for char in seq:
        max_index=0
        for i in range(0, generated_sequences.shape[2]):
          if char[i]>char[max_index]: 
            max_index=i
        for i in range(0, generated_sequences.shape[2]):
          if i==max_index:
            char[i]=1
          else:
            char[i]=0


    print(modify_seq)

    # Decode the generated sequences from one-hot encoding to protein sequences
    decoded_sequences = decode_sequences(modify_seq, encoding_mapping)  # Use the correct decoding function and provide the encoding_mapping

    return decoded_sequences

In [16]:
# Generate 10 new protein sequences
generated_proteins = generate_proteins(10)

tf.Tensor(
[[[0.5166759  0.46842536 0.5366596  ... 0.50497913 0.47429183 0.510549  ]
  [0.49059826 0.49786198 0.46004006 ... 0.5030415  0.51095456 0.54538804]
  [0.49469882 0.48425227 0.49530977 ... 0.5677939  0.48007748 0.4298949 ]
  ...
  [0.52343476 0.50395167 0.5049937  ... 0.5172256  0.5210657  0.39149347]
  [0.53176767 0.5180647  0.53126085 ... 0.55271393 0.51085544 0.58967966]
  [0.43254054 0.55665636 0.5370724  ... 0.4652047  0.46956617 0.5807766 ]]

 [[0.47497588 0.507018   0.5010536  ... 0.53040445 0.47548428 0.47224352]
  [0.48599228 0.5160231  0.53956276 ... 0.4817179  0.54571575 0.5263266 ]
  [0.50018024 0.52093613 0.51878625 ... 0.5182923  0.5168199  0.4938498 ]
  ...
  [0.512412   0.4790662  0.5009404  ... 0.52493614 0.42207953 0.52922446]
  [0.47363263 0.49989244 0.5428209  ... 0.51431555 0.43763772 0.56967205]
  [0.52182555 0.49965346 0.45969626 ... 0.4841601  0.48732218 0.5084871 ]]

 [[0.45484185 0.50359994 0.5402182  ... 0.5075404  0.45111936 0.5620997 ]
  [0.489551

In [17]:
# Print the generated protein sequences
for protein in generated_proteins:
    print(protein)

QTCFPIVGMCKRHELPFPDKKICQKVGYRDFCAKIEWTNNGNFAYHGRDAESLRVDFGGQILRSNIMNISPDDDLEKQCAKQEGLRIQATVVA
ESTELWKKMTKVKRLDSMENHKRRFWVTWIMLFVIMVYEGSQANGGRRAFEVNVLLTVCPMLHKHKAVPHELRAYNFIVPPSLWGDVWQIFAK
YKNWTADWLTYYGQAPAGDYKNEQSMFFVYFYMFIHHMAFCTPKWDQDHVKGTWHDAGDPDGHSHRHCTMNHQMYHAISMSLEPHKVQIPAYT
QDWFTARMVPKFHKPPFYDKNNRQFKGYQIFRASIPDMNKIFDVGCGCCTESYGLDPEPWFLKHQFAVPRPFHMDPLQCNPLTRGLSPQIKVG
GTHGAQHVYYGFLDEHLARNLMAQSKSVNFTAEWVWRVNKSIPFLCCWVNNRIQLLYGPQTGRDHLWVPILMWRLSRIWFNEHRYPVFPIQKL
QAKVCAWVYRKDDDLDWCRIHNVATNPYRIGEFGIETWNKRIAGGHWRRSNRHKLFPEVFVHMHGFAKQMWPSCYSRVCSKTDWMTVSQIKMP
GADCWIWAYNGVCVLPIMTNWLDQFGFYIFGCLTIWWHRNVCAVGLWAELPQRKLDPDCFIFKFMLNNQMGVWCYLLKKDSQEGRHAAMIIHK
QINCWICIWMKADLLDYMSLKNLRGIHYDTFTADIEWETARWPAHTRSDAYGTRFWTGPQGLHSHRQSKNVRYRLHFEVYKLNIPDWVAIQVA
AQDCFICGWDKKTQPHNVRYKNRNRMQMWTFCAWIFGMADMCNGKKADRFNSHNYCTSPVIQHLDRHKKHDHWNYQKQWYKHNLITLRIIQYE
WAHCPPCIYVKTDYPRRVTQPMRRGADHIFHTLVRPREFMRQLGQATRAESTVPVGEHPETLHNTIAAKLDRIKDQFIKDKMMQKTWFQAWVE
