# Tuning RNN model - amino acid sequences

In [1]:
# Preprocessing and encoding variables
import pandas as pd
import numpy as np

# Using Skicit-learn to split data into training and testing sets
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# For characterlevel one hot encoding or label encoding 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

# For creating the RNN model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Activation
import tensorflow.keras.utils as utils

# For plotting
import matplotlib.pyplot as plt
import seaborn as sn

# Tuning with keras tuner
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt
from keras_tuner import HyperModel
from keras_tuner import HyperParameters

### Uploading genomes

In [2]:
G = pd.read_csv('../input/genomes/G3_translated.csv').iloc[:, 1:]

In [3]:
G = G.dropna()

In [4]:
G.tail(10)

Unnamed: 0,Type,Sequence,Translated
8617,CDS,ATGCTGCGCGACAGACTGCGTTTTTCTCGCCGTCTGCATTGTGTTA...,MLRDRLRFSRRLHCVKKVKNPDAQQAIFQEMAKEIDQAAGKVLLRE...
8618,CDS,ATGAATAAAATTTATTCACTGAAATATAGTCATATTACAGGTGGAT...,MNKIYSLKYSHITGGLVAVSELTRKVSVGTSRKKVILGIILSSIYG...
8619,CDS,ATGATGGAAATTCTGCGTGGTTCGCCTGCACTGTCGGCATTCCGAA...,MMEILRGSPALSAFRINKLLARFQAARLPVHNIYAEYVHFADLNAP...
8620,CDS,TTGAGCCAGGAATACACTGAAGACAAAGAAGTCACATTGACAAAGT...,LSQEYTEDKEVTLTKLSSGRRLLEALLILIVLFAVWLMAALLSFNP...
8621,CDS,ATGGTTTACTCCTATACCGAGAAAAAACGTATTCGTAAGGATTTTG...,MVYSYTEKKRIRKDFGKRPQVLDVPYLLSIQLDSFQKFIEQDPEGQ...
8622,CDS,ATGAACAAAATATATTATCTTAAGTATTGCCATATAACCAAAAGCC...,MNKIYYLKYCHITKSLIAVSELARRVTCKSHRRLSRRVILTSVAAL...
8623,CDS,GTGAATAAAGTTTATTCTCTTAAATATTGCCCCGTCACCGGGGGGC...,VNKVYSLKYCPVTGGLIAVSELARRVIKKTCRRLTHILLAGIPAIC...
8624,CDS,GTGAAAGATTTATTAAAGTTTCTGAAAGCGCAGACTAAAACCGAAG...,VKDLLKFLKAQTKTEEFDAIKIALASPDMIRSWSFGEVKKPETINY...
8625,CDS,ATGATTGAACGCGGTAAATTTCGCTCACTGACGCTGATTAACTGGA...,MIERGKFRSLTLINWNGFFARTFDLDELVTTLSGGNGAGKSTTMAA...
8626,CDS,ATGCGAGGCGCGCGTATGACACGCAAACCCCGTCGCCACGCTCTTT...,MRGARMTRKPRRHALSVPVRSGSEVGFPQSLGEVHDMLYDKSLERD...


In [5]:
print('The shape of our dataframe is:', G.shape)
print('Rows:', G.shape[0])
print('Columns:', G.shape[1])

The shape of our dataframe is: (8627, 3)
Rows: 8627
Columns: 3


### Encoding data

Using sklearn OneHotEncoder and LabelEncoder

#### Labels

In [6]:
def encode_feature(array):
    """ Encode a categorical array into a number array
    
    :param array: array to be encoded
    :return: numerical array
    """
  
    encoder = preprocessing.LabelEncoder()
    encoder.fit(array)
    return encoder.transform(array)

In [7]:
class_names = ['CDS', 'LORF']
labels = G["Type"].values
print(labels)

['LORF' 'LORF' 'LORF' ... 'CDS' 'CDS' 'CDS']


In [8]:
labels = encode_feature(labels)
labels

array([1, 1, 1, ..., 0, 0, 0])

#### Sequences

The sequenes are are tokenized using keras tokenizer, then padded and finally one hot encoded.

In [9]:
# Extract sequences
samples = G['Translated'].values
samples = list(samples)
samples[0:5]

['LRAGGRLRRPAAFLDPVPALLHPAFAPVPSACAQCTSPPSVPEYDAALSSVAGHDRYHQRSL*',
 'MAIINIKCTSIATRIRLRQTFFFFYDLFIIE*',
 'MPEPRQPGGCKSGCGRI*',
 'MPTVTNDCIGLKTMNYSE*',
 'VIKSRMTRECHVRFREQPESETLSGGVPIAVANMRRCCTR*']

In [10]:
# Tokenizer configured to only take into account the top-4 most common words
tokenizer = Tokenizer(char_level=True)

# This builds the word index
tokenizer.fit_on_texts(samples)

# This turns strings into lists of integer indices.
seq_of_int = tokenizer.texts_to_sequences(samples)

In [11]:
seq_of_int[0]

[1,
 5,
 2,
 4,
 4,
 5,
 1,
 5,
 5,
 11,
 2,
 2,
 14,
 1,
 10,
 11,
 3,
 11,
 2,
 1,
 1,
 18,
 11,
 2,
 14,
 2,
 11,
 3,
 11,
 6,
 2,
 20,
 2,
 12,
 20,
 8,
 6,
 11,
 11,
 6,
 3,
 11,
 9,
 16,
 10,
 2,
 2,
 1,
 6,
 6,
 3,
 2,
 4,
 18,
 10,
 5,
 16,
 18,
 12,
 5,
 6,
 1,
 21]

In [12]:
print('Longest amino acid sequence is:', len(samples[len(G)-1]))

Longest amino acid sequence is: 1523


In [13]:
max_len = len(samples[len(G)-1])
#max_len = 1600
seq_padded = sequence.pad_sequences(seq_of_int, maxlen=max_len, dtype='int32', value=0.0) # if maxlen=None then the maximum lenght is the longest sequence in genome

In [14]:
np.shape(seq_padded)

(8627, 1523)

### Create training and test data

In [15]:
# Split the data into training and testing sets -> x = features and y = labels/targets
train_x, test_x, train_y, test_y = train_test_split(seq_padded, labels, test_size = 0.2, random_state = 42)

In [16]:
print('Input data:')
print(len(train_x), 'train sequences')
print(len(test_x), 'test sequences')

Input data:
6901 train sequences
1726 test sequences


In [17]:
input_dim = 22 # input dimension: 21 amino acids + padding

embedding_size = min(np.ceil((input_dim)/2), 50)
embedding_size = int(embedding_size)

output_dim = embedding_size
input_length = max_len

### Tuning

#### Keras tuner

In [18]:
def model_builder(hp):
    
    # Tune the number of units in the first LSTM layer - value between 32-512
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    
    # Tune the learning rate for the optimizer -value from 0.01, 0.001 or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-1,1e-2, 1e-3, 1e-4, 1e-5])
    
    # Optimizer
    #opt_adam = keras.optimizers.Adam(learning_rate = hp_learning_rate)
    opt_rmsprop = keras.optimizers.RMSprop(learning_rate = hp_learning_rate)
    
    # Different loss functions
    loss = hp.Choice('loss', values = ['binary_crossentropy', 'hinge', 'squared_hinge'])
    
    # Different drop values
    drop = hp.Choice('dropout', values = list(np.linspace(0,0.9, num=10)))
    
    model = keras.Sequential()
    model.add(Embedding(input_dim, output_dim))
    model.add(LSTM(units = hp_units))
    model.add(Dropout(rate = drop))
    
    # Test different amount of layers for the model
    #for i in range(hp.Int('n_layers', 1,6)):
    #    hp_unit = hp.Int(f'units {i}', min_value = 32, max_value =1024 , step = 32)
    #    model.add(LSTM(units = hp_unit, activation = hp_activation))
    
    model.add(Dense(1, activation = 'sigmoid'))
    
    model.compile(optimizer = opt_rmsprop,
                  loss = loss,
                  metrics = ['accuracy'])
    
    return model

In [19]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3)

2022-03-24 13:57:12.043670: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-24 13:57:12.131885: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-24 13:57:12.132603: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-24 13:57:12.133697: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [20]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=8)

In [21]:
tuner.search(train_x, train_y, epochs=20, validation_split=0.2, callbacks=[stop_early])

Trial 30 Complete [00h 02m 24s]
val_accuracy: 0.49456915259361267

Best val_accuracy So Far: 0.8218682408332825
Total elapsed time: 00h 37m 23s


In [22]:
# Get the optimal hyperparameters
tuner.get_best_hyperparameters(num_trials = 1)[0]
best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]

print(f"""The hyperparameter search is complete!\n 
The optimal number of units in the first LSTM layer is: {best_hps.get('units')}.\n 
The optimal learning rate for the optimizer is: {best_hps.get('learning_rate')}.\n 
The optimal dropout rate is: {best_hps.get('dropout')}.\n 
The optimal loss function is: {best_hps.get('loss')}.\n""")

The hyperparameter search is complete!
 
The optimal number of units in the first LSTM layer is: 128.
 
The optimal learning rate for the optimizer is: 0.001.
 
The optimal dropout rate is: 0.2.
 
The optimal loss function is: binary_crossentropy.



#### Tune number of epochs

In [23]:
# Build the model with the optimal hyperparameters and train it on the data for 30 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(train_x, train_y, epochs=30, validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Best epoch: 13


#### Train model again with best number of epochs:

In [24]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(train_x, train_y, epochs=best_epoch, validation_split=0.2)

Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


<keras.callbacks.History at 0x7f8a68f82b50>

#### Evaluate the model on the test data:

In [25]:
eval_result = hypermodel.evaluate(test_x, test_y)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [0.4201381504535675, 0.820393979549408]


**Next:**
- Check the results of the hypertuning. Are there paramters that are at the maximum or minima? - If so tune these again.
- Try rmsprop optimizer instead of adam
- Try tuning number of layers 
- Try early stopping
- Tune bach size and number of epochs