<a href="https://colab.research.google.com/github/yukontaf/HeadRepo/blob/main/novozymesEnzymeStabilityPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, matplotlib
import tensorflow as tf
from sklearn.model_selection import train_test_split

import os
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
# os.environ['TF_ENABLE_ONEDNN_OPTS'] = '1'

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import MeanAbsoluteError
from tensorflow.keras.optimizers import Adagrad, Adadelta, Adam, Adamax, Ftrl, Nadam, SGD, RMSprop 
from tqdm import tqdm
matplotlib.style.use("seaborn-whitegrid")
pd.set_option("display.width", 5000)
pd.set_option("display.max_columns", 60)
plt.rcParams["figure.figsize"] = (15, 10)

%config InlineBackend.figure_format = 'retina'

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.simplefilter('ignore')
pd.set_option('display.float_format', lambda x: '%.3f' % x)
AUTO = tf.data.experimental.AUTOTUNE

In [2]:
%%capture
!pip install optuna
!pip install ipdb

In [20]:
os.environ["TF_MIN_GPU_MULTIPROCESSOR_COUNT"] = "1"
strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
print ('Number of devices: {}'.format(strategy.num_replicas_in_sync))

Number of devices: 1


In [4]:
import optuna
import ipdb
from optuna.trial import TrialState

In [5]:
!gdown 12ceraaz41xJ503VhZlU-WnBB5pR8QWWf
!gdown 1mocZNvYWzWL9U-kygm9QU4ejuJoO0jyo

Downloading...
From: https://drive.google.com/uc?id=12ceraaz41xJ503VhZlU-WnBB5pR8QWWf
To: /content/train.feather
100% 12.2M/12.2M [00:00<00:00, 224MB/s]
Downloading...
From: https://drive.google.com/uc?id=1mocZNvYWzWL9U-kygm9QU4ejuJoO0jyo
To: /content/test.feather
100% 45.8k/45.8k [00:00<00:00, 63.2MB/s]


In [21]:
df = pd.read_feather('/content/train.feather')
df = df.drop('data_source', axis=1)
df = df.fillna(0)
train = df.sample(frac = 0.8)
val = df.drop(train.index, axis=0)

In [22]:
def split_seq(df):
    sentences = np.array([
    df[['protein_sequence']].to_numpy()[:, 0],
])
    splitted= []
    for i in sentences[0, :]:
        splitted.append(list(i))

    return np.array(splitted)

train_seq, test_seq = split_seq(train), split_seq(val)

train_ph, val_ph = train['pH'].to_numpy().reshape((len(train), 1)), val['pH'].to_numpy().reshape((len(val), 1))
train_tm, val_tm = train['tm'].to_numpy().reshape((len(train), 1)), val['tm'].to_numpy().reshape((len(val), 1))

In [8]:
train['protein_sequence'].apply(lambda x: len(x)).quantile(0.99)

2223.0

In [23]:
max_length = 2255
trunc_type='post'
embedding_dim = 64

In [24]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_seq)
sequences = tokenizer.texts_to_sequences(train_seq)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
vocab_size = len(tokenizer.word_index)
train = tf.data.Dataset.from_tensor_slices(np.append(np.append(padded, train_ph, 1), train_tm, 1))

testing_sequences = tokenizer.texts_to_sequences(test_seq)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)
val = tf.data.Dataset.from_tensor_slices(np.append(np.append(testing_padded, val_ph, 1), val_tm, 1))

In [69]:
BATCH_SIZE = 256
def preprocess(data):
    sequence = data[:-1]
    ph = tf.reshape(data[-2], (-1, ))
    tm = data[-1]
    return sequence, ph, tm
def get_training_dataset(dataset):
    dataset = dataset.map(preprocess).shuffle(len(sequences)).batch(BATCH_SIZE)
    return dataset
def get_validation_dataset(valid):
  valid = valid.map(preprocess).shuffle(len(sequences)).batch(BATCH_SIZE)
  return valid
def concat(input):
  return tf.concat([input[0], input[1]], -1)
  # return tf.keras.layers.concatenate([input[0], input[1]], axis=1)

In [70]:
# %pdb
class MyModel(tf.keras.Model):
  def __init__(self, lstm_layers, emb_dim, lstm_units, dropout_rate):
    super(MyModel, self).__init__()
    self.lstm_layers, self.emb_dim, self.lstm_units, self.dropout_rate = lstm_layers, emb_dim, lstm_units, dropout_rate
    self.input_ph = tf.keras.layers.Input((1,))
    self.input_seq = tf.keras.layers.Input((500, ))
    seq_layers =  [] 
    seq_layers.extend([tf.keras.layers.Embedding(21, emb_dim, name='embedding'),
                        tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
                        tf.keras.layers.Dropout(dropout_rate),  
                        tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
                        tf.keras.layers.Dropout(dropout_rate),
                        *[tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(i, return_sequences=True, name=f'LSTM_{i}')) for j in range(lstm_layers-1) for i in lstm_units],
                        ])
    for i in range(len(seq_layers)):
      vars(self)[f'SEQ_LAYER_{i}'] = seq_layers[i]

    self.ph_layers = [
                      tf.keras.layers.Dense(512, name='dense1_ph', activation='relu'),
                      tf.keras.layers.Dropout(dropout_rate), 
                      tf.keras.layers.Dense(256, name='dense2_ph', activation='relu'),
                      tf.keras.layers.Dropout(dropout_rate), 
                      tf.keras.layers.Dense(128, name='dense3_ph', activation='relu'),
                      tf.keras.layers.Dropout(dropout_rate),  
                      tf.keras.layers.Dense(1, name='output_ph')
                      ]

    self.lambda_layer = tf.keras.layers.Lambda(function=concat, name='lambda_layer')
    self.flatten = tf.keras.layers.Flatten(name='flatten')
    self.dense_combined = tf.keras.layers.Dense(64, activation='relu', name='dense_combined')
    self.lambda_helper = tf.keras.layers.Lambda(lambda x: tf.reshape(x, (-1, 64, 1)))
    self.lstm_dense = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512))
    self.last_dense = tf.keras.layers.Dense(1, name='output')

  def call(self, inputs):
    seq, ph = inputs
    SEQ_LAYER_0 = vars(self)['SEQ_LAYER_0']
    x = SEQ_LAYER_0(seq)
    for i in range(1, self.lstm_layers):
        SEQ_LAYER_i = vars(self)[f'SEQ_LAYER_{i}']
        x = SEQ_LAYER_i(x)
    for ind, layer in enumerate(self.ph_layers):
      ph = layer(ph)
    x = self.lambda_layer([seq, ph])
    x = self.flatten(x)
    x = self.dense_combined(x)
    x = self.lambda_helper(x)
    x = self.lstm_dense(x)
    x = self.last_dense(x)
    return x
    

In [71]:
TRAIN_STEPS = 15
PRUNING_INTERVAL_STEPS = 50
def objective(trial):    
  lstm_layers = trial.suggest_int('lstm_layers', 1, 7)
  emb_dim = trial.suggest_int('emb_dim', 256, 1024)
  lstm_units = []
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-7, 1)
  for i in range(lstm_layers):
    lstm_units.append(trial.suggest_int(f'lstm_units_l{i}', 16, 512))
  dropout_rate = trial.suggest_uniform('dropout_rate', 0, 1)
  optimizer = trial.suggest_categorical('optimizer', [Adagrad, Adadelta, Adam, Adamax, Ftrl, Nadam, SGD, RMSprop])
  regressor = MyModel(lstm_layers, emb_dim, lstm_units, dropout_rate)
  loss_obj = tf.keras.metrics.MeanAbsoluteError(name='loss_obj')
  regressor.compile(loss=loss_obj, optimizer=optimizer(learning_rate=learning_rate))
  for epoch in range(10):
    print(f'Epoch # {epoch} started')
    for batch in tqdm(get_training_dataset(train)):
      predictions = regressor([batch[0], batch[1]], training=True)
      loss = loss_obj(batch[2], predictions)
    print(f'Training Loss {loss}')
    for val_batch in tqdm(get_validation_dataset(val)):
      predictions = regressor([val_batch[0], val_batch[1]], training=False)
      val_loss = loss_obj(val_batch[2], predictions)
    print(f'Validation Loss {val_loss}')
  return val_loss

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

[32m[I 2022-10-02 04:26:08,944][0m A new study created in memory with name: no-name-4ca143c8-15c6-476a-88e5-5cec432e4e27[0m


Epoch # 0 started


100%|██████████| 99/99 [01:14<00:00,  1.33it/s]


Training Loss 49.242759704589844


100%|██████████| 25/25 [00:17<00:00,  1.44it/s]


Validation Loss 49.19940948486328
Epoch # 1 started


  7%|▋         | 7/99 [00:05<01:09,  1.32it/s]

In [None]:
print("Best trial:")
trial = study.best_trial

trial

'Best trial:'


FrozenTrial(number=0, values=[9.231255531311035], datetime_start=datetime.datetime(2022, 10, 2, 2, 45, 57, 884555), datetime_complete=datetime.datetime(2022, 10, 2, 2, 50, 0, 948796), params={'lstm_layers': 5, 'emb_dim': 516, 'optimizer': <class 'keras.optimizer_v2.adamax.Adamax'>, 'learning_rate': 0.0003657022606919054, 'lstm_units_l0': 144, 'lstm_units_l1': 487, 'lstm_units_l2': 180, 'lstm_units_l3': 435, 'lstm_units_l4': 282, 'dropout_rate': 0.6139876507248502}, distributions={'lstm_layers': IntDistribution(high=7, log=False, low=1, step=1), 'emb_dim': IntDistribution(high=1024, log=False, low=8, step=1), 'optimizer': CategoricalDistribution(choices=(<class 'keras.optimizer_v2.adagrad.Adagrad'>, <class 'keras.optimizer_v2.adadelta.Adadelta'>, <class 'keras.optimizer_v2.adam.Adam'>, <class 'keras.optimizer_v2.adamax.Adamax'>, <class 'keras.optimizer_v2.ftrl.Ftrl'>, <class 'keras.optimizer_v2.nadam.Nadam'>, <class 'keras.optimizer_v2.gradient_descent.SGD'>, <class 'keras.optimizer_v2.