In [27]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, matplotlib
import tensorflow as tf
from sklearn.model_selection import train_test_split

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '1'
from tensorflow import keras
tf.config.run_functions_eagerly(True)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import MeanAbsoluteError, MeanSquaredError
from tensorflow.keras.optimizers import Adagrad, Adadelta, Adam, Adamax, Ftrl, Nadam, SGD, RMSprop 
from tensorflow.keras.layers import Bidirectional, Dense, LSTM, GRU, Conv1D, GlobalAveragePooling1D, Lambda, Flatten, Dropout, Embedding, Input
from tqdm import tqdm
matplotlib.style.use("seaborn-whitegrid")
pd.set_option("display.width", 5000)
pd.set_option("display.max_columns", 60)
plt.rcParams["figure.figsize"] = (15, 10)

%config InlineBackend.figure_format = 'retina'

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.simplefilter('ignore')
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# AUTO = tf.data.experimental.AUTOTUNE

In [None]:
%%capture
# !pip install optuna
# !pip install ipdb
# !apt install --allow-change-held-packages libcudnn8=8.4.1.50-1+cuda11.6

In [3]:
import optuna
import ipdb
from optuna.trial import TrialState

In [4]:
df = pd.read_feather('train.feather')
df = df.drop('data_source', axis=1)
df = df.fillna(0)
train = df.sample(frac = 0.8)
val = df.drop(train.index, axis=0)

In [5]:
def split_seq(df):
    sentences = np.array([
    df[['protein_sequence']].to_numpy()[:, 0],
])
    splitted= []
    for i in sentences[0, :]:
        splitted.append(list(i))

    return np.array(splitted)

train_seq, test_seq = split_seq(train), split_seq(val)

train_ph, val_ph = train['pH'].to_numpy().reshape((len(train), 1)), val['pH'].to_numpy().reshape((len(val), 1))
train_tm, val_tm = train['tm'].to_numpy().reshape((len(train), 1)), val['tm'].to_numpy().reshape((len(val), 1))

In [6]:
df['protein_sequence'].apply(lambda x: len(x)).quantile(0.99)

2242.0

In [7]:
max_length = 2250
trunc_type='post'
embedding_dim = 64

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_seq)
sequences = tokenizer.texts_to_sequences(train_seq)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
vocab_size = len(tokenizer.word_index)
train = tf.data.Dataset.from_tensor_slices(np.append(np.append(padded, train_ph, 1), train_tm, 1))

testing_sequences = tokenizer.texts_to_sequences(test_seq)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)
val = tf.data.Dataset.from_tensor_slices(np.append(np.append(testing_padded, val_ph, 1), val_tm, 1))

In [18]:
BATCH_SIZE = 32
def preprocess(data):
    sequence = data[:-1]
    ph = tf.reshape(data[-2], (-1, ))
    tm =  data[-1]
    return sequence, ph, tm
def get_training_dataset(dataset):
    dataset = dataset.map(preprocess).shuffle(len(sequences)).batch(BATCH_SIZE)
    return dataset
def get_validation_dataset(valid):
  valid = valid.map(preprocess).shuffle(len(sequences)).batch(BATCH_SIZE)
  return valid
def concat(input):
  # if len(input[0].shape) >=2:
  #   input[1] = tf.tile(tf.reshape(input[1], (-1, 1, 1)), (1, 1, input[0].shape[-1]))
  #   return tf.keras.layers.concatenate([input[0], input[1]], axis=1)
  # else:
    return tf.keras.layers.concatenate([input[0], input[1]], axis=1)

In [10]:
# %pdb
class MyModel(tf.keras.Model):
  def __init__(self, lstm_layers, emb_dim, lstm_units, dropout_rate):
    super(MyModel, self).__init__()
    self.lstm_layers, self.emb_dim, self.lstm_units, self.dropout_rate = lstm_layers, emb_dim, lstm_units, dropout_rate
    self.input_ph = tf.keras.layers.Input((1,))
    self.input_seq = tf.keras.layers.Input((500, ))
    self.seq_layers =  [] 
    self.seq_layers.extend([tf.keras.layers.Embedding(21, emb_dim, name='embedding'),
                        tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
                        tf.keras.layers.Dropout(dropout_rate),  
                        tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
                        tf.keras.layers.Dropout(dropout_rate),
                        *[tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(i, return_sequences=True, name=f'LSTM_{i}')) for j in range(lstm_layers-1) for i in lstm_units],
                        ])
    for i in range(len(self.seq_layers)):
      vars(self)[f'SEQ_LAYER_{i}'] = self.seq_layers[i]

    self.ph_layers = [
                      tf.keras.layers.Dense(512, name='dense1_ph', activation='relu'),
                      tf.keras.layers.Dropout(dropout_rate), 
                      tf.keras.layers.Dense(256, name='dense2_ph', activation='relu'),
                      tf.keras.layers.Dropout(dropout_rate), 
                      tf.keras.layers.Dense(128, name='dense3_ph', activation='relu'),
                      # tf.keras.layers.Dropout(dropout_rate),  
                      # tf.keras.layers.Dense(1, name='output_ph')
                      ]

    self.lambda_layer = tf.keras.layers.Lambda(function=concat, name='lambda_layer')
    self.flatten = tf.keras.layers.Flatten(name='flatten')
    self.dense_combined = tf.keras.layers.Dense(64, activation='relu', name='dense_combined')
    self.lambda_helper = tf.keras.layers.Lambda(lambda x: tf.reshape(x, (-1, 64, 1)))
    self.lstm_dense = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512))
    self.last_dense = tf.keras.layers.Dense(1, name='output')

  def call(self, inputs):
    seq, ph = inputs
    SEQ_LAYER_0 = vars(self)['SEQ_LAYER_0']
    x = SEQ_LAYER_0(seq)
    for i in range(1, self.lstm_layers):
        SEQ_LAYER_i = vars(self)[f'SEQ_LAYER_{i}']
        x = SEQ_LAYER_i(x)

    # for layer in self.seq_layers:
    #   seq = layer(seq)

    for layer in self.ph_layers:
      ph = layer(ph)

    x = self.lambda_layer([x, tf.tile(tf.reshape(ph, (-1, 128, 1)), (1, 1, x.shape[-1]))])
    x = self.flatten(x)
    x = self.dense_combined(x)
    x = self.lambda_helper(x)
    x = self.lstm_dense(x)
    x = self.last_dense(x)
    return x
    

In [11]:
TRAIN_STEPS = 15
PRUNING_INTERVAL_STEPS = 50
def objective(trial):    
  lstm_layers = trial.suggest_int('lstm_layers', 1, 7)
  emb_dim = trial.suggest_int('emb_dim', 256, 1024)
  lstm_units = []
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-7, 1)
  for i in range(lstm_layers):
    lstm_units.append(trial.suggest_int(f'lstm_units_l{i}', 16, 512))
  dropout_rate = trial.suggest_uniform('dropout_rate', 0, 1)
  optimizer = trial.suggest_categorical('optimizer', [Adagrad, Adadelta, Adam, Adamax, Ftrl, Nadam, SGD, RMSprop])
  regressor = MyModel(lstm_layers, emb_dim, lstm_units, dropout_rate)
  loss_obj = tf.keras.metrics.MeanAbsoluteError(name='loss_obj')
  regressor.compile(loss=loss_obj, optimizer=optimizer(learning_rate=learning_rate))
  losses, n_train_iter, step = [], len(get_training_dataset(train)), 0
  for epoch in range(1):
    print(f'Epoch # {epoch} started')
    for batch in tqdm(get_training_dataset(train)):
      predictions = regressor([batch[0], batch[1]], training=True)
      loss = loss_obj(batch[2], predictions)
      losses.append(loss)
      if step > n_train_iter//2:
        intermediate_value = loss
        if intermediate_value < best_loss:
            raise optuna.TrialPruned()
      step += 1
      best_loss = min(losses)
    print(f'Training Loss {loss:.2f}, Best Loss: {best_loss:.2f}')
    for val_batch in tqdm(get_validation_dataset(val)):
      predictions = regressor([val_batch[0], val_batch[1]], training=False)
      val_loss = loss_obj(val_batch[2], predictions)
    print(f'Validation Loss {val_loss:.2f}')
  return val_loss

In [12]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

[32m[I 2022-10-02 08:33:55,859][0m A new study created in memory with name: no-name-6a75b006-2327-4191-9123-4ccff5d2dcfe[0m


Epoch # 0 started


  9%|▉         | 9/99 [01:00<10:02,  6.70s/it]
[33m[W 2022-10-02 08:34:56,762][0m Trial 0 failed because of the following error: InvalidArgumentError()[0m
Traceback (most recent call last):
  File "/Volumes/Environment/conda/miniconda3/envs/defaultenv/lib/python3.8/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/t0/fw3jvl196_v78v94hzhxhpgw0000gn/T/ipykernel_85779/1214606070.py", line 19, in objective
    predictions = regressor([batch[0], batch[1]], training=True)
  File "/Volumes/Environment/conda/miniconda3/envs/defaultenv/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/var/folders/t0/fw3jvl196_v78v94hzhxhpgw0000gn/T/ipykernel_85779/4287616964.py", line 39, in call
    x = SEQ_LAYER_0(seq)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Exception encountered when calling layer "embedding" "         

InvalidArgumentError: Exception encountered when calling layer "embedding" "                 f"(type Embedding).

{{function_node __wrapped__ResourceGather_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[199,2250] = 49 is not in [0, 21) [Op:ResourceGather]

Call arguments received by layer "embedding" "                 f"(type Embedding):
  • inputs=tf.Tensor(shape=(256, 2251), dtype=float32)

In [None]:
print("Best trial:")
trial = study.best_trial

trial

'Best trial:'


FrozenTrial(number=0, values=[9.231255531311035], datetime_start=datetime.datetime(2022, 10, 2, 2, 45, 57, 884555), datetime_complete=datetime.datetime(2022, 10, 2, 2, 50, 0, 948796), params={'lstm_layers': 5, 'emb_dim': 516, 'optimizer': <class 'keras.optimizer_v2.adamax.Adamax'>, 'learning_rate': 0.0003657022606919054, 'lstm_units_l0': 144, 'lstm_units_l1': 487, 'lstm_units_l2': 180, 'lstm_units_l3': 435, 'lstm_units_l4': 282, 'dropout_rate': 0.6139876507248502}, distributions={'lstm_layers': IntDistribution(high=7, log=False, low=1, step=1), 'emb_dim': IntDistribution(high=1024, log=False, low=8, step=1), 'optimizer': CategoricalDistribution(choices=(<class 'keras.optimizer_v2.adagrad.Adagrad'>, <class 'keras.optimizer_v2.adadelta.Adadelta'>, <class 'keras.optimizer_v2.adam.Adam'>, <class 'keras.optimizer_v2.adamax.Adamax'>, <class 'keras.optimizer_v2.ftrl.Ftrl'>, <class 'keras.optimizer_v2.nadam.Nadam'>, <class 'keras.optimizer_v2.gradient_descent.SGD'>, <class 'keras.optimizer_v2.

In [19]:
for i in get_training_dataset(train).take(1):
    x = Embedding(vocab_size + 1, 2250)(i[0])
    x = Conv1D(filters=64, kernel_size=5, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Conv1D(filters=64, kernel_size=3, activation='relu')(x)
    x = Bidirectional(LSTM(32, return_sequences=True))(x)
    x = GlobalAveragePooling1D()(x)
    x


<tf.Tensor: shape=(32, 64), dtype=float32, numpy=
array([[ 0.00255172, -0.00038243, -0.00433107, ...,  0.01631568,
         0.01245562, -0.0166873 ],
       [ 0.0037646 , -0.00017282, -0.00468532, ...,  0.0158443 ,
         0.0121361 , -0.01649837],
       [-0.00053325, -0.0007604 , -0.0038455 , ...,  0.01778299,
         0.01330473, -0.01690527],
       ...,
       [ 0.00970213,  0.00063291, -0.00561702, ...,  0.01281975,
         0.0099871 , -0.01621926],
       [ 0.00050922, -0.0005741 , -0.00392779, ...,  0.01714535,
         0.0130352 , -0.01674676],
       [ 0.01195663,  0.001212  , -0.00619053, ...,  0.01121793,
         0.00945734, -0.01510749]], dtype=float32)>

In [21]:
# %pdb
class MyModel2(tf.keras.Model):
  def __init__(self):
    super(MyModel2, self).__init__()
    self.emb_dim, self.dropout_rate = 2250, 0.2
    self.input_ph = tf.keras.layers.Input((1,))
    self.input_seq = tf.keras.layers.Input((2251, ))
    self.seq_layers =  [] 
    self.seq_layers.extend([
                        Embedding(vocab_size + 1, self.emb_dim, name='embedding'),
                        Conv1D(filters=64, kernel_size=5, activation='relu'),
                        Dropout(self.dropout_rate),  
                        Conv1D(filters=64, kernel_size=3, activation='relu'),
                        Dropout(self.dropout_rate),
                        # Bidirectional(LSTM(2048, return_sequences=True, name='LSTM_1')),
                        # Dropout(self.dropout_rate),
                        # LSTM(1024, return_sequences=True, name='LSTM_2'),
                        # LSTM(512, return_sequences=True, name='LSTM_3'),
                        Bidirectional(LSTM(256, return_sequences=True, name='LSTM_4')),
                        Dropout(self.dropout_rate),
                        Bidirectional(LSTM(128, return_sequences=True, name='LSTM_5')),
                        Dropout(self.dropout_rate),
                        # tf.keras.layers.LSTM(64, return_sequences=True, name='LSTM_6'),
                        Bidirectional(LSTM(32, return_sequences=True, name='LSTM_7')),
                        Dropout(self.dropout_rate),
                        Bidirectional(LSTM(16, return_sequences=True)),
                        GlobalAveragePooling1D(),
                        ])
    for i in range(len(self.seq_layers)):
      vars(self)[f'SEQ_LAYER_{i}'] = self.seq_layers[i]

    self.ph_layers = [
                      tf.keras.layers.Dense(512, name='dense1_ph', activation='relu'),
                      tf.keras.layers.Dropout(self.dropout_rate), 
                      # tf.keras.layers.Dense(256, name='dense2_ph', activation='relu'),
                      # tf.keras.layers.Dropout(self.dropout_rate), 
                      tf.keras.layers.Dense(128, name='dense3_ph', activation='relu'),
                      tf.keras.layers.Dropout(self.dropout_rate),  
                      tf.keras.layers.Dense(1, name='output_ph')
                      ]

    self.lambda_layer = Lambda(function=concat, name='lambda_layer')
    self.flatten = Flatten(name='flatten')
    self.dense_combined = Dense(64, activation='relu', name='dense_combined')
    self.lambda_helper = Lambda(lambda x: tf.reshape(x, (-1, 64, 1)))
    self.last_dense = Dense(1, activation='relu', name='output')

  def call(self, inputs):
    seq, ph = inputs

    for layer in self.seq_layers:
      seq = layer(seq)

    for layer in self.ph_layers:
      ph = layer(ph)

    x = self.lambda_layer([seq, ph])
    x = self.flatten(x)
    x = self.dense_combined(x)
    x = self.lambda_helper(x)
    x = self.last_dense(x)
    return x
    

In [30]:
model = MyModel2()
adam, SGD_fn, rms = Adam(1e-3), SGD(1e-4), RMSprop(1e-4)
loss_fn = MeanAbsoluteError()
model.compile(optimizer=adam, loss=loss_fn)

In [31]:
# %pdb
for epoch in range(10):
    print(f'Epoch # {epoch} started')
    for batch in tqdm(get_training_dataset(train)):
        predictions = model([batch[0], batch[1]], training=True)
        loss = loss_fn(batch[2], predictions)
        # print(tf.math.reduce_mean(loss))
    for val_batch in tqdm(get_validation_dataset(val)):
        predictions = model([val_batch[0], val_batch[1]], training=False)
        val_loss = loss_fn(val_batch[2], predictions)
    print(f'Validation Loss {tf.math.reduce_mean(val_loss):.2f}')


Epoch # 0 started


  0%|          | 3/785 [01:38<7:07:41, 32.82s/it]


InvalidArgumentError: Exception encountered when calling layer "embedding" "                 f"(type Embedding).

{{function_node __wrapped__ResourceGather_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[20,2250] = 48 is not in [0, 21) [Op:ResourceGather]

Call arguments received by layer "embedding" "                 f"(type Embedding):
  • inputs=tf.Tensor(shape=(32, 2251), dtype=float32)