# COVID-19 mRNA Vaccine Degradation Prediction


**Context: The most promising COVID-19 vaccine candidates use messenger RNA molecules (mRNA)** to help the patient develop immunity. Unfortunately, mRNA molecules are extremely fragile, and in certain circumstances spontaneously degrade. Current best scientific understanding lacks a means to analyze mRNA molecule candidates for areas of likely degradation.

**Goal: Predict degradation rates for each part of an RNA molecule.**


In [1]:
import sys
sys.path.append('/usr/local/bin')


import json
import numpy as np
import pandas as pd
import tensorflow as tf
from time import sleep

from tensorflow.keras.preprocessing.text import Tokenizer

## Automated hyper-parameter optimization

In [2]:
# Hyper-parameters
LR = 1e-3
EPOCHS = 1
BATCH_SIZE = 64
EMBED_DIM = 100
HIDDEN_DIM = 128
DROPOUT = .5
SP_DROPOUT = .3
TRAIN_SEQUENCE_LENGTH = 107

## Load data

In [3]:
train_df = pd.read_json("./train.json", lines=True)
test_df = pd.read_json("./test.json", lines=True)

In [4]:
sample_submission_df = pd.read_csv("./sample_submission.csv")

## Preprocess data

In [5]:
symbols = "().ACGUBEHIMSX"
feat_cols = ["sequence", "structure", "predicted_loop_type"]
target_cols = ["reactivity", "deg_Mg_pH10", "deg_Mg_50C", "deg_pH10", "deg_50C"]
error_cols = ["reactivity_error", "deg_error_Mg_pH10", "deg_error_Mg_50C", "deg_error_pH10", "deg_error_50C"]

In [6]:
tokenizer = Tokenizer(char_level=True, filters="")
tokenizer.fit_on_texts(symbols)

In [7]:
# get the number of elements in the vocabulary
vocab_size = len(tokenizer.word_index) + 1

In [8]:
def process_features(example):
    sequence_sentences = example[0]
    structure_sentences = example[1]
    loop_sentences = example[2]
    
    # transform character sequences into number sequences
    sequence_tokens = np.array(
        tokenizer.texts_to_sequences(sequence_sentences)
    )
    structure_tokens = np.array(
        tokenizer.texts_to_sequences(structure_sentences)
    )
    loop_tokens = np.array(
        tokenizer.texts_to_sequences(loop_sentences)
    )
    
    # concatenate the tokenized sequences
    sequences = np.stack(
        (sequence_tokens, structure_tokens, loop_tokens),
        axis=1
    )
    sequences = np.transpose(sequences, (2, 0, 1))
    
    prepared = sequences.tolist()
    
    return prepared[0]

In [9]:
def process_labels(df):
    df = df.copy()
    
    labels = np.array(df[target_cols].values.tolist())
    labels = np.transpose(labels, (0, 2, 1))
    
    return labels

In [10]:
public_test_df = test_df.query("seq_length == 107")
private_test_df = test_df.query("seq_length == 130")

In [11]:
x_train = [process_features(row.tolist()) for _, row in train_df[feat_cols].iterrows()]
y_train = process_labels(train_df)

unprocessed_x_public_test = [row.tolist() for _, row in public_test_df[feat_cols].iterrows()]
unprocessed_x_private_test = [row.tolist() for _, row in private_test_df[feat_cols].iterrows()]

# Define and train the model

In [12]:
def gru_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
         tf.keras.layers.GRU(hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer = 'orthogonal')
    )

In [13]:
def lstm_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer = 'orthogonal')
    )

In [14]:
def build_model(vocab_size, seq_length=TRAIN_SEQUENCE_LENGTH, pred_len=68,
                embed_dim=EMBED_DIM,
                hidden_dim=HIDDEN_DIM, dropout=DROPOUT, sp_dropout=SP_DROPOUT):
    inputs = tf.keras.layers.Input(shape=(seq_length, 3))

    embed = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
    
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )
    
    hidden = tf.keras.layers.SpatialDropout1D(sp_dropout)(reshaped)
    
    hidden = gru_layer(hidden_dim, dropout)(hidden)
    hidden = lstm_layer(hidden_dim, dropout)(hidden)
    
    truncated = hidden[:, :pred_len]
    
    out = tf.keras.layers.Dense(5, activation="linear")(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    
    return model

In [15]:
model = build_model(vocab_size)

In [16]:
class MeanColumnwiseRMSE(tf.keras.losses.Loss):
    def __init__(self, name='MeanColumnwiseRMSE'):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
        return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

In [17]:
model.compile(tf.optimizers.Adam(learning_rate=LR), loss=MeanColumnwiseRMSE())

In [18]:
history = model.fit(np.array(x_train), np.array(y_train), 
                    validation_split=.1, batch_size=BATCH_SIZE, epochs=EPOCHS)



In [19]:
validation_loss = history.history.get("val_loss")[0]

## Evaluate the model

In [20]:
model_public = build_model(vocab_size, seq_length=107, pred_len=107)
model_private = build_model(vocab_size, seq_length=130, pred_len=130)

model_public.set_weights(model.get_weights())
model_private.set_weights(model.get_weights())

In [21]:
public_preds = model_public.predict(np.array([process_features(x) for x in unprocessed_x_public_test]))
private_preds = model_private.predict(np.array([process_features(x) for x in unprocessed_x_private_test]))

In [22]:
print(validation_loss)

0.6244146823883057


In [23]:
from kale.common.serveutils import serve


In [24]:
kfserver = serve(model, preprocessing_fn=process_features, preprocessing_assets={'tokenizer': tokenizer})


2021-03-03 21:53:30 Kale serveutils:217       [INFO]     Starting serve procedure for model '<tensorflow.python.keras.engine.functional.Functional object at 0x7f8bf402bc88>'
2021-03-03 21:53:30 Kale jputils:324          [INFO]     Retrieving absolute path of the active notebook
2021-03-03 21:53:31 Kale marshalling          [INFO]     Saving function object using Function backend: transformer_function
2021-03-03 21:53:31 Kale marshalling          [INFO]     Saving generic object using Default backend: tokenizer
2021-03-03 21:53:31 Kale podutils:82          [INFO]     Getting the current container name...
2021-03-03 21:53:31 Kale podutils:88          [INFO]     Using NB_PREFIX env var '/notebook/kubeflow-user/vaccine-demo'. Container name: 'vaccine-demo'
2021-03-03 21:53:31 Kale serveutils:242       [INFO]     Model is contained in volume 'workspace-vaccine-demo-2gn5px437'
2021-03-03 21:53:31 Kale marshalling          [INFO]     Saving tensorflow object using Tensorflow backend: model


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
[INFO]:tensorflow:Assets written to: /home/jovyan/.kale.kfserving.model.dir/model.tfkeras/1/assets


2021-03-03 21:53:55 Kale serveutils:247       [INFO]     Model saved successfully at '/home/jovyan/.kale.kfserving.model.dir/model.tfkeras'
2021-03-03 21:53:55 Kale rokutils:60          [INFO]     Taking a snapshot of PVC workspace-vaccine-demo-2gn5px437 in namespace kubeflow-user ...
2021-03-03 21:53:55 Kale rokutils:259         [INFO]     Creating Rok bucket 'serving'...
2021-03-03 21:53:55 Kale rokutils:276         [INFO]     Successfully created Rok bucket 'serving'
2021-03-03 21:54:03 Kale rokutils:79          [INFO]     Successfully took Rok snapshot
2021-03-03 21:54:03 Kale rokutils:219         [INFO]     Creating new PVC 'vaccine-demo-0-wqsfk-pvc-7wjwp' from Rok version 62ec69f1-ceab-4045-8094-9c3db034cbef ...
2021-03-03 21:54:03 Kale rokutils:231         [INFO]     Using Rok url: http://rok.rok.svc.cluster.local/swift/v1/kubeflow-user/serving/workspace-vaccine-demo-2gn5px437?version=62ec69f1-ceab-4045-8094-9c3db034cbef
2021-03-03 21:54:03 Kale rokutils:253         [INFO]     S

In [25]:
data = json.dumps({"instances": unprocessed_x_public_test})


In [None]:
while True:
    kfserver.predict(data)
    sleep(5)

2021-03-03 21:54:34 Kale serveutils:152       [INFO]     Sending a request to the InferenceService...
2021-03-03 21:54:34 Kale serveutils:153       [INFO]     Getting InferenceService's host...
2021-03-03 21:54:34 Kale serveutils:156       [INFO]     Sending request to InferenceService...
2021-03-03 21:54:43 Kale serveutils:162       [INFO]     Response: {"predictions": [[[0.884521067, 0.979000449, 0.981851, 1.6084671, 0.958244] ..... 324251], [0.334864646, 0.464393079, 0.391387492, 0.3629722, 0.351089269]]]}
2021-03-03 21:54:48 Kale serveutils:152       [INFO]     Sending a request to the InferenceService...
2021-03-03 21:54:48 Kale serveutils:153       [INFO]     Getting InferenceService's host...
2021-03-03 21:54:48 Kale serveutils:156       [INFO]     Sending request to InferenceService...
2021-03-03 21:54:55 Kale serveutils:162       [INFO]     Response: {"predictions": [[[0.884521067, 0.979000449, 0.981851, 1.6084671, 0.958244] ..... 324251], [0.334864646, 0.464393079, 0.39138749