In [1]:
import itertools

import numpy as np
import pandas as pd

from tqdm import tqdm

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.metrics import MeanSquaredError
from keras.callbacks import EarlyStopping

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense

from matplotlib import pyplot as plt

Using TensorFlow backend.


In [2]:
# Read the data
submission = pd.read_csv('../input/stanford-covid-vaccine/sample_submission.csv')
train_data = pd.read_csv('../input/covid19v7/train_data.csv')
test_data = pd.read_csv('../input/covid19v7/test_data.csv')

In [3]:
train_data.shape, test_data.shape

((142596, 87), (457953, 77))

In [4]:
unused_columns = ['id', 'id_seqpos', 'deg_50C', 'deg_pH10']
train_data = train_data.drop(['id', 'id_seqpos', 'deg_50C', 'deg_pH10'], axis=1)
test_data = test_data.drop(['id', 'id_seqpos'], axis=1)

In [5]:
X_train = train_data.drop(['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C'], axis=1)
Y_train = train_data[['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']]

In [6]:
# one hot encoding
enc_targets = ['sequence', 'structure', 'predicted_loop_type']
cat_cols = []

for t in enc_targets:
    for c in [c for c in X_train.columns if t in c]:
        cat_cols.append(c)

In [7]:
# one hot encoding
enc_targets = ['sequence', 'structure', 'predicted_loop_type']
cat_cols = []

for t in enc_targets:
    for c in [c for c in X_train.columns if t in c]:
        cat_cols.append(c)

ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(X_train[cat_cols])

X_train = ohe.transform(X_train[cat_cols]).toarray()
test = ohe.transform(test_data[cat_cols]).toarray()

In [8]:
FOLD_N = 5
EPOCHS = 25
kf = KFold(n_splits=FOLD_N)

In [9]:
X_train.shape, Y_train.shape, type(X_train), type(Y_train)

((142596, 383), (142596, 3), numpy.ndarray, pandas.core.frame.DataFrame)

In [10]:
Y_train = Y_train.values.astype(np.float32)

In [11]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) / 3

def get_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(383),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(400, activation="relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(40, activation="relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(3, activation="relu")
    ])
    model.compile(optimizer='adam', loss='mse', metrics=[root_mean_squared_error])
    return model

In [12]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=3,
    verbose=1000,
    mode='auto'
)

In [13]:
preds = np.zeros((len(test_data), 3))

for n, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    x_train, y_train = X_train[train_idx], Y_train[train_idx]
    x_val, y_val = X_train[val_idx], Y_train[val_idx]
    
    print(f'Training fold #{n}')
    model = get_model()
    results = model.fit(
        x_train,
        y_train,
        epochs=100,
        batch_size=8192,
        validation_data=(x_val, y_val),
        callbacks=[early_stopping]
    )
    
    pred = model.predict(test)
    preds += pred / FOLD_N

Training fold #0
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 00061: early stopping
Training fold #1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 1

In [14]:
preds.shape

(457953, 3)

In [15]:
submission[['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']] = preds
submission

Unnamed: 0,id_seqpos,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
0,id_00073f8be_0,0.530633,0.540044,0.0,0.454679,0.0
1,id_00073f8be_1,2.203715,3.097908,0.0,2.905589,0.0
2,id_00073f8be_2,1.273226,0.762898,0.0,0.978819,0.0
3,id_00073f8be_3,1.203956,0.999577,0.0,1.345029,0.0
4,id_00073f8be_4,0.769264,0.598108,0.0,0.770769,0.0
...,...,...,...,...,...,...
457948,id_ffda94f24_125,0.317523,0.465173,0.0,0.540906,0.0
457949,id_ffda94f24_126,0.436340,0.520894,0.0,0.589646,0.0
457950,id_ffda94f24_127,0.642887,0.360048,0.0,0.390965,0.0
457951,id_ffda94f24_128,0.394461,0.373741,0.0,0.401873,0.0


In [16]:
submission.to_csv('submission.csv', index=False)