In [None]:
from google.colab import drive
drive.mount('/content/drive')

data_dir = 'drive/My Drive'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('drive/My Drive/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from collections import OrderedDict
from fastprogress import progress_bar
from pathlib import Path
from sklearn.model_selection import train_test_split, ShuffleSplit, GroupKFold
from torch import nn
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter

import functools
import os
import pandas as pd
import random
import shutil
import torch
import torch.nn.functional as F


target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
input_cols = ['sequence', 'structure', 'predicted_loop_type']
error_cols = ['reactivity_error', 'deg_error_Mg_pH10', 'deg_error_Mg_50C', 'deg_error_pH10', 'deg_error_50C']

token_dicts = {
    "sequence": {x: i for i, x in enumerate("ACGU")},
    "structure": {x: i for i, x in enumerate('().')},
    "predicted_loop_type": {x: i for i, x in enumerate("BEHIMSX")}
}

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [None]:
from sklearn.model_selection import train_test_split, ShuffleSplit
from torch import nn
from torch.utils.data import Dataset

import functools


BASE_PATH = data_dir + "/kaggle/input/stanford-covid-vaccine"
MODEL_SAVE_PATH = data_dir + "/kaggle/input/stanford-covid-vaccine/model"


def preprocess_inputs(df, cols):
    return np.concatenate([preprocess_feature_col(df, col) for col in cols], axis=2)


def preprocess_feature_col(df, col):
    dic = token_dicts[col]
    dic_len = len(dic)
    seq_length = len(df[col][0])
    ident = np.identity(dic_len)
    # convert to one hot
    arr = np.array(
        df[[col]].applymap(lambda seq: [ident[dic[x]] for x in seq]).values.tolist()
    ).squeeze(1)
    # shape: data_size x seq_length x dic_length
    assert arr.shape == (len(df), seq_length, dic_len)
    return arr


def preprocess(base_data, is_test=False):
    inputs = preprocess_inputs(base_data, input_cols)
    if is_test:
        labels = None
    else:
        labels = np.array(base_data[target_cols].values.tolist()).transpose((0, 2, 1))[:,:68]
        assert labels.shape[2] == len(target_cols)
    assert inputs.shape[2] == 14
    return inputs, labels


def get_bpp_feature(bpp):
    bpp_nb_mean = 0.077522  # mean of bpps_nb across all training data
    bpp_nb_std = 0.08914  # std of bpps_nb across all training data
    bpp_max = bpp.max(-1)[0]
    bpp_sum = bpp.sum(-1)
    bpp_nb = torch.true_divide((bpp > 0).sum(dim=1), bpp.shape[1])
    bpp_nb = torch.true_divide(bpp_nb - bpp_nb_mean, bpp_nb_std)
    return [bpp_max.unsqueeze(2), bpp_sum.unsqueeze(2), bpp_nb.unsqueeze(2)]


@functools.lru_cache(5000)
def load_from_id(id_):
    path = Path(BASE_PATH) / f"bpps/{id_}.npy"
    data = np.load(str(path))
    return data


def get_distance_matrix(leng):
    idx = np.arange(leng)
    Ds = []
    for i in range(len(idx)):
        d = np.abs(idx[i] - idx)
        Ds.append(d)

    Ds = np.array(Ds) + 1
    Ds = 1 / Ds
    Ds = Ds[None, :, :]
    Ds = np.repeat(Ds, 1, axis=0)

    Dss = []
    for i in [1, 2, 4]:
        Dss.append(Ds ** i)
    Ds = np.stack(Dss, axis=3)
    print(Ds.shape)
    return Ds


def get_structure_adj(df):
    Ss = []
    for i in range(len(df)):
        seq_length = df["seq_length"].iloc[i]
        structure = df["structure"].iloc[i]
        sequence = df["sequence"].iloc[i]

        cue = []
        a_structures = OrderedDict([
            (("A", "U"), np.zeros([seq_length, seq_length])),
            (("C", "G"), np.zeros([seq_length, seq_length])),
            (("U", "G"), np.zeros([seq_length, seq_length])),
            (("U", "A"), np.zeros([seq_length, seq_length])),
            (("G", "C"), np.zeros([seq_length, seq_length])),
            (("G", "U"), np.zeros([seq_length, seq_length])),
        ])
        for j in range(seq_length):
            if structure[j] == "(":
                cue.append(j)
            elif structure[j] == ")":
                start = cue.pop()
                a_structures[(sequence[start], sequence[j])][start, j] = 1
                a_structures[(sequence[j], sequence[start])][j, start] = 1

        a_strc = np.stack([a for a in a_structures.values()], axis=2)
        a_strc = np.sum(a_strc, axis=2, keepdims=True)
        Ss.append(a_strc)

    Ss = np.array(Ss)
    return Ss


def create_loader(df, batch_size=1, is_test=False):
    features, labels = preprocess(df, is_test)
    if labels is not None:
        features_tensor = torch.from_numpy(features)
        labels_tensor = torch.from_numpy(labels)
        dataset = VacDataset(features_tensor, df, labels_tensor)
        loader = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True, drop_last=False)
    else:
        features_tensor = torch.from_numpy(features)
        dataset = VacDataset(features_tensor, df, None)
        loader = torch.utils.data.DataLoader(dataset, batch_size, shuffle=False, drop_last=False)
    return loader


class VacDataset(Dataset):
    def __init__(self, features, df, labels=None):
        self.features = features
        self.labels = labels
        self.test = labels is None
        self.ids = df["id"]
        self.score = None
        self.structure_adj = get_structure_adj(df)
        self.distance_matrix = get_distance_matrix(self.structure_adj.shape[1])
        if "score" in df.columns:
            self.score = df["score"]
        else:
            df["score"] = 1.0
            self.score = df["score"]
        self.signal_to_noise = None
        if not self.test:
            self.signal_to_noise = df["signal_to_noise"]
            assert self.features.shape[0] == self.labels.shape[0]
        else:
            assert self.ids is not None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        bpp = torch.from_numpy(load_from_id(self.ids[index]).copy()).float()
        adj = self.structure_adj[index]
        distance = self.distance_matrix[0]
        bpp = np.concatenate([bpp[:, :, None], adj, distance], axis=2)
        if self.test:
            return dict(sequence=self.features[index].float(), bpp=bpp, ids=self.ids[index])
        else:
            return dict(sequence=self.features[index].float(), bpp=bpp,
                        label=self.labels[index], ids=self.ids[index],
                        signal_to_noise=self.signal_to_noise[index],
                        score=self.score[index])


In [None]:
from torch.nn import TransformerEncoder, TransformerEncoderLayer

import math


class Conv1dStack(nn.Module):
    def __init__(self, in_dim, out_dim, kernel_size=3, padding=1, dilation=1):
        super(Conv1dStack, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(in_dim, out_dim, kernel_size=kernel_size, padding=padding, dilation=dilation, bias=False),
            nn.BatchNorm1d(out_dim),
            nn.Dropout(0.1),
            nn.LeakyReLU(),
        )
        self.res = nn.Sequential(
            nn.Conv1d(out_dim, out_dim, kernel_size=kernel_size, padding=padding, dilation=dilation, bias=False),
            nn.BatchNorm1d(out_dim),
            nn.Dropout(0.1),
            nn.LeakyReLU(),
        )

    def forward(self, x):
        x = self.conv(x)
        h = self.res(x)
        return x + h


class Conv2dStack(nn.Module):
    def __init__(self, in_dim, out_dim, kernel_size=3, padding=1, dilation=1):
        super(Conv2dStack, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_dim, out_dim, kernel_size=kernel_size, padding=padding, dilation=dilation, bias=False),
            nn.BatchNorm2d(out_dim),
            nn.Dropout(0.1),
            nn.LeakyReLU(),
        )
        self.res = nn.Sequential(
            nn.Conv2d(out_dim, out_dim, kernel_size=kernel_size, padding=padding, dilation=dilation, bias=False),
            nn.BatchNorm2d(out_dim),
            nn.Dropout(0.1),
            nn.LeakyReLU(),
        )

    def forward(self, x):
        x = self.conv(x)
        h = self.res(x)
        return x + h


class SeqEncoder(nn.Module):
    def __init__(self, in_dim: int):
        super(SeqEncoder, self).__init__()
        self.conv0 = Conv1dStack(in_dim, 128, 3, padding=1)
        self.conv1 = Conv1dStack(128, 64, 6, padding=5, dilation=2)
        self.conv2 = Conv1dStack(64, 32, 15, padding=7, dilation=1)
        self.conv3 = Conv1dStack(32, 32, 30, padding=29, dilation=2)

    def forward(self, x):
        x1 = self.conv0(x)
        x2 = self.conv1(x1)
        x3 = self.conv2(x2)
        x4 = self.conv3(x3)
        x = torch.cat([x1, x2, x3, x4], dim=1)
        # x = x.permute(0, 2, 1).contiguous()
        # BATCH x 256 x seq_length
        return x


class BppAttn(nn.Module):
    def __init__(self, in_channel: int, out_channel: int):
        super(BppAttn, self).__init__()
        self.conv0 = Conv1dStack(in_channel, out_channel, 3, padding=1)
        self.bpp_conv = Conv2dStack(5, out_channel)

    def forward(self, x, bpp):
        x = self.conv0(x)
        bpp = self.bpp_conv(bpp)
        # BATCH x C x SEQ x SEQ
        # BATCH x C x SEQ
        x = torch.matmul(bpp, x.unsqueeze(-1))
        return x.squeeze(-1)


class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class TransformerWrapper(nn.Module):
    def __init__(self, dmodel=256, nhead=8, num_layers=2):
        super(TransformerWrapper, self).__init__()
        self.pos_encoder = PositionalEncoding(256)
        encoder_layer = TransformerEncoderLayer(d_model=dmodel, nhead=nhead)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers)
        self.pos_emb = PositionalEncoding(dmodel)

    def flatten_parameters(self):
        pass

    def forward(self, x):
        x = x.permute((1, 0, 2)).contiguous()
        x = self.pos_emb(x)
        x = self.transformer_encoder(x)
        x = x.permute((1, 0, 2)).contiguous()
        return x, None


class RnnLayers(nn.Module):
    def __init__(self, dmodel, dropout=0.3, transformer_layers: int = 2):
        super(RnnLayers, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.rnn0 = TransformerWrapper(dmodel, nhead=8, num_layers=transformer_layers)
        self.rnn1 = nn.LSTM(dmodel, dmodel // 2, batch_first=True, num_layers=1, bidirectional=True)
        self.rnn2 = nn.GRU(dmodel, dmodel // 2, batch_first=True, num_layers=1, bidirectional=True)

    def forward(self, x):
        self.rnn0.flatten_parameters()
        x, _ = self.rnn0(x)
        if self.rnn1 is not None:
            self.rnn1.flatten_parameters()
            x = self.dropout(x)
            x, _ = self.rnn1(x)
        if self.rnn2 is not None:
            self.rnn2.flatten_parameters()
            x = self.dropout(x)
            x, _ = self.rnn2(x)
        return x

    
class BaseAttnModel(nn.Module):
    def __init__(self, transformer_layers: int = 2):
        super(BaseAttnModel, self).__init__()
        self.linear0 = nn.Linear(14 + 3, 1)
        self.seq_encoder_x = SeqEncoder(18)
        self.attn = BppAttn(256, 128)
        self.seq_encoder_bpp = SeqEncoder(128)
        self.seq = RnnLayers(256 * 2, dropout=0.3,
                             transformer_layers=transformer_layers)

    def forward(self, x, bpp):
        bpp_features = get_bpp_feature(bpp[:, :, :, 0].float())
        x = torch.cat([x] + bpp_features, dim=-1)
        learned = self.linear0(x)
        x = torch.cat([x, learned], dim=-1)
        x = x.permute(0, 2, 1).contiguous().float()
        # BATCH x 18 x seq_len
        bpp = bpp.permute([0, 3, 1, 2]).contiguous().float()
        # BATCH x 5 x seq_len x seq_len
        x = self.seq_encoder_x(x)
        # BATCH x 256 x seq_len
        bpp = self.attn(x, bpp)
        bpp = self.seq_encoder_bpp(bpp)
        # BATCH x 256 x seq_len
        x = x.permute(0, 2, 1).contiguous()
        # BATCH x seq_len x 256
        bpp = bpp.permute(0, 2, 1).contiguous()
        # BATCH x seq_len x 256
        x = torch.cat([x, bpp], dim=2)
        # BATCH x seq_len x 512
        x = self.seq(x)
        return x


class AEModel(nn.Module):
    def __init__(self, transformer_layers: int = 2):
        super(AEModel, self).__init__()
        self.seq = BaseAttnModel(transformer_layers=transformer_layers)
        self.linear = nn.Sequential(
            nn.Linear(256 * 2, 14),
            nn.Sigmoid(),
        )

    def forward(self, x, bpp):
        x = self.seq(x, bpp)
        x = F.dropout(x, p=0.3)
        x = self.linear(x)
        return x


class FromAeModel(nn.Module):
    def __init__(self, seq, pred_len=68, dmodel: int = 256):
        super(FromAeModel, self).__init__()
        self.seq = seq
        self.pred_len = pred_len
        self.linear = nn.Sequential(
            nn.Linear(dmodel * 2, len(target_cols)),
        )

    def forward(self, x, bpp):
        x = self.seq(x, bpp)
        x = self.linear(x)
        x = x[:, :self.pred_len]
        return x

In [None]:
base_train_data = pd.read_json(str(Path(BASE_PATH) / 'train.json'), lines=True)
base_train_data.head()

device = torch.device('cuda')
BATCH_SIZE = 64
base_train_data = pd.read_json(str(Path(BASE_PATH) / 'train_mix.json'), lines=True)
base_test_data = pd.read_json(str(Path(BASE_PATH) / 'test.json'), lines=True)
public_df = base_test_data.query("seq_length == 107").copy()
private_df = base_test_data.query("seq_length == 130").copy()
print(f'train_df: {base_train_data.shape} ')
print(f"public_df: {public_df.shape}")
print(f"private_df: {private_df.shape}")
public_df = public_df.reset_index()
private_df = private_df.reset_index()

debug = False
if debug:
    base_train_data = base_train_data[:30]
    public_df = public_df[:30]
    private_df = private_df[:30]

features, _ = preprocess(base_train_data, True)
print(f'train.shape:{features.shape}')
features_tensor = torch.from_numpy(features)
dataset0 = VacDataset(features_tensor, base_train_data, None)
features, _ = preprocess(public_df, True)
print(f'ppublic.shape:{features.shape}')
features_tensor = torch.from_numpy(features)
dataset1 = VacDataset(features_tensor, public_df, None)
features, _ = preprocess(private_df, True)
print(f'private.shape:{features.shape}')
features_tensor = torch.from_numpy(features)
dataset2 = VacDataset(features_tensor, private_df, None)

loader0 = torch.utils.data.DataLoader(dataset0, BATCH_SIZE, shuffle=False, drop_last=False)
loader1 = torch.utils.data.DataLoader(dataset1, BATCH_SIZE, shuffle=False, drop_last=False)
loader2 = torch.utils.data.DataLoader(dataset2, BATCH_SIZE, shuffle=False, drop_last=False)

train_df: (4800, 31) 
public_df: (629, 7)
private_df: (3005, 7)
train.shape:(4800, 107, 14)
(1, 107, 107, 3)
ppublic.shape:(629, 107, 14)
(1, 107, 107, 3)
private.shape:(3005, 130, 14)
(1, 130, 130, 3)


In [None]:
def learn_from_batch_ae(model, data, device):
    seq = data["sequence"].clone()
    seq[:, :, :14] = F.dropout2d(seq[:, :, :14], p=0.3)
    target = data["sequence"][:, :, :14]
    out = model(seq.to(device), data["bpp"].to(device))
    loss = F.binary_cross_entropy(out, target.to(device))
    return loss


def train_ae(model, train_data, optimizer, lr_scheduler, epochs=10, device="cpu",
             start_epoch: int = 0, start_it: int = 0, log_path: str = "./logs"):
    print(f"device: {device}")
    losses = []
    it = start_it
    model_save_path = Path(MODEL_SAVE_PATH)
    start_epoch = start_epoch
    end_epoch = start_epoch + epochs
    min_loss = 10.0
    min_loss_epoch = 0
    if not model_save_path.exists():
        model_save_path.mkdir(parents=True)
    for epoch in progress_bar(range(start_epoch, end_epoch)):
        print(f"epoch: {epoch}")
        model.train()
        for i, data in enumerate(train_data):
            optimizer.zero_grad()
            loss = learn_from_batch_ae(model, data, device)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            if lr_scheduler:
                lr_scheduler.step()
            loss_v = loss.item()
            losses.append(loss_v)
            it += 1
        loss_m = np.mean(losses)
        if loss_m < min_loss:
            min_loss_epoch = epoch
            min_loss = loss_m
        print(f'epoch: {epoch} loss: {loss_m}')
        losses = []
        torch.save(optimizer.state_dict(), str(model_save_path / "optimizer.pt"))
        torch.save(model.state_dict(), str(model_save_path / f"model-{epoch}.pt"))
    return dict(end_epoch=end_epoch, it=it, min_loss_epoch=min_loss_epoch)

In [None]:
import shutil

set_seed(123)
shutil.rmtree("./model", True)
shutil.rmtree("./logs", True)
save_path = Path("./model_prediction")
if not save_path.exists():
    save_path.mkdir(parents=True)

lr_scheduler = None
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AEModel()
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
res = dict(end_epoch=0, it=0, min_loss_epoch=0)
epochs = [5, 5, 5, 5]
for e in epochs:
    res = train_ae(model, loader0, optimizer, lr_scheduler, e, device=device,
                   start_epoch=res["end_epoch"], start_it=res["it"])
    res = train_ae(model, loader1, optimizer, lr_scheduler, e, device=device,
                   start_epoch=res["end_epoch"], start_it=res["it"])
    res = train_ae(model, loader2, optimizer, lr_scheduler, e, device=device,
                   start_epoch=res["end_epoch"], start_it=res["it"])

epoch = res["min_loss_epoch"]
shutil.copyfile(str(Path(MODEL_SAVE_PATH) / f"model-{epoch}.pt"), "ae-model.pt")

device: cuda


epoch: 0
epoch: 0 loss: 0.2582839167118072
epoch: 1
epoch: 1 loss: 0.12696744680404662
epoch: 2
epoch: 2 loss: 0.06903558989365896
epoch: 3
epoch: 3 loss: 0.02616279828051726
epoch: 4
epoch: 4 loss: 0.017696437761187553
device: cuda


epoch: 5
epoch: 5 loss: 0.015756546799093486
epoch: 6
epoch: 6 loss: 0.015278169233351947
epoch: 7
epoch: 7 loss: 0.014119365438818931
epoch: 8
epoch: 8 loss: 0.012903992645442485
epoch: 9
epoch: 9 loss: 0.012181867565959691
device: cuda


epoch: 10
epoch: 10 loss: 0.018767028987566207
epoch: 11
epoch: 11 loss: 0.01596976442143638
epoch: 12
epoch: 12 loss: 0.01505153959101819
epoch: 13
epoch: 13 loss: 0.013968795934256087
epoch: 14
epoch: 14 loss: 0.013345466867247795
device: cuda


epoch: 15
epoch: 15 loss: 0.13020456763605276
epoch: 16
epoch: 16 loss: 0.12889247953891755
epoch: 17
epoch: 17 loss: 0.057808584024508794
epoch: 18
epoch: 18 loss: 0.029278151417771976
epoch: 19
epoch: 19 loss: 0.020809129625558854
device: cuda


epoch: 20
epoch: 20 loss: 0.015095696225762368
epoch: 21
epoch: 21 loss: 0.014818700961768627
epoch: 22
epoch: 22 loss: 0.013127184938639402
epoch: 23
epoch: 23 loss: 0.012269510608166456
epoch: 24
epoch: 24 loss: 0.012173028849065303
device: cuda


epoch: 25
epoch: 25 loss: 0.02123260589197595
epoch: 26
epoch: 26 loss: 0.017998483527372492
epoch: 27
epoch: 27 loss: 0.016924368534633454
epoch: 28
epoch: 28 loss: 0.015830750993274627
epoch: 29
epoch: 29 loss: 0.015244632206381635
device: cuda


epoch: 30
epoch: 30 loss: 0.015591737789412339
epoch: 31
epoch: 31 loss: 0.012986373603343964
epoch: 32
epoch: 32 loss: 0.010256844957669577
epoch: 33
epoch: 33 loss: 0.009709473450978596
epoch: 34
epoch: 34 loss: 0.009259459860622883
device: cuda


epoch: 35
epoch: 35 loss: 0.01332814022898674
epoch: 36
epoch: 36 loss: 0.024473259598016738
epoch: 37
epoch: 37 loss: 0.014301322400569916
epoch: 38
epoch: 38 loss: 0.011437801364809274
epoch: 39
epoch: 39 loss: 0.010174646135419607
device: cuda


epoch: 40
epoch: 40 loss: 0.016372297196946245
epoch: 41
epoch: 41 loss: 0.014222826610537285
epoch: 42
epoch: 42 loss: 0.013445869999680114
epoch: 43
epoch: 43 loss: 0.013015292069696366
epoch: 44
epoch: 44 loss: 0.01267763383765804
device: cuda


epoch: 45
epoch: 45 loss: 0.00792471387113134
epoch: 46
epoch: 46 loss: 0.007425774056464433
epoch: 47
epoch: 47 loss: 0.007014451672633489
epoch: 48
epoch: 48 loss: 0.00677653182297945
epoch: 49
epoch: 49 loss: 0.0064524308343728385
device: cuda


epoch: 50
epoch: 50 loss: 0.004919839184731245
epoch: 51
epoch: 51 loss: 0.0052539495751261715
epoch: 52
epoch: 52 loss: 0.004985286900773644
epoch: 53
epoch: 53 loss: 0.0047832226380705835
epoch: 54
epoch: 54 loss: 0.004769714502617717
device: cuda


epoch: 55
epoch: 55 loss: 0.012200576728804315
epoch: 56
epoch: 56 loss: 0.011222339968415016
epoch: 57
epoch: 57 loss: 0.011294469217512202
epoch: 58
epoch: 58 loss: 0.010726299929492016
epoch: 59
epoch: 59 loss: 0.01058207705933997


'ae-model.pt'

In [None]:
def MCRMSE(y_true, y_pred):
    colwise_mse = torch.mean(torch.square(y_true - y_pred), dim=1)
    return torch.mean(torch.sqrt(colwise_mse), dim=1)


def sn_mcrmse_loss(predict, target, signal_to_noise):
    loss = MCRMSE(target, predict)
    weight = 0.5 * torch.log(signal_to_noise + 1.01)
    loss = (loss * weight).mean()
    return loss


def learn_from_batch(model, data, optimizer, lr_scheduler, device):
    optimizer.zero_grad()
    out = model(data["sequence"].to(device), data["bpp"].to(device))
    signal_to_noise = data["signal_to_noise"] * data["score"]
    loss = sn_mcrmse_loss(out, data["label"].to(device), signal_to_noise.to(device))
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()
    if lr_scheduler:
        lr_scheduler.step()
    return out, loss


def evaluate(model, valid_data, device):
    model.eval()
    loss_list = []
    mcrmse = []
    for i, data in enumerate(valid_data):
        with torch.no_grad():
            y = model(data["sequence"].to(device), data["bpp"].to(device))
            mcrmse_ = MCRMSE(data["label"].to(device), y)[data["signal_to_noise"] > 1]
            mcrmse.append(mcrmse_.mean().item())
            loss = sn_mcrmse_loss(y, data["label"].to(device), data["signal_to_noise"].to(device))
            loss_list.append(loss.item())
    model.train()
    return dict(loss=np.mean(loss_list), mcmse=np.mean(mcrmse))


def train(model, train_data, valid_data, optimizer, lr_scheduler, epochs=10, device="cpu",
          start_epoch: int = 0, log_path: str = "./logs"):
    print(f"device: {device}")
    losses = []
    writer = SummaryWriter(log_path)
    it = 0
    model_save_path = Path(MODEL_SAVE_PATH)
    start_epoch = start_epoch
    end_epoch = start_epoch + epochs
    if not model_save_path.exists():
        model_save_path.mkdir(parents=True)
    min_eval_loss = 10.0
    min_eval_epoch = None
    for epoch in progress_bar(range(start_epoch, end_epoch)):
        print(f"epoch: {epoch}")
        model.train()
        for i, data in enumerate(train_data):
            _, loss = learn_from_batch(model, data, optimizer, lr_scheduler, device)
            loss_v = loss.item()
            writer.add_scalar('loss', loss_v, it)
            losses.append(loss_v)
            it += 1
        print(f'epoch: {epoch} loss: {np.mean(losses)}')
        losses = []

        eval_result = evaluate(model, valid_data, device)
        eval_loss = eval_result["loss"]
        if eval_loss <= min_eval_loss:
            min_eval_epoch = epoch
            min_eval_loss = eval_loss

        print(f"eval loss: {eval_loss} {eval_result['mcmse']}")
        writer.add_scalar(f"evaluate/loss", eval_loss, epoch)
        writer.add_scalar(f"evaluate/mcmse", eval_result["mcmse"], epoch)
        model.train()
        torch.save(optimizer.state_dict(), str(model_save_path / "optimizer.pt"))
        torch.save(model.state_dict(), str(model_save_path / f"model-{epoch}.pt"))
    print(f'min eval loss: {min_eval_loss} epoch {min_eval_epoch}')
    return min_eval_epoch

In [None]:
device = torch.device('cuda')
BATCH_SIZE = 128
base_train_data = pd.read_json(str(Path(BASE_PATH) / 'train_mix.json'), lines=True)
if debug:
    base_train_data = base_train_data[:30]
    public_df = public_df[:30]
    private_df = private_df[:30]
samples = base_train_data
save_path = Path("./model_prediction")
if not save_path.exists():
    save_path.mkdir(parents=True)
shutil.rmtree("./model", True)
shutil.rmtree("./logs", True)
split = ShuffleSplit(n_splits=8, test_size=.1)
split = GroupKFold(n_splits=8)
ids = samples.reset_index()["id"]
set_seed(124)

for fold, (train_index, test_index) in enumerate(split.split(samples,groups=samples['cluster_id'])):
    print(f"fold: {fold}")
    train_df = samples.loc[train_index].reset_index()
    val_df = samples.loc[test_index].reset_index()
    train_loader = create_loader(train_df, BATCH_SIZE)
    valid_loader = create_loader(val_df, BATCH_SIZE)
    print(train_df.shape, val_df.shape)
    ae_model = AEModel()
    state_dict = torch.load("./ae-model.pt")
    ae_model.load_state_dict(state_dict)
    del state_dict
    model = FromAeModel(ae_model.seq)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    lr_scheduler = None
    epoch = train(model, train_loader, valid_loader, optimizer, lr_scheduler, 75, device=device,
                  log_path=f"logs/{fold}")
    shutil.copyfile(str(Path(MODEL_SAVE_PATH) / f"./model-{epoch}.pt"), f"model_prediction/model-{fold}.pt")
    del model, epoch

fold: 0
(1, 107, 107, 3)
(1, 107, 107, 3)
(4200, 32) (600, 32)
device: cuda


epoch: 0
epoch: 0 loss: 0.2924952431780247
eval loss: 0.2635189040450353 0.3433295459777229
epoch: 1
epoch: 1 loss: 0.22760803889956308
eval loss: 0.24504332105909352 0.3185245520010889
epoch: 2
epoch: 2 loss: 0.21245488998180745
eval loss: 0.23332763597895506 0.3025593468381362
epoch: 3
epoch: 3 loss: 0.2029159529466883
eval loss: 0.22238402332355842 0.2889165411133831
epoch: 4
epoch: 4 loss: 0.19643853763081748
eval loss: 0.22036325718336963 0.28646003806920717
epoch: 5
epoch: 5 loss: 0.1884725297168256
eval loss: 0.22144560971302135 0.2866093185245076
epoch: 6
epoch: 6 loss: 0.18579741204130654
eval loss: 0.20753758289314267 0.2690712846921237
epoch: 7
epoch: 7 loss: 0.17964915283902688
eval loss: 0.20381961248186956 0.26343530834770623
epoch: 8
epoch: 8 loss: 0.17339929318088818
eval loss: 0.19592044719978902 0.25305416507385364
epoch: 9
epoch: 9 loss: 0.17030904848258083
eval loss: 0.19168386340805674 0.24843940883486543
epoch: 10
epoch: 10 loss: 0.16617785386263415
eval loss: 0.1

epoch: 0
epoch: 0 loss: 0.30102878517459825
eval loss: 0.24132417214376436 0.34148704661791773
epoch: 1
epoch: 1 loss: 0.23169364193263417
eval loss: 0.21969521074754644 0.3138779505157798
epoch: 2
epoch: 2 loss: 0.21673823969302108
eval loss: 0.21391937836302635 0.3049003221912094
epoch: 3
epoch: 3 loss: 0.2074891005839611
eval loss: 0.20322759227707024 0.28950228340584994
epoch: 4
epoch: 4 loss: 0.19942908989268962
eval loss: 0.19454110937004926 0.2769943994188735
epoch: 5
epoch: 5 loss: 0.1932447907394902
eval loss: 0.19409374274098007 0.2773980404170817
epoch: 6
epoch: 6 loss: 0.18746649814029
eval loss: 0.18660644289894365 0.26626948118548943
epoch: 7
epoch: 7 loss: 0.18303010218795826
eval loss: 0.1812436290738759 0.2577355057932582
epoch: 8
epoch: 8 loss: 0.179483277114499
eval loss: 0.18110680150176703 0.2599754918094482
epoch: 9
epoch: 9 loss: 0.17515338510547301
eval loss: 0.1733839553461301 0.24763025276697864
epoch: 10
epoch: 10 loss: 0.17109428243254413
eval loss: 0.174206

epoch: 0
epoch: 0 loss: 0.3051312837188184
eval loss: 0.28116779034255013 0.34312402912082984
epoch: 1
epoch: 1 loss: 0.23351647013692534
eval loss: 0.2569834055283754 0.3125566196518881
epoch: 2
epoch: 2 loss: 0.22024619709138643
eval loss: 0.24260420802672839 0.2949681164701519
epoch: 3
epoch: 3 loss: 0.20818064927569635
eval loss: 0.23302874571749127 0.2811218996817665
epoch: 4
epoch: 4 loss: 0.19990130063951753
eval loss: 0.22017142336605708 0.266937674519735
epoch: 5
epoch: 5 loss: 0.1939676418739605
eval loss: 0.2183942152874121 0.26495000133669055
epoch: 6
epoch: 6 loss: 0.18724676318809214
eval loss: 0.20839738008651754 0.25094381663967025
epoch: 7
epoch: 7 loss: 0.18292769349711532
eval loss: 0.20570246537736728 0.24704873797869437
epoch: 8
epoch: 8 loss: 0.1780814278785172
eval loss: 0.2027801962274028 0.24264379617712165
epoch: 9
epoch: 9 loss: 0.174312899465519
eval loss: 0.2008731500084436 0.24117695695378433
epoch: 10
epoch: 10 loss: 0.17065709594805645
eval loss: 0.19465

epoch: 0
epoch: 0 loss: 0.3067063835083503
eval loss: 0.25582147896640384 0.31574720296695663
epoch: 1
epoch: 1 loss: 0.2313202247714641
eval loss: 0.2364101500905642 0.2923638420629241
epoch: 2
epoch: 2 loss: 0.21747806768933536
eval loss: 0.22851813220924316 0.2828271305877427
epoch: 3
epoch: 3 loss: 0.20879987884246226
eval loss: 0.21441484871196445 0.2650308753784726
epoch: 4
epoch: 4 loss: 0.20086790910884003
eval loss: 0.21477757201225028 0.2675734720384966
epoch: 5
epoch: 5 loss: 0.19317714677901945
eval loss: 0.2121343053095642 0.26322361824766966
epoch: 6
epoch: 6 loss: 0.18897476709153127
eval loss: 0.20417012964526035 0.25363185370646535
epoch: 7
epoch: 7 loss: 0.18306729859295154
eval loss: 0.20424669330043116 0.2531988913826574
epoch: 8
epoch: 8 loss: 0.17939346841441156
eval loss: 0.2064500027147682 0.25652440163087065
epoch: 9
epoch: 9 loss: 0.1787219097641026
eval loss: 0.20556929235053206 0.25508895912232654
epoch: 10
epoch: 10 loss: 0.17356529265071516
eval loss: 0.19

epoch: 0
epoch: 0 loss: 0.2908892051128341
eval loss: 0.2256426917826503 0.31487860411592355
epoch: 1
epoch: 1 loss: 0.2269795236343403
eval loss: 0.20881540983924268 0.29063708022641854
epoch: 2
epoch: 2 loss: 0.21208169810795355
eval loss: 0.20213940743058795 0.28250867428857224
epoch: 3
epoch: 3 loss: 0.20265059877417352
eval loss: 0.19851119267023093 0.2759562604874924
epoch: 4
epoch: 4 loss: 0.19413898470577148
eval loss: 0.19322525689224826 0.2702189884311731
epoch: 5
epoch: 5 loss: 0.1881211015346528
eval loss: 0.1917071578588924 0.2670564273685856
epoch: 6
epoch: 6 loss: 0.18227160399111428
eval loss: 0.1887729961898205 0.2651165754458035
epoch: 7
epoch: 7 loss: 0.17692869594764912
eval loss: 0.18613888255619745 0.2598773943773728
epoch: 8
epoch: 8 loss: 0.17379826187205402
eval loss: 0.1909521739987022 0.2652117209721795
epoch: 9
epoch: 9 loss: 0.17121971460161955
eval loss: 0.1849769661782339 0.2578918817124048
epoch: 10
epoch: 10 loss: 0.16620749312215946
eval loss: 0.180582

epoch: 0
epoch: 0 loss: 0.3125520353977962
eval loss: 0.2556919545941261 0.32176589841149433
epoch: 1
epoch: 1 loss: 0.25021350204969867
eval loss: 0.24230421535017316 0.30595923325778646
epoch: 2
epoch: 2 loss: 0.23329781778238914
eval loss: 0.22833327309289012 0.2878444264043444
epoch: 3
epoch: 3 loss: 0.22159049313582285
eval loss: 0.21728265949926145 0.274275757876279
epoch: 4
epoch: 4 loss: 0.21382992025664677
eval loss: 0.2103551678420929 0.2657569541804367
epoch: 5
epoch: 5 loss: 0.20567931285182633
eval loss: 0.20528531476162898 0.26005193983388075
epoch: 6
epoch: 6 loss: 0.2014364131196594
eval loss: 0.19979844035588876 0.2541195263139112
epoch: 7
epoch: 7 loss: 0.1945102241722968
eval loss: 0.19734140273121392 0.24986219809720014
epoch: 8
epoch: 8 loss: 0.18815552112798042
eval loss: 0.19773864526810409 0.25118530108781567
epoch: 9
epoch: 9 loss: 0.18463740594265096
eval loss: 0.19358498746478836 0.24570990920344435
epoch: 10
epoch: 10 loss: 0.18082888064238137
eval loss: 0.1

epoch: 0
epoch: 0 loss: 0.2983592437560715
eval loss: 0.26638230209457553 0.3334912434273597
epoch: 1
epoch: 1 loss: 0.23134899231685777
eval loss: 0.24347838410834624 0.30799959718475867
epoch: 2
epoch: 2 loss: 0.21582940107320517
eval loss: 0.23665385060624305 0.2975518729390565
epoch: 3
epoch: 3 loss: 0.20643908043907944
eval loss: 0.22778919680535745 0.2875722273573723
epoch: 4
epoch: 4 loss: 0.19778511703104945
eval loss: 0.22118587802736706 0.2797412991174244
epoch: 5
epoch: 5 loss: 0.18962531935865812
eval loss: 0.2182632750573425 0.274605921809453
epoch: 6
epoch: 6 loss: 0.18430248456923512
eval loss: 0.21469455131653786 0.27142552528888336
epoch: 7
epoch: 7 loss: 0.17844617726644324
eval loss: 0.21281897702018943 0.27017283338249687
epoch: 8
epoch: 8 loss: 0.17346156798986567
eval loss: 0.21856106784491822 0.2747859031700956
epoch: 9
epoch: 9 loss: 0.17050263745727404
eval loss: 0.20645399461772235 0.26075689373783634
epoch: 10
epoch: 10 loss: 0.1673824459885978
eval loss: 0.2

epoch: 0
epoch: 0 loss: 0.3094528345192744
eval loss: 0.25533191463571214 0.33776001361518954
epoch: 1
epoch: 1 loss: 0.23289448805917462
eval loss: 0.23762509696548406 0.31408112770551044
epoch: 2
epoch: 2 loss: 0.21971774901348223
eval loss: 0.22897213635074415 0.3026077799971246
epoch: 3
epoch: 3 loss: 0.21009421474426365
eval loss: 0.22177307729327772 0.29336708970696934
epoch: 4
epoch: 4 loss: 0.20235379235700765
eval loss: 0.2196522635095414 0.29012907579500846
epoch: 5
epoch: 5 loss: 0.1956655333680007
eval loss: 0.2131569831342873 0.2821526645913929
epoch: 6
epoch: 6 loss: 0.1884356857802974
eval loss: 0.2126331797059909 0.28212260860849214
epoch: 7
epoch: 7 loss: 0.18420977045385475
eval loss: 0.2061576847803611 0.27394803632854386
epoch: 8
epoch: 8 loss: 0.17989164179752215
eval loss: 0.20479516078205556 0.27143945478989845
epoch: 9
epoch: 9 loss: 0.17443664674675474
eval loss: 0.19941471950442208 0.2645688054815497
epoch: 10
epoch: 10 loss: 0.17267658796283084
eval loss: 0.2

In [None]:
def predict_batch(model, data, device):
    # batch x seq_len x target_size
    with torch.no_grad():
        pred = model(data["sequence"].to(device), data["bpp"].to(device))
        pred = pred.detach().cpu().numpy()
    return_values = []
    ids = data["ids"]
    for idx, p in enumerate(pred):
        id_ = ids[idx]
        assert p.shape == (model.pred_len, len(target_cols))
        for seqpos, val in enumerate(p):
            assert len(val) == len(target_cols)
            dic = {key: val for key, val in zip(target_cols, val)}
            dic["id_seqpos"] = f"{id_}_{seqpos}"
            return_values.append(dic)
    return return_values


def predict_data(model, loader, device, batch_size):
    data_list = []
    for i, data in enumerate(progress_bar(loader)):
        data_list += predict_batch(model, data, device)
    expected_length = model.pred_len * len(loader) * batch_size
    assert len(data_list) == expected_length, f"len = {len(data_list)} expected = {expected_length}"
    return data_list

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 1
base_test_data = pd.read_json(str(Path(BASE_PATH) / 'test.json'), lines=True)
public_df = base_test_data.query("seq_length == 107").copy()
private_df = base_test_data.query("seq_length == 130").copy()
print(f"public_df: {public_df.shape}")
print(f"private_df: {private_df.shape}")
public_df = public_df.reset_index()
private_df = private_df.reset_index()
pub_loader = create_loader(public_df, BATCH_SIZE, is_test=True)
pri_loader = create_loader(private_df, BATCH_SIZE, is_test=True)
pred_df_list = []
c = 0
for fold in range(8):
    model_load_path = f"./model_prediction/model-{fold}.pt"
    ae_model0 = AEModel()
    ae_model1 = AEModel()
    model_pub = FromAeModel(pred_len=107, seq=ae_model0.seq)
    model_pub = model_pub.to(device)
    model_pri = FromAeModel(pred_len=130, seq=ae_model1.seq)
    model_pri = model_pri.to(device)
    state_dict = torch.load(model_load_path, map_location=device)
    model_pub.load_state_dict(state_dict)
    model_pri.load_state_dict(state_dict)
    del state_dict

    data_list = []
    data_list += predict_data(model_pub, pub_loader, device, BATCH_SIZE)
    data_list += predict_data(model_pri, pri_loader, device, BATCH_SIZE)
    pred_df = pd.DataFrame(data_list, columns=["id_seqpos"] + target_cols)
    print(pred_df.head())
    print(pred_df.tail())
    pred_df_list.append(pred_df)
    c += 1
data_dic = dict(id_seqpos=pred_df_list[0]["id_seqpos"])
for col in target_cols:
    vals = np.zeros(pred_df_list[0][col].shape[0])
    for df in pred_df_list:
        vals += df[col].values
    data_dic[col] = vals / float(c)
pred_df_avg = pd.DataFrame(data_dic, columns=["id_seqpos"] + target_cols)
pred_df_avg.to_csv(data_dir + "/kaggle/input/stanford-covid-vaccine/submission_aug_gnn_pytorch.csv", index = False)

public_df: (629, 7)
private_df: (3005, 7)
(1, 107, 107, 3)
(1, 130, 130, 3)


        id_seqpos  reactivity  deg_Mg_pH10  deg_pH10  deg_Mg_50C   deg_50C
0  id_00073f8be_0    0.752909     0.650585  1.958495    0.524911  0.789595
1  id_00073f8be_1    2.398270     3.248760  4.362280    3.086809  2.748080
2  id_00073f8be_2    1.631015     0.532373  0.681331    0.667061  0.702567
3  id_00073f8be_3    1.293422     1.144513  1.242096    1.615935  1.782577
4  id_00073f8be_4    0.800711     0.567188  0.517936    0.853366  0.844774
               id_seqpos  reactivity  ...  deg_Mg_50C   deg_50C
457948  id_ffda94f24_125    0.192900  ...    0.351714  0.162220
457949  id_ffda94f24_126    0.243192  ...    0.415634  0.302285
457950  id_ffda94f24_127    0.391225  ...    0.201999  0.335484
457951  id_ffda94f24_128    0.188205  ...    0.497125  0.245732
457952  id_ffda94f24_129   -0.019812  ...    0.098345 -0.329512

[5 rows x 6 columns]


        id_seqpos  reactivity  deg_Mg_pH10  deg_pH10  deg_Mg_50C   deg_50C
0  id_00073f8be_0    0.712879     0.664106  2.069820    0.457728  0.758356
1  id_00073f8be_1    2.071367     2.876157  4.173930    2.784078  2.533367
2  id_00073f8be_2    1.755341     0.692213  0.914433    0.710485  0.873653
3  id_00073f8be_3    1.239527     1.113198  1.248116    1.615267  1.760385
4  id_00073f8be_4    0.791326     0.471049  0.666894    0.838740  0.831031
               id_seqpos  reactivity  ...  deg_Mg_50C   deg_50C
457948  id_ffda94f24_125    0.141418  ...    0.497172  0.286330
457949  id_ffda94f24_126    0.341503  ...    0.745393  0.677710
457950  id_ffda94f24_127    0.453845  ...    0.343719  0.355012
457951  id_ffda94f24_128    0.110879  ...    0.527594  0.255508
457952  id_ffda94f24_129    0.098250  ...    0.540466  0.319864

[5 rows x 6 columns]


        id_seqpos  reactivity  deg_Mg_pH10  deg_pH10  deg_Mg_50C   deg_50C
0  id_00073f8be_0    0.748139     0.716387  1.769777    0.564542  0.748192
1  id_00073f8be_1    2.323643     3.128082  4.325874    3.025921  2.913717
2  id_00073f8be_2    1.736864     0.636297  0.583476    0.641398  0.722858
3  id_00073f8be_3    1.407723     1.141170  1.180370    1.671163  1.808822
4  id_00073f8be_4    0.970919     0.513636  0.380817    0.859475  0.947829
               id_seqpos  reactivity  ...  deg_Mg_50C   deg_50C
457948  id_ffda94f24_125   -0.114534  ...    0.476990  0.306777
457949  id_ffda94f24_126    0.101082  ...    0.546073  0.604613
457950  id_ffda94f24_127    0.171736  ...    0.442890  0.478060
457951  id_ffda94f24_128   -0.234998  ...    0.404388  0.279099
457952  id_ffda94f24_129   -0.027732  ...    0.435627  0.349832

[5 rows x 6 columns]


        id_seqpos  reactivity  deg_Mg_pH10  deg_pH10  deg_Mg_50C   deg_50C
0  id_00073f8be_0    0.640562     0.682661  2.070334    0.629239  0.828262
1  id_00073f8be_1    2.011713     2.963616  3.861885    3.078121  2.637606
2  id_00073f8be_2    1.621482     0.712545  0.566466    0.737647  0.598524
3  id_00073f8be_3    1.309342     1.298823  1.051741    1.672792  1.703100
4  id_00073f8be_4    0.802603     0.627824  0.498365    0.863579  0.909411
               id_seqpos  reactivity  ...  deg_Mg_50C   deg_50C
457948  id_ffda94f24_125   -0.094750  ...    0.788800  0.394488
457949  id_ffda94f24_126    0.289322  ...    0.895219  1.007200
457950  id_ffda94f24_127    0.713243  ...    0.370068  0.559723
457951  id_ffda94f24_128    0.098740  ...    0.282411  0.270923
457952  id_ffda94f24_129   -0.012483  ...    0.356970  0.467099

[5 rows x 6 columns]


        id_seqpos  reactivity  deg_Mg_pH10  deg_pH10  deg_Mg_50C   deg_50C
0  id_00073f8be_0    0.751656     0.799853  2.055716    0.473655  0.716080
1  id_00073f8be_1    2.426552     2.890114  4.157564    2.830533  2.671585
2  id_00073f8be_2    1.836514     0.812600  0.812095    0.765076  0.783078
3  id_00073f8be_3    1.471417     1.363419  1.426523    1.830005  1.915176
4  id_00073f8be_4    0.997734     0.634061  0.570412    0.909458  0.941390
               id_seqpos  reactivity  ...  deg_Mg_50C   deg_50C
457948  id_ffda94f24_125    0.222556  ...    0.612605  0.254052
457949  id_ffda94f24_126    0.387436  ...    0.485530  0.367226
457950  id_ffda94f24_127    0.659278  ...    0.469310  0.415500
457951  id_ffda94f24_128    0.155399  ...    0.616192  0.038972
457952  id_ffda94f24_129    0.320831  ...    0.423143 -0.066372

[5 rows x 6 columns]


        id_seqpos  reactivity  deg_Mg_pH10  deg_pH10  deg_Mg_50C   deg_50C
0  id_00073f8be_0    0.564133     0.592709  2.164126    0.543940  0.825868
1  id_00073f8be_1    1.982303     2.687471  4.048964    3.047003  2.997065
2  id_00073f8be_2    1.612717     0.559784  0.642528    0.754707  0.751805
3  id_00073f8be_3    1.478088     1.227162  1.189748    1.860043  1.923384
4  id_00073f8be_4    0.968500     0.639488  0.552444    1.035306  0.964893
               id_seqpos  reactivity  ...  deg_Mg_50C   deg_50C
457948  id_ffda94f24_125    0.203738  ...    0.771774  0.463624
457949  id_ffda94f24_126    0.287028  ...    0.917539  0.740396
457950  id_ffda94f24_127    0.510692  ...    0.456542  0.515240
457951  id_ffda94f24_128    0.254002  ...    0.534364  0.369632
457952  id_ffda94f24_129    0.160654  ...    0.494596  0.248713

[5 rows x 6 columns]


        id_seqpos  reactivity  deg_Mg_pH10  deg_pH10  deg_Mg_50C   deg_50C
0  id_00073f8be_0    0.734166     0.717777  1.933690    0.606291  0.774400
1  id_00073f8be_1    2.368398     3.135816  4.455996    3.196287  2.873404
2  id_00073f8be_2    1.625775     0.605046  0.699400    0.684023  0.718651
3  id_00073f8be_3    1.341882     1.165078  1.257520    1.674375  1.617459
4  id_00073f8be_4    1.024330     0.510139  0.478057    0.827769  0.876847
               id_seqpos  reactivity  ...  deg_Mg_50C   deg_50C
457948  id_ffda94f24_125    0.097254  ...    0.310723  0.171719
457949  id_ffda94f24_126    0.582782  ...    0.456935  0.463879
457950  id_ffda94f24_127    0.449094  ...    0.237697  0.303674
457951  id_ffda94f24_128    0.110352  ...    0.394380 -0.010609
457952  id_ffda94f24_129    0.115659  ...    0.422129  0.132410

[5 rows x 6 columns]


        id_seqpos  reactivity  deg_Mg_pH10  deg_pH10  deg_Mg_50C   deg_50C
0  id_00073f8be_0    0.670231     0.606069  2.055812    0.518237  0.739172
1  id_00073f8be_1    2.179083     3.232599  4.462213    3.229133  2.921494
2  id_00073f8be_2    1.608479     0.549009  0.693942    0.618688  0.735800
3  id_00073f8be_3    1.255320     1.020320  1.125910    1.541840  1.804706
4  id_00073f8be_4    0.807887     0.557094  0.418413    0.785865  0.778157
               id_seqpos  reactivity  ...  deg_Mg_50C   deg_50C
457948  id_ffda94f24_125    0.238581  ...    0.372664  0.161167
457949  id_ffda94f24_126    0.341022  ...    0.620286  0.593709
457950  id_ffda94f24_127    0.683104  ...    0.311629  0.384907
457951  id_ffda94f24_128    0.146306  ...    0.244314 -0.031905
457952  id_ffda94f24_129    0.169296  ...    0.496848  0.223149

[5 rows x 6 columns]


In [None]:
pred_df_avg

Unnamed: 0,id_seqpos,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
0,id_00073f8be_0,0.696834,0.678768,2.009721,0.539818,0.772491
1,id_00073f8be_1,2.220166,3.020327,4.231088,3.034736,2.787040
2,id_00073f8be_2,1.678523,0.637483,0.699209,0.697386,0.735867
3,id_00073f8be_3,1.349590,1.184210,1.215253,1.685178,1.789451
4,id_00073f8be_4,0.895501,0.565060,0.510417,0.871695,0.886791
...,...,...,...,...,...,...
457948,id_ffda94f24_125,0.110895,0.408810,0.389248,0.522805,0.275047
457949,id_ffda94f24_126,0.321671,0.451259,0.781053,0.635326,0.594627
457950,id_ffda94f24_127,0.504027,0.218261,0.211098,0.354232,0.418450
457951,id_ffda94f24_128,0.103611,0.284065,0.180591,0.437596,0.177169
