It is an example of inference with model trained in [RNA starter kernel](https://www.kaggle.com/code/iafoss/rna-starter)

In [1]:
import pandas as pd
import os, gc
import numpy as np
from tqdm.notebook import tqdm
import math
from sklearn.model_selection import KFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [8]:
MODELS = ['/kaggle/input/example5/example5_0.pth']
PATH = '/kaggle/input/stanford-ribonanza-rna-folding-converted/'
bs = 256
num_workers = 2
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
df_test = pd.read_parquet(os.path.join(PATH,'test_sequences.parquet'))

In [4]:
id_min, id_max, seq = df_test.loc[0, ['id_min','id_max','sequence']]
id_min, id_max, seq

(0,
 176,
 'GGGAACGACUCGAGUAGAGUCGAAAAUUUCCUUCCAAAUCCUGAGGGAGAGAUAGAGGCGGAGGGUCUGGGGGAGGAAUUAAAACACAAGGUCUCCUCCCCUCUCGCCUGUCCGAACUUGGGGGCACCCCGGCUCGUACUUCGGUACGAGCCGGGGAAAAGAAACAACAACAACAAC')

# Data

In [5]:
class RNA_Dataset_Test(Dataset):
    def __init__(self, df, mask_only=False, **kwargs):
        self.seq_map = {'A':0,'C':1,'G':2,'U':3}
        df['L'] = df.sequence.apply(len)
        self.Lmax = df['L'].max()
        self.df = df
        self.mask_only = mask_only
        
    def __len__(self):
        return len(self.df)  
    
    def __getitem__(self, idx):
        id_min, id_max, seq = self.df.loc[idx, ['id_min','id_max','sequence']]
        mask = torch.zeros(self.Lmax, dtype=torch.bool)
        L = len(seq)
        mask[:L] = True
        if self.mask_only: return {'mask':mask},{}
        ids = np.arange(id_min,id_max+1)
        
        seq = [self.seq_map[s] for s in seq]
        seq = np.array(seq)
        seq = np.pad(seq,(0,self.Lmax-L))
        ids = np.pad(ids,(0,self.Lmax-L), constant_values=-1)
        
        return {'seq':torch.from_numpy(seq), 'mask':mask}, \
               {'ids':ids}
            
def dict_to(x, device='cuda'):
    return {k:x[k].to(device) for k in x}

def to_device(x, device='cuda'):
    return tuple(dict_to(e,device) for e in x)

class DeviceDataLoader:
    def __init__(self, dataloader, device='cuda'):
        self.dataloader = dataloader
        self.device = device
    
    def __len__(self):
        return len(self.dataloader)
    
    def __iter__(self):
        for batch in self.dataloader:
            yield tuple(dict_to(x, self.device) for x in batch)

# Model

In [6]:
class SinusoidalPosEmb(nn.Module):
    def __init__(self, dim=16, M=10000):
        super().__init__()
        self.dim = dim
        self.M = M

    def forward(self, x):
        device = x.device
        half_dim = self.dim // 2
        emb = math.log(self.M) / half_dim
        emb = torch.exp(torch.arange(half_dim, device=device) * (-emb))
        emb = x[...,None] * emb[None,...]
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb

class RNA_Model(nn.Module):
    def __init__(self, dim=192, depth=12, head_size=32, **kwargs):
        super().__init__()
        self.emb = nn.Embedding(4,dim)
        self.pos_enc = SinusoidalPosEmb(dim)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=dim, nhead=dim//head_size, dim_feedforward=4*dim,
                dropout=0.1, activation=nn.GELU(), batch_first=True, norm_first=True), depth)
        self.proj_out = nn.Linear(dim,2)
    
    def forward(self, x0):
        mask = x0['mask']
        Lmax = mask.sum(-1).max()
        mask = mask[:,:Lmax]
        x = x0['seq'][:,:Lmax]
        
        pos = torch.arange(Lmax, device=x.device).unsqueeze(0)
        pos = self.pos_enc(pos)
        x = self.emb(x)
        x = x + pos
        
        x = self.transformer(x, src_key_padding_mask=~mask)
        x = self.proj_out(x)
        
        return x

# Inference

In [9]:
df_test = pd.read_parquet(os.path.join(PATH,'test_sequences.parquet'))
ds = RNA_Dataset_Test(df_test)
dl = DeviceDataLoader(torch.utils.data.DataLoader(ds, batch_size=bs, 
               shuffle=False, drop_last=False, num_workers=num_workers), device)
del df_test
gc.collect()

models = []
for m in MODELS:
    model = RNA_Model() 
    model = model.to(device)
    model.load_state_dict(torch.load(m,map_location=torch.device('cpu')))
    model.eval()
    models.append(model)

In [None]:
ids,preds = [],[]
for x,y in tqdm(dl):
    with torch.no_grad(),torch.cuda.amp.autocast():
        p = torch.stack([torch.nan_to_num(model(x)) for model in models]
                        ,0).mean(0).clip(0,1)
        
    for idx, mask, pi in zip(y['ids'].cpu(), x['mask'].cpu(), p.cpu()):
        ids.append(idx[mask])
        preds.append(pi[mask[:pi.shape[0]]])

ids = torch.concat(ids)
preds = torch.concat(preds)

df = pd.DataFrame({'id':ids.numpy(), 'reactivity_DMS_MaP':preds[:,1].numpy(), 
                   'reactivity_2A3_MaP':preds[:,0].numpy()})
df.to_csv('submission.csv', index=False, float_format='%.4f') # 6.5GB
df.head()

  0%|          | 0/5250 [00:00<?, ?it/s]

  return torch._transformer_encoder_layer_fwd(
