In [6]:
import pandas as pd
import numpy as np
import argparse
import pickle
import torch

from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import PredefinedSplit

from data_module import CodonDataModule
from fine_tuning import PLProteinBertRegressor
from calm.model import ProteinBertRegressor
from calm.alphabet import Alphabet



In [8]:
data = pd.read_csv('./data/transcript_stability/mrna_half-life.csv')

In [9]:
args = argparse.Namespace(
    max_positions=1024,
    warmup_steps=200,
    weight_decay=0.1,
    lr_scheduler='warmup_cosine',
    learning_rate=1e-4,
    batch_size=4,
    accumulate_gradients=32,
    num_steps=15000,
    num_layers = 12,
    embed_dim = 768,
    attention_dropout = 0.,
    logit_bias = False,
    rope_embedding = True,
    ffn_embed_dim = 768*4,
    attention_heads = 12
)

# Initialize model
alphabet = Alphabet.from_architecture('CodonModel')

datamodule = CodonDataModule(args, alphabet,
                             './data/transcript_stability/mrna_half-life.csv', 
                             args.batch_size,
                             fine_tune=True, 
                             target_column='y',
                             sequence_column='CDS')

base_model = ProteinBertRegressor(args, alphabet)

In [13]:
r_list = []
rho_list = []

split = PredefinedSplit(test_fold = data['split'])
    
for fold, idxs in enumerate(split.split()):

    datamodule = CodonDataModule(args, alphabet, './data/transcript_stability/mrna_half-life.csv', args.batch_size,
                                     fine_tune=True, sequence_column = 'CDS',
                                     target_column = 'y', split_idxs = idxs)

    checkpoint_path = f'./assets/crossval/saluki_stability_fold_{fold}.ckpt'
    model = PLProteinBertRegressor.load_from_checkpoint(checkpoint_path, model=base_model, args=args)
    model.to('cuda')
    model.eval()

    datamodule.setup()
    dataloader = datamodule.val_dataloader()

    predictions = []
    labels = []
    with torch.no_grad():
        for batch in dataloader:
            tokens = batch['input'].to("cuda")  # Move to GPU if available
            preds = model(tokens)  # Forward pass
            preds = preds["logits"].squeeze(-1)  # Extract logits
            predictions.extend(preds.cpu().numpy())
            labels.extend(batch['labels'].cpu().numpy())


    r, p_val = pearsonr(predictions, labels)
    rho, p_val = spearmanr(predictions, labels)
    r_list.append(r)
    rho_list.append(rho)

    print(f'Results for fold {fold}:')
    print(f'R: {r:.4f}')
    print(f'R\u00b2: {r ** 2:.4f}')

print(f'R: {np.mean(r_list):.4f}')
print(f'rho: {np.mean(rho_list):.4f}')



Results for fold 0:
R: 0.6042
R²: 0.3650




Results for fold 1:
R: 0.5848
R²: 0.3420




Results for fold 2:
R: 0.5780
R²: 0.3340




Results for fold 3:
R: 0.6164
R²: 0.3799




Results for fold 4:
R: 0.5950
R²: 0.3541




Results for fold 5:
R: 0.6101
R²: 0.3722




Results for fold 6:
R: 0.6010
R²: 0.3613




Results for fold 7:
R: 0.5854
R²: 0.3427




Results for fold 8:
R: 0.6061
R²: 0.3674




Results for fold 9:
R: 0.5889
R²: 0.3468
R: 0.5970
rho: 0.5904


In [15]:
res = pd.DataFrame({'R': r_list, 'R2': np.array(r_list) **2, 'rho': rho_list})
res.rename_axis('fold', inplace=True)

In [17]:
res.to_csv('./results/finetuning/saluki_crossval.csv')