In [1]:
import torch
import pytorch_lightning as L
from torchmetrics.classification import BinaryAccuracy
from transformers import AutoTokenizer
import pickle
import pandas as pd
import os

import sys
sys.path.append('../src')

from grover.grover import *
from utrlm.utrlm import *

In [3]:
grover_base_model = "PoetschLab/GROVER"
utrlm_base_model = "multimolecule/utrlm-te_el"

grover_ckpt = "/home/gjobenc/checkpoints/grover_ft_hek_muscle_pc3-.epoch=01-val_loss=0.567.ckpt"
utrlm_ckpt = "/ascldap/users/gjobenc/scratch/utrlm/logs/utrlm_hek_muscle_pc3/checkpoints/utrlm_ft_hek_muscle_pc3-.epoch=13-val_loss=0.617.ckpt"

In [3]:
grover_model = GroverClassifier.load_from_checkpoint(grover_ckpt)
grover_model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at PoetschLab/GROVER and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GroverClassifier(
  (loss): BCEWithLogitsLoss()
  (train_accuracy): BinaryAccuracy()
  (valid_accuracy): BinaryAccuracy()
  (test_accuracy): BinaryAccuracy()
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(609, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(

In [4]:
utrlm_model = UTRLMClassifier.load_from_checkpoint(utrlm_ckpt)
utrlm_model.eval()

Some weights of UtrLmForSequencePrediction were not initialized from the model checkpoint at multimolecule/utrlm-te_el and are newly initialized: ['sequence_head.decoder.bias', 'sequence_head.decoder.weight', 'utrlm.pooler.dense.bias', 'utrlm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


UTRLMClassifier(
  (loss): BCEWithLogitsLoss()
  (train_accuracy): BinaryAccuracy()
  (valid_accuracy): BinaryAccuracy()
  (test_accuracy): BinaryAccuracy()
  (model): UtrLmForSequencePrediction(
    (utrlm): UtrLmModel(
      (embeddings): UtrLmEmbeddings(
        (word_embeddings): Embedding(26, 128, padding_idx=0)
      )
      (encoder): UtrLmEncoder(
        (layer): ModuleList(
          (0-5): 6 x UtrLmLayer(
            (attention): UtrLmAttention(
              (self): UtrLmSelfAttention(
                (query): Linear(in_features=128, out_features=128, bias=True)
                (key): Linear(in_features=128, out_features=128, bias=True)
                (value): Linear(in_features=128, out_features=128, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
                (rotary_embeddings): RotaryEmbedding()
              )
              (output): UtrLmSelfOutput(
                (dense): Linear(in_features=128, out_features=128, bias=True)
                (d

In [4]:
test_data_csv = "/ascldap/users/gjobenc/scratch/utrlm/ratios/hek/test.csv"

grover_tokenizer = AutoTokenizer.from_pretrained(grover_base_model)
utrlm_tokenizer = AutoTokenizer.from_pretrained(utrlm_base_model)

grover_dl = DNADataModule(test_data=test_data_csv, max_length=512, tokenizer=grover_tokenizer)
grover_dl.setup()
grover_dl = grover_dl.test_dataloader()

utrlm_dl = DNADataModule(test_data=test_data_csv, max_length=512, tokenizer=utrlm_tokenizer)
utrlm_dl.setup()
utrlm_dl = utrlm_dl.test_dataloader()

labels = [y['label'] for y in grover_dl]

acc_fn = BinaryAccuracy()

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/77.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

FileNotFoundError: [Errno 2] No such file or directory: '/ascldap/users/gjobenc/scratch/utrlm/ratios/hek/test.csv'

In [20]:
trainer = L.Trainer(
    accelerator="gpu",
    devices=1,
    precision=32
)

/ascldap/users/gjobenc/anaconda3/envs/evo2/lib/python3.12/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /ascldap/users/gjobenc/anaconda3/envs/evo2/lib/pytho ...
Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/ascldap/users/gjobenc/anaconda3/envs/evo2/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `C

In [33]:
grover_preds = trainer.predict(grover_model, grover_dl)
utrlm_preds = trainer.predict(utrlm_model, utrlm_dl)

accuracy_scorer = BinaryAccuracy()

grover_acc = accuracy_scorer(torch.cat(grover_preds), torch.cat(labels))
utrlm_acc = accuracy_scorer(torch.cat(utrlm_preds), torch.cat(labels))

print(f"Grover Accuracy: {grover_acc} UTRLM Accuracy: {utrlm_acc}")

/ascldap/users/gjobenc/anaconda3/envs/evo2/lib/python3.12/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /ascldap/users/gjobenc/anaconda3/envs/evo2/lib/pytho ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

Grover Accuracy: 0.7099459767341614 UTRLM Accuracy: 0.6743924617767334


# Grover .txt inference

In [2]:
path = "/projects/wg-mrna-antib-pred/DNABERT_data/randomUTRs/random_25mers_10M.txt"
seqs = []
with open(path, "r") as file:
    for line in file.readlines():
        seqs.append(line.strip())
seqs[:5]

['GTTCTATAGACATGAATGATTTGCG',
 'CATTGACTAGCCCCATCCAATTGTA',
 'CAGGCCTGTGGACGGCAAGTCTAGT',
 'AGTTCAGACGCTATCTCTCACGATT',
 'CGAAAGGTTGCGATCTCAGTTGTGC']

In [53]:
df = pd.DataFrame(seqs, columns=['seq'])
df['_'] = list(range(len(seqs)))
df.to_csv("/projects/wg-mrna-antib-pred/DNABERT_data/randomUTRs/random_25mers_10M.csv", index=False)

In [11]:
data_path = "/projects/wg-mrna-antib-pred/DNABERT_data/randomUTRs/random_25mers_10M.csv"

grover_tokenizer = AutoTokenizer.from_pretrained(grover_base_model)

grover_dl = DNADataModule(test_data=data_path, max_length=512, tokenizer=grover_tokenizer)
grover_dl.setup()
grover_dl = grover_dl.test_dataloader()

In [6]:
df = pd.read_csv(data_path, delimiter=',', skiprows=1, header=None)
df.head()

Unnamed: 0,0,1
0,GTTCTATAGACATGAATGATTTGCG,0
1,CATTGACTAGCCCCATCCAATTGTA,1
2,CAGGCCTGTGGACGGCAAGTCTAGT,2
3,AGTTCAGACGCTATCTCTCACGATT,3
4,CGAAAGGTTGCGATCTCAGTTGTGC,4


In [7]:
trainer = L.Trainer(
    accelerator="gpu",
    devices=1,
    precision=32
)

/home/gjobenc/anaconda3/envs/evo/lib/python3.11/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/gjobenc/anaconda3/envs/evo/lib/python3.11/site ...
Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/gjobenc/anaconda3/envs/evo/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the def

In [8]:
for row in grover_dl:
    print(row)
    break

{'input_ids': tensor([[  2,   7,  28,  ...,   0,   0,   0],
        [  2,   6,  63,  ...,   0,   0,   0],
        [  2,   6, 111,  ...,   0,   0,   0],
        ...,
        [  2,   6,  35,  ...,   0,   0,   0],
        [  2,   6, 113,  ...,   0,   0,   0],
        [  2,   7, 200,  ...,   0,   0,   0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'label': tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])}


In [12]:
grover_preds = trainer.predict(grover_model, grover_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [11]:
grover_preds

NameError: name 'grover_preds' is not defined

In [7]:
with open("/home/gjobenc/projects/UTRML/src/out/grover_ft_hek_muscle_pc3-.epoch=01-val_loss=0.567.ckpt", 'rb') as f:
    embeds = pickle.load(f)
embeds

[tensor([1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.]),
 tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.]),
 tensor([1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1.,
         1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.]),
 tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.]),
 tensor([1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.]),
 tensor([1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
         1., 0., 1., 1., 1., 1.

In [None]:
cat_embeds = torch.cat(embeds)

In [19]:
preds_col = [int(row.item()) for row in cat_embeds]

In [20]:
df = pd.DataFrame({
                    "sequence": seqs, 
                    "grover_pred": preds_col
                   })

In [21]:
df.head()

Unnamed: 0,sequence,grover_pred
0,GTTCTATAGACATGAATGATTTGCG,1
1,CATTGACTAGCCCCATCCAATTGTA,1
2,CAGGCCTGTGGACGGCAAGTCTAGT,0
3,AGTTCAGACGCTATCTCTCACGATT,1
4,CGAAAGGTTGCGATCTCAGTTGTGC,1


In [29]:
df = pd.DataFrame(seqs, columns=['sequence'])
df['pred'] = preds_col
df

Unnamed: 0,sequence,pred
0,GTTCTATAGACATGAATGATTTGCG,1
1,CATTGACTAGCCCCATCCAATTGTA,1
2,CAGGCCTGTGGACGGCAAGTCTAGT,0
3,AGTTCAGACGCTATCTCTCACGATT,1
4,CGAAAGGTTGCGATCTCAGTTGTGC,1
...,...,...
9999995,ATAAGGATAAGTAAGCACCGGAACG,1
9999996,CCGGCGGCAAAAATAGTTAGCCGTA,1
9999997,GCTCGGGACATGTCCGTTCCCTGCC,1
9999998,TGCTGTATGCTGAAAGTGTATGGAT,1


In [6]:
out_path = "/home/gjobenc/projects/UTRML/src/out"
df = pd.DataFrame(seqs, columns=['sequence'])
for fn in os.listdir(out_path):
    with open(os.path.join(out_path, fn), 'rb') as f:
        preds = pickle.load(f)

    preds = torch.cat(preds)
    preds_col = [int(row.item()) for row in preds]

    model = fn.split('_')[0]
    data_name = fn.split('ft_')[-1].split('-')[0]

    df[model + '_' + data_name] = preds_col
df.head()

Unnamed: 0,sequence,utrlm_hek,utrlm_hek_pc3,utrlm_muscle,grover_hek,utrlm_pc3,grover_hek_muscle,grover_muscle,utrlm_mpra,grover_hek_muscle_pc3,utrlm_muscle_pc3,utrlm_hek_muscle_pc3,grover_hek_pc3,grover_pc3,utrlm_hek_muscle
0,GTTCTATAGACATGAATGATTTGCG,0,0,1,0,1,1,1,0,1,1,0,0,1,0
1,CATTGACTAGCCCCATCCAATTGTA,0,1,0,0,1,1,0,1,1,0,1,1,1,1
2,CAGGCCTGTGGACGGCAAGTCTAGT,1,1,1,0,1,0,1,1,0,1,1,0,1,1
3,AGTTCAGACGCTATCTCTCACGATT,1,1,0,1,1,1,1,0,1,1,1,1,1,1
4,CGAAAGGTTGCGATCTCAGTTGTGC,1,1,1,1,0,1,0,1,1,1,1,1,1,1


In [7]:
df.to_csv(os.path.join(out_path, "grover_utrml_preds.csv"))