# 1. Embed using ProtT5_XL_Uniref50

In [2]:
import torch
import numpy as np
import pandas as pd
import pickle
from transformers import T5EncoderModel, T5Tokenizer

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.has_mps else 'cpu'

# Load model

In [7]:
# Load model
tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50", do_lower_case=False)
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50")

Some weights of the model checkpoint at Rostlab/prot_t5_xl_uniref50 were not used when initializing T5EncoderModel: ['decoder.block.4.layer.2.DenseReluDense.wo.weight', 'decoder.block.2.layer.1.EncDecAttention.k.weight', 'decoder.block.23.layer.1.EncDecAttention.v.weight', 'decoder.block.1.layer.1.layer_norm.weight', 'decoder.block.9.layer.1.EncDecAttention.v.weight', 'decoder.block.4.layer.1.EncDecAttention.k.weight', 'decoder.block.11.layer.1.EncDecAttention.o.weight', 'decoder.block.22.layer.2.DenseReluDense.wo.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.19.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.1.EncDecAttention.v.weight', 'decoder.block.4.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.0.SelfAttention.k.weight', 'decoder.block.10.layer.0.SelfAttention.k.weight', 'decoder.block.15.layer.0.SelfAttention.q.weight', 'decoder.block.12.layer.2.layer_norm.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.b

# Load data

In [5]:
df = pd.read_csv('data/input_with_features.csv')
print(df.shape)
df.head()

(111564, 17)


Unnamed: 0,allele,seq,count_duplicates,aff_nM_max,aff_nM_min,aff_nM_perc_diff,aff_nM_mean,aff_log50k_mean,subset,dataset,length,allele_fam,alpha,beta,cat_bin_500,cat_multi,a_encoded
0,HLA-DPA1*01:03/DPB1*02:01,AAAAGWQTLSAALDA,1,3769.9803,3769.9803,0.0,3769.9803,0.23891,train,NetMHCIIpan4.0,15,DP,HLA-DPA1*01:03,DPB1*02:01,1,2,0.0
1,HLA-DPA1*01:03/DPB1*02:01,AAAGAEAGKATTEEQ,1,50000.0,50000.0,0.0,50000.0,0.0,test,NetMHCIIpan4.0,15,DP,HLA-DPA1*01:03,DPB1*02:01,1,3,0.0
2,HLA-DPA1*01:03/DPB1*02:01,AAASVPAADKFKTFE,2,5528.934759,5520.014,0.161608,5524.47438,0.203593,train-test,44k-NetMHCIIpan4.0,15,DP,HLA-DPA1*01:03,DPB1*02:01,1,3,0.0
3,HLA-DPA1*01:03/DPB1*02:01,AAATAGTTVYGAFAA,1,7154.820918,7154.820918,0.0,7154.820918,0.179693,train,44k,15,DP,HLA-DPA1*01:03,DPB1*02:01,1,3,0.0
4,HLA-DPA1*01:03/DPB1*02:01,AAATATATAAVGAAT,1,50000.0,50000.0,0.0,50000.0,0.0,test,NetMHCIIpan4.0,15,DP,HLA-DPA1*01:03,DPB1*02:01,1,3,0.0


# Preprocess data

In [17]:
# convert from df to list
data = [ (str(x), y) for x, y in zip(df.index, df.seq)]
assert len(data) == df.shape[0] # double check

In [18]:
def clean_data(data):
    """
    clean data for ProtT5 embedding
    params data: list of tuples (str seq_name, str single aa seq).
    return: tuple of lists: (list of seq names, list of clean seqs)
    """
    seq_names = []
    seqs = []
    
    for entry in data:
        # append seq name
        seq_names.append(entry[0])
        
        # process seq by add space between aa
        s = list(entry[1])
        seqs.append(' '.join(s))
        
    return seq_names, seqs

seq_names, seqs = clean_data(data)
assert len(seqs) == df.shape[0] # double check

In [19]:
def get_max_token_length(seqs):
    """
    Get max token length
    seqs: list of strings. Each string is in format "A B C"
    return int, max token length to use for tokenizer
    """
    # find longest seq
    longest_seq = [max(seqs, key = len)]

    # encode longest seq
    ids = tokenizer.batch_encode_plus(longest_seq, add_special_tokens=True, padding=True)

    # max token length
    max_token_length = len(ids['input_ids'][0])

    return max_token_length

max_token_length = get_max_token_length(seqs)
max_token_length

21

In [20]:
# set arbitrary batch size
batch_size = 2048

# batch data
seqs_batched = np.array_split(seqs, len(seqs)//batch_size)

In [21]:
len(seqs_batched)

54

# Activate GPU

In [22]:
model = model.to(device)

# Predict

In [23]:
# set model to eval
model = model.eval()

# Initiate features
features = []

for batch in seqs_batched:
    # tokenize
    ids = tokenizer.batch_encode_plus(batch, add_special_tokens=True, padding='max_length', max_length=max_token_length)
    
    # copy input tensors to device
    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)
    
    # predict
    with torch.no_grad():
        batch_result = model(input_ids=input_ids,attention_mask=attention_mask)
        
    # extract features
    batch_features = batch_result['last_hidden_state'].cpu().numpy()
    
    features.append(batch_features)
    
# concatenate features
features = np.concatenate(features, axis = 0)

In [24]:
# double check 
assert features.shape == (df.shape[0], max_token_length, 1024) 
# number of proteins * tokens * features

# Save

In [25]:
# save the model to disk
fp = 'embedded/prot_t5_xl_uniref50.pkl'
pickle.dump(features, open(fp, 'wb'))