In [63]:
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
import gc

from tqdm import tqdm
tqdm.write = True

import esm
from huggingface_hub import login
from esm.models.esm3 import ESM3
from esm.models.esmc import ESMC
from esm.sdk.api import ESM3InferenceClient, ESMCInferenceClient, ESMProtein, GenerationConfig
from concurrent.futures import ThreadPoolExecutor
from typing import Sequence

from esm.sdk.api import (
    ESM3InferenceClient,
    ESMProtein,
    ESMProteinError,
    LogitsConfig,
    LogitsOutput,
    ProteinType,
)


In [2]:
# # Will instruct you how to get an API key from huggingface hub, make one with "Read" permission.
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
print(dir(esm.pretrained))

['Callable', 'ESM3', 'ESM3_FUNCTION_DECODER_V0', 'ESM3_OPEN_SMALL', 'ESM3_STRUCTURE_DECODER_V0', 'ESM3_STRUCTURE_ENCODER_V0', 'ESM3_function_decoder_v0', 'ESM3_sm_open_v0', 'ESM3_structure_decoder_v0', 'ESM3_structure_encoder_v0', 'ESMC', 'ESMC_300M', 'ESMC_300M_202412', 'ESMC_600M', 'ESMC_600M_202412', 'FunctionTokenDecoder', 'LOCAL_MODEL_REGISTRY', 'ModelBuilder', 'StructureTokenDecoder', 'StructureTokenEncoder', '__annotations__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'data_root', 'get_esm3_model_tokenizers', 'get_esmc_model_tokenizers', 'load_local_model', 'nn', 'register_local_model', 'torch']


In [64]:
#model: ESM3InferenceClient = ESM3.from_pretrained("esm3-open").to("cuda")
model: ESMCInferenceClient = ESMC.from_pretrained("esmc_600m").eval().to("cuda:3") 

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [84]:

EMBEDDING_CONFIG = LogitsConfig(
    sequence=False, return_embeddings=True, return_hidden_states=False
)


def embed_sequence(model: ESM3InferenceClient, sequence: str) -> LogitsOutput:
    with torch.inference_mode():
        protein = ESMProtein(sequence=sequence)
        protein_tensor = model.encode(protein)
        output = model.logits(protein_tensor, EMBEDDING_CONFIG)
        output_cpu = LogitsOutput(
                    embeddings=output.embeddings.to('cpu') if output.embeddings is not None else None,
                    )
    del output
    torch.cuda.empty_cache()  
    return output_cpu


def batch_embed(
    model: ESM3InferenceClient, inputs: Sequence[ProteinType], max_workers: int = 4,
) -> Sequence[LogitsOutput]:
    """Forge supports auto-batching. So batch_embed() is as simple as running a collection
    of embed calls in parallel using asyncio.
    """
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(embed_sequence, model, protein) for protein in inputs
        ]
        results = []
        #for future in tqdm(futures, desc="Embedding proteins", total=len(futures)):
        for future in futures:
            try:
                results.append(future.result())
            except Exception as e:
                results.append(ESMProteinError(500, str(e)))
    return results
     

In [4]:
with open("/home/user11/data/mhcii/allele_sequences.json") as file:
    allels = json.load(file)

In [5]:
def short_name(name):
    return name.split('-')[1].replace('*', '').replace(':', '')

table = []
for key1 in allels.keys():
    for key2 in allels[key1].keys():
        table.append(
            {'name': key1, 'second_name': key2, 'sequence': allels[key1][key2]}
        )

df = pd.DataFrame(table)
df['id'] = df['name'].apply(short_name)

In [14]:
outputs = batch_embed(model, df["sequence"].tolist())

Embedding proteins:   0%|          | 0/861 [00:00<?, ?it/s]

In [15]:
import os
import numpy as np

write_dir = '/home/user11/data/embeddings_proteins/emb_esmc_650m/'
write_table = []
table_write_path = '/home/user11/data/embeddings_proteins/emb_esmc_table.tsv'
for index, res in enumerate(outputs):
    if isinstance(outputs[index], LogitsOutput):
        line = df.iloc[index]
        _id = line['id']
        write_path = f'{write_dir}/{_id}.npy'
        if not os.path.exists(write_path):
            write_table.append({
                'id': _id,
                'seq': line['sequence'],
                'emb_path': write_path
            })
            np.save(write_path, res.embeddings.cpu().numpy())
pd.DataFrame(write_table).to_csv(table_write_path, sep='\t', index=False)


In [16]:
del outputs
torch.cuda.empty_cache() 
gc.collect() 


15534

## После эмьендига собираем "широкую" таблицу

In [86]:
df_hla = pd.read_csv('/home/user11/data/embeddings_proteins/emb_esmc_table.tsv', sep='\t')
df_peptide = pd.read_csv('/home/user11/data/data_processed/data.tsv', sep='\t', header=None)
df_peptide.columns = ['peptide', 'score', 'hla']
df_hla = df_hla.set_index(keys=['id'], drop=True)
df_peptide['hla'] = df_peptide['hla'].str.replace('_', '')


In [87]:
alpha_chain = []
beta_chain = []
for index, line in df_peptide.iterrows():
    name = line['hla'].split('-')
    if len(name) != 1:
        alpha_chain.append(name[1])
        beta_chain.append(name[2])
    else:
        alpha_chain.append('DRA01010101')
        beta_chain.append(name[0])
data_all = df_peptide.copy()
data_all['alpha_id'] = alpha_chain
data_all['beta_id'] = beta_chain

# оставим только те случаи для которые есть 
filter_ = [True if line['alpha_id'] in df_hla.index and line['beta_id'] in df_hla.index else False for index, line in data_all.iterrows()]
data_all = data_all[filter_]


In [88]:
filter_ = [True if line['alpha_id'] in df_hla.index and line['beta_id'] in df_hla.index else False for index, line in data_all.iterrows()]
data_all = data_all[filter_]


In [89]:
data_all = data_all.assign(alpha_seq = [str(df_hla.loc[i,'seq']) for i in data_all['alpha_id']])
data_all = data_all.assign(beta_seq = [str(df_hla.loc[i,'seq']) for i in data_all['beta_id']])

data_all = data_all.assign(alpha_path = [str(df_hla.loc[i,'emb_path']) for i in data_all['alpha_id']])
data_all = data_all.assign(beta_path = [str(df_hla.loc[i,'emb_path']) for i in data_all['beta_id']])

In [94]:
hla_interface_path = '/home/user11/base/DeepMHCII/data/pseudosequence.2016.all.X.dat'
hla_interface = pd.read_csv(hla_interface_path, sep='\t', header=None)
hla_interface.columns = ['id', 'seq']
hla_interface['id'] = hla_interface['id'].str.replace('_', '')
hla_interface = hla_interface.drop_duplicates(subset=['id'])
hla_interface = hla_interface.set_index(keys=['id'], drop=True)



In [95]:
data_all = data_all.assign(interface=[hla_interface.loc[index, 'seq'] for index in data_all['hla']])

In [96]:
data_all.to_csv('/home/user11/data/embeddings_proteins/wide_data.tsv', index=False, sep='\t')

## Эмбединг для пептидов

In [5]:
data_all = pd.read_csv('/home/user11/data/embeddings_proteins/wide_data.tsv', sep='\t')

In [None]:
outputs = batch_embed(model, data_all["peptide"].tolist(), max_workers=8)

Embedding proteins:  26%|███████▋                     | 34025/129146 [34:42<1:39:31, 15.93it/s]

In [None]:
import pickle

# Сохранение объекта на диск
write_path = '/home/user11/data/embeddings_proteins/emb_esmc_peptides_650m/raw_emb.pkl'
with open(write_path, 'wb') as file:  # 'wb' — бинарный режим записи
    pickle.dump(outputs, file)

In [None]:
import os
import numpy as np


write_table = data_all.copy()
write_dir = '/home/user11/data/embeddings_proteins/emb_esmc_peptides_650m/'
table_write_path = '/home/user11/data/embeddings_proteins/wide_data__with_pep.tsv'
write_emb_pep_path = []
for index, res in enumerate(outputs):
    line = data_all.iloc[index]
    _id = line['peptide']
    write_path = f'{write_dir}/{_id}.npy'
    np.save(write_path, res.embeddings.cpu().numpy())
    write_emb_pep_path.append(write_path)
write_table['peptide_path'] = write_emb_pep_path
pd.DataFrame(write_table).to_csv(table_write_path, sep='\t', index=False)


In [None]:
'/home/user11/data/embeddings_proteins/emb_esmc_650m/'

In [None]:
# write_table['alpha_path'] = write_table['alpha_path'].str.replace('/home/user11/data/data_processed/emb_esmc_650m/',
#                                       '/home/user11/data/embeddings_proteins/emb_esmc_650m/')
# write_table['beta_path'] = write_table['beta_path'].str.replace('/home/user11/data/data_processed/emb_esmc_650m/',
#                                       '/home/user11/data/embeddings_proteins/emb_esmc_650m/')

In [None]:
write_table.to_csv(table_write_path, sep='\t', index=False)

# Восстанавливаю поседовательнотси интерфейса

In [5]:
df = pd.read_csv('/home/user11/data/embeddings_proteins/wide_data.tsv', sep='\t')

In [6]:
alpha_positions = np.array([9, 11, 22, 24, 31, 52, 53, 58, 59, 61, 65, 66, 68, 72, 73])
beta_positions = np.array([9, 11, 13, 26, 28, 30, 47, 57, 67, 70, 71, 74, 77, 78, 81, 85, 86, 89, 90])

In [7]:
import numpy as np
import re

embeddings = df.copy()
embeddings["alpha_positions"] = pd.NA
embeddings["beta_positions"] = pd.NA

for i in tqdm(range(len(embeddings))):
    alpha_seq = embeddings.loc[i, "alpha_seq"]
    interface = embeddings.loc[i, "interface"]
    iterator = list(re.finditer(interface[7:9], alpha_seq))

    for match in iterator:
        correct_positions = []

        for letter, pos in zip(interface[:15], alpha_positions - alpha_positions[7] + match.start()):
            if letter == "X":
                correct_positions.append(int(pos))
            elif alpha_seq[pos] == letter:
                correct_positions.append(int(pos))
            elif alpha_seq[pos - 1] == letter:
                correct_positions.append(int(pos - 1))
            elif alpha_seq[pos + 1] == letter:
                correct_positions.append(int(pos + 1))
            elif alpha_seq[pos - 2] == letter:
                correct_positions.append(int(pos - 2))
            elif alpha_seq[pos + 2] == letter:
                correct_positions.append(int(pos + 2))

        embeddings.at[i, "alpha_positions"] = correct_positions
    
        if len(correct_positions) == 15:
            break

for i in tqdm(range(len(embeddings))):
    beta_seq = embeddings.loc[i, "beta_seq"]
    interface = embeddings.loc[i, "interface"]
    iterator = list(re.finditer(interface[-7:-5] + ".{2}" + interface[-5], beta_seq))

    for match in iterator:
        correct_positions = []

        for letter, pos in zip(interface[15:], beta_positions - beta_positions[-7] + match.start()):
            try:
                if letter == "X":
                    correct_positions.append(int(pos))
                elif beta_seq[pos] == letter:
                    correct_positions.append(int(pos))
                elif beta_seq[pos - 1] == letter:
                    correct_positions.append(int(pos - 1))
                elif beta_seq[pos + 1] == letter:
                    correct_positions.append(int(pos + 1))
                elif beta_seq[pos - 2] == letter:
                    correct_positions.append(int(pos - 2))
                elif beta_seq[pos + 2] == letter:
                    correct_positions.append(int(pos + 2))
            except:
                continue

        embeddings.at[i, "beta_positions"] = correct_positions
    
        if len(correct_positions) == 19:
            break


100%|███████████████████████████████████████████████| 129146/129146 [00:09<00:00, 13786.37it/s]
100%|███████████████████████████████████████████████| 129146/129146 [00:09<00:00, 14119.89it/s]


In [42]:
embeddings.to_csv('/home/user11/data/embeddings_proteins/wide_data.tsv', index=False, sep='\t')


## Генерируем независимую выборку

In [88]:
def fix_name(name):
    if name == 'DRA01010101':
        return 'DRA10101'
    else: 
        return name

table = pd.read_csv('/home/user11/data/embeddings_proteins/wide_data.tsv', sep='\t')
table['alpha_id_'] = table['alpha_id'].apply(fix_name)

train_mhc_names = []
for index, line in table.iterrows():
    alpha = line['alpha_id']
    beta = line['beta_id']
    train_mhc_names.append(f'HLA-{alpha}-{beta}')
train_mhc_names = set(train_mhc_names)

test = pd.read_csv('/home/user11/data/embeddings_test/test_dataset.csv', sep='\t')
test['allele'] = test['allele'].str.replace(':', '').str.replace('*', '')
test_mhc_names = set(test['allele'])
print(len(test))


mhc_filter = test_mhc_names - train_mhc_names
peptide_filter = set(test['peptide']) - set(table['peptide'])

_filter = [True if i['allele'] in mhc_filter and i['peptide'] in peptide_filter else False for _, i in test.iterrows()]
test = test[_filter]
test = test[[True if not i in {'H-2-IAb', 'H-2-IAk', 'H-2-IEk',} else False for i in test['allele']]]
test = test.reset_index(drop=True)

print(len(test))

2413
2161


In [66]:
with open("/home/user11/data/mhcii/allele_sequences_test.json") as file:
    allels = json.load(file)

In [89]:
def short_name(name):
    return name.split('-')[1].replace('*', '').replace(':', '')

table = []
for key1 in allels.keys():
    for key2 in allels[key1].keys():
        table.append(
            {'name': key1, 'second_name': key2, 'sequence': allels[key1][key2]}
        )

df = pd.DataFrame(table)
df['id'] = df['name'].apply(short_name)
df = df.drop_duplicates(subset=['id'])
df = df.reset_index(drop=True)
df = df.set_index('id')

In [99]:
df

Unnamed: 0_level_0,name,second_name,sequence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DRB10701,HLA-DRB1*07:01,DRB1*07:01:01:01,MVCLKLPGGSCMAALTVTLMVLSSPLALAGDTQPRFLWQGKYKCHF...
DRB10101,HLA-DRB1*01:01,DRB1*01:01:01:01,MVCLKLPGGSCMTALTVTLMVLSSPLALAGDTRPRFLWQLKFECHF...
DQB10402,HLA-DQB1*04:02,DQB1*04:02:01:01,MSWKKALRIPGGLRVATVTLMLAMLSTPVAEGRDSPEDFVFQFKGM...
DRB11001,HLA-DRB1*10:01,DRB1*10:01:01:01,MVCLRLPGGSCMAVLTVTLMVLSSPLALAGDTRPRFLEEVKFECHF...
DRB11101,HLA-DRB1*11:01,DRB1*11:01:01:01,MVCLRLPGGSCMAVLTVTLMVLSSPLALAGDTRPRFLEYSTSECHF...
DRB11402,HLA-DRB1*14:02,DRB1*14:02:01:01,MVCLRLPGGSCMAVLTVTLMVLSSPLALAGDTRPRFLEYSTSECHF...
DRB11301,HLA-DRB1*13:01,DRB1*13:01:01:01,MVCLRLPGGSCMAVLTVTLMVLSSPLALAGDTRPRFLEYSTSECHF...
DPB10501,HLA-DPB1*05:01,DPB1*05:01:01:01,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...
DRB10402,HLA-DRB1*04:02,DRB1*04:02:01,MVCLKFPGGSCMAALTVTLMVLSSPLALAGDTRPRFLEQVKHECHF...
DPB10201,HLA-DPB1*02:01,DPB1*02:01:02:01,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...


In [97]:
import numpy as np

alpha_chain = []
beta_chain = []
for index, line in test.iterrows():
    name = line['allele'].split('-')
    alpha_chain.append(name[1])
    beta_chain.append(name[2])

data_test = test.copy()
data_test['alpha_id'] = alpha_chain
data_test['beta_id'] = beta_chain
data_test = data_test[['allele', 'peptide', 'label_quantitative', 'alpha_id', 'beta_id']]
data_test = data_test[~np.isnan(data_test['label_quantitative'])]

# оставим только те случаи для которые есть 
# filter_ = [True if line['alpha_id'] in df_hla.index and line['beta_id'] in df_hla.index else False for index, line in data_test.iterrows()]
# data_test = data_test[filter_]


In [104]:
alpha_seq = []
beta_seq = []
for index, line in data_test.iterrows():
    alpha_id = line['alpha_id']
    beta_id = line['beta_id']
    alpha_seq.append(df.loc[alpha_id]['sequence'])
    beta_seq.append(df.loc[beta_id]['sequence'])

data_test['alpha_seq'] = alpha_seq
data_test['beta_seq'] = beta_seq

In [106]:
hla_interface_path = '/home/user11/base/DeepMHCII/data/pseudosequence.2016.all.X.dat'
hla_interface = pd.read_csv(hla_interface_path, sep='\t', header=None)
hla_interface.columns = ['id', 'seq']
hla_interface['id'] = hla_interface['id'].str.replace('_', '')
hla_interface = hla_interface.drop_duplicates(subset=['id'])
hla_interface = hla_interface.set_index(keys=['id'], drop=True)

In [115]:
interface = []
for i in data_test['allele']:
    if i.startswith('HLA-DRA10101-'):
        i = i.split('-')[-1]
    interface.append(hla_interface.loc[i, 'seq'])
data_test['interface'] = interface

In [131]:
data_test

Unnamed: 0,allele,peptide,label_quantitative,alpha_id,beta_id,alpha_seq,beta_seq,interface
0,HLA-DRA10101-DRB50101,QTKEFQVLKSLGKLA,0.594998,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT
1,HLA-DRA10101-DRB10101,GASFYHLGAAVPHSQ,0.682411,DRA10101,DRB10101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSCMTALTVTLMVLSSPLALAGDTRPRFLWQLKFECHF...,QEFFIASGAAVDAIMWLFLECYDLQRATYHVGFT
2,HLA-DRA10101-DRB10101,RTYSLGSALRPSTSR,0.778379,DRA10101,DRB10101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSCMTALTVTLMVLSSPLALAGDTRPRFLWQLKFECHF...,QEFFIASGAAVDAIMWLFLECYDLQRATYHVGFT
3,HLA-DRA10101-DRB10101,RWGFRSGVPPKVVNY,0.307237,DRA10101,DRB10101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSCMTALTVTLMVLSSPLALAGDTRPRFLWQLKFECHF...,QEFFIASGAAVDAIMWLFLECYDLQRATYHVGFT
4,HLA-DRA10101-DRB10101,EGFLKAQASLRGQYW,0.878086,DRA10101,DRB10101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSCMTALTVTLMVLSSPLALAGDTRPRFLWQLKFECHF...,QEFFIASGAAVDAIMWLFLECYDLQRATYHVGFT
...,...,...,...,...,...,...,...,...
2154,HLA-DRA10101-DRB10301,VPRAAEVPGAQGQQGPRG,0.084687,DRA10101,DRB10301,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLRLPGGSCMAVLTVTLMVLSSPLALAGDTRPRFLEYSTSECHF...,QEFFIASGAAVDAIMESSYDYFDLQKRNYHVVFT
2157,HLA-DRA10101-DRB10301,ERRVAGPQVGGVNPLEGGSR,0.047212,DRA10101,DRB10301,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLRLPGGSCMAVLTVTLMVLSSPLALAGDTRPRFLEYSTSECHF...,QEFFIASGAAVDAIMESSYDYFDLQKRNYHVVFT
2158,HLA-DRA10101-DRB11301,AAQERRVPRAAEVPGAQG,0.084687,DRA10101,DRB11301,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLRLPGGSCMAVLTVTLMVLSSPLALAGDTRPRFLEYSTSECHF...,QEFFIASGAAVDAIMESSFDYFDIDEATYHVVFT
2159,HLA-DRA10101-DRB11301,VPGAQGQQGPRGREEAPR,0.084687,DRA10101,DRB11301,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLRLPGGSCMAVLTVTLMVLSSPLALAGDTRPRFLEYSTSECHF...,QEFFIASGAAVDAIMESSFDYFDIDEATYHVVFT


In [137]:
peptides = list(set(data_test['peptide']))

In [138]:
outputs = batch_embed(model, peptides, max_workers=8)

Embedding proteins: 100%|████████████████████████████████████| 453/453 [00:25<00:00, 17.51it/s]


In [139]:
emb_peptides = dict()
for pep, emb in zip(peptides, outputs):
    if isinstance(emb, LogitsOutput):
        emb_peptides[pep] = emb
    else:
        emb_peptides[pep] = batch_embed(model, [pep], max_workers=1)[0]

In [140]:
import os
write_emb_dir = '/home/user11/data/embeddings_proteins/emb_esmc_peptides_650m'
container_emb = []
for index, line in data_test.iterrows():
    pep = line['peptide']
    emb_path = f'{write_emb_dir}/{pep}.npy'
    container_emb.append(emb_path)
    if not os.path.exists(emb_path):
        np.save(emb_path, emb_peptides[pep].embeddings.cpu().numpy())
data_test['peptide_path'] = container_emb

In [150]:
proteins = dict()
proteins['alpha'] = dict()
proteins['beta'] = dict()
for index, line in data_test.iterrows():
    proteins['alpha'][line['alpha_id']] = line['alpha_seq']
    proteins['beta'][line['beta_id']] = line['beta_seq']



In [None]:
proteins

In [158]:
import os

tag = 'beta'
_proteins = proteins[tag]
emb_proteins = dict()


for key, seq in _proteins.items():
    emb_proteins[key] = batch_embed(model, [seq], max_workers=1)[0]
    if not isinstance(emb_proteins[key], LogitsOutput):
        emb_proteins[key] = batch_embed(model, [seq], max_workers=1)[0]
        

write_emb_dir = '/home/user11/data/embeddings_proteins/emb_esmc_650m/'
container_emb = []
for index, line in data_test.iterrows():
    pep = line[f'{tag}_id']
    emb_path = f'{write_emb_dir}/{pep}.npy'
    container_emb.append(emb_path)
    if not os.path.exists(emb_path):
        np.save(emb_path, emb_proteins[pep].embeddings.cpu().numpy())
data_test[f'{tag}_path'] = container_emb

Embedding proteins: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 14.22it/s]
Embedding proteins: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 13.63it/s]
Embedding proteins: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 14.57it/s]
Embedding proteins: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 14.35it/s]
Embedding proteins: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 14.42it/s]
Embedding proteins: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 14.43it/s]
Embedding proteins: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 14.41it/s]
Embedding proteins: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 14.05it/s]
Embedding proteins: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 14.30it/s]
Embedding proteins: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 14.37it/s]
Embedding proteins: 100%|███████████████

In [162]:
data_test = data_test.reset_index(drop=True)

### Positions of AA

In [163]:
import numpy as np
import re

alpha_positions = np.array([9, 11, 22, 24, 31, 52, 53, 58, 59, 61, 65, 66, 68, 72, 73])
beta_positions = np.array([9, 11, 13, 26, 28, 30, 47, 57, 67, 70, 71, 74, 77, 78, 81, 85, 86, 89, 90])

embeddings = data_test.copy()
embeddings["alpha_positions"] = pd.NA
embeddings["beta_positions"] = pd.NA

for i in tqdm(range(len(embeddings))):
    alpha_seq = embeddings.loc[i, "alpha_seq"]
    interface = embeddings.loc[i, "interface"]
    iterator = list(re.finditer(interface[7:9], alpha_seq))

    for match in iterator:
        correct_positions = []

        for letter, pos in zip(interface[:15], alpha_positions - alpha_positions[7] + match.start()):
            if letter == "X":
                correct_positions.append(int(pos))
            elif alpha_seq[pos] == letter:
                correct_positions.append(int(pos))
            elif alpha_seq[pos - 1] == letter:
                correct_positions.append(int(pos - 1))
            elif alpha_seq[pos + 1] == letter:
                correct_positions.append(int(pos + 1))
            elif alpha_seq[pos - 2] == letter:
                correct_positions.append(int(pos - 2))
            elif alpha_seq[pos + 2] == letter:
                correct_positions.append(int(pos + 2))

        embeddings.at[i, "alpha_positions"] = correct_positions
    
        if len(correct_positions) == 15:
            break

for i in tqdm(range(len(embeddings))):
    beta_seq = embeddings.loc[i, "beta_seq"]
    interface = embeddings.loc[i, "interface"]
    iterator = list(re.finditer(interface[-7:-5] + ".{2}" + interface[-5], beta_seq))

    for match in iterator:
        correct_positions = []

        for letter, pos in zip(interface[15:], beta_positions - beta_positions[-7] + match.start()):
            try:
                if letter == "X":
                    correct_positions.append(int(pos))
                elif beta_seq[pos] == letter:
                    correct_positions.append(int(pos))
                elif beta_seq[pos - 1] == letter:
                    correct_positions.append(int(pos - 1))
                elif beta_seq[pos + 1] == letter:
                    correct_positions.append(int(pos + 1))
                elif beta_seq[pos - 2] == letter:
                    correct_positions.append(int(pos - 2))
                elif beta_seq[pos + 2] == letter:
                    correct_positions.append(int(pos + 2))
            except:
                continue

        embeddings.at[i, "beta_positions"] = correct_positions
    
        if len(correct_positions) == 19:
            break

100%|███████████████████████████████████████████████████| 2052/2052 [00:00<00:00, 14707.80it/s]
100%|███████████████████████████████████████████████████| 2052/2052 [00:00<00:00, 13832.00it/s]


In [168]:
embeddings = embeddings.rename(columns={'label_quantitative': 'score'})

In [169]:
embeddings.to_csv('/home/user11/data/embeddings_proteins/wide_data_test.tsv', index=False, sep='\t')

# ESM 32float

In [53]:
import os
import torch


read_dir = '/home/user11/data/embeddings'
write_dir = '/home/user11/data/embeddings_numpy_float32/'

names = os.listdir(read_dir)
for name in names:
    basename = name.split('.')[0]
    emb = torch.load(f'/home/user11/data/embeddings/{basename}.pt', weights_only=False)
    emb = emb[None,:,:]
    if not os.path.exists(f'{write_dir}/{basename}.npy'):
        np.save(f'{write_dir}/{basename}.npy', emb)


In [51]:
import pandas as pd

table_path = '/home/user11/data/embeddings_proteins/wide_data_test.tsv'
table = pd.read_csv(table_path, sep='\t')

table['alpha_path'] = table['alpha_id'].apply(lambda name: f'{write_dir}/{name}.npy')
table['beta_path'] = table['beta_id'].apply(lambda name: f'{write_dir}/{name}.npy')
table['peptide_path'] = table['peptide'].apply(lambda name: f'{write_dir}/{name}.npy')

write_path = '/home/user11/data/embeddings_proteins/wide_data_test_32float.tsv'
table.to_csv(write_path, index=False, sep='\t')


In [52]:
table_path = '/home/user11/data/embeddings_proteins/wide_data.tsv'
table = pd.read_csv(table_path, sep='\t')

table['alpha_path'] = table['alpha_id'].apply(lambda name: f'{write_dir}/{name}.npy')
table['beta_path'] = table['beta_id'].apply(lambda name: f'{write_dir}/{name}.npy')
table['peptide_path'] = table['peptide'].apply(lambda name: f'{write_dir}/{name}.npy')

write_path = '/home/user11/data/embeddings_proteins/wide_data_32float.tsv'
table.to_csv(write_path, index=False, sep='\t')

In [21]:
pep.unsqueeze(0).shape

AttributeError: 'numpy.ndarray' object has no attribute 'unsqueeze'

In [11]:
import numpy as np
pep = np.load('/home/user11/data/embeddings_proteins/emb_esmc_650m/DRB10102.npy')
pep.shape

(1, 268, 1152)

# Эмбендинги эпитопа полученные по всему белку

In [60]:
import pandas as pd

inital_table = pd.read_csv('/home/user11/data/embeddings_proteins/wide_data.tsv', sep='\t')
inital_table['alpha_id'] = inital_table['alpha_id'].str.replace('DRA01010101', 'DRA10101') 
inital_table['MHC Restriction'] = [i if i.startswith('HLA') else 'HLA-' + i for i in inital_table['hla']]

df = pd.read_csv('/home/user11/data/merged_df_new.csv')
df['MHC Restriction'] = df['MHC Restriction'].str.replace(':', '').str.replace('*', '').str.replace('/', '-')

In [59]:
inital_table

Unnamed: 0,peptide,score,hla,alpha_id,beta_id,alpha_seq,beta_seq,alpha_path,beta_path,interface,peptide_path,alpha_positions,beta_positions,MHC Restriction
0,PKYVKQNTLKLAT,0.000000,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1...",HLA-DPA10103-DPB10201
1,DSDVGEFRAVTELG,0.047212,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1...",HLA-DPA10103-DPB10201
2,AAAAGWQTLSAALDA,0.238910,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1...",HLA-DPA10103-DPB10201
3,AALDAQAVELTARLN,0.357937,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1...",HLA-DPA10103-DPB10201
4,AAPAAGYTPATPAAP,0.076722,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1...",HLA-DPA10103-DPB10201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129141,QWHKEGSSIGKLFTQHHHHHH,0.762939,DRB50101,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT,/home/user11/data/embeddings_proteins/emb_esmc...,"[33, 35, 46, 48, 55, 76, 77, 82, 83, 85, 89, 9...","[37, 39, 41, 54, 56, 58, 75, 85, 95, 98, 99, 1...",DRB50101
129142,SLETVAIDRPAEVRKHHHHHH,0.236037,DRB50101,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT,/home/user11/data/embeddings_proteins/emb_esmc...,"[33, 35, 46, 48, 55, 76, 77, 82, 83, 85, 89, 9...","[37, 39, 41, 54, 56, 58, 75, 85, 95, 98, 99, 1...",DRB50101
129143,GFPVRPQVPLRPMTYKGAFDL,0.514467,DRB50101,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT,/home/user11/data/embeddings_proteins/emb_esmc...,"[33, 35, 46, 48, 55, 76, 77, 82, 83, 85, 89, 9...","[37, 39, 41, 54, 56, 58, 75, 85, 95, 98, 99, 1...",DRB50101
129144,INAGFKAALAAAAGVPPADKY,0.655131,DRB50101,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT,/home/user11/data/embeddings_proteins/emb_esmc...,"[33, 35, 46, 48, 55, 76, 77, 82, 83, 85, 89, 9...","[37, 39, 41, 54, 56, 58, 75, 85, 95, 98, 99, 1...",DRB50101


In [61]:
df = df.rename(columns={'Epitope Seq':'peptide',
                        'Sequence':'epitop_full_sequence',
                        'Starting Position': 'start_epitop',
                        'Ending Position': 'end_epitop',
                        })

df = df[['MHC Restriction', 'peptide', 'epitop_full_sequence', 'Uniprot ID', 'start_epitop', 'end_epitop']]
df = df.drop_duplicates()
df = df.reset_index(drop=True)

In [67]:
greate_table = pd.merge(df, inital_table, on=['MHC Restriction', "peptide"])
greate_table

Unnamed: 0,MHC Restriction,peptide,epitop_full_sequence,Uniprot ID,start_epitop,end_epitop,score,hla,alpha_id,beta_id,alpha_seq,beta_seq,alpha_path,beta_path,interface,peptide_path,alpha_positions,beta_positions
0,HLA-DPA10103-DPB10201,VPDHVVWSLFNTL,MNHIVQTFSPVNSGQPPNYEMLKEEQEVAMLGVPHNPAPPMSTVIH...,Q01629,53.0,65.0,0.871874,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1..."
1,HLA-DPA10103-DPB10201,VPDHVVWSLFNTL,MHKEEHEVAVLGPPPSTILPRSTVINIHSETSVPDHVVWSLFNTLF...,P13164,33.0,45.0,0.871874,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1..."
2,HLA-DPA10103-DPB10201,VPDHVVWSLFNTL,MNHTSQAFVNAATGGQPPNYERIKEEYEVSELGAPHGSASVRTTVI...,Q01628,54.0,66.0,0.871874,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1..."
3,HLA-DPA10103-DPB10201,HTGREIVDLMCHAT,AGVLWDVPSPPPMGKAELEDGAYRIKQKGILGYSQIGAGVYKEGTF...,P17763,251.0,264.0,0.000000,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1..."
4,HLA-DPA10103-DPB10201,AAASVPAADKFKTFE,SVKRSNGSAEVHRGAVPRRGPRGGPGRSYAADAGYAPATPAAAGAE...,Q40960,71.0,85.0,0.203668,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36231,HLA-DRB50101,SWYQQALGQGPQFIFQYYEE,MGPGLLCWALLCLLGAGLVDAGVTQSPTHLIKTRGQQVTLRCSPKS...,A0A599,52.0,71.0,0.212813,DRB50101,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT,/home/user11/data/embeddings_proteins/emb_esmc...,"[33, 35, 46, 48, 55, 76, 77, 82, 83, 85, 89, 9...","[37, 39, 41, 54, 56, 58, 75, 85, 95, 98, 99, 1..."
36232,HLA-DRB50101,TGTEKLIETYFSKNYQDYEY,MGLLECCARCMVGAPFASLVATGLCFFGVALFCGCGHEALTGTEKL...,P60201,41.0,60.0,0.515053,DRB50101,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT,/home/user11/data/embeddings_proteins/emb_esmc...,"[33, 35, 46, 48, 55, 76, 77, 82, 83, 85, 89, 9...","[37, 39, 41, 54, 56, 58, 75, 85, 95, 98, 99, 1..."
36233,HLA-DRB50101,VYYLTRDPTTPLARAAWETA,MSTNPKPQRKTKRNTNRRPQDVKFPGGGQIVGGVYLLPRRGPRLGV...,P27958,2801.0,2820.0,0.374679,DRB50101,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT,/home/user11/data/embeddings_proteins/emb_esmc...,"[33, 35, 46, 48, 55, 76, 77, 82, 83, 85, 89, 9...","[37, 39, 41, 54, 56, 58, 75, 85, 95, 98, 99, 1..."
36234,HLA-DRB50101,ALGQGPQFIFQYYEEEERQRG,MGPGLLCWALLCLLGAGLVDAGVTQSPTHLIKTRGQQVTLRCSPKS...,A0A599,57.0,77.0,0.574375,DRB50101,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT,/home/user11/data/embeddings_proteins/emb_esmc...,"[33, 35, 46, 48, 55, 76, 77, 82, 83, 85, 89, 9...","[37, 39, 41, 54, 56, 58, 75, 85, 95, 98, 99, 1..."


In [90]:
tag = 'epitop_full_sequence'
_proteins = set((i[0], i[1]) for i in greate_table[['epitop_full_sequence', 'Uniprot ID']].itertuples(index=False))
_proteins

{('AAAAVPRRGPRGGPGRSYTADAGYAPATPAAAGAAAGKATTEEQKLIEDINVGFKAAVAAAASVPAADKFKTFEAAFTSSSKAAAAKAPGLVPKLDAAYSVAYKAAVGATPEAKFDSFVASLTEALRVIAGALEVHAVKPVTEEPGMAKIPAGELQIIDKIDAAFKVAATAAATAPADDKFTVFEAAFNKAIKESTGGAYDTYKCIPSLEAAVKQAYAATVAAAPQVKYAVFEAALTKAITAMSEVQKVSQPATGAATVAAGAATTAAGAASGAATVAAGGYKV',
  'Q40960'),
 ('AAPVEFTVEKGSDEKNLALSIKYNKEGDSMAEVELKEHGSNEWLALKKNGDGVWEIKSDKPLKGPFNFRFVSEKGMRNVFDDVVPADFKVGTTYKPE',
  'P14947'),
 ('AAPVEFTVEKGSDEKNLALSIKYNKEGDSMAEVELKEHGSNEWLALKKNGDGVWEIKSDKPLKGPFNFRFVSEKGMRNVFDDVVPADFKVGTTYKPE',
  'Q41183'),
 ('AAQKRPSQRSKYLASASTMDHARHGFLPRHRDTGILDSLGRFFGSDRGAPKRGSGKDGHHAARTTHYGSLPQKAQGHRPQDENPVVHFFKNIVTPRTPPPSQGKGRGLSLSRFSWGAEGQKPGFGYGGRASDYKSAHKGLKGHDAQGTLSKIFKLGGRDSRSGSPMARR',
  'P02687'),
 ('AATSESLDVMASQKRPSQRHGSKYLATASTMDHARHGFLPRHRDTGILDSIGRFFGGDRGAPKRGSGKDSHHPARTAHYGSLPQKSHGRTQDENPVVHFFKNIVTPRTPPPSQGKGRGLSLSRFSWGAEGQRPGFGYGGRASDYKSAHKGFKGVDAQGTLSKIFKLGGRDSRSGSPMARR',
  'P02686'),
 ('ADAGYTPAAPAAAGAGGKATTDEQKLLEDVNAGFKTAVAAAANVPPADKYKTFEAAFTASSKASIAAAATKAPGL

In [93]:
import os

tag = 'epitop_full_sequence'
_proteins = set((i[0], i[1]) for i in greate_table[['epitop_full_sequence', 'Uniprot ID']].itertuples(index=False))
emb_proteins = dict()

for seq, key in tqdm(_proteins):
    emb_proteins[key] = batch_embed(model, [seq], max_workers=1)[0]
    if not isinstance(emb_proteins[key], LogitsOutput):
        emb_proteins[key] = batch_embed(model, [seq], max_workers=1)[0]
        if not isinstance(emb_proteins[key], LogitsOutput):
            emb_proteins[key] = batch_embed(model, [seq], max_workers=1)[0]
            if not isinstance(emb_proteins[key], LogitsOutput):
                emb_proteins[key] = batch_embed(model, [seq], max_workers=1)[0]
        



100%|████████████████████████████████████████████████████████| 787/787 [03:27<00:00,  3.79it/s]


In [95]:
failed = []
for key in emb_proteins.keys():
    if not isinstance(emb_proteins[key], LogitsOutput):
        failed.append(key)
failed = set(failed)
failed

{'A0A0J9YY99',
 'A5K724',
 'P01764',
 'P01889',
 'P04601',
 'P10321',
 'P18541',
 'P60710',
 'P9WPE7',
 'Q504P2'}

In [97]:
filter_ = [False if i in failed else True for i in greate_table['Uniprot ID']]
greate_table = greate_table[filter_]

In [98]:
import numpy as np

write_emb_dir = '/home/user11/data/embeddings_proteins/emb_esmc_full_epitopes_650m/'
container_emb = []
for index, line in greate_table.iterrows():
    key = line[f'Uniprot ID']
    emb_path = f'{write_emb_dir}/{key}.npy'
    container_emb.append(emb_path)
    if not os.path.exists(emb_path):
        np.save(emb_path, emb_proteins[key].embeddings.cpu().numpy())
greate_table[f'{tag}_emb_path'] = container_emb

In [99]:
greate_table.to_csv('/home/user11/data/embeddings_proteins/greate_data.tsv', sep='\t', index=False)

In [100]:
greate_table

Unnamed: 0,MHC Restriction,peptide,epitop_full_sequence,Uniprot ID,start_epitop,end_epitop,score,hla,alpha_id,beta_id,alpha_seq,beta_seq,alpha_path,beta_path,interface,peptide_path,alpha_positions,beta_positions,epitop_full_sequence_emb_path
0,HLA-DPA10103-DPB10201,VPDHVVWSLFNTL,MNHIVQTFSPVNSGQPPNYEMLKEEQEVAMLGVPHNPAPPMSTVIH...,Q01629,53.0,65.0,0.871874,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1...",/home/user11/data/embeddings_proteins/emb_esmc...
1,HLA-DPA10103-DPB10201,VPDHVVWSLFNTL,MHKEEHEVAVLGPPPSTILPRSTVINIHSETSVPDHVVWSLFNTLF...,P13164,33.0,45.0,0.871874,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1...",/home/user11/data/embeddings_proteins/emb_esmc...
2,HLA-DPA10103-DPB10201,VPDHVVWSLFNTL,MNHTSQAFVNAATGGQPPNYERIKEEYEVSELGAPHGSASVRTTVI...,Q01628,54.0,66.0,0.871874,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1...",/home/user11/data/embeddings_proteins/emb_esmc...
3,HLA-DPA10103-DPB10201,HTGREIVDLMCHAT,AGVLWDVPSPPPMGKAELEDGAYRIKQKGILGYSQIGAGVYKEGTF...,P17763,251.0,264.0,0.000000,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1...",/home/user11/data/embeddings_proteins/emb_esmc...
4,HLA-DPA10103-DPB10201,AAASVPAADKFKTFE,SVKRSNGSAEVHRGAVPRRGPRGGPGRSYAADAGYAPATPAAAGAE...,Q40960,71.0,85.0,0.203668,HLA-DPA10103-DPB10201,DPA10103,DPB10201,MRPEDRMFHIRAVILRALSLAFLLSLRGAGAIKADHVSTYAAFVQT...,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,YAFFMFSGGAILNTLFGQFEYFDIEEVRMHLGMT,/home/user11/data/embeddings_proteins/emb_esmc...,"[39, 41, 52, 54, 61, 82, 83, 88, 89, 91, 95, 9...","[37, 39, 38, 52, 54, 56, 73, 83, 93, 96, 97, 1...",/home/user11/data/embeddings_proteins/emb_esmc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36231,HLA-DRB50101,SWYQQALGQGPQFIFQYYEE,MGPGLLCWALLCLLGAGLVDAGVTQSPTHLIKTRGQQVTLRCSPKS...,A0A599,52.0,71.0,0.212813,DRB50101,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT,/home/user11/data/embeddings_proteins/emb_esmc...,"[33, 35, 46, 48, 55, 76, 77, 82, 83, 85, 89, 9...","[37, 39, 41, 54, 56, 58, 75, 85, 95, 98, 99, 1...",/home/user11/data/embeddings_proteins/emb_esmc...
36232,HLA-DRB50101,TGTEKLIETYFSKNYQDYEY,MGLLECCARCMVGAPFASLVATGLCFFGVALFCGCGHEALTGTEKL...,P60201,41.0,60.0,0.515053,DRB50101,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT,/home/user11/data/embeddings_proteins/emb_esmc...,"[33, 35, 46, 48, 55, 76, 77, 82, 83, 85, 89, 9...","[37, 39, 41, 54, 56, 58, 75, 85, 95, 98, 99, 1...",/home/user11/data/embeddings_proteins/emb_esmc...
36233,HLA-DRB50101,VYYLTRDPTTPLARAAWETA,MSTNPKPQRKTKRNTNRRPQDVKFPGGGQIVGGVYLLPRRGPRLGV...,P27958,2801.0,2820.0,0.374679,DRB50101,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT,/home/user11/data/embeddings_proteins/emb_esmc...,"[33, 35, 46, 48, 55, 76, 77, 82, 83, 85, 89, 9...","[37, 39, 41, 54, 56, 58, 75, 85, 95, 98, 99, 1...",/home/user11/data/embeddings_proteins/emb_esmc...
36234,HLA-DRB50101,ALGQGPQFIFQYYEEEERQRG,MGPGLLCWALLCLLGAGLVDAGVTQSPTHLIKTRGQQVTLRCSPKS...,A0A599,57.0,77.0,0.574375,DRB50101,DRA10101,DRB50101,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...,/home/user11/data/embeddings_proteins/emb_esmc...,/home/user11/data/embeddings_proteins/emb_esmc...,QEFFIASGAAVDAIMQDYFHDYDFDRATYHVGFT,/home/user11/data/embeddings_proteins/emb_esmc...,"[33, 35, 46, 48, 55, 76, 77, 82, 83, 85, 89, 9...","[37, 39, 41, 54, 56, 58, 75, 85, 95, 98, 99, 1...",/home/user11/data/embeddings_proteins/emb_esmc...
