In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

In [None]:
!pip install -q gdown
!gdown 1HJXSEBC-VJo2mytdTBtZOhJGwKC7cT-4 -O protein_id.csv # file protein_ids.csv
!gdown 1o-zZ0G_cCWLIVobQVfW4-dJ_VIxcAYAA -O protein_embedddings.npy # file protein_embeddings.npy

In [3]:

path = "/kaggle/input/cafa6-protein-embeddings-esm2/protein_ids.csv"
df = pd.read_csv(path)

print(df.head(20))

    protein_id
0       Q9ZSA8
1       P25353
2   A0A2R8YCW8
3       G3V5N8
4   A0A140LFN4
5       B8ZZU6
6       Q01850
7       P11076
8       Q9VJ64
9       Q7YSJ4
10      Q9URX2
11  A0A384ME80
12      Q97SR4
13      O82663
14      P58958
15      P83142
16      Q388J7
17  A0A0B4K7W6
18      Q6WRH9
19      Q9V474


In [24]:
import numpy as np

# path = "/kaggle/input/cafa6-protein-embeddings-esm2/protein_embeddings.npy"
# arr = np.load(path)

# print("Shape:", arr.shape)

test_path = "/kaggle/working/test_embeds.npy"
arr = np.load(test_path)

print("Shape:", arr.shape)

train_path = "/kaggle/working/train_embeds.npy"
train_arr = np.load(train_path)

print("Shape train: ", train_arr.shape)

Shape: (224309, 1280)
Shape train:  (82404, 1280)


In [5]:
## Parse FASTA to IDs
def load_fasta_ids(path):
    ids = []
    with open(path) as f:
        for line in f:
            if line.startswith(">"):
                line = line[1:].strip()  # bỏ ">"
                
                if "|" in line:
                    # TRAIN format: >sp|A0A0C5B5G6|MOTSC_HUMAN
                    parts = line.split("|")
                    if len(parts) >= 2:
                        ids.append(parts[1])
                else:
                    # TEST format: >A0A0C5B5G6 9606
                    parts = line.split()
                    ids.append(parts[0])  # lấy A0A0C5B5G6
    return ids

train_fasta = "/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta"
test_fasta  = "/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta"

train_ids = load_fasta_ids(train_fasta)
test_ids  = load_fasta_ids(test_fasta)

print("Train FASTA:", len(train_ids))
print("Test FASTA:", len(test_ids))

Train FASTA: 82404
Test FASTA: 224309


In [6]:
## Load dataset embeddings
emb_dir = Path("/kaggle/input/cafa6-protein-embeddings-esm2")

public_ids = pd.read_csv(emb_dir / "protein_ids.csv")["protein_id"].astype(str).tolist()
embeds = np.load(emb_dir / "protein_embeddings.npy")

print("Embeddings loaded:", embeds.shape)
print("IDs loaded:", len(public_ids))

print(public_ids[:5])
print(embeds[:5])

Embeddings loaded: (287001, 1280)
IDs loaded: 287001
['Q9ZSA8', 'P25353', 'A0A2R8YCW8', 'G3V5N8', 'A0A140LFN4']
[[-0.09229109 -0.06628396 -0.01226195 ... -0.16065988  0.0159197
   0.00173827]
 [ 0.01162435 -0.03031761 -0.00580197 ... -0.15341142  0.00663222
   0.00142484]
 [ 0.02737274 -0.04104703 -0.02920537 ... -0.05123704 -0.01053242
   0.02256369]
 [ 0.03376611 -0.07888931 -0.05974137 ... -0.10354079  0.01092495
   0.1584017 ]
 [ 0.0119482  -0.00210759 -0.08492219 ... -0.05035866 -0.05547693
   0.02029048]]


In [7]:
## dict to lookup embedding by Id
pid_to_emb = {pid: emb for pid, emb in zip(public_ids, embeds)}

In [8]:
## Sampligng 1000 train + 5000 test to check
import random
random.seed(2025)

sample_train = random.sample(train_ids, min(1000, len(train_ids)))
sample_test  = random.sample(test_ids,  min(5000, len(test_ids)))

In [9]:
def check_ids(sample_ids, pid_to_emb):
    missing = []
    nan_vec = []
    wrong_dim = []

    for pid in sample_ids:
        if pid not in pid_to_emb:
            missing.append(pid)
        else:
            emb = pid_to_emb[pid]
            if emb.shape[0] != 1280:
                wrong_dim.append(pid)
            if np.isnan(emb).any():
                nan_vec.append(pid)

    return missing, wrong_dim, nan_vec


missing_train, wrong_train, nan_train = check_ids(sample_train, pid_to_emb)
missing_test,  wrong_test,  nan_test  = check_ids(sample_test, pid_to_emb)

print("===== TRAIN CHECK =====")
print("Missing:", len(missing_train))
print("Wrong dim:", len(wrong_train))
print("NaN:", len(nan_train))

print("\n===== TEST CHECK =====")
print("Missing:", len(missing_test))
print("Wrong dim:", len(wrong_test))
print("NaN:", len(nan_test))

===== TRAIN CHECK =====
Missing: 0
Wrong dim: 0
NaN: 0

===== TEST CHECK =====
Missing: 0
Wrong dim: 0
NaN: 0


In [10]:
# Train
train_embeds = []
train_ids_ok = []

for pid in train_ids:
    if pid in pid_to_emb:
        train_embeds.append(pid_to_emb[pid])
        train_ids_ok.append(pid)

train_embeds = np.array(train_embeds)
np.save("train_embeds.npy", train_embeds)

with open("train_ids.txt", "w") as f:
    for pid in train_ids_ok:
        f.write(pid + "\n")

print("Saved train_embeds.npy:", train_embeds.shape)


Saved train_embeds.npy: (82404, 1280)


In [11]:
test_embeds = []
test_ids_ok = []

for pid in test_ids:
    if pid in pid_to_emb:
        test_embeds.append(pid_to_emb[pid])
        test_ids_ok.append(pid)

test_embeds = np.array(test_embeds)
np.save("test_embeds.npy", test_embeds)

with open("test_ids.txt", "w") as f:
    for pid in test_ids_ok:
        f.write(pid + "\n")

print("Saved test_embeds.npy:", test_embeds.shape)


Saved test_embeds.npy: (224309, 1280)


## Embedding Check

In [12]:
# def load_fasta_dict(path):
#     seqs = {}
#     current_id = None
    
#     with open(path) as f:
#         for line in f:
#             line = line.strip()
            
#             if not line:
#                 continue
            
#             if line.startswith(">"):
#                 header = line[1:]  # remove ">"

#                 # Case 1 — TRAIN format: sp|A0A0C5B5G6|MOTSC_HUMAN
#                 if "|" in header:
#                     parts = header.split("|")
#                     if len(parts) >= 2:
#                         current_id = parts[1]
#                     else:
#                         continue
                
#                 # Case 2 — TEST format: A0A0C5B5G6 9606
#                 else:
#                     current_id = header.split()[0]

#                 seqs[current_id] = ""
            
#             else:
#                 if current_id:
#                     seqs[current_id] += line

#     return seqs


# train_fasta = "/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta"
# test_fasta = "/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta"

# train_seqs = load_fasta_dict(train_fasta)
# test_seqs  = load_fasta_dict(test_fasta)

# print("Train sequences:", len(train_seqs))
# print("Test sequences:", len(test_seqs))

Train sequences: 82404
Test sequences: 224309


In [13]:
# import random
# random.seed(2025)

# train_ids_local = random.sample(list(train_seqs.keys()), 5)
# print(train_ids_local)

# test_ids_local = random.sample(list(test_seqs.keys()), 20)
# test_ids_local

['Q10243', 'Q96SY0', 'C8V3W5', 'Q03200', 'Q9VQ36']


['O15116',
 'F4IUJ7',
 'Q1RMA6',
 'Q7Z1F8',
 'Q9VIS1',
 'P63101',
 'Q9LF46',
 'P25366',
 'Q94K66',
 'Q9NTN3',
 'Q8BZ97',
 'P06356',
 'Q59W33',
 'Q1ECT8',
 'Q9JM15',
 'Q9M203',
 'Q13813',
 'Q5VVP1',
 'Q5XKE5',
 'P29860']

In [14]:
!pip install --upgrade --force-reinstall protobuf==3.20.*

Collecting protobuf==3.20.*
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Downloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 6.33.0
    Uninstalling protobuf-6.33.0:
      Successfully uninstalled protobuf-6.33.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 3.20.3 which is incompatible.
onnx 1.18.0 requires protobuf>=4.25.1, but you have protobuf 3.20.3 which is incompatible.
a2a-sdk 0.3.10 requires protobuf>=5.29

In [15]:
# import torch
# from transformers import AutoTokenizer, AutoModel

# device = "cuda" if torch.cuda.is_available() else "cpu"

# tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
# model = AutoModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
# model.eval()

tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

2025-12-04 10:06:38.539955: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764842798.798463      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764842798.870756      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/2.61G [00:00<?, ?B/s]

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(33, 1280, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-32): 33 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=1280, out_features=1280, bias=True)
            (key): Linear(in_features=1280, out_features=1280, bias=True)
            (value): Linear(in_features=1280, out_features=1280, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (rotary_embeddings): RotaryEmbedding()
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=1280, out_features=1280, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (LayerNorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): EsmIntermediate(
          (dense): Linear(in_features=1280, out_feature

In [18]:
# import torch.nn.functional as F

# def get_esm2_embedding(seq: str):
#     tokens = tokenizer(seq, return_tensors="pt", add_special_tokens=True).to(device)
#     with torch.no_grad():
#         output = model(**tokens)
#     last_hidden = output.last_hidden_state.squeeze(0)  # (L, 1280)
#     mask = tokens["attention_mask"].squeeze(0).unsqueeze(-1)  # (L,1)
#     last_hidden = last_hidden * mask
#     emb = last_hidden.sum(dim=0) / mask.sum()
#     return emb.cpu().numpy()

In [19]:
# import numpy as np
# import pandas as pd

# emb_dir = "/kaggle/input/cafa6-protein-embeddings-esm2"

# public_ids = pd.read_csv(f"{emb_dir}/protein_ids.csv")["protein_id"].astype(str).tolist()
# public_embs = np.load(f"{emb_dir}/protein_embeddings.npy")

# pid2emb_pub = {pid: emb for pid, emb in zip(public_ids, public_embs)}


In [21]:
# ## Compare
# from numpy.linalg import norm

# def compare_embeddings(emb1, emb2):
#     cos_sim = np.dot(emb1, emb2) / (norm(emb1) * norm(emb2))
#     mse = np.mean((emb1 - emb2)**2)
#     max_diff = np.max(np.abs(emb1 - emb2))
#     return cos_sim, mse, max_diff


In [23]:
# ## Run
# for pid in test_ids_local:
#     print("="*50)
#     print("Protein:", pid)

#     if pid in train_seqs:
#         seq = train_seqs[pid]
#     elif pid in test_seqs:
#         seq = test_seqs[pid]
#     else:
#         print("❌ PID không nằm trong train hoặc test FASTA!")
#         continue
#     local_emb = get_esm2_embedding(seq)

#     if pid not in pid2emb_pub:
#         print("❌ ID không có trong dataset public")
#         continue

#     public_emb = pid2emb_pub[pid]

#     cos, mse, diff = compare_embeddings(local_emb, public_emb)

#     print("Cosine similarity:", cos)
#     print("MSE:", mse)
#     print("Max diff:", diff)

Protein: O15116
Cosine similarity: 0.99996233
MSE: 3.6694303e-06
Max diff: 0.03949547
Protein: F4IUJ7
Cosine similarity: 0.9999979
MSE: 2.406604e-07
Max diff: 0.008792639
Protein: Q1RMA6
Cosine similarity: 0.99999815
MSE: 2.7569257e-07
Max diff: 0.012228966
Protein: Q7Z1F8
Cosine similarity: 0.9999999
MSE: 5.6785013e-15
Max diff: 4.7683716e-07
Protein: Q9VIS1
Cosine similarity: 0.99620664
MSE: 0.00066582754
Max diff: 0.5303054
Protein: P63101
Cosine similarity: 0.99998724
MSE: 1.169933e-06
Max diff: 0.02277565
Protein: Q9LF46
Cosine similarity: 0.99999636
MSE: 2.7936716e-07
Max diff: 0.0130906105
Protein: P25366
Cosine similarity: 0.999996
MSE: 4.7244976e-07
Max diff: 0.01104188
Protein: Q94K66
Cosine similarity: 0.99998575
MSE: 1.4896821e-06
Max diff: 0.019098282
Protein: Q9NTN3
Cosine similarity: 0.9999489
MSE: 1.8434503e-06
Max diff: 0.04030907
Protein: Q8BZ97


KeyboardInterrupt: 