<a href="https://colab.research.google.com/github/yifan-grace-tang/final-project/blob/main/Renee/report.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---

### Required Imports

In [4]:
from copy import deepcopy
import pandas as pd
import os
import time
import shutil
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from scipy.stats import spearmanr
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

---

### Data Collection and Cleaning



> __This section relies on having a `sequence.fasta`, `train.csv`, `query.csv` and `test.csv` in your runtime.__

We can start by looking at our _sequence_ from the `sequence.fasta` file and analyzing its composition and length. Our _sequence_ will be the entry-point to generate mutated sequences from coded mutations as discussed later.

In [5]:
def parse_fasta(filename):
    seqs = {}
    with open(filename, 'r') as f:
        current_id = None
        current_seq = []
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if current_id is not None:
                    seqs[current_id] = ''.join(current_seq)
                current_id = line[1:]  # remove the ">"
                current_seq = []
            else:
                current_seq.append(line)
        # Add the last entry
        if current_id is not None:
            seqs[current_id] = ''.join(current_seq)
    return seqs

seq_dict = parse_fasta("nrPDB-EC_2020.04_sequences.fasta")
seq_dict = {k.split(' nrPDB')[0]: v for k, v in seq_dict.items()}

Load train/valid splits

In [6]:
# Load train and validation protein IDs
train_ids = pd.read_csv("nrPDB-EC_2020.04_train.txt", header=None, names=["PDB-chain"])
valid_ids = pd.read_csv("nrPDB-EC_2020.04_valid.txt", header=None, names=["PDB-chain"])

print(train_ids.head(5))
print(valid_ids.head(5))

  PDB-chain
0    1R9W-A
1    3U7V-A
2    1CK7-A
3    6FLM-A
4    2WBK-A
  PDB-chain
0    1EF9-A
1    4BYF-A
2    1MVP-A
3    2BIH-A
4  6UE0-AAA


In [7]:
df_annot = pd.read_csv(
    "nrPDB-EC_2020.04_annot.tsv",
    sep="\t",
    skiprows=3,  # Skip the first two rows
    names=["PDB-chain", "EC-nums"]  # Set column names manually
)

df_annot['EC-nums'] = df_annot['EC-nums'].apply(lambda x: x.split(','))

print(df_annot.head())

  PDB-chain              EC-nums
0    4PR3-A   [3.2.2.9, 3.2.2.-]
1    1TNT-A  [6.5.1.-, 3.1.22.-]
2    1T8A-A  [3.2.1.17, 3.2.1.-]
3    5H75-A            [4.1.1.-]
4    2FOR-A  [2.5.1.-, 2.5.1.10]


In [8]:
df_train = df_annot[df_annot['PDB-chain'].isin(train_ids['PDB-chain'])].copy()
df_valid = df_annot[df_annot['PDB-chain'].isin(valid_ids['PDB-chain'])].copy()

df_train['sequence'] = df_train['PDB-chain'].map(seq_dict)
df_valid['sequence'] = df_valid['PDB-chain'].map(seq_dict)

print(df_train.head())
print(df_valid.head())

  PDB-chain              EC-nums  \
0    4PR3-A   [3.2.2.9, 3.2.2.-]   
1    1TNT-A  [6.5.1.-, 3.1.22.-]   
2    1T8A-A  [3.2.1.17, 3.2.1.-]   
3    5H75-A            [4.1.1.-]   
6    4XL3-A            [2.1.3.-]   

                                            sequence  
0  MHHHHHHHHGVDLGTENLYFQSNAMKTVAGKRLLYVMAADAEYGRH...  
1  MELWVSPKELANLPGLPKTSAGVIYVAKKQGWQNRTRAGVKGGKAI...  
2  MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSINAAKSEL...  
3  GSGGGGSMSISILKDKKLLIGICGSISSVGISSYLLYFKSFFKEIR...  
6  MKEVVIASAVRTAIGSYGKSLKDVPAVDLGATAIKEAVKKAGIKPE...  
   PDB-chain                EC-nums  \
5     1Z7L-A              [6.2.1.-]   
12    1ZXA-A  [2.7.11.12, 2.7.11.-]   
26    3K13-A              [2.1.1.-]   
33    3QWZ-A     [3.6.4.6, 3.6.4.-]   
77    1SFE-A              [2.1.1.-]   

                                             sequence  
5   MGHHHHHHEFEKSIPICTLKNFPNAIEHTLQWARDEFEGLFKQPAE...  
12  GSPGIPGSTSELEEDFAKILMLKEERIKELEKRLSEKEEEIQELKR...  
26  SNALEVKPEINFVNIGERCNVAGSRKFLRLVNEKKYDEALSIARQQ

In [9]:
# Define the amino acids (20 standard ones)
amino_acids = 'ACDEFGHIKLMNPQRSTVWXY'

def one_hot_encode(sequence, length=1000):
    encoding = np.zeros((length, len(amino_acids)), dtype=int)
    for i, aa in enumerate(sequence[:length]):
        if aa in amino_acids:
            encoding[i, amino_acids.index(aa)] = 1
    return encoding.flatten()  # Flatten to a 1D array

# Apply the one-hot encoding to the sequence column
df_train['sequence_encoded'] = df_train['sequence'].apply(lambda x: one_hot_encode(x))
df_valid['sequence_encoded'] = df_valid['sequence'].apply(lambda x: one_hot_encode(x))

# Check the result
print(df_train.head())

  PDB-chain              EC-nums  \
0    4PR3-A   [3.2.2.9, 3.2.2.-]   
1    1TNT-A  [6.5.1.-, 3.1.22.-]   
2    1T8A-A  [3.2.1.17, 3.2.1.-]   
3    5H75-A            [4.1.1.-]   
6    4XL3-A            [2.1.3.-]   

                                            sequence  \
0  MHHHHHHHHGVDLGTENLYFQSNAMKTVAGKRLLYVMAADAEYGRH...   
1  MELWVSPKELANLPGLPKTSAGVIYVAKKQGWQNRTRAGVKGGKAI...   
2  MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSINAAKSEL...   
3  GSGGGGSMSISILKDKKLLIGICGSISSVGISSYLLYFKSFFKEIR...   
6  MKEVVIASAVRTAIGSYGKSLKDVPAVDLGATAIKEAVKKAGIKPE...   

                                    sequence_encoded  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
6  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...  


In [12]:
!unzip esm_embeddings.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: esm_embeddings/test_seq_1461.pt  
  inflating: __MACOSX/esm_embeddings/._test_seq_1461.pt  
  inflating: esm_embeddings/train_seq_15915.pt  
  inflating: __MACOSX/esm_embeddings/._train_seq_15915.pt  
  inflating: esm_embeddings/train_seq_12076.pt  
  inflating: __MACOSX/esm_embeddings/._train_seq_12076.pt  
  inflating: esm_embeddings/test_seq_686.pt  
  inflating: __MACOSX/esm_embeddings/._test_seq_686.pt  
  inflating: esm_embeddings/test_seq_517.pt  
  inflating: __MACOSX/esm_embeddings/._test_seq_517.pt  
  inflating: esm_embeddings/train_seq_9108.pt  
  inflating: __MACOSX/esm_embeddings/._train_seq_9108.pt  
  inflating: esm_embeddings/train_seq_9299.pt  
  inflating: __MACOSX/esm_embeddings/._train_seq_9299.pt  
  inflating: esm_embeddings/train_seq_542.pt  
  inflating: __MACOSX/esm_embeddings/._train_seq_542.pt  
  inflating: esm_embeddings/test_seq_985.pt  
  inflating: __MACOSX/esm_embeddings/._te

In [43]:
class ProteinDataset(Dataset):

    def __init__(self, df, istrain=True, device='cuda', mode='both', std=10):
        self.df = df.reset_index(drop=True)
        self.istrain = istrain
        self.device = device
        self.mode = mode
        self.std = std

        self.embedding_dir = "esm_embeddings_generated"
        os.makedirs(self.embedding_dir, exist_ok=True)

        self.model, self.alphabet = torch.hub.load("facebookresearch/esm:main",
                                                   "esm2_t33_650M_UR50D")
        self.batch_converter = self.alphabet.get_batch_converter()
        self.model = self.model.to(self.device)
        self.model.eval()

        for idx, row in self.df.iterrows():
            emb_path = self._embedding_path(idx)
            if not os.path.exists(emb_path):
                self._compute_and_save_embedding(idx, row['sequence'])

        if self.istrain:
            ec_numbers = self.df['EC-nums'].astype(str)  # ensure they're strings
            self.label_to_idx = {label: i for i, label in enumerate(sorted(set(ec_numbers)))}
            self.targets = ec_numbers.map(self.label_to_idx).values
            self.idx_to_label = {i: label for label, i in self.label_to_idx.items()}

    def _embedding_path(self, idx):
        prefix = "train" if self.istrain else "test"
        return os.path.join(self.embedding_dir, f"{prefix}_seq_{idx}.pt")

    def _compute_and_save_embedding(self, idx, seq):
        name = f"protein{idx}"
        data = [(name, seq)]
        batch_labels, batch_strs, batch_tokens = self.batch_converter(data)
        batch_tokens = batch_tokens.to(self.device)
        batch_lens = (batch_tokens != self.alphabet.padding_idx).sum(1)

        with torch.no_grad():
            results = self.model(batch_tokens, repr_layers=[33], return_contacts=False)
        token_representations = results["representations"][33]
        rep = token_representations[0, 1:batch_lens.item()-1].cpu()
        torch.save(rep, self._embedding_path(idx))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        emb_path = self._embedding_path(idx)
        embedding = torch.load(emb_path)

        feature = embedding.max(dim=0).values

        if self.istrain:
            target = self.targets[idx]
            return feature, torch.tensor(target, dtype=torch.float32)
        else:
            return feature


In [44]:
# Define DataLoader
train_dataset = ProteinDataset(df_train, istrain=True, device='cuda')
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


Using cache found in /root/.cache/torch/hub/facebookresearch_esm_main


In [45]:
X_max, y_full = [], []

for i in range(len(train_dataset)):

    feature_max, target = train_dataset[i]
    X_max.append(feature_max.numpy())

    y_full.append(target.item())

X_max = np.vstack(X_max)
y_full = np.array(y_full)

print("Max Approach Shape:", X_max.shape)
print("y_full shape:", y_full.shape)

X_max_train, X_max_val, y_train, y_val = train_test_split(X_max, y_full, test_size=0.2, random_state=42)

print("Train shape:", X_max_train.shape)
print("Val shape:", X_max_val.shape)


Max Approach Shape: (15551, 1280)
y_full shape: (15551,)
Train shape: (12440, 1280)
Val shape: (3111, 1280)


In [46]:
def spearman_score(y_true, y_pred):
    rho, _ = spearmanr(y_true, y_pred)
    return rho

spearman_scorer = make_scorer(spearman_score, greater_is_better=True)

best_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'n_estimators': 300,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'reg_alpha': 0.01,
    'reg_lambda': 1,
    'seed': 42,
    'tree_method': 'hist',
    'device': 'cuda'
}

model_max = xgb.XGBRegressor(**best_params)
model_max.fit(X_max_train, y_train)

y_val_max = model_max.predict(X_max_val)

print("\n[MAX] Validation Spearman correlation:", spearman_score(y_val, y_val_max))
print("[MAX] Validation MSE:", mean_squared_error(y_val, y_val_max))

model_max.fit(X_max, y_full)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.





[MAX] Validation Spearman correlation: 0.7017819998144615
[MAX] Validation MSE: 24413.58187739261


In [54]:
test_dataset_max = ProteinDataset(df_valid, istrain=False)

X_test_max = np.vstack([test_dataset_max[i].numpy() for i in range(len(test_dataset_max))])

print("X_test_max shape:", X_test_max.shape)

y_test_max = model_max.predict(X_test_max)
y_test_pred = y_test_max

# ec_predictions = [test_dataset_max.idx_to_label[pred] for pred in y_test_pred]

df_results = pd.DataFrame({
    "PDB-chain": df_valid["PDB-chain"],
    "EC_prediction": y_test_pred
})
df_results.to_csv("predictions.csv", index=False)

# df_top10 = df_results.sort_values(by="EC_prediction", ascending=False).head(10)

Using cache found in /root/.cache/torch/hub/facebookresearch_esm_main


X_test_max shape: (1729, 1280)
