<a href="https://colab.research.google.com/github/yifan-grace-tang/final-project/blob/main/Renee/report.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


<table style="background-color: transparent; border: none;">   
  <tr>     
    <td><img src="https://cdn.prod.website-files.com/6606dc3fd5f6645318003df4/6678476dc198b5a75b8c8873_ES_Logo_Black_5.png" width="100" alt="img"/></td>     
    <td><h1>Custom Embeddings + <code>XGBoost</code></h1></td>   
  </tr>
</table>

</br>

__Updated On: `04.04`__


__Key Notes:__

- This model is a complete overhaul from previous implementations focusing on `NN` (_Neural Network_) implementations.
- Here we explore the capabilities of [_XGBoost_](https://xgboost.readthedocs.io/en/release_3.0.0/) and [_Random Search_](https://www.yourdatateacher.com/2021/05/19/hyperparameter-tuning-grid-search-and-random-search/) hyperparameter tuning.
- Validated __Spearman__ from Gradescope: `0.44`



---

### Required Imports

In [None]:
from copy import deepcopy
import pandas as pd
import os
import time
import shutil
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from scipy.stats import spearmanr
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

---

### Data Collection and Cleaning



> __This section relies on having a `sequence.fasta`, `train.csv`, `query.csv` and `test.csv` in your runtime.__

We can start by looking at our _sequence_ from the `sequence.fasta` file and analyzing its composition and length. Our _sequence_ will be the entry-point to generate mutated sequences from coded mutations as discussed later.

In [19]:
def parse_fasta(filename):
    seqs = {}
    with open(filename, 'r') as f:
        current_id = None
        current_seq = []
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if current_id is not None:
                    seqs[current_id] = ''.join(current_seq)
                current_id = line[1:]  # remove the ">"
                current_seq = []
            else:
                current_seq.append(line)
        # Add the last entry
        if current_id is not None:
            seqs[current_id] = ''.join(current_seq)
    return seqs

seq_dict = parse_fasta("nrPDB-EC_2020.04_sequences.fasta")
seq_dict = {k.split(' nrPDB')[0]: v for k, v in seq_dict.items()}

Load train/valid splits

In [20]:
# Load train and validation protein IDs
train_ids = pd.read_csv("nrPDB-EC_2020.04_train.txt", header=None, names=["PDB-chain"])
valid_ids = pd.read_csv("nrPDB-EC_2020.04_valid.txt", header=None, names=["PDB-chain"])

print(train_ids.head(5))
print(valid_ids.head(5))

  PDB-chain
0    1R9W-A
1    3U7V-A
2    1CK7-A
3    6FLM-A
4    2WBK-A
  PDB-chain
0    1EF9-A
1    4BYF-A
2    1MVP-A
3    2BIH-A
4  6UE0-AAA


In [21]:
df_annot = pd.read_csv(
    "nrPDB-EC_2020.04_annot.tsv",
    sep="\t",
    skiprows=3,  # Skip the first two rows
    names=["PDB-chain", "EC-nums"]  # Set column names manually
)

df_annot['EC-nums'] = df_annot['EC-nums'].apply(lambda x: x.split(','))
print(df_annot.head())

  PDB-chain              EC-nums
0    4PR3-A   [3.2.2.9, 3.2.2.-]
1    1TNT-A  [6.5.1.-, 3.1.22.-]
2    1T8A-A  [3.2.1.17, 3.2.1.-]
3    5H75-A            [4.1.1.-]
4    2FOR-A  [2.5.1.-, 2.5.1.10]


In [23]:
df_train = df_annot[df_annot['PDB-chain'].isin(train_ids['PDB-chain'])].copy()
df_valid = df_annot[df_annot['PDB-chain'].isin(valid_ids['PDB-chain'])].copy()

df_train['sequence'] = df_train['PDB-chain'].map(seq_dict)
df_valid['sequence'] = df_valid['PDB-chain'].map(seq_dict)

print(df_train.head())
print(df_valid.head())

  PDB-chain              EC-nums  \
0    4PR3-A   [3.2.2.9, 3.2.2.-]   
1    1TNT-A  [6.5.1.-, 3.1.22.-]   
2    1T8A-A  [3.2.1.17, 3.2.1.-]   
3    5H75-A            [4.1.1.-]   
6    4XL3-A            [2.1.3.-]   

                                            sequence  
0  MHHHHHHHHGVDLGTENLYFQSNAMKTVAGKRLLYVMAADAEYGRH...  
1  MELWVSPKELANLPGLPKTSAGVIYVAKKQGWQNRTRAGVKGGKAI...  
2  MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSINAAKSEL...  
3  GSGGGGSMSISILKDKKLLIGICGSISSVGISSYLLYFKSFFKEIR...  
6  MKEVVIASAVRTAIGSYGKSLKDVPAVDLGATAIKEAVKKAGIKPE...  
   PDB-chain                EC-nums  \
5     1Z7L-A              [6.2.1.-]   
12    1ZXA-A  [2.7.11.12, 2.7.11.-]   
26    3K13-A              [2.1.1.-]   
33    3QWZ-A     [3.6.4.6, 3.6.4.-]   
77    1SFE-A              [2.1.1.-]   

                                             sequence  
5   MGHHHHHHEFEKSIPICTLKNFPNAIEHTLQWARDEFEGLFKQPAE...  
12  GSPGIPGSTSELEEDFAKILMLKEERIKELEKRLSEKEEEIQELKR...  
26  SNALEVKPEINFVNIGERCNVAGSRKFLRLVNEKKYDEALSIARQQ

In [24]:
# Define the amino acids (20 standard ones)
amino_acids = 'ACDEFGHIKLMNPQRSTVWXY'

def one_hot_encode(sequence, length=1000):
    encoding = np.zeros((length, len(amino_acids)), dtype=int)
    for i, aa in enumerate(sequence[:length]):
        if aa in amino_acids:
            encoding[i, amino_acids.index(aa)] = 1
    return encoding.flatten()  # Flatten to a 1D array

# Apply the one-hot encoding to the sequence column
df_train['sequence_encoded'] = df_train['sequence'].apply(lambda x: one_hot_encode(x))
df_valid['sequence_encoded'] = df_valid['sequence'].apply(lambda x: one_hot_encode(x))

# Check the result
print(df_train.head())

  PDB-chain              EC-nums  \
0    4PR3-A   [3.2.2.9, 3.2.2.-]   
1    1TNT-A  [6.5.1.-, 3.1.22.-]   
2    1T8A-A  [3.2.1.17, 3.2.1.-]   
3    5H75-A            [4.1.1.-]   
6    4XL3-A            [2.1.3.-]   

                                            sequence  \
0  MHHHHHHHHGVDLGTENLYFQSNAMKTVAGKRLLYVMAADAEYGRH...   
1  MELWVSPKELANLPGLPKTSAGVIYVAKKQGWQNRTRAGVKGGKAI...   
2  MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSINAAKSEL...   
3  GSGGGGSMSISILKDKKLLIGICGSISSVGISSYLLYFKSFFKEIR...   
6  MKEVVIASAVRTAIGSYGKSLKDVPAVDLGATAIKEAVKKAGIKPE...   

                                    sequence_encoded  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
6  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...  


In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Prepare the features and target labels (EC-nums)
X_train = np.array(df_train['sequence_encoded'].tolist())
y_train = df_train['EC-nums']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.