#Make sure to have sequences.fasta and train.tsv uploaded. Also be sure to set your runtime to A100 in Colab.

##Loading and processing data (skip unless interested)

In [1]:
############################### Loading sequence data ###########################
%pip install Bio
import warnings
warnings.filterwarnings("ignore")
import torch
from Bio import SeqIO
from Bio.SeqUtils import seq3
sequence_data =  list(SeqIO.parse("sequences.fasta", "fasta"))
sequences = [str(seq.seq) for seq in sequence_data]

Collecting Bio
  Downloading bio-1.8.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Downloading bio-1.8.1-py3-none-any.whl (321 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.3/321.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m108.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
################################ Loading pretrained model ###################################
#Using 650 million parameter ESM2 model
from transformers import AutoTokenizer, AutoModel
model_name = "facebook/esm2_t33_650M_UR50D"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
#Sending model to the GPU using cuda
device = torch.device("cuda")
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.61G [00:00<?, ?B/s]

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
################################ Getting vector embedding #################################
#Getting embeddings in batches of 16 to save memory
#takes about 1 minute
model.half()
batch_size = 16
embedded_batches = []
for i in range(0, len(sequences), batch_size):
    batch = sequences[i:i+batch_size]
    inputs = tokenizer(batch, return_tensors="pt", padding=True, return_attention_mask=True).to(device)
    with torch.inference_mode():
        outputs = model(**inputs)
    #removing padding from embedding
    input_ids = inputs['input_ids'].half()
    attention_mask = inputs['attention_mask']
    mask = (attention_mask == 1) & (input_ids != tokenizer.cls_token_id) & (input_ids != tokenizer.eos_token_id)
    embeddings = outputs.last_hidden_state[mask].cpu()

    embedded_batches.append(embeddings)
    torch.cuda.empty_cache()
embeddings = torch.cat(embedded_batches, dim=0).numpy()

In [5]:
############################Creating final training dataframe############################
#creating id for every amino acid that's compatable with training data
IDs = []
for i in range(len(sequences)):
    for j in range(len(sequences[i])):
        token_ID = sequence_data[i].id + "_" + seq3(sequences[i][j]).upper() + "_" + str(j + 1)
        IDs.append(token_ID)
#Get df of every amino acid's id, embedding, and label
#takes about a minute
import pandas as pd
df_IDs = pd.DataFrame({'id':pd.Series(IDs)})
df_embeddings = pd.DataFrame(embeddings)
df_fasta = pd.concat([df_IDs, df_embeddings], axis = 1)
df_train_data = pd.read_csv('train.tsv', sep='\t').drop_duplicates()#remove duplicate rows
train_IDs = df_train_data.id.drop_duplicates(keep=False)
df_train_filtered = df_train_data.merge(pd.DataFrame({'id':train_IDs}), on='id', how='inner')#remove residues with conflicting labels
df_train = df_fasta.merge(df_train_filtered, on = 'id')#get all valid train data together with its fasta info

##Training various models on embeddings using cuml (Start Here)

In [6]:
################################# Getting train-test split #####################################
#Using cudf, cupy, and cuml instead of pandas, numpy, and sklearn because they utilize the GPU
from cuml.preprocessing import LabelEncoder
from cuml.model_selection import train_test_split
from cuml.metrics import accuracy_score
import cudf, cupy, gc
X = df_train.drop(columns=['id', 'secondary_structure'])
y = df_train['secondary_structure']
le = LabelEncoder()
le.fit(y)
y = le.transform(y)
X_train, X_test, y_train, y_test = train_test_split(cupy.asarray(X.values), y, random_state=0, test_size=0.3)
cupy.get_default_memory_pool().free_all_blocks()
X_train = X_train.astype(cupy.float32)
X_test = X_test.astype(cupy.float32)
gc.collect()

266

In [7]:
def evalModel(Model):
    model = Model
    model.fit(X_train.astype(cupy.float32),y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", acc)

In [8]:
from cuml.linear_model import LogisticRegression
evalModel(LogisticRegression(C=15))

Accuracy: 0.7386413599359464


In [9]:
from cuml.svm import LinearSVC
evalModel(LinearSVC(loss='squared_hinge', penalty='l1', C=1))

Accuracy: 0.7305374671979784


In [10]:
from cuml.neighbors import KNeighborsClassifier
evalModel(KNeighborsClassifier()) #takes a while

Accuracy: 0.7121289958300357
