In [1]:
from transformers import BertTokenizer, BertModel

KeyboardInterrupt: 

In [2]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda', index=0)

In [4]:
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = BertModel.from_pretrained("Rostlab/prot_bert").to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
def embed_sequence(sequence):
    inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return embeddings

In [6]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
from sklearn import linear_model
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from joblib import Parallel, delayed

In [7]:
def readCSV(path):
    with open(path, 'r') as f:
        f.readline()
        for l in f:
            a, b, i = l.strip().split(',')
            i = int(i)
            yield a, b, i

In [8]:
# Load data
data = pd.read_csv("train.csv", header=0)  # Assuming columns: gene1, gene2, label

# Display the first few rows
data.head()

Unnamed: 0,GeneA,GeneB,Interaction
0,MERSLDSLAGMAKSAFGAGTSAAMRQATSPKTILEYIINFFTCGGI...,MSKEKFERTKPHVNVGTIGHVDHGKTTLTAAITTVLAKTYGGAARA...,1
1,MAAGFKTVEPLEYYRRFLKENCRPDGRELGEFRTTTVNIGSISTAD...,MAATAECDVVMAATEPELLEDEDAKREAESFKEQGNAYYAKKDYNE...,0
2,MDSQPVDVDNIIDRLLEVRGSKPGQQVDLEENEIRYLCSKARSIFI...,MTLGRRLACLFLACVLPALLLGGTALASEIVGGRRARPHAWPFMVS...,0
3,MIGVIRALVWWTLVKTAAASVCDSPVLVLTASMAGLHDAVTKTVGL...,MSDYENDDECWSTLESFRVKLISVIDPSRITPYLRQCKVLNPDDEE...,0
4,MPAERKKPASMEEKDSLPNNKEKDCSERRTVSSKERPKDDIKLTAK...,MPYLGPDTIELIFRDPAPGVDANGQPNVTDRVVSKGNCALMVTTAT...,0


In [9]:
protein_cache = {}

def get_protein_embedding(protein_seq):
    if protein_seq in protein_cache:
        return protein_cache[protein_seq]
    else:
        embedding = embed_sequence(protein_seq)
        protein_cache[protein_seq] = embedding
        return embedding

In [10]:
protein1_seqs = np.array(data['GeneA'].values)
protein2_seqs = np.array(data['GeneB'].values)

print("protein1", protein1_seqs)

print("protein2", protein2_seqs.shape)



protein1 ['MERSLDSLAGMAKSAFGAGTSAAMRQATSPKTILEYIINFFTCGGIRRRNETQYQELIETMAETLKSTMPDRGAPLPENIILDDMDGCRVEFNLPGENNEAGQVIVRVSKGDHSETREIPLASFEKICRALLFRCEFSLPQDSVILTAQGGMNLKGAVLTGANLTSENLCDADLSGANLEGAVLFMADCEGANFKGANLSGTSLGDSNFKNACLEDSIMCGATLDHANLTGANLQHASLLGCSMIECNCSGANMDHTNLSGATLIRADMSGATLQGATIMAAIMEGAVLTRANLRKASFISTNLDGADLAEANLNNTCFKDCTLTDLRTEDATMSTSTQTLFNEFYSENI'
 'MAAGFKTVEPLEYYRRFLKENCRPDGRELGEFRTTTVNIGSISTADGSALVKLGNTTVICGVKAEFAAPSTDAPDKGYVVPNVDLPPLCSSRFRSGPPGEEAQVASQFIADVIENSQIIQKEDLCISPGKLVWVLYCDLICLDYDGNILDACTFALLAALKNVQLPEVTINEETALAEVNLKKKSYLNIRTHPVATSFAVFDDTLLIVDPTGEEEHLATGTLTIVMDEEGKLCCLHKPGGSGLTGAKLQDCMSRAVTRHKEVKKLMDEVIKSMKPK'
 'MDSQPVDVDNIIDRLLEVRGSKPGQQVDLEENEIRYLCSKARSIFIKQPILLELEAPIKICGDIHGQYYDLLRLFEYGGFPPESNYLFLGDYVDRGKQSLETICLLLAYKIKYPENFFILRGNHECASINRIYGFYDECKRRYNIKLWKTFTDCFNCLPIAAIIDEKIFCMHGGLSPDLNSMEQIRRVMRPTDIPDVGLLCDLLWSDPDKDIVGWSENDRGVSFTFGPDVVNRFLQKQDMELICRAHQVVEDGYEFFSKRQLVTLFSAPNYCGEFDNAGAMMSVDESLLCSFQILKPAQKSLPRQAGGRKKK'
 ...
 'MDSNTVSSFQVDCFLWHIRKRFADQKMGDAPFLD

In [11]:

def process_sequences_on_gpu(sequences):
    sequences = [str(seq) for seq in sequences]  # Ensure all are strings
    # Batch the sequences
    batch_size = 512
    embeddings = []
    for i in tqdm(range(0, len(sequences), batch_size), desc="Processing Batches on GPU"):
        batch = sequences[i:i + batch_size]
        # Generate embeddings as a tensor
        batch_embeddings = torch.tensor(np.array([get_protein_embedding(seq) for seq in batch])).to(device)
        embeddings.append(batch_embeddings.cpu().numpy())  # Move to CPU for final concatenation
    return np.vstack(embeddings)  # Combine all batches into a single array

# Process protein sequences in parallel on GPU
protein1_embeds = process_sequences_on_gpu(protein1_seqs)
protein2_embeds = process_sequences_on_gpu(protein2_seqs)

labels = np.array(data['Interaction'].values)

print("Protein1 embeddings shape:", protein1_embeds.shape)
print("Protein2 embeddings shape:", protein2_embeds.shape)
print("Labels shape:", labels.shape)

Processing Batches on GPU: 100%|██████████| 469/469 [08:43<00:00,  1.12s/it]
Processing Batches on GPU: 100%|██████████| 469/469 [00:33<00:00, 13.80it/s] 


Protein1 embeddings shape: (239664, 1024)
Protein2 embeddings shape: (239664, 1024)
Labels shape: (239664,)


Protein1 embeddings shape: (239664, 1024)
Protein2 embeddings shape: (239664, 1024)
Labels shape: (239664,)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split your data (X_train, X_test, y_train, y_test)
X = np.hstack((protein1_embeds, protein2_embeds))
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Accuracy: 0.66


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split your data
X = np.hstack((protein1_embeds, protein2_embeds))
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM
model = SVC(kernel='rbf', probability=True)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")