In [129]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import protein_utils as utils
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.utils.data import ConcatDataset
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

In [143]:
# file_path = "input_bcell.csv"
# df=pd.read_csv(file_path)
# print(df.shape)
# df.iloc[:100,:].to_csv("input_bcell_truncated.csv", index=False)

(12076, 14)


In [131]:
class ProteinDataset(Dataset):
    #def __init__(self, sequences, peptides, start_positions, end_positions, labels, tokenizer, covariates, max_len=512):
    def __init__(self, sequences, peptides, start_positions, end_positions, labels, tokenizer, max_len=512):
        self.sequences = sequences
        self.peptides = peptides
        self.start_positions = start_positions
        self.end_positions = end_positions
        self.labels = labels
        self.tokenizer = tokenizer
        #self.covariates = torch.tensor(covariates, dtype=torch.float16)  # Convert covariates to tensor
        self.max_len = max_len

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        # Tokenize parent sequence
        parent_tokens = self.tokenizer(
            self.sequences[idx],
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        # Tokenize peptide sequence
        peptide_tokens = self.tokenizer(
            self.peptides[idx],
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        # Extract covariates and labels
        #covariates = self.covariates[idx]  # Access the covariates as tensors
        label = self.labels[idx]

        return {
            "parent_tokens": {key: val.squeeze(0) for key, val in parent_tokens.items()},
            "peptide_tokens": {key: val.squeeze(0) for key, val in peptide_tokens.items()},
            "start_positions": self.start_positions[idx],
            "end_positions": self.end_positions[idx],
            #"covariates": covariates,
            "labels": label,
        }

# Sequence Training

In [None]:
# Read file
file_path = "input_bcell.csv"

# Split data
df=pd.read_csv(file_path)
covariates = df.iloc[:,5:-1].values
parent_sequences = df.iloc[:,1].values
start_positions = df.iloc[:,2].values
end_positions = df.iloc[:,3].values
peptide_sequences = df.iloc[:,4].values
target = df["target"].values

# Add spaces and update indices
parent_sequences = [" ".join(seq) for seq in parent_sequences]
peptide_sequences = [" ".join(seq) for seq in peptide_sequences]


scaler = StandardScaler()
scaled_covariates = scaler.fit_transform(covariates)
train_covariates, test_covariates, train_parent, test_parent, train_peptide, test_peptide, train_start, test_start, train_end, test_end, train_labels, test_labels = train_test_split(
    scaled_covariates, parent_sequences, peptide_sequences, start_positions, end_positions, target, test_size=0.2, random_state=42
)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert")

# Align start and end positions for training and testing data
aligned_train_start, aligned_train_end = utils.preprocess_data(train_parent, train_peptide, tokenizer)
aligned_test_start, aligned_test_end = utils.preprocess_data(test_parent, test_peptide, tokenizer)

# Replace raw start/end positions with aligned indices
train_start = aligned_train_start
train_end = aligned_train_end
test_start = aligned_test_start
test_end = aligned_test_end

# Create DataLoaders
# train_dataset = ProteinDataset(train_parent, train_peptide, train_start, train_end, train_labels, tokenizer, train_covariates)
# test_dataset = ProteinDataset(test_parent, test_peptide, test_start, test_end, test_labels, tokenizer,test_covariates)
train_dataset = ProteinDataset(train_parent, train_peptide, train_start, train_end, train_labels, tokenizer)
test_dataset = ProteinDataset(test_parent, test_peptide, test_start, test_end, test_labels, tokenizer)

all_dataset = ConcatDataset([train_dataset, test_dataset])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
all_loader = DataLoader(all_dataset, batch_size=16, shuffle=False)

In [None]:
'''
# Load ProtBERT model
model = BertModel.from_pretrained("Rostlab/prot_bert")

# Run each method
peptide_embeddings = utils.embed_peptide_only(model, peptide_tokens)
parent_embeddings = utils.extract_subsequence_embeddings(model, parent_tokens, start_positions, end_positions)
masked_embeddings = utils.embed_with_masking(model, parent_tokens, start_positions, end_positions)
concatenated_embeddings = utils.concatenate_embeddings(model, parent_tokens, peptide_tokens)
'''

'\n# Load ProtBERT model\nmodel = BertModel.from_pretrained("Rostlab/prot_bert")\n\n# Run each method\npeptide_embeddings = utils.embed_peptide_only(model, peptide_tokens)\nparent_embeddings = utils.extract_subsequence_embeddings(model, parent_tokens, start_positions, end_positions)\nmasked_embeddings = utils.embed_with_masking(model, parent_tokens, start_positions, end_positions)\nconcatenated_embeddings = utils.concatenate_embeddings(model, parent_tokens, peptide_tokens)\n'

In [None]:
'''
# Save results
utils.save_embeddings(peptide_embeddings, "results/peptide_embeddings.pt")
utils.save_embeddings(parent_embeddings, "results/parent_embeddings.pt")
utils.save_embeddings(masked_embeddings, "results/masked_embeddings.pt")
utils.save_embeddings(concatenated_embeddings, "results/concatenated_embeddings.pt")
'''

'\n# Save results\nutils.save_embeddings(peptide_embeddings, "results/peptide_embeddings.pt")\nutils.save_embeddings(parent_embeddings, "results/parent_embeddings.pt")\nutils.save_embeddings(masked_embeddings, "results/masked_embeddings.pt")\nutils.save_embeddings(concatenated_embeddings, "results/concatenated_embeddings.pt")\n'

# Helper Functions

In [138]:
class BertAttentionClassifier(nn.Module):
    #def __init__(self, bert_model_name="Rostlab/prot_bert", hidden_dim=256, num_covariates=covariates.shape[1]):
    def __init__(self, bert_model_name="Rostlab/prot_bert", hidden_dim=256):
        super(BertAttentionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)

        # Attention layer
        self.attention = nn.Linear(1024, 1)

        # Updated fully connected layer input size to include covariates
        #print(num_covariates)
        #self.fc1 = nn.Linear(1024 + num_covariates, hidden_dim)  # Combined input size
        self.fc1 = nn.Linear(1024, hidden_dim)  # Combined input size
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, 1)  # Output layer for binary classification

    # def forward(self, mode, parent_tokens=None, peptide_tokens=None, start_positions=None,
    #             end_positions=None, covariates=None):
    #     # Embedding extraction based on the mode
    #     if mode == "peptide_only":
    #         embeddings = self.embed_peptide_only(peptide_tokens)
    #         #print(f"Shape of embeddings: {embeddings.shape}")
    #     elif mode == "subsequence":
    #         embeddings = self.extract_subsequence_embeddings(parent_tokens, start_positions, end_positions)
    #     elif mode == "masked":
    #         embeddings = self.embed_with_masking(parent_tokens, start_positions, end_positions)
    #     elif mode == "concatenated":
    #         embeddings = self.concatenate_embeddings(parent_tokens, peptide_tokens)
    #     elif mode == "parent_only":
    #         embeddings = self.embed_parent_only(parent_tokens)
    #     else:
    #         raise ValueError(f"Invalid embedding mode: {mode}")

    #     # Apply attention mechanism
    #     attention_scores = self.attention(embeddings)   # Capture attention scores
    #     embeddings = self.apply_attention(embeddings)  # Shape: [batch_size, 1024]

    #     # Combine embedding with additional covariates
    #     if covariates is not None:
    #         covariates = covariates.to("cpu")  # Keep covariates on CPU
    #         combined_input = torch.cat([embeddings.to("cpu"), covariates], dim=1).to(device)
    #     else: #if no covariates, that means we only want the embeddings
    #         return embeddings


    def forward(self, mode, parent_tokens=None, peptide_tokens=None, start_positions=None, end_positions=None,final=0):
        # Embedding extraction based on the mode
        if mode == "peptide_only":
            embeddings = self.embed_peptide_only(peptide_tokens)
            #print(f"Shape of embeddings: {embeddings.shape}")
        elif mode == "subsequence":
            embeddings = self.extract_subsequence_embeddings(parent_tokens, start_positions, end_positions)
        elif mode == "masked":
            embeddings = self.embed_with_masking(parent_tokens, start_positions, end_positions)
        elif mode == "concatenated":
            embeddings = self.concatenate_embeddings(parent_tokens, peptide_tokens)
        elif mode == "parent_only":
            embeddings = self.embed_parent_only(parent_tokens)
        else:
            raise ValueError(f"Invalid embedding mode: {mode}")

        # Apply attention mechanism
        attention_scores = self.attention(embeddings)   # Capture attention scores
        embeddings = self.apply_attention(embeddings)  # Shape: [batch_size, 1024]

        combined_input = embeddings
        if final==1: # if it's final extraction mode, we return directly after attention layer
            return combined_input

        # Classification layers
        x = F.relu(self.fc1(combined_input))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x, attention_scores

    def apply_attention(self, embeddings):
        """Apply attention mechanism to the embeddings."""
        #print(f"Input embeddings shape before attention: {embeddings.shape}")
        # Debugging: Ensure embeddings have the expected shape [batch_size, seq_length, 1024]

        scores = self.attention(embeddings)  # Linear layer to calculate attention scores
        #print(f"Attention scores shape after self.attention: {scores.shape}")
        # Debugging: Should be [batch_size, seq_length, 1]

        scores = torch.softmax(scores, dim=1)  # Normalize scores across seq_length
        #print(f"Attention scores shape after softmax: {scores.shape}")
        # Debugging: Should remain [batch_size, seq_length, 1]

        # Element-wise multiplication and summation to compute attended embeddings
        attended_embeddings = torch.sum(scores * embeddings, dim=1)
        #print(f"Output attended embeddings shape: {attended_embeddings.shape}")
        # Debugging: Should be [batch_size, 1024]

        return attended_embeddings

    def embed_parent_only(self, parent_tokens):
        """Baseline: Disregard peptide subsequence and only evaluate on full information"""
        outputs = self.bert(**parent_tokens)
        embeddings = outputs.last_hidden_state
        return embeddings

    def embed_peptide_only(self, peptide_tokens):
        """Generate embeddings for the peptide subsequence only."""
        outputs = self.bert(**peptide_tokens)
        embeddings = outputs.last_hidden_state
        return embeddings

    # Embed parent and extract peptide subsequence
    def extract_subsequence_embeddings(self, parent_tokens, start_positions, end_positions):
        """
        Extract embeddings for subsequences within parent sequences.
        Args:
            parent_tokens: Tokenized parent sequences.
            start_positions: Aligned start positions for subsequences.
            end_positions: Aligned end positions for subsequences.
        Returns:
            subsequence_embeddings: Tensor of subsequence embeddings.
        """
        outputs = self.bert(**parent_tokens)
        parent_embeddings = outputs.last_hidden_state  # Shape: [batch_size, seq_length, hidden_dim]

        subsequence_embeddings = []
        for i, (start, end) in enumerate(zip(start_positions, end_positions)):
            seq_length = parent_embeddings.size(1)  # Tokenized sequence length

            # Validate indices
            if 0 <= start < seq_length and 0 <= end < seq_length and start <= end:
                subsequence = parent_embeddings[i, start:end + 1, :]  # Extract subsequence
                subsequence_embeddings.append(subsequence)
            else:
                print(f"Invalid indices for sample {i}: start={start}, end={end}, seq_length={seq_length}")
                raise ValueError("Invalid start or end position")

        # Pad subsequences to ensure uniform length (optional)
        max_length = max([sub.size(0) for sub in subsequence_embeddings])
        padded_embeddings = [
            F.pad(sub, (0, 0, 0, max_length - sub.size(0))) for sub in subsequence_embeddings
        ]
        subsequence_embeddings = torch.stack(padded_embeddings, dim=0)  # Shape: [batch_size, max_length, hidden_dim]

        return subsequence_embeddings

    def embed_with_masking(self, parent_tokens, start_positions, end_positions):
        """Generate embeddings with non-peptide regions masked."""
        input_ids, attention_mask = utils.mask_non_peptide_regions(
            parent_tokens['input_ids'], parent_tokens['attention_mask'], start_positions, end_positions
        )
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state
        return embeddings

    # Embed parent and peptide separately and concatenate
    def concatenate_embeddings(self, parent_tokens, peptide_tokens):
        """Generate and concatenate parent and peptide embeddings."""
        parent_outputs = self.bert(**parent_tokens)
        peptide_outputs = self.bert(**peptide_tokens)

        # Extract the last hidden state
        parent_embeddings = parent_outputs.last_hidden_state  # Shape: [batch_size, parent_seq_len, hidden_dim]
        peptide_embeddings = peptide_outputs.last_hidden_state

        concatenated_embeddings = torch.cat((parent_embeddings, peptide_embeddings), dim=1)  # Combine
        return concatenated_embeddings


In [139]:
from tqdm import tqdm

def train_model(model, dataloader, criterion, optimizer, mode, num_epochs=10):

    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        # Add a progress bar for the batches
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch", leave=True)

        for batch in progress_bar:
        #for batch in dataloader:
            # Move data to device
            parent_tokens = {k: v.to(device) for k, v in batch["parent_tokens"].items()}
            peptide_tokens = {k: v.to(device) for k, v in batch["peptide_tokens"].items()}
            start_positions = batch["start_positions"]
            end_positions = batch["end_positions"]
            #covariates = batch["covariates"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass with the chosen embedding method
            logits, attention_scores = model(
                mode=mode,
                parent_tokens=parent_tokens,
                peptide_tokens=peptide_tokens,
                start_positions=start_positions,
                end_positions=end_positions,
                #covariates=covariates
                final=0
            )

            # Compute loss
            loss = criterion(logits.squeeze(), labels.float())

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Update progress bar
            progress_bar.set_postfix(loss=loss.item())

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

In [140]:
def evaluate_model(model, loader, device,mode):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0
    mode=mode

    criterion = nn.BCEWithLogitsLoss()
    progress_bar = tqdm(loader, desc="Evaluating", unit="batch", leave=True)

    with torch.no_grad():
        for batch in progress_bar:
            # Move data to device
            parent_tokens = {k: v.to(device) for k, v in batch["parent_tokens"].items()}
            peptide_tokens = {k: v.to(device) for k, v in batch["peptide_tokens"].items()}
            start_positions = batch["start_positions"]
            end_positions = batch["end_positions"]
            #covariates = batch["covariates"].to(device)  # Ensure covariates are on the same device
            #print(covariates.size())
            labels = batch["labels"].to(device)

            # Forward pass
            logits, _ = model(
                mode=mode,
                parent_tokens=parent_tokens,
                peptide_tokens=peptide_tokens,
                start_positions=start_positions,
                end_positions=end_positions,
                #covariates=covariates
                final=0
            )

            # Compute loss
            loss = criterion(logits.squeeze(), labels.float())
            total_loss += loss.item()

            # Convert logits to probabilities
            probabilities = torch.sigmoid(logits).squeeze()
            predictions = (probabilities > 0.5).float()

            # Calculate accuracy
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = 100 * correct / total
    avg_loss = total_loss / len(loader)

    print(f"Evaluation Accuracy: {accuracy:.2f}%")
    print(f"Average Loss: {avg_loss:.4f}")

    return accuracy


def extract_final_embeddings(model, dataloader, mode, device):
    """
    Extract embeddings from the model after the attention layer.

    Args:
        model: The trained model instance.
        dataloader: DataLoader containing the dataset.
        mode: The embedding mode ("peptide_only", "subsequence", etc.).
        device: The device to run the model on ("cpu" or "cuda").

    Returns:
        final_embeddings: Tensor containing all extracted embeddings.
    """
    model.eval()  # Set model to evaluation mode
    final_embeddings = []

    # Progress bar for monitoring the extraction process
    progress_bar = tqdm(dataloader, desc="Extracting Final Embeddings", unit="batch", leave=True)

    with torch.no_grad():  # Disable gradient computation for inference
        for batch in progress_bar:
            # Move data to the appropriate device
            parent_tokens = {k: v.to(device) for k, v in batch["parent_tokens"].items()}
            peptide_tokens = {k: v.to(device) for k, v in batch["peptide_tokens"].items()}
            start_positions = batch["start_positions"]
            end_positions = batch["end_positions"]

            # Get embeddings after attention
            embeddings = model(
                mode=mode,
                parent_tokens=parent_tokens,
                peptide_tokens=peptide_tokens,
                start_positions=start_positions,
                end_positions=end_positions,
                #covariates=None  # Exclude covariates to get embeddings directly
                final=1
            )
            # Move embeddings to CPU and store
            final_embeddings.append(embeddings)

    # Combine all embeddings into a single tensor
    # final_embeddings = torch.cat(final_embeddings, dim=0)
    return final_embeddings

def save_embeddings(method,embeddings):
  output_dir = "results"
  os.makedirs(output_dir, exist_ok=True)  # Ensure directory exists
  file_path = f"{output_dir}/{method}_final_embeddings.pt"
  torch.save(embeddings, file_path)
  print(f"Saved final embeddings for {method} to {file_path}")

## Parent Only

In [145]:
# Define the embedding methods to loop through
embedding_methods = "parent_only"
#embedding_methods = ["parent_only","peptide_only", "subsequence", "masked", "concatenated"]

# Parameters
num_epochs = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset is imbalanced (train set is 27% positive) so weighting loss function
# Calculate class weights
num_positives = sum(train_labels)  # Number of positive samples in the training set
num_negatives = len(train_labels) - num_positives  # Number of negative samples
pos_weight = num_negatives / num_positives  # Positive class weight
pos_weight_tensor = torch.tensor(pos_weight).to(device)  # Ensure compatibility with GPU

# Define the loss function
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)

# Store results for accuracy and embeddings
results = {}
final_embeddings = {}

print(f"Training and evaluating for embedding method: {embedding_methods}")

# Initialize the model for each method
model = BertAttentionClassifier(bert_model_name="Rostlab/prot_bert", hidden_dim=256)
model.bert.gradient_checkpointing_enable()

# Move to device (GPU or CPU)
model.to(device)

# Define optimizer for each run
optimizer = torch.optim.AdamW(
    [
        {'params': model.bert.parameters(), 'lr': 1e-4},  # Fine-tuning BERT with a smaller learning rate
        {'params': model.fc1.parameters(), 'lr': 1e-3},  # Classifier hidden layer
        {'params': model.fc2.parameters(), 'lr': 1e-3},  # Classifier hidden layer
        {'params': model.fc3.parameters(), 'lr': 1e-3},  # Classifier output layer
        {'params': model.attention.parameters(), 'lr': 1e-3},  # Attention layer
    ]
)

# Train the model for the current embedding method
train_model(
    model=model,
    dataloader=train_loader,
    criterion=criterion,
    optimizer=optimizer,
    mode=embedding_methods,
    num_epochs=num_epochs
)

Training and evaluating for embedding method: parent_only


Epoch 1/5: 100%|██████████| 5/5 [00:08<00:00,  1.67s/batch, loss=0.299]


Epoch 1/5, Loss: 1.2918


Epoch 2/5: 100%|██████████| 5/5 [00:08<00:00,  1.66s/batch, loss=0.155]


Epoch 2/5, Loss: 1.1035


Epoch 3/5: 100%|██████████| 5/5 [00:08<00:00,  1.66s/batch, loss=0.108]


Epoch 3/5, Loss: 0.7085


Epoch 4/5: 100%|██████████| 5/5 [00:08<00:00,  1.66s/batch, loss=0.0899]


Epoch 4/5, Loss: 0.2796


Epoch 5/5: 100%|██████████| 5/5 [00:08<00:00,  1.66s/batch, loss=0.6]

Epoch 5/5, Loss: 0.8969





In [146]:
accuracy=evaluate_model(model, test_loader, device, embedding_methods)
final_embeddings=extract_final_embeddings(model, all_loader, embedding_methods,device=device)
#final_embeddings.shape
save_embeddings(embedding_methods,final_embeddings)

Evaluating: 100%|██████████| 2/2 [00:00<00:00,  3.43batch/s]


Evaluation Accuracy: 45.00%
Average Loss: 2.5571


Extracting Final Embeddings: 100%|██████████| 7/7 [00:02<00:00,  2.88batch/s]

Saved final embeddings for parent_only to results/parent_only_final_embeddings.pt





## Peptide Only

In [108]:
# Define the embedding methods to loop through
embedding_methods = "peptide_only"
#embedding_methods = ["parent_only","peptide_only", "subsequence", "masked", "concatenated"]

# Parameters
num_epochs = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset is imbalanced (train set is 27% positive) so weighting loss function
# Calculate class weights
num_positives = sum(train_labels)  # Number of positive samples in the training set
num_negatives = len(train_labels) - num_positives  # Number of negative samples
pos_weight = num_negatives / num_positives  # Positive class weight
pos_weight_tensor = torch.tensor(pos_weight).to(device)  # Ensure compatibility with GPU

# Define the loss function
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)

# Store results for accuracy and embeddings
results = {}
final_embeddings = {}

print(f"Training and evaluating for embedding method: {embedding_methods}")

# Initialize the model for each method
model = BertAttentionClassifier(bert_model_name="Rostlab/prot_bert", hidden_dim=256)
model.bert.gradient_checkpointing_enable()

# Move to device (GPU or CPU)
model.to(device)

# Define optimizer for each run
optimizer = torch.optim.AdamW(
    [
        {'params': model.bert.parameters(), 'lr': 1e-4},  # Fine-tuning BERT with a smaller learning rate
        {'params': model.fc1.parameters(), 'lr': 1e-3},  # Classifier hidden layer
        {'params': model.fc2.parameters(), 'lr': 1e-3},  # Classifier hidden layer
        {'params': model.fc3.parameters(), 'lr': 1e-3},  # Classifier output layer
        {'params': model.attention.parameters(), 'lr': 1e-3},  # Attention layer
    ]
)

# Train the model for the current embedding method
train_model(
    model=model,
    dataloader=train_loader,
    criterion=criterion,
    optimizer=optimizer,
    mode=embedding_methods,
    num_epochs=num_epochs
)
accuracy=evaluate_model(model, test_loader, device,embedding_methods)
final_embeddings=extract_final_embeddings(model, all_loader, embedding_methods,device=device)
save_embeddings(embedding_methods,final_embeddings)

Training and evaluating for embedding method: peptide_only


Epoch 1/3: 100%|██████████| 5/5 [00:08<00:00,  1.67s/batch, loss=0.195]


Epoch 1/3, Loss: 1.3107


Epoch 2/3: 100%|██████████| 5/5 [00:08<00:00,  1.66s/batch, loss=0.197]


Epoch 2/3, Loss: 1.3058


Epoch 3/3: 100%|██████████| 5/5 [00:08<00:00,  1.66s/batch, loss=0.225]


Epoch 3/3, Loss: 1.3000


Evaluating: 100%|██████████| 2/2 [00:00<00:00,  3.45batch/s]


Evaluation Accuracy: 85.00%
Average Loss: 0.6816


Extracting Final Embeddings: 100%|██████████| 7/7 [00:02<00:00,  2.88batch/s]

Saved final embeddings for peptide_only to results/peptide_only_final_embeddings.pt





In [None]:
torch.cuda.empty_cache()

## subsequence

In [26]:
# Define the embedding methods to loop through
embedding_methods = "subsequence"
#embedding_methods = ["parent_only","peptide_only", "subsequence", "masked", "concatenated"]

# Parameters
num_epochs = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset is imbalanced (train set is 27% positive) so weighting loss function
# Calculate class weights
num_positives = sum(train_labels)  # Number of positive samples in the training set
num_negatives = len(train_labels) - num_positives  # Number of negative samples
pos_weight = num_negatives / num_positives  # Positive class weight
pos_weight_tensor = torch.tensor(pos_weight).to(device)  # Ensure compatibility with GPU

# Define the loss function
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)

# Store results for accuracy and embeddings
results = {}
final_embeddings = {}

print(f"Training and evaluating for embedding method: {embedding_methods}")

# Initialize the model for each method
model = BertAttentionClassifier(bert_model_name="Rostlab/prot_bert", hidden_dim=256)
model.bert.gradient_checkpointing_enable()

# Move to device (GPU or CPU)
model.to(device)

# Define optimizer for each run
optimizer = torch.optim.AdamW(
    [
        {'params': model.bert.parameters(), 'lr': 1e-4},  # Fine-tuning BERT with a smaller learning rate
        {'params': model.fc1.parameters(), 'lr': 1e-3},  # Classifier hidden layer
        {'params': model.fc2.parameters(), 'lr': 1e-3},  # Classifier hidden layer
        {'params': model.fc3.parameters(), 'lr': 1e-3},  # Classifier output layer
        {'params': model.attention.parameters(), 'lr': 1e-3},  # Attention layer
    ]
)

# Train the model for the current embedding method
train_model(
    model=model,
    dataloader=train_loader,
    criterion=criterion,
    optimizer=optimizer,
    mode=embedding_methods,
    num_epochs=num_epochs
)

accuracy=evaluate_model(model, test_loader, device,embedding_methods)
final_embeddings=extract_final_embeddings(model, all_loader, embedding_methods,device=device)
save_embeddings(embedding_methods,final_embeddings)

Training and evaluating for embedding method: subsequence


Epoch 1/3: 100%|██████████| 5/5 [00:08<00:00,  1.68s/batch, loss=0.293]


Epoch 1/3, Loss: 1.3024


Epoch 2/3: 100%|██████████| 5/5 [00:08<00:00,  1.67s/batch, loss=0.208]


Epoch 2/3, Loss: 1.1029


Epoch 3/3: 100%|██████████| 5/5 [00:08<00:00,  1.67s/batch, loss=0.234]


Epoch 3/3, Loss: 0.8477


Evaluating:   0%|          | 0/2 [00:00<?, ?batch/s]

torch.Size([16, 8])


Evaluating: 100%|██████████| 2/2 [00:00<00:00,  3.42batch/s]


torch.Size([4, 8])
Evaluation Accuracy: 100.00%
Average Loss: 0.1587


Extracting Final Embeddings: 100%|██████████| 7/7 [00:02<00:00,  2.88batch/s]

Saved final embeddings for subsequence to results/subsequence_final_embeddings.pt





In [28]:
import torch
import pandas as pd

# Load the saved embeddings
method = 'subsequence'  # Replace with the method used when saving the embeddings
file_path = f"results/{method}_final_embeddings.pt"

# Load the embeddings from the .pt file
embeddings = torch.load(file_path)

# If embeddings are a tensor, convert to a DataFrame
if isinstance(embeddings, torch.Tensor):
    # Move the tensor to CPU and convert to numpy
    embeddings_cpu = embeddings.cpu().numpy()  # Use .cpu() before .numpy()
    df = pd.DataFrame(embeddings_cpu)  # Convert the numpy array to a DataFrame
elif isinstance(embeddings, dict):  # If embeddings are stored as a dictionary
    # If the embeddings are stored as a dictionary, you can extract the values and convert
    df = pd.DataFrame(embeddings)  # Example: If embeddings are already in a suitable format
else:
    print("Unsupported embeddings format")

# Check the first few rows of the DataFrame
print(df.head())


       0         1         2         3         4         5         6     \
0 -0.114157  0.248824  0.014217  0.050114  0.033292  0.016470 -0.013134   
1 -0.132596  0.253248 -0.082209  0.010419 -0.213525 -0.148483 -0.004346   
2 -0.170352  0.158309 -0.027124 -0.024673 -0.101800 -0.043206 -0.090205   
3 -0.076508  0.096773 -0.033277 -0.095106 -0.208793 -0.155706 -0.049120   
4 -0.081266  0.081046  0.020499 -0.128383 -0.179900 -0.045796  0.034417   

       7         8         9     ...      1014      1015      1016      1017  \
0 -0.030876 -0.062250  0.062992  ...  0.001058  0.091619  0.058876 -0.077734   
1 -0.167261 -0.102770 -0.051028  ... -0.172340  0.029804  0.058265 -0.241839   
2 -0.281302 -0.110784 -0.011791  ... -0.164307  0.093185  0.027939 -0.152268   
3 -0.163977 -0.108256  0.040491  ... -0.356882  0.155240  0.069389 -0.088114   
4 -0.242266 -0.036848 -0.001851  ... -0.283149  0.064167  0.102296 -0.104585   

       1018      1019      1020      1021      1022      1023  
0  0

  embeddings = torch.load(file_path)


## masked

In [29]:
# Define the embedding methods to loop through
embedding_methods = "masked"
#embedding_methods = ["parent_only","peptide_only", "subsequence", "masked", "concatenated"]

# Parameters
num_epochs = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset is imbalanced (train set is 27% positive) so weighting loss function
# Calculate class weights
num_positives = sum(train_labels)  # Number of positive samples in the training set
num_negatives = len(train_labels) - num_positives  # Number of negative samples
pos_weight = num_negatives / num_positives  # Positive class weight
pos_weight_tensor = torch.tensor(pos_weight).to(device)  # Ensure compatibility with GPU

# Define the loss function
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)

# Store results for accuracy and embeddings
results = {}
final_embeddings = {}

print(f"Training and evaluating for embedding method: {embedding_methods}")

# Initialize the model for each method
model = BertAttentionClassifier(bert_model_name="Rostlab/prot_bert", hidden_dim=256)
model.bert.gradient_checkpointing_enable()

# Move to device (GPU or CPU)
model.to(device)

# Define optimizer for each run
optimizer = torch.optim.AdamW(
    [
        {'params': model.bert.parameters(), 'lr': 1e-4},  # Fine-tuning BERT with a smaller learning rate
        {'params': model.fc1.parameters(), 'lr': 1e-3},  # Classifier hidden layer
        {'params': model.fc2.parameters(), 'lr': 1e-3},  # Classifier hidden layer
        {'params': model.fc3.parameters(), 'lr': 1e-3},  # Classifier output layer
        {'params': model.attention.parameters(), 'lr': 1e-3},  # Attention layer
    ]
)

# Train the model for the current embedding method
train_model(
    model=model,
    dataloader=train_loader,
    criterion=criterion,
    optimizer=optimizer,
    mode=embedding_methods,
    num_epochs=num_epochs
)

accuracy=evaluate_model(model, test_loader, device,embedding_methods)
final_embeddings=extract_final_embeddings(model, all_loader, embedding_methods,device=device)
save_embeddings(embedding_methods,final_embeddings)

Training and evaluating for embedding method: masked


Epoch 1/3: 100%|██████████| 5/5 [00:08<00:00,  1.67s/batch, loss=0.259]


Epoch 1/3, Loss: 1.2987


Epoch 2/3: 100%|██████████| 5/5 [00:08<00:00,  1.67s/batch, loss=0.219]


Epoch 2/3, Loss: 1.2901


Epoch 3/3: 100%|██████████| 5/5 [00:08<00:00,  1.67s/batch, loss=0.336]


Epoch 3/3, Loss: 1.2841


Evaluating:   0%|          | 0/2 [00:00<?, ?batch/s]

torch.Size([16, 8])


Evaluating: 100%|██████████| 2/2 [00:00<00:00,  3.42batch/s]


torch.Size([4, 8])
Evaluation Accuracy: 85.00%
Average Loss: 0.6378


Extracting Final Embeddings: 100%|██████████| 7/7 [00:02<00:00,  2.88batch/s]

Saved final embeddings for masked to results/masked_final_embeddings.pt





In [30]:
final_embeddings

tensor([[-0.0446,  0.0432, -0.1188,  ..., -0.0144,  0.0302,  0.0203],
        [-0.0447,  0.0429, -0.1186,  ..., -0.0147,  0.0304,  0.0204],
        [-0.0447,  0.0430, -0.1186,  ..., -0.0146,  0.0304,  0.0204],
        ...,
        [-0.0446,  0.0432, -0.1188,  ..., -0.0143,  0.0300,  0.0203],
        [-0.0446,  0.0431, -0.1188,  ..., -0.0143,  0.0302,  0.0203],
        [-0.0449,  0.0431, -0.1184,  ..., -0.0148,  0.0303,  0.0205]],
       device='cuda:0')

## concatenated

In [None]:
# Define the embedding methods to loop through
embedding_methods = "concatenated"
#embedding_methods = ["parent_only","peptide_only", "subsequence", "masked", "concatenated"]

# Parameters
num_epochs = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset is imbalanced (train set is 27% positive) so weighting loss function
# Calculate class weights
num_positives = sum(train_labels)  # Number of positive samples in the training set
num_negatives = len(train_labels) - num_positives  # Number of negative samples
pos_weight = num_negatives / num_positives  # Positive class weight
pos_weight_tensor = torch.tensor(pos_weight).to(device)  # Ensure compatibility with GPU

# Define the loss function
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)

# Store results for accuracy and embeddings
results = {}
final_embeddings = {}

print(f"Training and evaluating for embedding method: {embedding_methods}")

# Initialize the model for each method
model = BertAttentionClassifier(bert_model_name="Rostlab/prot_bert", hidden_dim=256)
model.bert.gradient_checkpointing_enable()

# Move to device (GPU or CPU)
model.to(device)

# Define optimizer for each run
optimizer = torch.optim.AdamW(
    [
        {'params': model.bert.parameters(), 'lr': 1e-4},  # Fine-tuning BERT with a smaller learning rate
        {'params': model.fc1.parameters(), 'lr': 1e-3},  # Classifier hidden layer
        {'params': model.fc2.parameters(), 'lr': 1e-3},  # Classifier hidden layer
        {'params': model.fc3.parameters(), 'lr': 1e-3},  # Classifier output layer
        {'params': model.attention.parameters(), 'lr': 1e-3},  # Attention layer
    ]
)

# Train the model for the current embedding method
train_model(
    model=model,
    dataloader=train_loader,
    criterion=criterion,
    optimizer=optimizer,
    mode=embedding_methods,
    num_epochs=num_epochs
)

accuracy=evaluate_model(model, test_loader, device,embedding_methods)
final_embeddings=extract_final_embeddings(model, all_loader, embedding_methods,device=device)
save_embeddings(embedding_methods,final_embeddings)