In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import pipeline, T5Tokenizer, T5EncoderModel
import torch
import re
import json
import numpy as np
import sys

## Negative Samples Creation w/ Embeddings and Coisne Similarity

Here we chose the approach to create embeddings from the prot_t5_xl model and then check if the embedding is in a certain "un-similarity range" which is a self-defined threshold. This becuase evaluations showed that this is (at least as far as we consider) the "best" approach.

In [None]:
read_path_beta = "../../data/customDatasets/Stitchr_beta_concatenated.tsv"
stitchr_beta_df = pd.read_csv(read_path_beta, sep="\t")

In [None]:
stitchr_beta_df

In [None]:
len(set(stitchr_beta_df["Epitope"]))

In [None]:
stitchr_beta_df['Epitope'] = stitchr_beta_df['Epitope'].astype(str)
epitope_counts = stitchr_beta_df['Epitope'].value_counts().reset_index()
print(epitope_counts)
epitope_counts.columns = ['Epitope Name', 'Count'] 

plt.figure(figsize=(10, 6))
plt.hist(stitchr_beta_df['Epitope'], bins=len(epitope_counts), edgecolor='k', alpha=0.7)
plt.xlabel('Epitope Value')
plt.ylabel('Frequency')
plt.title('Distribution of Epitope Data')
plt.xticks([])  # removes the x-axis labels
plt.show()

In [None]:
# print(epitope_counts.to_string())

In [None]:

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("Using device: {}".format(device))


In [None]:

#@title Load encoder-part of ProtT5 in half-precision. { display-mode: "form" }
# Load ProtT5 in half-precision (more specifically: the encoder-part of ProtT5-XL-U50 in half-precision)
transformer_link = "Rostlab/prot_t5_xl_half_uniref50-enc"
print("Loading: {}".format(transformer_link))
model = T5EncoderModel.from_pretrained(transformer_link)
if device==torch.device("cpu"):
  print("Casting model to full precision for running on CPU ...")
  model.to(torch.float32) # only cast to full-precision if no GPU is available
model = model.to(device)
model = model.eval()
tokenizer = T5Tokenizer.from_pretrained(transformer_link, do_lower_case=False, legacy=True)


In [None]:
epitopes = set(stitchr_beta_df["Epitope"].to_list())

In [None]:
# this will replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
processed_epitopes = [(sequence, " ".join(list(re.sub(r"[UZOB]", "X", sequence)))) for sequence in epitopes]
# processed_epitopes

In [None]:
def process_batch(processed_seqs):
    # Extract just the processed sequences for tokenization
    sequences = [seq[1] for seq in processed_seqs]
    ids = tokenizer.batch_encode_plus(sequences, add_special_tokens=True, padding="longest", return_tensors="pt")
    input_ids = ids['input_ids'].to(device)
    attention_mask = ids['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    last_hidden_states = outputs.last_hidden_state
    
    # Now, return embeddings mapped to the original sequence
    embeddings = {}
    for i, (original_seq, _) in enumerate(processed_seqs):
        seq_len = attention_mask[i].sum().item() - 2  # Subtract [CLS] and [SEP]
        valid_embeddings = last_hidden_states[i, 1:seq_len+1]
        mean_embedding = valid_embeddings.mean(dim=0)
        embeddings[original_seq] = mean_embedding.cpu().numpy()  # Use original sequence as key
    
    return embeddings

In [None]:
to_path = "../../data/customDatasets/negative_samples/temp"
file_name = "Stitchr_beta_concatenated_with_epitope_embedding.tsv"

In [None]:

batch_size = 128
sequence_to_embedding = {}

# Batch processing with a dictionary, using original sequences as keys
for i in range(0, len(processed_epitopes), batch_size):
    batch_sequences = processed_epitopes[i:i+batch_size]
    batch_embeddings = process_batch(batch_sequences)
    sequence_to_embedding.update(batch_embeddings)

    stitchr_beta_df["Epitope Embedding"] = stitchr_beta_df["Epitope"].map(sequence_to_embedding)

# This is needed becuase the embedding is huge and otherwise it would be stored with line breaks (\n) 
# This would make it difficult while reading the file
stitchr_beta_df['Epitope Embedding'] = stitchr_beta_df['Epitope Embedding'].apply(lambda x: json.dumps(x.tolist()))

stitchr_beta_df.to_csv(to_path+"/"+file_name, sep="\t", index=False)


The cell above is commented out to safe time. After changing something in the underlaying dataset re-run this cell to create the up-to-date embeddings!

In [None]:

stitchr_beta_df = pd.read_csv(to_path+"/"+file_name, sep="\t")
stitchr_beta_df['Epitope Embedding'] = stitchr_beta_df['Epitope Embedding'].apply(lambda x: np.array(json.loads(x)))

In [None]:
max_index = len(stitchr_beta_df) - 1 
negative_epitopes_cosine = []

In [None]:
def cosine_similarity(embedding1, embedding2): 
    cosine = np.dot(embedding1,embedding2)/(np.linalg.norm(embedding1)*np.linalg.norm(embedding2))
    return cosine

In [None]:
def is_valid_negative(cosine_similarity, current_epitope, random_epitope): 
    is_valid = False
    cosine_min = -1
    cosine_max = 0.75

    if (cosine_similarity >= cosine_min \
        and cosine_similarity <= cosine_max) \
        and (current_epitope != random_epitope): 
        is_valid = True 

    return is_valid


In [None]:
sys_max_depth = sys.getrecursionlimit()
max_attempts_by_system = sys_max_depth - 1

In [None]:
len(stitchr_beta_df["Epitope"])

In [None]:
def search_negative_epitope_embedding(df, index, current_epitope, max_attempts=max_attempts_by_system):
    current_embedding = df["Epitope Embedding"][index]
    attempt = 0
    
    while attempt < max_attempts:
        random_epitope_index = np.random.randint(0, len(df))
        random_epitope = df["Epitope"][random_epitope_index]
        
        if random_epitope_index == index:
            attempt += 1
            continue  # Skip the rest of the loop and try again
        
        random_epitope_embedding = df["Epitope Embedding"][random_epitope_index]
        cosine = cosine_similarity(current_embedding, random_epitope_embedding)
        
        if is_valid_negative(cosine, current_epitope, random_epitope) or attempt == max_attempts - 1:
            return random_epitope  # Return the found valid or last attempt epitope
        
        attempt += 1
    
    # This point should theoretically never be reached because of the check in the loop,
    # but it's a fallback to return a random different epitope if for some reason it does.
    while True:
        random_epitope_index = np.random.randint(0, len(df))
        if random_epitope_index != index:
            return df["Epitope"][random_epitope_index]

for i, epitope in enumerate(stitchr_beta_df["Epitope"]):
    negative_epitope = search_negative_epitope_embedding(stitchr_beta_df, i, epitope)
    negative_epitopes_cosine.append(negative_epitope)


In [None]:
len((negative_epitopes_cosine))

In [None]:
negative_epitopes_cosine_dict = {"Negative Epitope": negative_epitopes_cosine}
negative_epitopes_cosine_df = pd.DataFrame(negative_epitopes_cosine_dict)
# print(negative_epitopes_cosine_df.to_string())
negative_epitopes_cosine_df

In [None]:
epitope_counts_negative = negative_epitopes_cosine_df['Negative Epitope'].value_counts().reset_index()
epitope_counts_negative.columns = ['Epitope Name', 'Count']
print(epitope_counts_negative) 

plt.figure(figsize=(10, 6))
plt.hist(negative_epitopes_cosine_df['Negative Epitope'].astype(str), bins=len(epitope_counts_negative), edgecolor='k', alpha=0.7)
plt.xlabel('Epitope Value')
plt.ylabel('Frequency')
plt.title('Distribution of Negative Epitope Data')
plt.xticks([])  # removes the x-axis labels
plt.show()

In [None]:
stitchr_beta_df.columns

In [None]:
stitchr_beta_with_negative_df = stitchr_beta_df.drop(["Epitope Embedding"], axis=1).copy(deep=True)
stitchr_beta_with_negative_df["Binding"] = 0
stitchr_beta_with_negative_df["Epitope"] = negative_epitopes_cosine
stitchr_beta_with_negative_df

In [None]:
stitchr_beta_with_negative_df = pd.concat([stitchr_beta_df.drop(["Epitope Embedding"], axis=1).copy(deep=True), stitchr_beta_with_negative_df], axis=0)
stitchr_beta_with_negative_df["TCR_name"] = range(1, len(stitchr_beta_with_negative_df)+1)
stitchr_beta_with_negative_df

In [None]:
to_path = "../../data/customDatasets/negative_samples/"
file_name = "Stitchr_beta_concatenated_with_negative.tsv"

In [None]:
stitchr_beta_with_negative_df.to_csv(to_path+"/"+file_name, sep="\t", index=False)