In [1]:
%%capture
!pip install esm

import os
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/COLAB/TCR_projects"
os.chdir(path)

In [2]:
from esm.sdk import client
from getpass import getpass

model_name = "esm3-small-2024-08"

# Evolutionary Scale API key Secret:
token = getpass("Token from Forge console: ")
ESM3_model = client(
    model=model_name, # https://forge.evolutionaryscale.ai/console
    url="https://forge.evolutionaryscale.ai",
    token=token,
)




Token from Forge console: ··········


In [3]:
import random
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

def preprocess_features(feat, res, train_indices, test_indices):
    x_train = feat.iloc[train_indices, :]
    y_train = res[train_indices]
    x_test = feat.iloc[test_indices, :]
    y_test = res[test_indices]
    # scale the data
    scaler = StandardScaler().fit(x_train)
    x_train = pd.DataFrame(scaler.transform(x_train), index=x_train.index, columns=x_train.columns)
    x_test = pd.DataFrame(scaler.transform(x_test), index=x_test.index, columns=x_test.columns)
    return x_train, y_train, x_test, y_test


def run_prediction(mat_test_tab,epitope_embeddings,tcr_embeddings):

  def get_embeddings(row):
    epitope = epitope_embeddings.loc[row['epitope']].values
    tcr = tcr_embeddings.loc[row[tcr_features]].values
    return np.concatenate((epitope, tcr))

  ################# test set features
  features_test = mat_test_tab.apply(get_embeddings, axis=1)
  features_test = pd.DataFrame(features_test.tolist(), index=features_test.index)
  features_test.index = mat_test_tab["epitope"] + "_" + mat_test_tab[tcr_features]
  features_test.columns = epitope_embeddings.columns.tolist() + tcr_embeddings.columns.tolist()
  ## add other information
  df_encoded_TCR_subset = df_encoded_TCR.loc[mat_test_tab[tcr_features], : ]
  df_encoded_epitope_subset = df_encoded_epitope.loc[mat_test_tab["epitope"], : ]

  ## combine
  # "ESM3 + VJ genes" "all features" "ESMonly" "withoutESM"
  if features_name == "ESM3 + VJ genes":
    features_test_all = pd.concat([features_test.reset_index(drop=True), df_encoded_TCR_subset.reset_index(drop=True)], axis=1)
  if features_name == "all features":
    features_test_all = pd.concat([features_test.reset_index(drop=True), df_encoded_TCR_subset.reset_index(drop=True), df_encoded_epitope_subset.reset_index(drop=True)], axis=1)
  if features_name == "ESMonly":
    features_test_all = features_test
  if features_name == "withoutESM":
    features_test_all = pd.concat([df_encoded_TCR_subset.reset_index(drop=True), df_encoded_epitope_subset.reset_index(drop=True)], axis=1)

  features_test_all.index = features_test.index

  ############################################ run ML ############################################
  X_test  = features_test_all
  X_test.columns  = X_test.columns.astype(str)

  x_test_proba = model.predict_proba(X_test)[:, 1]

  p_test = pd.DataFrame(
      { 'split': "test",
        'epitope': mat_test_tab.epitope,
        'sample': X_test.index,
        'predicted_prob': x_test_proba
        }
  )

  return p_test



def run_ESM3(seq,model):
  try:
    from esm.models.esm3 import ESM3
    from esm.sdk.api import ESMProtein, SamplingConfig
    from esm.utils.constants.models import ESM3_OPEN_SMALL

    # Create an ESMProtein object
    protein = ESMProtein(sequence=seq)

    # Encode the protein
    protein_tensor = model.encode(protein)

    # Get the embeddings
    output = model.forward_and_sample(
      protein_tensor,
      SamplingConfig(return_per_residue_embeddings=True)
    )

    # aggregate the per residue embedding
    df = pd.DataFrame(output.per_residue_embedding)
    column_means = df.mean(axis=0)
    return column_means.transpose(), seq

  except Exception as e:
    print(f"Error processing sequence: {seq}")  # Print the problematic sequence
    print(e)  # Print the exception details
    return None  # Or handle the error differently




In [4]:
######################################## TCR-Epitope Binding Affinity Prediction Task #################################
os.chdir(path)
combined_df = pd.read_csv("MixTCRpred/full_training_set_146pmhc.csv")

# combine cdr3
combined_df["cdr3"] = combined_df["cdr3_TRA"] + combined_df["cdr3_TRB"]
combined_df["value"] = 1
combined_df.index = combined_df["epitope"] + "_" + combined_df["cdr3"]
combined_df

##################################################### choose ESM model #################################################

# "esm3-small-2024-08" "esm2_t6_8M_UR50D"

model_name = "esm3-small-2024-08"
epitope_embeddings = pd.read_csv('MixTCRpred/data/epitope_embeddings_'+model_name+'.csv',index_col=0)
cdr3_embeddings = pd.read_csv('MixTCRpred/data/cdr3_embeddings_'+model_name+'.csv',index_col=0)

############################################# subset of available embeddings ###########################################
combined_df = combined_df.loc[combined_df["epitope"].isin(epitope_embeddings.index) , :]
combined_df = combined_df.loc[combined_df["cdr3"].isin(cdr3_embeddings.index) , :]



In [5]:
##################### encode additional information for the TCRs #####################
from sklearn.preprocessing import OneHotEncoder

# One hot encoding of categorical variables
columns_to_encode = ['TRAV','TRAJ','TRBV','TRBJ']
df = combined_df.loc[:,columns_to_encode]

one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)
# Fit and transform the data
one_hot_encoded = one_hot_encoder.fit_transform(df)
feature_names = one_hot_encoder.get_feature_names_out(df.columns)
df_encoded = pd.DataFrame(one_hot_encoded, columns=feature_names)

df_encoded.index = combined_df["cdr3"]
df_encoded_TCR = df_encoded[~df_encoded.index.duplicated(keep="first")]
df_encoded_TCR

Unnamed: 0_level_0,TRAV_TCRAV12-1,TRAV_TCRAV17,TRAV_TCRAV19,TRAV_TCRAV21,TRAV_TCRAV23/DV6,TRAV_TCRAV3,TRAV_TCRAV38-1,TRAV_TCRAV38-2/DV8,TRAV_TCRAV41,TRAV_TRAV-2,...,TRBJ_TRBJ2-5,TRBJ_TRBJ2-6,TRBJ_TRBJ2-7,TRBJ_TRBJ2-7,TRBJ_TRBJ20-1,TRBJ_TRBJ24-1,TRBJ_TRBJ38-2/DV8,TRBJ_TRBJ5-1,TRBJ_TRBJ5-6,TRBJ_nan
cdr3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CAGGADRLTFCASSPAGNTLYF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CAASGGSNYNVLYFCAWSLWGGPSAETLYF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CAASYNYAQGLTFCASRDWGGRQDTQYF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CAAQTGNYKYVFCASGDAGTGQDTQYF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CAASLTGGYKVVFCAWRTDNQDTQYF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CAYRSGEYGNKLVFCASSMAGSSYEQYF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
CAYRSFNNNDMRFCASRSRGGHSPLHF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
CATDNDMRFCASSFGPDEQYF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
CAVLNNARLMFCASSVDRVADTQYF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
###################################### enter parameters ######################################
# Setting_new_epitope_new_TCR_LOOCV

setting = "Setting_new_epitope_new_TCR_FINAL_MODEL"

# "ESM3 + VJ genes" "all features" "ESMonly" "withoutESM"
features_name = "ESM3 + VJ genes"

# MHCI MHCII all
MHC_class = "all"

species = "all" # HomoSapiens all
tcr_features = "cdr3"
repetition = 5
algorithm = "sklearn_logit"
result_folder = "MixTCRpred/output/"+setting+"/"
nfolds = 5 # here it is only for Gridsearch
n_jobs = -1

os.chdir(path)
os.makedirs(result_folder,exist_ok=True)
os.chdir(result_folder)

if species != "all":
  combined_df = combined_df.loc[combined_df["species"]==species,:]

if MHC_class != "all":
  combined_df = combined_df.loc[combined_df["MHC_class"]==MHC_class, : ]


############################################ percentile background peptides ############################################
p_test_mean_background = pd.read_csv("selected_background_peptides_embeddings_"+features_name+"_"+model_name+'_predicted_prob.csv', index_col=0)


In [7]:
combined_df.MHC_class.value_counts()

Unnamed: 0_level_0,count
MHC_class,Unnamed: 1_level_1
MHCI,13248
MHCII,4428


In [8]:
# load model
import pickle
pickle_off = open("model_"+features_name+".pickle","rb")
model = pickle.load(pickle_off)


In [10]:
####################################### INPUT: list of epitope sequences #######################################

epitopes = ["DIYKGMGPLLATVFKSV","GMGPLLATVFKSV"]

MHC = "H2-IAb"         # for now, these values do not matter since the model does not incorporate them !
MHC_class="MHCII"      # for now, these values do not matter since the model does not incorporate them !
species = "MusMusculus"# for now, these values do not matter since the model does not incorporate them !

tcr = np.unique(combined_df[tcr_features])
selected_tcrs = random.sample(list(tcr), min(10000, len(tcr)))

mat_test_tab = pd.DataFrame()
for epitope in epitopes:
  print(epitope)
  df = pd.DataFrame( {tcr_features : selected_tcrs , "epitope": epitope} )
  mat_test_tab = pd.concat( [ mat_test_tab , df ] ).reset_index(drop=True)
mat_test_tab

DIYKGMGPLLATVFKSV
GMGPLLATVFKSV


Unnamed: 0,cdr3,epitope
0,CAMRNNVGDNSKLIWCASGDAGWSNQDTQYF,DIYKGMGPLLATVFKSV
1,CAWRGGGGADGLTFCASSWDPTYNEQFF,DIYKGMGPLLATVFKSV
2,CILREGFGNVLHCCASSMRSGSEQFF,DIYKGMGPLLATVFKSV
3,CAASIGNNRIFFCAWSLQEDTQYF,DIYKGMGPLLATVFKSV
4,CAVSSNTGKLIFCASSASRVGEDTQYF,DIYKGMGPLLATVFKSV
...,...,...
19995,CAVINMGYKLTFCASEDWGGAHAEQFF,GMGPLLATVFKSV
19996,CALSGYTEGADRLTFCASSERNSGNTLYF,GMGPLLATVFKSV
19997,CVLSANNNAGAKLTFCASSDAAREGQNTLYF,GMGPLLATVFKSV
19998,CAVRDQAGTALIFCASSFGPVEQYF,GMGPLLATVFKSV


In [11]:

################################# encoding epitope #################################

epitope_embeddings = pd.DataFrame()
for epitope in epitopes:
  print(epitope)
  results = run_ESM3(epitope,ESM3_model)

  embed_total = pd.concat([results[0]], axis=1).transpose()
  sequence_left = [results[1]]
  embed_total.index = sequence_left
  # Convert column names to strings before adding "ESM3_"
  embed_total.columns = "ESM3_" + embed_total.columns.astype(str)
  epitope_embeddings = pd.concat( [ epitope_embeddings , embed_total.copy() ] )
epitope_embeddings

DIYKGMGPLLATVFKSV
GMGPLLATVFKSV


Unnamed: 0,ESM3_0,ESM3_1,ESM3_2,ESM3_3,ESM3_4,ESM3_5,ESM3_6,ESM3_7,ESM3_8,ESM3_9,...,ESM3_1526,ESM3_1527,ESM3_1528,ESM3_1529,ESM3_1530,ESM3_1531,ESM3_1532,ESM3_1533,ESM3_1534,ESM3_1535
DIYKGMGPLLATVFKSV,0.023491,-0.103977,-0.012792,0.017144,0.038514,-0.093617,-0.049821,-0.009365,0.126688,-0.118027,...,0.127035,0.001517,0.115759,0.041184,0.046807,-0.103359,0.081879,0.041203,-0.049147,-0.609815
GMGPLLATVFKSV,0.083255,-0.080921,-0.00917,0.004828,0.05329,-0.087519,-0.056259,-0.005152,0.104326,-0.114268,...,0.150475,0.080277,0.104821,-0.0163,0.032343,-0.084724,0.076475,0.116125,-0.067687,-0.679018


In [12]:
#################### encode additional information for the epitopes ####################
from sklearn.preprocessing import OneHotEncoder

# One hot encoding of categorical variables
columns_to_encode = ['MHC','MHC_class','species']
df = combined_df.loc[:,columns_to_encode]

one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)
# Fit and transform the data
one_hot_encoded = one_hot_encoder.fit_transform(df)
feature_names = one_hot_encoder.get_feature_names_out(df.columns)
df_encoded = pd.DataFrame(one_hot_encoded, columns=feature_names)

df_encoded.index = combined_df["epitope"]
df_encoded_reference = df_encoded[~df_encoded.index.duplicated(keep="first")]

# Create a new row with all zeros and index name as epitope
df_encoded_epitope = pd.DataFrame()
for epitope in epitopes:
  print(epitope)
  new_row = pd.DataFrame(index=[epitope],columns=df_encoded_reference.columns)
  df_encoded_reference_temp = pd.concat([new_row, df_encoded_reference])


  if "MHC_" + MHC not in df_encoded_reference_temp.columns:
    cols_to_zero = [col for col in df_encoded_reference_temp.columns if "MHC_" in col]
    df_encoded_reference_temp[cols_to_zero] = 0
  else:
    df_encoded_reference_temp.loc[epitope,["MHC_"+MHC]] = 1

  if "MHC_class_" + MHC_class not in df_encoded_reference_temp.columns:
    cols_to_zero = [col for col in df_encoded_reference_temp.columns if "MHC_class_" in col]
    df_encoded_reference_temp[cols_to_zero] = 0
  else:
    df_encoded_reference_temp.loc[epitope,["MHC_class_"+MHC_class]] = 1

  if "species_" + species not in df_encoded_reference_temp.columns:
    cols_to_zero = [col for col in df_encoded_reference_temp.columns if "species_" in col]
    df_encoded_reference_temp[cols_to_zero] = 0
  else:
    df_encoded_reference_temp.loc[epitope,["species_"+species]] = 1


  df_encoded_reference_temp = df_encoded_reference_temp.fillna(0)
  df_encoded_epitope = pd.concat( [ df_encoded_epitope , df_encoded_reference_temp.iloc[[0]] ] )

df_encoded_epitope



DIYKGMGPLLATVFKSV
GMGPLLATVFKSV


  df_encoded_reference_temp = pd.concat([new_row, df_encoded_reference])
  df_encoded_reference_temp = pd.concat([new_row, df_encoded_reference])


Unnamed: 0,MHC_H2-Db,MHC_H2-IAb,MHC_H2-IEk,MHC_H2-Kb,MHC_H2-Kd,MHC_H2-Ld,MHC_HLA-A*02:01,MHC_HLA-A*08:01,MHC_HLA-A*11:01,MHC_HLA-A*24:02,...,MHC_HLA-DQA1:02/DQB1*06:02,MHC_HLA-DRA:01,MHC_HLA-DRA:01/DRB1:01,MHC_HLA-DRB1*04:01,MHC_HLA-DRB1*04:05,MHC_HLA-DRB1*07:01,MHC_HLA-DRB1*11:01,MHC_HLA-DRB1:01,MHC_class_MHCII,species_MusMusculus
DIYKGMGPLLATVFKSV,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
GMGPLLATVFKSV,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [13]:
############################################### run prediction on epitope ###############################################

p_test = run_prediction(mat_test_tab,epitope_embeddings,cdr3_embeddings)


In [14]:
############################################# average predicted probability #############################################

p_test = p_test.loc[:,["epitope","predicted_prob"]]
p_test_mean = p_test.groupby('epitope').agg('mean')
p_test_mean

Unnamed: 0_level_0,predicted_prob
epitope,Unnamed: 1_level_1
DIYKGMGPLLATVFKSV,0.880342
GMGPLLATVFKSV,0.6041


In [22]:
################################################ compute Percentile Rank ################################################
from scipy import stats
percentile_rank_all = pd.DataFrame()
for epitope in p_test_mean.index:

  # Calculate the mean predicted probability for the test set
  mean_predicted_prob = p_test_mean.loc[epitope,"predicted_prob"]

  # Calculate the percentile rank
  percentile_rank = round(100 - stats.percentileofscore(p_test_mean_background["predicted_prob"], mean_predicted_prob), 2 )

  df = pd.DataFrame([epitope, percentile_rank]).transpose()
  percentile_rank_all = pd.concat( [ percentile_rank_all , df ] )

  # Print the percentile rank
  print(f"Percentile Rank: {percentile_rank}")

# Percentile Rank from 0 to 100. the closer to 0 the stronger the predicted TCR recognition.
percentile_rank_all.columns = ["epitope","Percentile Rank"]
percentile_rank_all

Percentile Rank: 1.25
Percentile Rank: 16.15


Unnamed: 0,epitope,Percentile Rank
0,DIYKGMGPLLATVFKSV,1.25
0,GMGPLLATVFKSV,16.15
