In [None]:
# this will cause the colab to restart once
!pip install -q condacolab
import condacolab
condacolab.install()

⏬ Downloading https://github.com/jaimergp/miniforge/releases/download/24.11.2-1_colab/Miniforge3-colab-24.11.2-1_colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:24
🔁 Restarting kernel...


In [None]:
import os
import rpy2.robjects as ro
import shutil

In [None]:
# Google Drive Mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# @markdown ## Specify paths
# @markdown **NOTE**: Working path should contain the EvOlf directory where the codes, environments etc. are present
Working_path = '' # @param {type:"string", placeholder:"Path of EvOlf directory"}
# @markdown Path to the input file including file name
Input_data = '' # @param {type:"string", placeholder:"Input File Name and Path"}
# @markdown Path to the output directory (**OPTIONAL**)
Output_path = '' # @param {type:"string", placeholder:"Output Directory Path"}

# @markdown ---
# @markdown ## Specify columns names
Ligand_column = '' # @param {type:"string", placeholder:"column header for Ligand SMILES"}
Receptor_column = '' # @param {type:"string", placeholder:"column header for Receptor Sequences"}

# @markdown ---
# @markdown Provide custom unique identifiers for each ligand SMILES, receptor sequence and the ligand-receptor pair (**OPTIONAL**)
# @markdown * If only ligand identifiers are available, specify them under Ligand_identifier.
# @markdown * If only receptor identifiers are available, specify them under Receptor_identifier.
# @markdown * If only ligand-receptor pair identifiers are available, specify them under Ligand_Receptor_pairID.
Ligand_identifier = '' # @param {type:"string", placeholder:"column header for Ligand IDs"}
Receptor_identifier = '' # @param {type:"string", placeholder:"column header for Receptor IDs"}
Ligand_Receptor_pairID = '' # @param {type:"string", placeholder:"column header for Ligand-Receptor Pair IDs"}
# @markdown In case no identifiers are provided, EvOlf will generate identifiers for you. The details of these identifiers can be found in the file `Input_ID_Information.csv` in the output directory.


In [None]:
# @title Set working directories
def setsdir(pth):
  import os
  os.chdir(pth+'/EvOlf')

setsdir(Working_path)

In [None]:
class EvOlfError(Exception):
    pass

In [None]:
# @title Create Output and Tmp Directories
if len(Output_path)==0:
  if os.path.exists(Working_path+'/Output'):
    shutil.rmtree(Working_path+'/Output')
    os.mkdir(Working_path+'/Output')
  else:
    os.mkdir(Working_path+'/Output')
  Output_path=Working_path+'/Output/'

Tmp_path=Working_path+'/TMP/'
if os.path.exists(Tmp_path):
  shutil.rmtree(Tmp_path)
  os.mkdir(Tmp_path)
else:
  os.mkdir(Tmp_path)

# Input File Prep

In [None]:
# @title Updating the file paths
with open(Working_path+'/EvOlf/Codes/Input_File.R')as fin:
  with open(Working_path+'/EvOlf/Codes/Input_File_v2.R','w')as fout:
    for line in fin:
      if 'tempPath <- \"' in line:
        fout.write("tempPath <- \""+Tmp_path+"\"\n")
      elif 'dataPath <- \"' in line:
        if len(Input_data)==0:
          raise EvOlfError("Data not provided")
        else:
          fout.write("dataPath <- \""+Input_data+"\"\n")
      elif 'ligSmilesColumn <- \"' in line:
        if len(Ligand_column)==0:
          raise EvOlfError("Data not provided")
        else:
          fout.write("ligSmilesColumn <- \""+Ligand_column+"\"\n")
      elif 'recSeqColumn <- \"' in line:
        if len(Receptor_column)==0:
          raise EvOlfError("Data not provided")
        else:
          fout.write("recSeqColumn <- \""+Receptor_column+"\"\n")
      elif 'ligID <- ' in line:
        if len(Ligand_identifier)==0:
          fout.write(line)
        else:
          fout.write("ligID <- \""+Ligand_identifier+"\"\n")
      elif 'recID <- ' in line:
        if len(Receptor_identifier)==0:
          fout.write(line)
        else:
          fout.write("recID <- \""+Receptor_identifier+"\"\n")
      elif 'lrID <- ' in line:
        if len(Ligand_Receptor_pairID)==0:
          fout.write(line)
        else:
          fout.write("lrID <- \""+Ligand_Receptor_pairID+"\"\n")
      else:
        fout.write(line)

In [None]:
# @title Input File Prep
ro.r.source(Working_path+'/EvOlf/Codes/Input_File_v2.R')

[1] "Code ran successfully"


0,1
value,[16]
visible,[10]


# Ligands Featurizer

## Signaturizer

In [None]:
%%capture
# @title Create environment
!conda env create -f {Working_path}'/EvOlf/Environments/Signaturizer.yml'

In [None]:
# @title Updating the file paths
with open(Working_path+'/EvOlf/Codes/Signaturizer.py')as fin:
  with open(Working_path+'/EvOlf/Codes/Signaturizer_v2.py','w')as fout:
    for line in fin:
      if 'outPath = \"' in line:
        fout.write("outPath = \""+Tmp_path+"\"\n")
      elif 'dataPath = \"' in line:
        fout.write("dataPath = \""+Tmp_path+"\"\n")
      else:
        fout.write(line)

In [None]:
# @title Generating Features from Signaturizer
setsdir(Working_path)
!/usr/local/envs/Signaturizer/bin/python Codes/Signaturizer_v2.py
# @markdown If you encounter HTTPS error, re-run this after a short time interval

2025-03-21 14:26:21.455937: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-21 14:26:22.428677: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-21 14:26:22.433878: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
4
['A1_1', 'A1_2', 'A1_3', 'A1_4', 'A1_5', 'A1_6', 'A1_7', 'A1_8', 'A1_9', 'A1_10', 'A1_11', 'A1_12', 'A1_13', 'A1_14', 'A1_15', 'A1_16', 'A1_17', 'A1_18', 'A1_19', 'A1_20', 'A1_21', 'A1_22', 'A1_23', 'A1_24', 'A1_25', 'A1_26', 'A1_27', 'A1_28', 'A1_29', 'A1_30', 'A1_31', 'A1_32', 'A1_33', 'A1_34', 'A1_35', 'A1_36', 'A1_37', 'A1_38', 'A1_39', 'A1_40', 'A1_41', 'A1_42', 'A1_43', 'A1_44', 'A1_45', 'A1_46', 'A1_47',

## ChemBERTa

In [None]:
%%capture
# @title Create environment
!conda env create -f {Working_path}'/EvOlf/Environments/EvOlf_Clean.yml'

In [None]:
# @title Updating the file paths
with open(Working_path+'/EvOlf/Codes/ChemBERTa.py')as fin:
  with open(Working_path+'/EvOlf/Codes/ChemBERTa_v2.py','w')as fout:
    for line in fin:
      if 'outPath = \"' in line:
        fout.write("outPath = \""+Tmp_path+"\"\n")
      elif 'dataPath = \"' in line:
        fout.write("dataPath = \""+Tmp_path+"\"\n")
      else:
        fout.write(line)

In [None]:
# @title Generating Features from ChemBERTa
setsdir(Working_path)
!/usr/local/envs/EvOlf_Clean_01/bin/python Codes/ChemBERTa_v2.py

Some weights of the model checkpoint at DeepChem/ChemBERTa-77M-MLM were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be 

## Mol2Vec

In [None]:
%%capture
# @title Create environment
!conda env create -f  {Working_path}'/EvOlf/Environments/Mol2Vec_Env.yml'

In [None]:
# @title Updating the file paths
with open(Working_path+'/EvOlf/Codes/Mol2Vec.py')as fin:
  with open(Working_path+'/EvOlf/Codes/Mol2Vec_v2.py','w')as fout:
    for line in fin:
      if 'outPath = \"' in line:
        fout.write("outPath = \""+Tmp_path+"\"\n")
      elif 'dataPath = \"' in line:
        fout.write("dataPath = \""+Tmp_path+"\"\n")
      elif 'model =' in line:
        fout.write("model = word2vec.Word2Vec.load('"+Working_path+"/EvOlf/Models/model_300dim.pkl')\n")
      else:
        fout.write(line)

In [None]:
setsdir(Working_path)
# @title Generating Features from Mol2Vec
!/usr/local/envs/Mol2Vec/bin/python Codes/Mol2Vec_v2.py

Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality
Code ran successfully


## Mordred

In [None]:
%%capture
# @title Create environment
!conda env create -f  {Working_path}'/EvOlf/Environments/Mordred_Env.yml'

In [None]:
# @title Updating the file paths
with open(Working_path+'/EvOlf/Codes/Mordred.py')as fin:
  with open(Working_path+'/EvOlf/Codes/Mordred_v2.py','w')as fout:
    for line in fin:
      if 'outPath = \'' in line:
        fout.write("outPath = \""+Tmp_path+"\"\n")
      elif 'dataPath = \'' in line:
        fout.write("dataPath = \""+Tmp_path+"\"\n")
      elif 'csv_path=\'' in line:
        fout.write("csv_path = \""+Working_path+"/EvOlf/Models/\"\n")
      else:
        fout.write(line)

In [None]:
# @title Generating Features from Mordred
setsdir(Working_path)
!/usr/local/envs/Mordred/bin/python Codes/Mordred_v2.py

(4, 2)
100% 4/4 [00:01<00:00,  3.43it/s]
341
1492
Code ran successfully


## Graph2Vec
User-provided SMILES are processed alongside a representative dataset to ensure robust Graph2Vec embeddings to maintain reproducibility in predictions.

In [None]:
%%capture
# @title Create environment
!conda env create -f {Working_path}'/EvOlf/Environments/Graph2Vec_Env.yml'

In [None]:
# @title Updating the file paths
with open(Working_path+'/EvOlf/Codes/Graph2Vec.py')as fin:
  with open(Working_path+'/EvOlf/Codes/Graph2Vec_v2.py','w')as fout:
    for line in fin:
      if 'outPath = \'' in line:
        fout.write("outPath = \""+Tmp_path+"\"\n")
      elif 'dataPath = \'' in line:
        fout.write("dataPath = \""+Tmp_path+"\"\n")
      elif 'csv_path = ' in line:
        fout.write("csv_path = '"+ Working_path+"/EvOlf/Models/'\n")
      else:
        fout.write(line)

In [None]:
# @title Generating Features from Graph2Vec
setsdir(Working_path)
!/usr/local/envs/Graph2Vec/bin/python Codes/Graph2Vec_v2.py

Code ran successfully


# Receptor Featurizer

## ProtR

In [None]:
# @title Install package
ro.r('capture.output(install.packages("protr", quiet = TRUE), file="/dev/null")')

In [None]:
# @title Updating the file paths
with open(Working_path+'/EvOlf/Codes/ProtR.R')as fin:
  with open(Working_path+'/EvOlf/Codes/ProtR_v2.R','w')as fout:
    for line in fin:
      if 'ASGPCRs_Final.csv' in line:
        fout.write("rec <- read.csv(\""+Tmp_path+"/recsData.csv\")\n")
      elif 'setwd' in line:
        fout.write("setwd(\""+Tmp_path+"\")\n")
      else:
        fout.write(line)

In [None]:
# @title Generating Features from ProtR
ro.r.source(Working_path+'/EvOlf/Codes/ProtR_v2.R')

[1] "extractAAC Done"
[1] "extractDC Done"
[1] "extractTC Done"
[1] "extractMoreauBroto Done"
[1] "extractMoran Done"
[1] "extractGeary Done"
[1] "extractCTDC Done"
[1] "extractCTDT Done"
[1] "extractCTDD Done"
[1] "extractCTriad Done"
[1] "extractSOCN Done"
[1] "extractQSO Done"
[1] "extractPAAC Done"
[1] "extractAPAAC Done"
[1] "Code ran successfully"


0,1
value,[16]
visible,[10]


## ProtT5
Note: Slow

In [None]:
%%capture
# @title Create environment
!conda env create -f {Working_path}'/EvOlf/Environments/EvOlf_Clean.yml'

In [None]:
# @title Updating the file paths
with open(Working_path+'/EvOlf/Codes/ProtT5.py')as fin:
  with open(Working_path+'/EvOlf/Codes/ProtT5_v2.py','w')as fout:
    for line in fin:
      if 'to_csv' in line:
        fout.write("final_df.to_csv(\'"+Tmp_path+"Raw_ProtT5.csv', index = False)\n")
      elif 'seq_path = \"' in line:
        fout.write("seq_path = \""+Tmp_path+"/recsData.fasta\"\n")
      elif '/storage1/aayushim/EvOlf_3.3/01_Dry_Lab/02_Descriptors/protT5/output/' in line:
        line=line.replace('/storage1/aayushim/EvOlf_3.3/01_Dry_Lab/02_Descriptors/protT5/output/',Tmp_path+"")
        fout.write(line)
      else:
        fout.write(line)

In [None]:
# @title Generating Features from ProtT5
setsdir(Working_path)
!/usr/local/envs/EvOlf_Clean_01/bin/python Codes/ProtT5_v2.py

Using cpu
Read 4 sequences.

############# EMBEDDING STATS #############
Total number of per-residue embeddings: 4
Total number of per-protein embeddings: 4
Time for generating embeddings: 1.1[m] (16.467[s/protein])

############# END #############
Code ran successfully


## ProtBERT

In [None]:
%%capture
# @title Create environment
!conda env create -f {Working_path}'/EvOlf/Environments/EvOlf_Clean.yml'

In [None]:
# @title Updating the file paths
with open(Working_path+'/EvOlf/Codes/ProtBERT.py')as fin:
  with open(Working_path+'/EvOlf/Codes/ProtBERT_v2.py','w')as fout:
    for line in fin:
      if 'outPath = \"' in line:
        fout.write("outPath = \""+Tmp_path+"\"\n")
      elif 'dataPath = \"' in line:
        fout.write("dataPath = \""+Tmp_path+"\"\n")
      else:
        fout.write(line)

In [None]:
# @title Generating Features from ProtBERT
setsdir(Working_path)
!/usr/local/envs/EvOlf_Clean_01/bin/python Codes/ProtBERT_v2.py

Using cpu
Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
4
299
1024
Code ran successfully


## MathFeature

In [None]:
%%capture
# @title Create environment
!conda env create -f {Working_path}'/EvOlf/Environments/MathFeature.yml'

In [None]:
# @title Updating the file paths
with open(Working_path+'/EvOlf/MathFeature/methods/Mappings-Protein.py')as fin:
  with open(Working_path+'/EvOlf/MathFeature/methods/Mappings-Protein_v2.py','w')as fout:
    for line in fin:
      if 'ASGPCRs_Final.csv' in line:
        line=line.replace('ASGPCRs_Final.csv',Tmp_path+"/recsData.fasta")
        fout.write(line)
      else:
        fout.write(line)

### Generating Features from MathFeature

In [None]:
cm1='/EvOlf/MathFeature/methods/Mappings-Protein_v2.py'
cm2='MF_02.csv'
command1 = f"{Working_path}{cm1}"
command2 = f"{Tmp_path}{cm2}"
!/usr/local/envs/MathFeature/bin/python3.7 {command1} -n 1 -o {command2} -r 2

In [None]:
cm1='/EvOlf/MathFeature/methods/Mappings-Protein_v2.py'
cm2='MF_04.csv'
command1 = f"{Working_path}{cm1}"
command2 = f"{Tmp_path}{cm2}"
!/usr/local/envs/MathFeature/bin/python3.7 {command1} -n 1 -o {command2} -r 4

In [None]:
cm1='/EvOlf/MathFeature/methods/Mappings-Protein_v2.py'
cm2='MF_06.csv'
command1 = f"{Working_path}{cm1}"
command2 = f"{Tmp_path}{cm2}"
!/usr/local/envs/MathFeature/bin/python3.7 {command1} -n 1 -o {command2} -r 6

In [None]:
cm1='/EvOlf/MathFeature/methods/EntropyClass.py'
cm2='recsData.fasta'
cm3='MF_08.csv'
command1 = f"{Working_path}{cm1}"
command2 = f"{Tmp_path}{cm2}"
command3 = f"{Tmp_path}{cm3}"
!/usr/local/envs/MathFeature/bin/python3.7 {command1} -i {command2} -o {command3} -l 1 -k 10 -e Shannon

In [None]:
cm1='/EvOlf/MathFeature/methods/EntropyClass.py'
cm2='recsData.fasta'
cm3='MF_09.csv'
command1 = f"{Working_path}{cm1}"
command2 = f"{Tmp_path}{cm2}"
command3 = f"{Tmp_path}{cm3}"
!/usr/local/envs/MathFeature/bin/python3.7 {command1} -i {command2} -o {command3} -l 1 -k 10 -e Tsallis

In [None]:
cm1='/EvOlf/MathFeature/methods/ComplexNetworksClass-v2.py'
cm2='recsData.fasta'
cm3='MF_10.csv'
command1 = f"{Working_path}{cm1}"
command2 = f"{Tmp_path}{cm2}"
command3 = f"{Tmp_path}{cm3}"
!/usr/local/envs/MathFeature/bin/python3.7 {command1} -i {command2} -o {command3} -l 1 -k 3

In [None]:
cm1='/EvOlf/MathFeature/methods/Kgap.py'
cm2='recsData.fasta'
cm3='MF_11.csv'
command1 = f"{Working_path}{cm1}"
command2 = f"{Tmp_path}{cm2}"
command3 = f"{Tmp_path}{cm3}"
!/usr/local/envs/MathFeature/bin/python3.7 {command1} -i {command2} -o {command3} -l 1 -k 3 -bef 1 -aft 2 -seq 3

# Feature Processing

In [None]:
# @title Updating the file paths
with open(Working_path+'/EvOlf/Codes/Feature_Compilation.R')as fin:
  with open(Working_path+'/EvOlf/Codes/Feature_Compilation_v2.R','w')as fout:
    for line in fin:
      if 'outPath <- \"' in line:
        fout.write("outPath <- \""+Output_path+"\"\n")
      elif 'tempPath <- \"' in line:
        fout.write("tempPath <- \""+Tmp_path+"\"\n")
      else:
        fout.write(line)

In [None]:
# @title Process all the features
ro.r.source(Working_path+'/EvOlf/Codes/Feature_Compilation_v2.R')

[1] "Code ran successfully"


0,1
value,[16]
visible,[10]


# Predictions

In [None]:
%%capture
# @title Create environment
!conda env create -f  {Working_path}'/EvOlf/Environments/20240316_evolf_new.yml'

In [None]:
# @title Updating the file paths
with open(Working_path+'/EvOlf/Codes/Predictions.py')as fin:
  with open(Working_path+'/EvOlf/Codes/Predictions_v2.py','w')as fout:
    for line in fin:
      if 'sys.path' in line:
        fout.write("sys.path.append('"+Working_path+"/EvOlf/Codes')\n")
      elif "weights_file_path = f\"" in line:
        fout.write("weights_file_path = f'"+Working_path+"/EvOlf/Models/Final_5'\n")
      elif "test_text_file_path = f\'" in line:
        fout.write("test_text_file_path = f'"+Output_path+"/'\n")
      elif "test_key_embedding_file_path = f\'" in line:
        fout.write("test_key_embedding_file_path = f'"+Output_path+"/'\n")
      elif "test_lock_embedding_file_path = f\'" in line:
        fout.write("test_lock_embedding_file_path = f'"+Output_path+"/'\n")
      elif "test_concat_embedding_file_path = f\'" in line:
        fout.write("test_concat_embedding_file_path = f'"+Output_path+"/'\n")
      elif "mypath = \"" in line:
        fout.write("mypath = \'"+Tmp_path+"'\n")
      elif "scaler_models" in line:
        fout.write("file_path = '"+Working_path+"/EvOlf/Models/scaler_models/'\n")
      elif "pca_models" in line:
        fout.write("file_path = '"+Working_path+"/EvOlf/Models/pca_models/'\n")
      else:
        fout.write(line)

In [None]:
# @title Get Final Predictions and embeddings
setsdir(Working_path)
!/usr/local/envs/evolf_new/bin/python Codes/Predictions_v2.py

Code start time: 2025-03-21 14:31:10
Data Loaded: 00:00:00
Time elapsed till Data Intersection : 00:00:01
Time elapsed till Scaling : 00:00:01
Time elapsed till PCA : 00:00:02
Code end time: 2025-03-21 14:31:12
Code ran successfully
