In [1]:
# Standard Libraries
import os
import sys
import re
import argparse
import subprocess
from io import StringIO
import joblib

# Data Manipulation
import pandas as pd
import numpy as np

# Machine Learning & Deep Learning
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    classification_report
)

# Utilities
from tqdm import tqdm
from IPython.display import clear_output


2025-03-19 16:26:49.382715: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-19 16:26:49.405876: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-19 16:26:49.426343: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-19 16:26:49.432882: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-19 16:26:49.452757: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# set current working directory to MAIN_DIR or you can change manually
MAIN_DIR = os.getcwd()
print("MAIN_DIR:", MAIN_DIR)

MAIN_DIR: /home/sp2530/Desktop/pLM-DBPs


In [3]:
# Load pLMDBPs base models
ProtT5_ann_model = load_model(os.path.join(MAIN_DIR, "assets/models/ProtT5_pLMDBPs.keras"))
SaProt_ann_model = load_model(os.path.join(MAIN_DIR, "assets/models/SaProt_pLMDBPs.keras"))

2025-03-19 16:26:55.417359: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


### Read already computed features for test set

In [4]:
# read ProtT5 embeddings
def split_protid(prot_id):
    if '|' in prot_id:
        return prot_id.split('|')[1]
    elif ' ' in prot_id:
        return prot_id.split(' ')[0]
    else:
        return prot_id

# Load the data
test_positive_prott5 = pd.read_csv(MAIN_DIR + '/dataset/test/embeddings/prott5/DBP_independent.csv', header=None)
test_negative_prott5 = pd.read_csv(MAIN_DIR + '/dataset/test/embeddings/prott5/non_DBP_independent.csv', header=None)

# Apply the split function to the first column to get protein id
test_positive_prott5[0] = test_positive_prott5[0].apply(split_protid)
test_negative_prott5[0] = test_negative_prott5[0].apply(split_protid)

# Load the SaProt embeddings
test_positive_saprot = pd.read_csv(MAIN_DIR + '/dataset/test/embeddings/saprot/DBP_independent.csv', header=None, low_memory=False).iloc[1:, :]
test_negative_saprot = pd.read_csv(MAIN_DIR + '/dataset/test/embeddings/saprot/non_DBP_independent.csv', header=None, low_memory=False).iloc[1:, :]

In [5]:
# prepare X_test
X_test_prott5 = np.concatenate(
    (test_positive_prott5.iloc[:, 1:].values, test_negative_prott5.iloc[:, 1:].values), axis=0
)

X_test_saprot = np.concatenate(
    (test_positive_saprot.iloc[:, 1:].values, test_negative_saprot.iloc[:, 1:].values), axis=0
)

# prepare y_test
test_positive_labels = np.ones(test_positive_saprot.shape[0])
test_negative_labels = np.zeros(test_negative_saprot.shape[0])
y_test = np.concatenate((test_positive_labels, test_negative_labels), axis=0)

print(f"X_test_prott5: {X_test_prott5.shape}, y_test: {y_test.shape}")
print(f"X_test_saprot: {X_test_saprot.shape}, y_test: {y_test.shape}")

# apply scaling
scaler_saprot = joblib.load(os.path.join(MAIN_DIR, "assets/models/scaler_saprot.pkl")) # load standard scaler fitted with train set
X_test_saprot = scaler_saprot.transform(X_test_saprot)

X_test_prott5: (997, 1024), y_test: (997,)
X_test_saprot: (997, 1280), y_test: (997,)


In [6]:
# make data type consistent
X_test_prott5 = np.array(X_test_prott5, dtype=np.float32)
X_test_saprot = np.array(X_test_saprot, dtype=np.float32)

In [7]:
# Get predictions from ProtT5 and SAPROT
y_pred_prott5_prob = ProtT5_ann_model.predict(X_test_prott5)
y_pred_saprot_prob = SaProt_ann_model.predict(X_test_saprot)

# Compute the average probability ( using ProtT5 and SAPROT)
y_pred_avg_prob = (y_pred_prott5_prob + y_pred_saprot_prob) / 2

# Convert probabilities to binary predictions (threshold at 0.5)
y_pred_avg = (y_pred_avg_prob > 0.5).astype(int).flatten()

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred_avg)
precision = precision_score(y_test, y_pred_avg)
recall = recall_score(y_test, y_pred_avg)
f1 = f1_score(y_test, y_pred_avg)
mcc = matthews_corrcoef(y_test, y_pred_avg)
auroc = roc_auc_score(y_test, y_pred_avg_prob)
aupr = average_precision_score(y_test, y_pred_avg_prob)
cm = confusion_matrix(y_test, y_pred_avg)

# Print the results
print(f"Accuracy  : {accuracy:.2f}")
print(f"Precision : {precision:.2f}")
print(f"Recall    : {recall:.2f}")
print(f"F1 Score  : {f1:.2f}")
print(f"MCC       : {mcc:.2f}")
print(f"AUROC     : {auroc:.2f}")
print(f"AUPR      : {aupr:.2f}")

# Print confusion matrix
print("\nConfusion Matrix:")
print(f"              Predicted No     Predicted Yes")
print(f"Actual No     {cm[0][0]:>6}            {cm[0][1]:>6}")
print(f"Actual Yes    {cm[1][0]:>6}            {cm[1][1]:>6}")

# Save results in a DataFrame
results_df = pd.DataFrame({
    "Model": ["ProtT5 + SAPROT (Avg Probabilities)"],
    "Accuracy": [accuracy],
    "Precision": [precision],
    "Recall": [recall],
    "F1 Score": [f1],
    "MCC": [mcc],
    "AUROC": [auroc],
    "AUPR": [aupr]
})

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step  
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Accuracy  : 0.92
Precision : 0.95
Recall    : 0.89
F1 Score  : 0.92
MCC       : 0.85
AUROC     : 0.97
AUPR      : 0.97

Confusion Matrix:
              Predicted No     Predicted Yes
Actual No        474                23
Actual Yes        54               446
