<a href="https://colab.research.google.com/github/zariyagh/SimPep_and_OP-AND/blob/main/SimPep_App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ========================================
#           INSTALLING Packages
# ========================================
!pip install -q torch transformers pandas numpy tensorflow

# ========================================
#              IMPORTING
# ========================================
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
import tensorflow.keras.backend as K
from transformers import BertTokenizer, BertModel
import torch
import urllib.request
import os
import ipywidgets as widgets
from IPython.display import display, clear_output

# ========================================
#         LOADING Model WEIGHTS
# ========================================
weights_url = "https://raw.githubusercontent.com/CBRC-lab/SimPep_and_OP-AND/main/weights_only.weights.h5"
weights_path = "weights_only.weights.h5"

if not os.path.exists(weights_path) or os.path.getsize(weights_path) < 50000:
    print("Downloading weights...")
    urllib.request.urlretrieve(weights_url, weights_path)

# ========================================
#         LOADING DATA
# ========================================
df_pos = pd.read_csv('https://raw.githubusercontent.com/CBRC-lab/SimPep_and_OP-AND/main/PPP_ProtBERT_embeddings.txt', sep=",", header=None).iloc[:, 1:-1].drop_duplicates()
df_neg1 = pd.read_csv('https://raw.githubusercontent.com/CBRC-lab/SimPep_and_OP-AND/main/NPP_Q5T9C2_ProtBERT_embeddings.txt', sep=",", header=None).iloc[:, 1:-1].drop_duplicates()
df_neg2 = pd.read_csv('https://raw.githubusercontent.com/CBRC-lab/SimPep_and_OP-AND/main/NPP_Q9CWT3_ProtBERT_embeddings.txt', sep=",", header=None).iloc[:, 1:-1].drop_duplicates()
df_neg3 = pd.read_csv('https://raw.githubusercontent.com/CBRC-lab/SimPep_and_OP-AND/main/NPP_O88942_ProtBERT_embeddings.txt', sep=",", header=None).iloc[:, 1:-1].drop_duplicates()
df_neg = pd.concat([df_neg1, df_neg2, df_neg3], ignore_index=True)

X1 = df_pos.to_numpy(dtype='float')
X0 = df_neg.to_numpy(dtype='float')

# ========================================
#             LOADING ProtBERT
# ========================================
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
modelb = BertModel.from_pretrained("Rostlab/prot_bert")
modelb.eval()

def embed_peptide_with_protbert(seq):
    sequence = ' '.join(seq)
    inputs = tokenizer(sequence, return_tensors='pt')
    with torch.no_grad():
        outputs = modelb(**inputs)
        embeddings = outputs.last_hidden_state.squeeze(0)
    return torch.mean(embeddings, dim=0).numpy()

# ========================================
#           BUILDING MODEL
# ========================================
def build_siamese_model(input_dim):
    shared_model = tf.keras.models.Sequential([
        Dense(512, input_shape=(input_dim,), activation='relu'),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
    ])
    left_input = Input(shape=(input_dim,))
    right_input = Input(shape=(input_dim,))
    encoded_l = shared_model(left_input)
    encoded_r = shared_model(right_input)
    L1 = Lambda(lambda x: K.abs(x[0] - x[1]))([encoded_l, encoded_r])
    L1_D = Dropout(0.2)(L1)
    prediction = Dense(1, activation='sigmoid')(L1_D)
    return Model(inputs=[left_input, right_input], outputs=prediction)

input_dim = 1024
model = build_siamese_model(input_dim)
model.load_weights(weights_path)

# ========================================
#               PREDICTION
# ========================================
def RealTest(X, x0_train, x1_train):
    dataset_test = []
    Indexsample = []
    for i in range(len(X)):
        for j in range(len(x0_train)):
            dataset_test.append(np.concatenate((X[i], x0_train[j])))
            Indexsample.append(f"{i},0")
        for j in range(len(x1_train)):
            dataset_test.append(np.concatenate((X[i], x1_train[j])))
            Indexsample.append(f"{i},1")
    return np.asarray(dataset_test), Indexsample

def RealPredict(X, test_pred, Indexsample):
    Pred = []
    for i in range(len(X)):
        cnt0Pred, cnt1Pred, cnt0, cnt1 = 0, 0, 0, 0
        for j in range(len(Indexsample)):
            triplet = Indexsample[j].split(",")
            if int(triplet[0]) == i:
                if int(triplet[1]) == 1:
                    cnt1Pred += test_pred[j]
                    cnt1 += 1
                else:
                    cnt0Pred += test_pred[j]
                    cnt0 += 1
        pos = (1 - (cnt0Pred / cnt0)) + (cnt1Pred / cnt1)
        neg = (cnt0Pred / cnt0) + (1 - (cnt1Pred / cnt1))
        Pred.append(float(pos / (pos + neg)))
    return Pred

# ========================================
#               GUI
# ========================================
peptide_input = widgets.Text(
    placeholder='Enter peptide sequence',
    description='Peptide:',
    disabled=False
)
predict_button = widgets.Button(description="Predict Osteogenic Score")
output = widgets.Output()

def on_predict_clicked(b):
    with output:
        clear_output()
        seq = peptide_input.value.strip()
        if not seq:
            print("Please enter a valid peptide sequence.")
            return
        print(f"Predicting for sequence: {seq}")
        vector = embed_peptide_with_protbert(seq).reshape(1, -1)
        test_data, index_sample = RealTest(vector, X0, X1)
        left = test_data[:, :input_dim]
        right = test_data[:, input_dim:]
        y_pred = model.predict([left, right], verbose=0)
        final_score = RealPredict(vector, y_pred, index_sample)
        print(f"✅ Osteogenic Score: {final_score[0]:.4f}")

predict_button.on_click(on_predict_clicked)

display(peptide_input, predict_button, output)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Text(value='', description='Peptide:', placeholder='Enter peptide sequence')

Button(description='Predict Osteogenic Score', style=ButtonStyle())

Output()