In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc

from rdkit.Chem import AllChem as Chem
from rdkit.Chem import MACCSkeys
from rdkit import DataStructs
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles

import warnings
warnings.filterwarnings("ignore")

In [None]:
root_dir = os.path.join(os.getcwd(), "datasets_int_val")
MUBD_ligand_dir = [os.path.join(it, "MUBDreal/Diverse_ligands_PS.csv") for it in os.scandir(root_dir)]
MUBDreal_decoy_dir = [os.path.join(it, "MUBDreal/Final_decoys.csv") for it in os.scandir(root_dir)]
MUBDsyn_decoy_dir = [os.path.join(it, "MUBDsyn/Final_decoys.csv") for it in os.scandir(root_dir)]

cases = ['5HT1F-AGO', '5HT1F-ANTA', 'DRD5-AGO', 'DRD5-ANTA', 'HRH4-AGO',
        'HRH4-ANTA', 'ACM4-AGO', 'ACM4-ANTA', 'OPRM-AGO', 'OPRM-ANTA',
        'BRS3-ANTA','SSR2-ANTA','AG22-ANTA','PE2R3-AGO','PE2R3-ANTA',
        'MTR1B-AGO','MTR1B-ANTA',
        ]

In [None]:
#Unique scaffold ratio
def bmsratio(final_decoys):
    df = pd.read_csv(final_decoys)
    smis = list(df['SMILES'])
    mks = []

    for smi in smis:
        mks.append(MurckoScaffoldSmiles(smi))

    mks_u = list(set(mks))
    ratio = len(mks_u) / len(smis)

    return round(ratio, 2)

MUBDreal_usr = []
for case in cases:
    for dir in MUBDreal_decoy_dir:
        if case in dir:
            MUBDreal_usr.append(bmsratio(dir))
MUBDreal_usr = np.array(MUBDreal_usr)

MUBDsyn_usr = []
for case in cases:
    for dir in MUBDsyn_decoy_dir:
        if case in dir:
            MUBDsyn_usr.append(bmsratio(dir))
MUBDsyn_usr = np.array(MUBDsyn_usr)


In [None]:
MUBDreal_usr, MUBDsyn_usr

In [None]:
#simp
def simp_AUC(idx, diverse_ligand_PS, final_decoys):
    df_Ligand = pd.read_csv(diverse_ligand_PS)
    df_Decoy = pd.read_csv(final_decoys)
    length_Ligand = len(df_Ligand)
    length_Decoy = len(df_Decoy)

    MW_MCS_Ligand = list(df_Ligand['MW_MCS'])
    NR_MCS_Ligand = list(df_Ligand['NR_MCS'])
    HD_MCS_Ligand = list(df_Ligand['HD_MCS'])
    HA_MCS_Ligand = list(df_Ligand['HA_MCS'])
    FC_MCS_Ligand = list(df_Ligand['FC_MCS'])
    LogP_MCS_Ligand = list(df_Ligand['LogP_MCS'])

    MW_MCS_Decoy = list(df_Decoy['MW_MCS'])
    NR_MCS_Decoy = list(df_Decoy['NR_MCS'])
    HD_MCS_Decoy = list(df_Decoy['HD_MCS'])
    HA_MCS_Decoy = list(df_Decoy['HA_MCS'])
    FC_MCS_Decoy = list(df_Decoy['FC_MCS'])
    LogP_MCS_Decoy = list(df_Decoy['LogP_MCS'])

    def Middle(PIT, PIR):
        x = PIT - PIR
        y = np.square(x)
        return(y)

    def simp_LL(t, r):
        simp1 = Middle(MW_MCS_Ligand[t], MW_MCS_Ligand[r])
        simp2 = Middle(NR_MCS_Ligand[t], NR_MCS_Ligand[r])
        simp3 = Middle(HD_MCS_Ligand[t], HD_MCS_Ligand[r])
        simp4 = Middle(HA_MCS_Ligand[t], HA_MCS_Ligand[r])
        simp5 = Middle(FC_MCS_Ligand[t], FC_MCS_Ligand[r])
        simp6 = Middle(LogP_MCS_Ligand[t], LogP_MCS_Ligand[r])
        simp = 1 - np.sqrt((simp1 + simp2 + simp3 +
                              simp4 + simp5 + simp6) / 6)
        return simp

    def simp_LD(t, r):
        simp1 = Middle(MW_MCS_Ligand[t], MW_MCS_Decoy[r])
        simp2 = Middle(NR_MCS_Ligand[t], NR_MCS_Decoy[r])
        simp3 = Middle(HD_MCS_Ligand[t], HD_MCS_Decoy[r])
        simp4 = Middle(HA_MCS_Ligand[t], HA_MCS_Decoy[r])
        simp5 = Middle(FC_MCS_Ligand[t], FC_MCS_Decoy[r])
        simp6 = Middle(LogP_MCS_Ligand[t], LogP_MCS_Decoy[r])
        simp = 1 - np.sqrt((simp1 + simp2 + simp3 +
                              simp4 + simp5 + simp6) / 6)
        return simp

    def Get_simp(k):
        Simp = []

        for i in range(0, length_Ligand):
            if i != k:
                Simp.append(1)
                Simp.append(simp_LL(k, i))

        for j in range(0, length_Decoy):
            if not ((39 * k <= j) & (j < 39 * (k + 1))):
                Simp.append(0)
                Simp.append(simp_LD(k, j))
        return Simp

    def Get_Roc_Arg_simp(k):
        Simp_list = Get_simp(k)
        data = np.array(Simp_list).reshape(-1, 2)
        df_data = pd.DataFrame(data, columns=['sort', 'simp'])
        df = df_data.sort_values(by="simp", ascending=False)
        sort = list(df['sort'])
        simp = list(df['simp'])

        fpr, tpr, thresholds = roc_curve(sort, simp)
        roc_auc = auc(fpr, tpr)

        return (fpr, tpr, roc_auc)

    Fpr_list_simp = []
    Tpr_list_simp = []
    Auc_list_simp = []

    for k in range(0, length_Ligand):
        Roc_Arg = Get_Roc_Arg_simp(k)
        Fpr_list_simp.append(Roc_Arg[0])
        Tpr_list_simp.append(Roc_Arg[1])
        Auc_list_simp.append(Roc_Arg[2])

    auc_simps = np.array(Auc_list_simp)

    return auc_simps.mean(), auc_simps.std()
rows = []
for i, case in enumerate(cases):
    for dir in MUBD_ligand_dir:
        if case in dir:
            single_ligands = dir
    for dir in MUBDreal_decoy_dir:
        if case in dir:
            single_real_decoys = dir
    for dir in MUBDsyn_decoy_dir:
        if case in dir:
            single_syn_decoys = dir
    
    real_auc_simps_m, real_auc_simps_s = simp_AUC(i, single_ligands, single_real_decoys)
    syn_auc_simps_m, syn_auc_simps_s = simp_AUC(i, single_ligands, single_syn_decoys)
    rows.append({"Case":case, "MUBDreal_mean":real_auc_simps_m, "MUBDreal_std":real_auc_simps_s,
                    "MUBDsyn_mean":syn_auc_simps_m, "MUBDsyn_std":syn_auc_simps_s,})
df_simp = pd.DataFrame(rows)
df_simp.to_csv("simp.csv")

In [None]:
#NLBscore
def nlbscore(diverse_ligands_PS, final_decoys):
    df_Ligand = pd.read_csv(diverse_ligands_PS)
    df_Decoy = pd.read_csv(final_decoys)
    length_Ligand = len(df_Ligand)
    length_Decoy = len(df_Decoy)

    S_Ligand = list(df_Ligand['SMILES'])
    Suppl_Ligand = []
    for i in range(0, length_Ligand):
        m = Chem.MolFromSmiles(S_Ligand[i])
        Suppl_Ligand.append(m)

    S_Decoy = list(df_Decoy['SMILES'])
    Suppl_Decoy = []
    for i in range(0, length_Decoy):
        m = Chem.MolFromSmiles(S_Decoy[i])
        Suppl_Decoy.append(m)

    Fps_Ligand = [MACCSkeys.GenMACCSKeys(x) for x in Suppl_Ligand]
    Fps_Decoy = [MACCSkeys.GenMACCSKeys(x) for x in Suppl_Decoy]

    def sims_LL(t, r):
        sims = DataStructs.FingerprintSimilarity(Fps_Ligand[t], Fps_Ligand[r])
        return sims

    def sims_LD(t, r):
        sims = DataStructs.FingerprintSimilarity(Fps_Ligand[t], Fps_Decoy[r])
        return sims

    def Get_NLB(k):
        Sims_LD_list = []
        for i in range(0, length_Decoy):
            Sims_LD_list.append(sims_LD(k, i))

        Sims_LD_Max = max(Sims_LD_list[i] for i in range(0, len(Sims_LD_list)))

        Sims_LL_list = []
        for i in range(0, length_Ligand):
            if i != k:
                Sims_LL_list.append(sims_LL(k, i))

        count = 0
        for i in range(0, len(Sims_LL_list)):
            Sims_LL = Sims_LL_list[i]
            if Sims_LL > Sims_LD_Max:
                count = count + 1
        Pcount = float(count) / float(len(Sims_LL_list))

        return Pcount

    NLB_Sum = 0
    for k in range(0, length_Ligand):
        Pcount = Get_NLB(k)
        NLB_Sum = NLB_Sum + Pcount
    score = float(NLB_Sum) / float(length_Ligand)

    return round(score, 3)

MUBDreal_nlbs, MUBDsyn_nlbs = [], []
for i, case in enumerate(cases):
    for dir in MUBD_ligand_dir:
        if case in dir:
            single_ligands = dir
    for dir in MUBDreal_decoy_dir:
        if case in dir:
            single_real_decoys = dir
    for dir in MUBDsyn_decoy_dir:
        if case in dir:
            single_syn_decoys = dir
    
    nlb_real = nlbscore(single_ligands, single_real_decoys)
    MUBDreal_nlbs.append(nlb_real)

    nlb_syn = nlbscore(single_ligands, single_syn_decoys)
    MUBDsyn_nlbs.append(nlb_syn)

MUBDreal_nlbs = np.array(MUBDreal_nlbs)
MUBDsyn_nlbs = np.array(MUBDsyn_nlbs)

In [None]:
MUBDreal_nlbs, MUBDsyn_nlbs

In [None]:
#sims
def sims_AUC(idx, diverse_ligand_PS, final_decoys):

    df_Ligand = pd.read_csv(diverse_ligand_PS)
    df_Decoy = pd.read_csv(final_decoys)
    length_Ligand = len(df_Ligand)
    length_Decoy = len(df_Decoy)

    S_Ligand = list(df_Ligand['SMILES'])
    Suppl_Ligand = []
    for i in range(0, length_Ligand):
        m = Chem.MolFromSmiles(S_Ligand[i])
        Suppl_Ligand.append(m)

    S_Decoy = list(df_Decoy['SMILES'])
    Suppl_Decoy = []
    for i in range(0, length_Decoy):
        m = Chem.MolFromSmiles(S_Decoy[i])
        Suppl_Decoy.append(m)

    Fps_Ligand = [MACCSkeys.GenMACCSKeys(x) for x in Suppl_Ligand]
    Fps_Decoy = [MACCSkeys.GenMACCSKeys(x) for x in Suppl_Decoy]

    def sims_LL(t, r):
        sims = DataStructs.FingerprintSimilarity(Fps_Ligand[t], Fps_Ligand[r])
        return sims

    def sims_LD(t, r):
        sims = DataStructs.FingerprintSimilarity(Fps_Ligand[t], Fps_Decoy[r])
        return sims

    def Get_sims(k):
        Sims = []

        for i in range(0, length_Ligand):
            if i != k:
                Sims.append(1)
                Sims.append(sims_LL(k, i))
        for j in range(0, length_Decoy):
            if not ((39 * k <= j) & (j < 39 * (k + 1))):
                Sims.append(0)
                Sims.append(sims_LD(k, j))
        return Sims

    def Get_Roc_Arg_sims(k):
        Sims_list = Get_sims(k)
        data = np.array(Sims_list).reshape(-1, 2)
        df_data = pd.DataFrame(data, columns=['sort', 'sims'])
        df = df_data.sort_values(by="sims", ascending=False)
        sort = list(df['sort'])
        sims = list(df['sims'])

        fpr, tpr, thresholds = roc_curve(sort, sims)
        roc_auc = auc(fpr, tpr)

        return (fpr, tpr, roc_auc)

    Fpr_list_sims = []
    Tpr_list_sims = []
    Auc_list_sims = []

    for k in range(0, length_Ligand):
        Roc_Arg = Get_Roc_Arg_sims(k)
        Fpr_list_sims.append(Roc_Arg[0])
        Tpr_list_sims.append(Roc_Arg[1])
        Auc_list_sims.append(Roc_Arg[2])

    auc_simss = np.array(Auc_list_sims)

    return auc_simss.mean(), auc_simss.std()

rows = []
for i, case in enumerate(cases):
    for dir in MUBD_ligand_dir:
        if case in dir:
            single_ligands = dir
    for dir in MUBDreal_decoy_dir:
        if case in dir:
            single_real_decoys = dir
    for dir in MUBDsyn_decoy_dir:
        if case in dir:
            single_syn_decoys = dir
    
    real_auc_simss_m, real_auc_simss_s = sims_AUC(i, single_ligands, single_real_decoys)
    syn_auc_simss_m, syn_auc_simss_s = sims_AUC(i, single_ligands, single_syn_decoys)
    rows.append({"Case":case, "MUBDreal_mean":real_auc_simss_m, "MUBDreal_std":real_auc_simss_s,
                    "MUBDsyn_mean":syn_auc_simss_m, "MUBDsyn_std":syn_auc_simss_s,})
df_sims = pd.DataFrame(rows)
df_sims.to_csv("sims.csv")