In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem 
from rdkit.Chem import AllChem
from rdkit.Chem.Scaffolds import MurckoScaffold
from multiprocessing import Pool, cpu_count
import os
from glob import glob
from pathlib import Path

def applyParallel(df, func):
    df_split = np.array_split(df, cpu_count())
    pool = Pool(cpu_count())
    data = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return data

def normalize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Chem.MolToSmiles(mol, isomericSmiles=False)
    else:
        return np.NaN

def normalize_smiles_series(smiles_series):
    return smiles_series.map(normalize_smiles)


def generate_murcko_scaffold(smile):
    try:
        mol = Chem.MolFromSmiles(smile)
        if mol:
            scaffold = MurckoScaffold.GetScaffoldForMol(mol)
            return Chem.MolToSmiles(scaffold, isomericSmiles=False)
        else:
            return np.NaN
    except:
        return np.NaN
    

def generate_topological_scaffold(smile):
    try:
        mol = Chem.MolFromSmiles(smile)
        if mol:
            scaffold = MurckoScaffold.MakeScaffoldGeneric(MurckoScaffold.GetScaffoldForMol(mol))
            return Chem.MolToSmiles(scaffold, isomericSmiles=False)
        else:
            return np.NaN
    except:
        return np.NaN


    
                   
def generate_murcko_scaffold_series(data):
    return data.map(generate_murcko_scaffold)

def generate_topological_scaffold_series(data):
    return data.map(generate_topological_scaffold)




In [None]:
import numba
from numba import njit, prange

@njit(parallel=True,fastmath=True)
def find_nns(fps, references):
    all_nns = np.zeros((fps.shape[0], 3), dtype=np.float64)
    
    for i in prange(fps.shape[0]):
        nn_dist, nn_idx, avg_nn = _find_nn_and_avg_nn(fps[i], references)
        all_nns[i,0] = nn_dist
        all_nns[i,1] = nn_idx
        all_nns[i,2] = avg_nn
    
    return all_nns
    
@njit(fastmath=True)
def _find_nn_and_avg_nn(fp, fps):
    nn_dist = 0.0
    nn_idx = numba.int32(-1)
    avg_nn = 0.0
    
    for i in range(fps.shape[0]):
        tanimoto = _minmax_two_fp(fp, fps[i])
        avg_nn += tanimoto
        if tanimoto > nn_dist:
            nn_dist = tanimoto
            nn_idx = i
    avg_nn = numba.float64(avg_nn) / numba.float64(fps.shape[0])
            
    return nn_dist, nn_idx, avg_nn


@njit(fastmath=True)
def _minmax_two_fp(fp1, fp2):
    common = numba.int32(0)
    maxnum = numba.int32(0)
    i = 0

    while i < len(fp1):
        min_ = fp1[i]
        max_ = fp2[i]

        if min_ > max_:
            min_ = fp2[i]
            max_ = fp1[i]

        common += min_
        maxnum += max_

        i += 1

    return numba.float64(common) / numba.float64(maxnum)

def counted_tanimoto_similarity(f1, fp2, return_distance=True):
    if return_distance:
        return 1. - _minmax_two_fp(fp1,fp2)
    else:
        return _minmax_two_fp(fp1,fp2)
    
def bulk_counted_tanimoto(fp1, fps, return_distance=True):
    if return_distance:
        return [1. - _minmax_two_fp(fp1,fp2) for fp2 in fps]
    else: 
        return [_minmax_two_fp(fp1,fp2) for fp2 in fps]



In [None]:
OVERWRITE_FILES = False


def read_scaffold_memory(folder):
    memory = pd.read_csv(f"data/memories/{folder}/memory_preprocessed.csv.gz")
    return memory

def read_id_steps(folder):
    return read_scaffold_memory(folder)[["ID","step","total_score"]]


def read_mmp_and_filter_pairs(mmpfile):
    foldername = mmpfile.split("/")[1]
    target = foldername.split("_")

    filtername = target[1]
    target = target[0]

    df = pd.read_csv(mmpfile, header=None)
    df.columns = ["SMILES_OF_LEFT_MMP","SMILES_OF_RIGHT_MMP","ID_OF_LEFT_MMP","ID_OF_RIGHT_MMP","SMIRKS_OF_TRANSFORMATION","SMILES_OF_CONTEXT"]

    df["SET_OF_LEFT_MMP"] = df["ID_OF_LEFT_MMP"].map(lambda x: x.split("_")[-2] )
    df["SET_OF_RIGHT_MMP"] = df["ID_OF_RIGHT_MMP"].map(lambda x: x.split("_")[-2] )


    mmp_pairs = df.query(" SET_OF_LEFT_MMP == @filtername and (SET_OF_RIGHT_MMP == 'training' or SET_OF_RIGHT_MMP == 'test' or SET_OF_RIGHT_MMP == 'validation' )")
    ids = read_id_steps(foldername)
    
    mmp_pairs = pd.merge(mmp_pairs, ids, left_on="ID_OF_LEFT_MMP", right_on="ID", how="inner")
    del mmp_pairs["ID"]
    mmp_pairs.rename(columns={"step": "STEP", "total_score": "SCORE"}, inplace=True)
    mmp_pairs = mmp_pairs.sort_values(by=['STEP'])
    return mmp_pairs


for mmpfile in glob("MMP/*/MMP.csv.gz"):
    folder = mmpfile.split("/")[1]
    if len(folder.split("_")) > 2: #we only process the "default" parameters here in this notebook
        continue 
    mmpfile_filtered = mmpfile.replace("MMP.csv.gz", "MMP_filtered.csv")
    if not os.path.exists(mmpfile_filtered) or OVERWRITE_FILES:
        mmp = read_mmp_and_filter_pairs(mmpfile)
        mmp.to_csv(mmpfile_filtered, index=False)
    else:
        print(f"Skipping {mmpfile} as it seems to already be processed")
        continue
        
        





In [None]:
def _get_mmp_stats(mmp_pairs):
        nb_mmp_pairs = len(mmp_pairs)
        nb_generated_unique_cpds = len(mmp_pairs.drop_duplicates("SMILES_OF_LEFT_MMP"))
        return nb_mmp_pairs, nb_generated_unique_cpds
    

def get_mmp_cpd(df, filtername, maxstep, minactivity):
    mmp_pairs = df.query("SET_OF_LEFT_MMP == @filtername and STEP < @maxstep and SCORE >= @minactivity").copy()
    
    mmp_pairs = mmp_pairs.sort_values(by=['STEP'])
    mmp_pairs = mmp_pairs.drop_duplicates(["SMILES_OF_LEFT_MMP", "SMILES_OF_RIGHT_MMP"], keep='first')
        
    np_mmp_pairs_training, nb_generated_unique_cpds_training = _get_mmp_stats(mmp_pairs.query("SET_OF_RIGHT_MMP == 'training'"))
    np_mmp_pairs_test, nb_generated_unique_cpds_test = _get_mmp_stats(mmp_pairs.query("SET_OF_RIGHT_MMP == 'test' or SET_OF_RIGHT_MMP == 'validation'"))
    
    return np_mmp_pairs_training, nb_generated_unique_cpds_training, np_mmp_pairs_test, nb_generated_unique_cpds_test#, np_mmp_pairs_validation, nb_generated_unique_cpds_validation


def load_mmp(target, filtername, maxstep, minactivity):
    file = glob(f"MMP/{target}_{filtername}/MMP_filtered.csv")[0]
    df = pd.read_csv(file)
    return get_mmp_cpd(df, filtername, maxstep, minactivity)


target_params = {
    "DRD2":      {"maxstep": 300,
                  "minactivity": 0.7},
    "HTR1A":     {"maxstep": 300,
                  "minactivity": 0.7},
    "clogP":     {"maxstep": 150,
                  "minactivity": 1.}
}

clogP_param = {
        "clogP": {"maxstep": 150,
                  "minactivity": 1.}, 
}

filternames = ["NoFilter", "CompoundSimilarity", "IdenticalMurckoScaffold", "IdenticalTopologicalScaffold", "ScaffoldSimilarity"]
filternames_in_plots = [ "No memory", "CompoundSimilarity memory", "IdenticalMurckoScaffold memory", "IdenticalCarbonSkeleton memory", "ScaffoldSimilarity memory"]

mmp_memories = {}
for target, params in target_params.items():
    if target not in mmp_memories:
        mmp_memories[target] = {} 
    maxstep = params['maxstep']
    minactivity = params['minactivity']
    for filtername in filternames:
        nb_mmp_pairs_training, nb_generated_unique_cpds_training, np_mmp_pairs_test, nb_generated_unique_cpds_test = load_mmp(target, filtername, maxstep, minactivity)
        print(f"{target} {filtername} STEPS: {maxstep}")
        print(f"Training: {nb_mmp_pairs_training} MMPs formed by generating {nb_generated_unique_cpds_training} compounds.")
        print(f"Test: {np_mmp_pairs_test} MMPs formed by generating {nb_generated_unique_cpds_test} compounds.")
        print()
        
        
        mmp_memories[target][filtername] = {} 
        folder = glob(f"MMP/{target}_{filtername}")[0].split("/")[-1]
        memory = read_scaffold_memory(folder)
        memory.rename(columns={"step": "STEP", "total_score": "SCORE"}, inplace=True)
        memory = memory.sort_values(by=['STEP'])
        mmp_memories[target][filtername] = memory


In [None]:
for target, params in clogP_param.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity']
    print(f"Number of MMP analogs\t\t\tgen\tcore\tpercent_core")
    for filtername in filternames:
        file = glob(f"MMP/{target}_{filtername}/MMP_filtered.csv")[0]
        df = pd.read_csv(file)
        mmp_pairs = df.query("SET_OF_LEFT_MMP == @filtername and STEP < @maxstep and SCORE >= @minactivity")
        unique_mmp_pairs = mmp_pairs.drop_duplicates(["ID_OF_LEFT_MMP","ID_OF_RIGHT_MMP"])
        generated_cpds = unique_mmp_pairs.drop_duplicates("ID_OF_LEFT_MMP")
        covered_cores = unique_mmp_pairs.drop_duplicates("ID_OF_RIGHT_MMP")
        nb_generated_cpds = len(generated_cpds)
        nb_covered_cores = len(covered_cores)
        percent = nb_covered_cores/ 487 * 100

        print(f"{target}\t{filtername:30}\t{nb_generated_cpds}\t{nb_covered_cores}\t{percent:.2}")

In [None]:
OVERWRITE_FILES = False


def cecfp6_from_mol(mol):
    fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=False)
    size = 2048
    nfp = np.zeros(size, np.int32)
    for idx,v in fp.GetNonzeroElements().items():
        nidx = idx%size
        nfp[nidx] += int(v)
    return nfp

def cecfp6_from_smiles(smiles):
    return cecfp6_from_mol(Chem.MolFromSmiles(smiles))

def generate_fingerprint_series(data):
    return data.map(cecfp6_from_smiles)


def find_nns_from_pandas(fps, reference):
    fps = np.array([np.array(e) for e in fps.values],dtype=np.int32)
    reference = np.array([np.array(e) for e in reference.values],dtype=np.int32)
    return find_nns(fps, reference)


for target, params in target_params.items():
    reference = pd.read_pickle(f"data/{target}/actives.pkl.gz")
    if target == "clogP":
        reference['cfp'] = applyParallel( reference["SMILES"], generate_fingerprint_series)

    reference_training = reference.query("trainingset_class == 'training'").copy()

    reference_test =  reference.query("trainingset_class == 'test' or trainingset_class == 'validation'").copy()

    
    for filtername in filternames:
        if not os.path.exists(f"data/memories/{target}_{filtername}/memory_with_nn.pkl.gz") or OVERWRITE_FILES:
            memory = pd.read_csv(f"data/memories/{target}_{filtername}/memory_preprocessed.csv.gz")
            memory.rename(columns={"step": "STEP", "total_score": "SCORE"}, inplace=True)
            memory['cfp'] = applyParallel( memory["SMILES"], generate_fingerprint_series)

            nns_arr = find_nns_from_pandas(memory['cfp'], reference['cfp'])

            memory["NN_dist"] = nns_arr[:,0]
            memory["NN_idx"] = nns_arr[:,1]
            memory["NN_avg"] = nns_arr[:,2]
            memory["NN_Original_Entry_ID"] = reference.iloc[nns_arr[:,1]]["Original_Entry_ID"].values
            memory["NN_RDKIT_SMILES"] = reference.iloc[nns_arr[:,1]]["RDKIT_SMILES"].values
            memory["NN_ID"] = reference.iloc[nns_arr[:,1]]["ID"].values

            nns_arr = find_nns_from_pandas(memory['cfp'], reference_training['cfp'])

            memory["NN_dist_training"] = nns_arr[:,0]
            memory["NN_idx_training"] = nns_arr[:,1]
            memory["NN_avg_training"] = nns_arr[:,2]
            memory["NN_Original_Entry_ID_training"] = reference_training.iloc[nns_arr[:,1]]["Original_Entry_ID"].values
            memory["NN_RDKIT_SMILES_training"] = reference_training.iloc[nns_arr[:,1]]["RDKIT_SMILES"].values
            memory["NN_ID_training"] = reference_training.iloc[nns_arr[:,1]]["ID"].values

            nns_arr = find_nns_from_pandas(memory['cfp'], reference_test['cfp'])

            memory["NN_dist_test"] = nns_arr[:,0]
            memory["NN_idx_test"] = nns_arr[:,1]
            memory["NN_avg_test"] = nns_arr[:,2]
            memory["NN_Original_Entry_ID_test"] = reference_test.iloc[nns_arr[:,1]]["Original_Entry_ID"].values
            memory["NN_RDKIT_SMILES_test"] = reference_test.iloc[nns_arr[:,1]]["RDKIT_SMILES"].values
            memory["NN_ID__test"] = reference_test.iloc[nns_arr[:,1]]["ID"].values

            memory.to_pickle(f"data/memories/{target}_{filtername}/memory_with_nn.pkl.gz")
        else:
            print(f"Skipping data/memories/{target}_{filtername}/memory_with_nn.pkl.gz as it seems to already be processed")
            continue

In [None]:
def get_cumsum_per_target(memory, STEPS, whattoplot="SMILES"):
    nb_stuff = []
    stuff = set()

    for i in range(1,STEPS+1):
        subset = memory[memory["STEP"] == i-1]
        for s in subset[whattoplot]:
            stuff.add(s)
        nb_stuff.append(len(stuff))
    
    return nb_stuff

cumsum = dict()
memories = dict()
for target, params in target_params.items():
    cumsum[target] = dict()
    memories[target] = dict()
    maxstep = params['maxstep']
    minactivity = params['minactivity']    
    for filtername in filternames:
        memory = pd.read_pickle(f"data/memories/{target}_{filtername}/memory_with_nn.pkl.gz")
        memory.rename(columns={"step": "STEP", "total_score": "SCORE"}, inplace=True)
        memories[target][filtername] = memory
        subset = memory.query("STEP < @maxstep and SCORE >= @minactivity")
        cumsum[target][filtername] = dict()
        for stuff in ["SMILES", "Murcko Scaffold", "Topological Scaffold"]: 
            subset = subset.query("NN_dist >= 0.4")
            cumsum[target][filtername][stuff] = get_cumsum_per_target(subset, maxstep, stuff)

# Number of unique actives

In [None]:
for target, params in target_params.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity']    
    for filtername in filternames:
        subset = memories[target][filtername].query("STEP < @maxstep and SCORE >= @minactivity")
        nb = len(set(subset["SMILES"]))
        print(f"{target}\t{filtername:30}\t{nb}")
    print()

# Number of unique Murcko Scaffolds

In [None]:
for target, params in target_params.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity']    
    for filtername in filternames:
        subset = memories[target][filtername].query("STEP < @maxstep and SCORE >= @minactivity")
        nb = len(set(subset["Murcko Scaffold"]))
        print(f"{target}\t{filtername:30}\t{nb}")
    print()

# Number of unique Topological Scaffolds

In [None]:
for target, params in target_params.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity']    
    for filtername in filternames:
        subset = memories[target][filtername].query("STEP < @maxstep and SCORE >= @minactivity")
        nb = len(set(subset["Topological Scaffold"]))
        print(f"{target}\t{filtername:30}\t{nb}")
    print()

# Number of exact SMILES matches from ExCAPE

In [None]:
for target, params in target_params.items():
    
    maxstep = params['maxstep']
    minactivity = params['minactivity']    
    if target == 'clogP':
        continue
    df = pd.read_pickle(f"data/{target}/actives.pkl.gz")
    actives = set(df["RDKIT_SMILES"])

    for filtername in filternames:
        subset = memories[target][filtername].query("STEP < @maxstep and SCORE >= @minactivity")
        
        generated = set(subset["SMILES"])
        nb_overlap = len(actives.intersection(generated))
        print(f"{target}\t{filtername:30}\t{nb_overlap}")
    print()

# Number of ECFP6 analogs

In [None]:
for target, params in target_params.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity']    
    for filtername in filternames:
        subset = memories[target][filtername].query("STEP < @maxstep and SCORE >= @minactivity")
        subset = subset.drop_duplicates("SMILES")
        nb = len(subset.query("NN_dist >= 0.4"))
        percent = (nb / len(subset))*100
        print(f"{target}\t{filtername:30}\t{nb}\t{percent:.4}%")
    print()

In [None]:
for target, params in target_params.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity']    
    for filtername in filternames:
        subset = memories[target][filtername].query("STEP < @maxstep and SCORE >= @minactivity")
        subset = subset.drop_duplicates("SMILES")
        nb = len(subset.query("NN_dist_training >= 0.4"))
        percent = (nb / len(subset))*100
        print(f"{target}\t{filtername:30}\t{nb}\t{percent:.4}%")
    print()

In [None]:
for target, params in target_params.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity']    
    for filtername in filternames:
        subset = memories[target][filtername].query("STEP < @maxstep and SCORE >= @minactivity")
        subset = subset.drop_duplicates("SMILES")
        nb = len(subset.query("NN_dist_test >= 0.4"))
        percent = (nb / len(subset))*100
        print(f"{target}\t{filtername:30}\t{nb}\t{percent:.4}%")
    print()

In [None]:
#%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
os.makedirs(f"figures/", exist_ok=True)
# SMALL_SIZE = 8
# MEDIUM_SIZE = 10
# BIGGER_SIZE = 12

# plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
# plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
# plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
# plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
# plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
# plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
# plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title


filternames_in_plots = ["No memory", "CompoundSimilarity memory", "IdenticalMurckoScaffold memory", "IdenticalCarbonSkeleton memory", "ScaffoldSimilarity memory"]

sns.set_context("talk")
sns.set_style("whitegrid")
#plt.style.use('bmh')
fig = plt.figure(figsize=(5,6))
for target in target_params.keys():
    for stuff in ["SMILES", "Murcko Scaffold", "Topological Scaffold"]:
        if stuff == "SMILES":
            print_ylabel = "No. generated ECFP6 analogs"
        elif stuff == "Murcko Scaffold":
            print_ylabel = "No. generated Bemis Murcko scaffolds"
        else:
            print_ylabel = "No. generated carbon skeletons"
        for filtername in filternames:
            plt.plot(cumsum[target][filtername][stuff])
        plt.xlabel("Iteration")
        plt.ylabel(print_ylabel)
        #plt.legend(filternames_in_plots, loc='upper right', ncol=3)
        #plt.savefig("{} {} without_Title.png".format(target, stuff), dpi=300)
        plt.title(target)
        plt.tight_layout()
        plt.savefig("figures/{} {}.svg".format(target, stuff))
        plt.clf()
        

In [None]:
def get_num_cuts(smiles_of_context):
    if "[*:3]" in smiles_of_context:
        return 3
    elif "[*:2]" in smiles_of_context:
        return 2
    else:
        return 1
    
def load_mmp(target, filtername,maxstep, minactivity):
    file = glob(f"MMP/{target}_{filtername}/MMP_filtered.csv")[0]
    df = pd.read_csv(file)
    mmp_pairs = df.query("SET_OF_LEFT_MMP == @filtername and STEP < @maxstep and SCORE >= @minactivity").copy()
    mmp_pairs["cuts"] = mmp_pairs["SMILES_OF_CONTEXT"].map(get_num_cuts)
    mmp_pairs = mmp_pairs.query("cuts == 1").sort_values(by=['cuts','STEP'])
    mmp_pairs = mmp_pairs.drop_duplicates(["ID_OF_LEFT_MMP","ID_OF_RIGHT_MMP"], keep='first')
    
    if target == 'clogP':
        target_to_load = "DRD2"
    else:
        target_to_load = target
    target_df = pd.read_pickle(f"data/{target_to_load}/actives.pkl.gz")
    
    mmp_pairs = mmp_pairs.merge(target_df, left_on = "SMILES_OF_RIGHT_MMP", right_on= "RDKIT_SMILES")
    return mmp_pairs
        
mmps = {} 
for target, params in target_params.items():
    if target not in mmps:
        mmps[target] = {}
    maxstep = params['maxstep']
    minactivity = params['minactivity'] 
    
    for filtername in filternames:
        mmps[target][filtername] = {}
        mmp = load_mmp(target, filtername, maxstep, minactivity)          
        mmps[target][filtername] = mmp[['SMILES_OF_LEFT_MMP', 'SMILES_OF_RIGHT_MMP', 'ID_OF_LEFT_MMP', 'ID_OF_RIGHT_MMP', 'SMIRKS_OF_TRANSFORMATION', 'SMILES_OF_CONTEXT', 'STEP', 'Original_Entry_ID', 'DB']]

In [None]:
from rdkit.Chem import PandasTools
for target, params in target_params.items():
    os.makedirs(f"htmls/{target}", exist_ok=True)
    for filtername in filternames:
        save_df = mmps[target][filtername][['SMILES_OF_LEFT_MMP', 'SMILES_OF_RIGHT_MMP', 'ID_OF_LEFT_MMP', 'Original_Entry_ID']].copy()
        PandasTools.AddMoleculeColumnToFrame(save_df, smilesCol='SMILES_OF_LEFT_MMP', molCol='MOL_OF_LEFT_MMP')
        PandasTools.AddMoleculeColumnToFrame(save_df, smilesCol='SMILES_OF_RIGHT_MMP', molCol='MOL_OF_RIGHT_MMP')
        save_df = save_df.sort_values(by=['SMILES_OF_RIGHT_MMP'])
        with open(f"htmls/{target}/{target}_{filtername}_mmps.html", 'w') as fd:
            html = save_df.to_html()
            fd.write(html)

In [None]:
from rdkit.Chem import PandasTools
os.makedirs(f"figures", exist_ok=True)
save_df = []
for filtername in filternames:
    indices = {"NoFilter": [1,2],
               "ScaffoldSimilarity": [0,2],
               "CompoundSimilarity": [1,2],
               "IdenticalMurckoScaffold": [1,5],
               "IdenticalTopologicalScaffold": [3,4]
              }
    a = mmps["HTR1A"][filtername][['SMILES_OF_LEFT_MMP', 'SMILES_OF_RIGHT_MMP', 'ID_OF_LEFT_MMP', 'Original_Entry_ID']].copy()
    a = a.query("Original_Entry_ID == 'CHEMBL277120'").iloc[indices[filtername]].copy()
    save_df.append(a)
save_df = pd.concat(save_df)
PandasTools.AddMoleculeColumnToFrame(save_df, smilesCol='SMILES_OF_LEFT_MMP', molCol='MOL_OF_LEFT_MMP')
PandasTools.AddMoleculeColumnToFrame(save_df, smilesCol='SMILES_OF_RIGHT_MMP', molCol='MOL_OF_RIGHT_MMP')
with open(f"figures/figure4_example_mmps.html", 'w') as fd:
    html = save_df.to_html()
    fd.write(html)

In [None]:
from rdkit.Chem import PandasTools
os.makedirs(f"figures", exist_ok=True)
save_df = []
for filtername in filternames:
    indices = {"NoFilter": [("Original_Entry_ID == '409926' or Original_Entry_ID == '2845629'", [])],
               "ScaffoldSimilarity": [("Original_Entry_ID == '409926'", [2,3])],
               "CompoundSimilarity": [("Original_Entry_ID == '409926'", [24,26])],
               "IdenticalMurckoScaffold": [("Original_Entry_ID == '409926'", [0]), ("Original_Entry_ID == '2845629'", [3])],
               "IdenticalTopologicalScaffold": [("Original_Entry_ID == '409926'", [0]), ("Original_Entry_ID == '2845629'", [2])]
              }
    a = mmps["DRD2"][filtername][['SMILES_OF_LEFT_MMP', 'SMILES_OF_RIGHT_MMP', 'ID_OF_LEFT_MMP', 'Original_Entry_ID']].copy()
    for query, index in indices[filtername]:
        x = a.query(query).iloc[index].copy()
        save_df.append(x)
save_df = pd.concat(save_df)
PandasTools.AddMoleculeColumnToFrame(save_df, smilesCol='SMILES_OF_LEFT_MMP', molCol='MOL_OF_LEFT_MMP')
PandasTools.AddMoleculeColumnToFrame(save_df, smilesCol='SMILES_OF_RIGHT_MMP', molCol='MOL_OF_RIGHT_MMP')
#save_df = save_df.drop_duplicates("SMILES_OF_LEFT_MMP",keep='first')
with open(f"figures/figure5_example_mmps.html", 'w') as fd:
    html = save_df.to_html()
    fd.write(html)