In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem 
from rdkit.Chem import AllChem
from rdkit.Chem.Scaffolds import MurckoScaffold
from multiprocessing import Pool, cpu_count
import os
from glob import glob
from pathlib import Path
import itertools

def applyParallel(df, func):
    df_split = np.array_split(df, cpu_count())
    pool = Pool(cpu_count())
    data = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return data

def normalize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Chem.MolToSmiles(mol, isomericSmiles=False)
    else:
        return np.NaN

def normalize_smiles_series(smiles_series):
    return smiles_series.map(normalize_smiles)


def generate_murcko_scaffold(smile):
    try:
        mol = Chem.MolFromSmiles(smile)
        if mol:
            scaffold = MurckoScaffold.GetScaffoldForMol(mol)
            return Chem.MolToSmiles(scaffold, isomericSmiles=False)
        else:
            return np.NaN
    except:
        return np.NaN
    

def generate_topological_scaffold(smile):
    try:
        mol = Chem.MolFromSmiles(smile)
        if mol:
            scaffold = MurckoScaffold.MakeScaffoldGeneric(MurckoScaffold.GetScaffoldForMol(mol))
            return Chem.MolToSmiles(scaffold, isomericSmiles=False)
        else:
            return np.NaN
    except:
        return np.NaN


    
                   
def generate_murcko_scaffold_series(data):
    return data.map(generate_murcko_scaffold)

def generate_topological_scaffold_series(data):
    return data.map(generate_topological_scaffold)




In [None]:
import numba
from numba import njit, prange

@njit(parallel=True,fastmath=True)
def find_nns(fps, references):
    all_nns = np.zeros((fps.shape[0], 3), dtype=np.float64)
    
    for i in prange(fps.shape[0]):
        nn_dist, nn_idx, avg_nn = _find_nn_and_avg_nn(fps[i], references)
        all_nns[i,0] = nn_dist
        all_nns[i,1] = nn_idx
        all_nns[i,2] = avg_nn
    
    return all_nns
    
@njit(fastmath=True)
def _find_nn_and_avg_nn(fp, fps):
    nn_dist = 0.0
    nn_idx = numba.int32(-1)
    avg_nn = 0.0
    
    for i in range(fps.shape[0]):
        tanimoto = _minmax_two_fp(fp, fps[i])
        avg_nn += tanimoto
        if tanimoto > nn_dist:
            nn_dist = tanimoto
            nn_idx = i
    avg_nn = numba.float64(avg_nn) / numba.float64(fps.shape[0])
            
    return nn_dist, nn_idx, avg_nn


@njit(fastmath=True)
def _minmax_two_fp(fp1, fp2):
    common = numba.int32(0)
    maxnum = numba.int32(0)
    i = 0

    while i < len(fp1):
        min_ = fp1[i]
        max_ = fp2[i]

        if min_ > max_:
            min_ = fp2[i]
            max_ = fp1[i]

        common += min_
        maxnum += max_

        i += 1

    return numba.float64(common) / numba.float64(maxnum)

def counted_tanimoto_similarity(f1, fp2, return_distance=True):
    if return_distance:
        return 1. - _minmax_two_fp(fp1,fp2)
    else:
        return _minmax_two_fp(fp1,fp2)
    
def bulk_counted_tanimoto(fp1, fps, return_distance=True):
    if return_distance:
        return [1. - _minmax_two_fp(fp1,fp2) for fp2 in fps]
    else: 
        return [_minmax_two_fp(fp1,fp2) for fp2 in fps]



In [None]:
OVERWRITE_FILES = False


def read_scaffold_memory(folder):
    memory = pd.read_csv(f"data/memories/{folder}/memory_preprocessed.csv.gz")
    return memory

def read_id_steps(folder):
    return read_scaffold_memory(folder)[["ID","step","total_score"]]


def read_mmp_and_filter_pairs(mmpfile):
    foldername = mmpfile.split("/")[1]
    target = foldername.split("_")

    filtername = target[1]
    target = target[0]

    df = pd.read_csv(mmpfile, header=None)
    df.columns = ["SMILES_OF_LEFT_MMP","SMILES_OF_RIGHT_MMP","ID_OF_LEFT_MMP","ID_OF_RIGHT_MMP","SMIRKS_OF_TRANSFORMATION","SMILES_OF_CONTEXT"]

    df["SET_OF_LEFT_MMP"] = df["ID_OF_LEFT_MMP"].map(lambda x: x.split("_")[2] )
    df["SET_OF_RIGHT_MMP"] = df["ID_OF_RIGHT_MMP"].map(lambda x: x.split("_")[2] )


    mmp_pairs = df.query(" SET_OF_LEFT_MMP == @filtername and (SET_OF_RIGHT_MMP == 'training' or SET_OF_RIGHT_MMP == 'test' or SET_OF_RIGHT_MMP == 'validation' )")
    ids = read_id_steps(foldername)
    
    mmp_pairs = pd.merge(mmp_pairs, ids, left_on="ID_OF_LEFT_MMP", right_on="ID", how="inner")
    del mmp_pairs["ID"]
    mmp_pairs.rename(columns={"step": "STEP", "total_score": "SCORE"}, inplace=True)
    mmp_pairs = mmp_pairs.sort_values(by=['STEP'])
    return mmp_pairs


for mmpfile in glob("MMP/*/MMP.csv.gz"):
    folder = mmpfile.split("/")[1]
    if len(folder.split("_")) <= 2: #we only process the run with explicit parameters here in this notebook
        continue 
    mmpfile_filtered = mmpfile.replace("MMP.csv.gz", "MMP_filtered.csv")
    if not os.path.exists(mmpfile_filtered) or OVERWRITE_FILES:
        mmp = read_mmp_and_filter_pairs(mmpfile)
        mmp.to_csv(mmpfile_filtered, index=False)
    else:
        #print(f"Skipping {mmpfile} as it seems to already be processed")
        continue

In [None]:
def _get_mmp_stats(mmp_pairs):
        nb_mmp_pairs = len(mmp_pairs)
        nb_generated_unique_cpds = len(mmp_pairs.drop_duplicates("SMILES_OF_LEFT_MMP"))
        return nb_mmp_pairs, nb_generated_unique_cpds
    

def get_mmp_cpd(df, filtername, maxstep, minactivity):
    mmp_pairs = df.query("SET_OF_LEFT_MMP == @filtername and STEP < @maxstep and SCORE >= @minactivity").copy()
    
    mmp_pairs = mmp_pairs.sort_values(by=['STEP'])
    mmp_pairs = mmp_pairs.drop_duplicates(["SMILES_OF_LEFT_MMP", "SMILES_OF_RIGHT_MMP"], keep='first')
        
    np_mmp_pairs_training, nb_generated_unique_cpds_training = _get_mmp_stats(mmp_pairs.query("SET_OF_RIGHT_MMP == 'training'"))
    np_mmp_pairs_test, nb_generated_unique_cpds_test = _get_mmp_stats(mmp_pairs.query("SET_OF_RIGHT_MMP == 'test' or SET_OF_RIGHT_MMP == 'validation'"))
    
    return np_mmp_pairs_training, nb_generated_unique_cpds_training, np_mmp_pairs_test, nb_generated_unique_cpds_test#, np_mmp_pairs_validation, nb_generated_unique_cpds_validation


def load_mmp(target, filtername, maxstep, minactivity, minsimilarity=None, bucket_size = None, outputmode = None, temperature = None, experience_replay = None):
    if minsimilarity is not None:
        file = glob(f"MMP/{target}_{filtername}_{minsimilarity}_{bucket_size}_{outputmode}_{temperature}_{experience_replay}/MMP_filtered.csv")
    else:
        file = glob(f"MMP/{target}_{filtername}/MMP_filtered.csv")
    if len(file) != 1: 
        raise Exception("Invalid Experiment")
    df = pd.read_csv(file[0])
    return get_mmp_cpd(df, filtername, maxstep, minactivity)


target_params = {
    "DRD2":      {"maxstep": 300,
                  "minactivity": 0.7}
}

bucket_sizes = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75]
minsimilarities = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
outputmodes = ["binary", "sigmoid", "linear"]
temperatures = [1.0, 1.25, 1.5, 1.75, 2.0]
experience_replays = [False, True]


default_bucket_size = 25
default_minsimilarity = 0.6
default_outputmode = "binary"
default_temperature = 1.0
default_experience_replay = False

valid_experiments = [] 

filternames = ["NoFilter", "CompoundSimilarity", "IdenticalMurckoScaffold", "IdenticalTopologicalScaffold", "ScaffoldSimilarity"]
filternames_in_plots = [ "No memory", "CompoundSimilarity memory", "IdenticalMurckoScaffold memory", "IdenticalCarbonSkeleton memory", "ScaffoldSimilarity memory"]

mmp_memories = {}
for target, params in target_params.items():
    if target not in mmp_memories:
        mmp_memories[target] = {} 
    maxstep = params['maxstep']
    minactivity = params['minactivity']
    
    for filtername, bucket_size, minsimilarity, outputmode, temperature, experience_replay in itertools.product(filternames, bucket_sizes, minsimilarities, outputmodes, temperatures, experience_replays):
        try: 
            nb_mmp_pairs_training, nb_generated_unique_cpds_training, np_mmp_pairs_test, nb_generated_unique_cpds_test = load_mmp(target, filtername, maxstep, minactivity, minsimilarity, bucket_size, outputmode, temperature, experience_replay)
            valid_experiments.append((target, filtername, minsimilarity, bucket_size, outputmode, temperature, experience_replay))
            experiment = "_".join(map(str,valid_experiments[-1]))
        except Exception as e:
             continue 
        
        print(f"{experiment} STEPS: {maxstep}")
        print(f"Training: {nb_mmp_pairs_training} MMPs formed by generating {nb_generated_unique_cpds_training} compounds.")
        print(f"Test: {np_mmp_pairs_test} MMPs formed by generating {nb_generated_unique_cpds_test} compounds.")
        print()


        mmp_memories[experiment] = {} 
        folder = glob(f"MMP/{experiment}")[0].split("/")[-1]
        memory = read_scaffold_memory(folder)
        memory.rename(columns={"step": "STEP", "total_score": "SCORE"}, inplace=True)
        memory = memory.sort_values(by=['STEP'])
        mmp_memories[experiment] = memory


In [None]:
OVERWRITE_FILES = False


def cecfp6_from_mol(mol):
    fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=False)
    size = 2048
    nfp = np.zeros(size, np.int32)
    for idx,v in fp.GetNonzeroElements().items():
        nidx = idx%size
        nfp[nidx] += int(v)
    return nfp

def cecfp6_from_smiles(smiles):
    return cecfp6_from_mol(Chem.MolFromSmiles(smiles))

def generate_fingerprint_series(data):
    return data.map(cecfp6_from_smiles)


def find_nns_from_pandas(fps, reference):
    fps = np.array([np.array(e) for e in fps.values],dtype=np.int32)
    reference = np.array([np.array(e) for e in reference.values],dtype=np.int32)
    return find_nns(fps, reference)


for target, params in target_params.items():
    reference = pd.read_pickle(f"data/{target}/actives.pkl.gz")
    if target == "clogP":
        reference['cfp'] = applyParallel( reference["SMILES"], generate_fingerprint_series)

    reference_training = reference.query("trainingset_class == 'training'").copy()

    reference_test =  reference.query("trainingset_class == 'test' or trainingset_class == 'validation'").copy()

    
    for experiment_parts in valid_experiments:
        if target != experiment_parts[0]:
            continue
        experiment = "_".join(map(str,experiment_parts))

        if not os.path.exists(f"data/memories/{experiment}/memory_with_nn.pkl.gz") or OVERWRITE_FILES:
            memory = pd.read_csv(f"data/memories/{experiment}/memory_preprocessed.csv.gz")
            memory.rename(columns={"step": "STEP", "total_score": "SCORE"}, inplace=True)
            memory['cfp'] = applyParallel( memory["SMILES"], generate_fingerprint_series)

            nns_arr = find_nns_from_pandas(memory['cfp'], reference['cfp'])

            memory["NN_dist"] = nns_arr[:,0]
            memory["NN_idx"] = nns_arr[:,1]
            memory["NN_avg"] = nns_arr[:,2]
            memory["NN_Original_Entry_ID"] = reference.iloc[nns_arr[:,1]]["Original_Entry_ID"].values
            memory["NN_RDKIT_SMILES"] = reference.iloc[nns_arr[:,1]]["RDKIT_SMILES"].values
            memory["NN_ID"] = reference.iloc[nns_arr[:,1]]["ID"].values

            nns_arr = find_nns_from_pandas(memory['cfp'], reference_training['cfp'])

            memory["NN_dist_training"] = nns_arr[:,0]
            memory["NN_idx_training"] = nns_arr[:,1]
            memory["NN_avg_training"] = nns_arr[:,2]
            memory["NN_Original_Entry_ID_training"] = reference_training.iloc[nns_arr[:,1]]["Original_Entry_ID"].values
            memory["NN_RDKIT_SMILES_training"] = reference_training.iloc[nns_arr[:,1]]["RDKIT_SMILES"].values
            memory["NN_ID_training"] = reference_training.iloc[nns_arr[:,1]]["ID"].values

            nns_arr = find_nns_from_pandas(memory['cfp'], reference_test['cfp'])

            memory["NN_dist_test"] = nns_arr[:,0]
            memory["NN_idx_test"] = nns_arr[:,1]
            memory["NN_avg_test"] = nns_arr[:,2]
            memory["NN_Original_Entry_ID_test"] = reference_test.iloc[nns_arr[:,1]]["Original_Entry_ID"].values
            memory["NN_RDKIT_SMILES_test"] = reference_test.iloc[nns_arr[:,1]]["RDKIT_SMILES"].values
            memory["NN_ID__test"] = reference_test.iloc[nns_arr[:,1]]["ID"].values

            memory.to_pickle(f"data/memories/{experiment}/memory_with_nn.pkl.gz")
        else:
            #print(f"Skipping data/memories/{experiment}/memory_with_nn.pkl.gz as it seems to already be processed")
            continue

In [None]:
def get_cumsum_per_target(memory, STEPS, whattoplot="SMILES"):
    nb_stuff = []
    stuff = set()

    for i in range(1,STEPS+1):
        subset = memory[memory["STEP"] == i-1]
        for s in subset[whattoplot]:
            stuff.add(s)
        nb_stuff.append(len(stuff))
    
    return nb_stuff

cumsum = dict()
memories = dict()
for target, params in target_params.items():
    cumsum[target] = dict()
    memories[target] = dict()
    maxstep = params['maxstep']
    minactivity = params['minactivity']
    for experiment_parts in valid_experiments:
        if target != experiment_parts[0]:
            continue    
        experiment = "_".join(map(str,experiment_parts))
        memory = pd.read_pickle(f"data/memories/{experiment}/memory_with_nn.pkl.gz")
        memory.rename(columns={"step": "STEP", "total_score": "SCORE"}, inplace=True)
        memories[target][experiment] = memory
        subset = memory.query("STEP < @maxstep and SCORE >= @minactivity")
        cumsum[target][experiment] = dict()
        for stuff in ["SMILES", "Murcko Scaffold", "Topological Scaffold"]: 
            subset = subset.query("NN_dist >= 0.4")
            cumsum[target][experiment][stuff] = get_cumsum_per_target(subset, maxstep, stuff)

In [None]:
def get_cumsum_per_target(memory, STEPS, whattoplot="SMILES"):
    nb_stuff = []
    stuff = set()

    for i in range(1,STEPS+1):
        subset = memory[memory["STEP"] == i-1]
        for s in subset[whattoplot]:
            stuff.add(s)
        nb_stuff.append(len(stuff))
    
    return nb_stuff

for target, params in target_params.items():
    if target not in cumsum:
        cumsum[target] = dict()
    if target not in memories:
        memories[target] = dict()
    maxstep = params['maxstep']
    minactivity = params['minactivity']    
    for filtername in filternames:
        memory = pd.read_pickle(f"data/memories/{target}_{filtername}/memory_with_nn.pkl.gz")
        memory.rename(columns={"step": "STEP", "total_score": "SCORE"}, inplace=True)
        memories[target][filtername] = memory
        subset = memory.query("STEP < @maxstep and SCORE >= @minactivity")
        cumsum[target][filtername] = dict()
        for stuff in ["SMILES", "Murcko Scaffold", "Topological Scaffold"]: 
            subset = subset.query("NN_dist >= 0.4")
            cumsum[target][filtername][stuff] = get_cumsum_per_target(subset, maxstep, stuff)

In [None]:
len(pd.read_pickle("data/memories/DRD2_IdenticalMurckoScaffold/memory_with_nn.pkl.gz").rename(columns={"step": "STEP", "total_score": "SCORE"}).query("STEP < 300 and SCORE >= 0.7").SMILES.unique())

In [None]:
len(pd.read_pickle("data/memories/DRD2_IdenticalMurckoScaffold_0.6_25_sigmoid_1.0_False/memory_with_nn.pkl.gz").rename(columns={"step": "STEP", "total_score": "SCORE"}).query("STEP < 300 and SCORE >= 0.7").SMILES.unique())

In [None]:
def get_experiment_names(filtername, bucket_size=25, minsimilarity=0.6, outputmode="binary", temperature=1.0, experience_replay=False):
    experiment_string = f"DRD2_{filtername}_{minsimilarity}_{bucket_size}_{outputmode}_{temperature}_{experience_replay}"
    if experiment_string in cumsum["DRD2"]:
        return experiment_string
    else:
        return None

In [None]:
print(get_experiment_names("NoFilter") )

In [None]:
for filtername in filternames:
    name = get_experiment_names(filtername)
    print(filtername, cumsum["DRD2"][filtername]["SMILES"][-1], cumsum["DRD2"][name]["SMILES"][-1])

In [None]:
bucket_experiment = {}
similarity_experiment = {} 
outputmode_experiment = {}

temperature_experiment = {}
experience_experiment = {}

experiments_list = [bucket_experiment, similarity_experiment, outputmode_experiment, temperature_experiment, experience_experiment]

for target in target_params:
    for filtername in filternames:
        bucket_experiment[filtername] = {}
        similarity_experiment[filtername] = {}
        outputmode_experiment[filtername] = {}
            
        for bucket_size in bucket_sizes:
            bucket_size = default_bucket_size if filtername == "NoFilter" else bucket_size
            experiment = "_".join(map(str,[target, filtername, default_minsimilarity, bucket_size, default_outputmode, default_temperature, default_experience_replay]))
            if experiment not in  cumsum[target]:
                print(experiment)
            else:
                bucket_experiment[filtername][bucket_size] = experiment                

        for minsimilarity in minsimilarities:
            minsimilarity = default_minsimilarity if filtername == "NoFilter" else minsimilarity
            experiment = "_".join(map(str,[target, filtername, minsimilarity, default_bucket_size, default_outputmode, default_temperature, default_experience_replay]))
            if experiment not in  cumsum[target]:
                print(experiment)
            else:
                similarity_experiment[filtername][minsimilarity] = experiment  
                

        for outputmode in outputmodes:
            outputmode = default_outputmode if filtername == "NoFilter" else outputmode
            experiment = "_".join(map(str,[target, filtername, default_minsimilarity, default_bucket_size, outputmode, default_temperature, default_experience_replay]))
            if experiment not in  cumsum[target]:
                print(experiment)
            else:
                outputmode_experiment[filtername][outputmode] = experiment  

                
temperature_experiment["NoFilter"] = {}
filtername  = "NoFilter"
for temperature in temperatures:
    experiment = "_".join(map(str,["DRD2", "NoFilter", default_minsimilarity, default_bucket_size, default_outputmode, temperature, default_experience_replay]))
    if experiment not in  cumsum[target]:
        print(experiment)
    else:
        temperature_experiment[filtername][temperature] = experiment  

experience_experiment["NoFilter"] = {}
for experience_replay in experience_replays:
    experiment = "_".join(map(str,["DRD2", "NoFilter", default_minsimilarity, default_bucket_size, default_outputmode, default_temperature, experience_replay]))
    if experiment not in  cumsum[target]:
        print(experiment)
    else:
        experience_experiment[filtername][experience_replay] = experiment  




# Number of unique actives

In [None]:
for target, params in target_params.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity']
    for valid_experiments in experiments_list:
        for filtername, experiment_dict in valid_experiments.items():
            for number, experiment in experiment_dict.items():
                experiment_parts = experiment.split("_")
                if target != experiment_parts[0]:
                    continue    
                subset = memories[target][experiment].query("STEP < @maxstep and SCORE >= @minactivity")
                nb = len(set(subset["SMILES"]))
                print(f"{target:8}{experiment:65}{nb:>6}")
            print()
        print()

# Number of unique Murcko Scaffolds

In [None]:
for target, params in target_params.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity'] 
    for valid_experiments in experiments_list:
        for filtername, experiment_dict in valid_experiments.items():
            for number, experiment in experiment_dict.items():
                experiment_parts = experiment.split("_")
                if target != experiment_parts[0]:
                    continue    
                subset = memories[target][experiment].query("STEP < @maxstep and SCORE >= @minactivity")
                nb = len(set(subset["Murcko Scaffold"]))
                print(f"{target:8}{experiment:65}{nb:>6}")
            print()
        print()

# Number of unique Topological Scaffolds

In [None]:
for target, params in target_params.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity']    
    for valid_experiments in experiments_list:
        for filtername, experiment_dict in valid_experiments.items():
            for number, experiment in experiment_dict.items():
                experiment_parts = experiment.split("_")
                if target != experiment_parts[0]:
                    continue    
                subset = memories[target][experiment].query("STEP < @maxstep and SCORE >= @minactivity")
                nb = len(set(subset["Topological Scaffold"]))
                print(f"{target:8}{experiment:65}{nb:>6}")
            print()
        print()

# Number of exact SMILES matches from ExCAPE

# Number of ECFP6 analogs

In [None]:
for target, params in target_params.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity']        
    for valid_experiments in experiments_list:
        for filtername, experiment_dict in valid_experiments.items():
            for number, experiment in experiment_dict.items():
                experiment_parts = experiment.split("_")
                if target != experiment_parts[0]:
                    continue    
                subset = memories[target][experiment].query("STEP < @maxstep and SCORE >= @minactivity")
                subset = subset.drop_duplicates("SMILES")
                nb = len(subset.query("NN_dist >= 0.4"))
                percent = (nb / len(subset))*100
                print(f"{target:8}{experiment:65}{nb:>6}\t{percent:.3}%")
            print()
        print()

In [None]:
for target, params in target_params.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity']    
    for valid_experiments in experiments_list:
        for filtername, experiment_dict in valid_experiments.items():
            for number, experiment in experiment_dict.items():
                experiment_parts = experiment.split("_")
                if target != experiment_parts[0]:
                    continue    
                subset = memories[target][experiment].query("STEP < @maxstep and SCORE >= @minactivity")
                subset = subset.drop_duplicates("SMILES")
                nb = len(subset.query("NN_dist_training >= 0.4"))
                percent = (nb / len(subset))*100
                print(f"{target:8}{experiment:65}{nb:>6}\t{percent:.3}%")
            print()
        print()

In [None]:
for target, params in target_params.items():
    maxstep = params['maxstep']
    minactivity = params['minactivity']
    for valid_experiments in experiments_list:
        for filtername, experiment_dict in valid_experiments.items():
            for number, experiment in experiment_dict.items():
                experiment_parts = experiment.split("_")
                if target != experiment_parts[0]:
                    continue    
                subset = memories[target][experiment].query("STEP < @maxstep and SCORE >= @minactivity")
                subset = subset.drop_duplicates("SMILES")
                nb = len(subset.query("NN_dist_test >= 0.4"))
                percent = (nb / len(subset))*100
                print(f"{target:8}{experiment:65}{nb:>6}\t{percent:.3}%")
            print()
        print()

In [None]:
#%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
os.makedirs(f"plots_revisions/", exist_ok=True)
# SMALL_SIZE = 8
# MEDIUM_SIZE = 10
# BIGGER_SIZE = 12

# plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
# plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
# plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
# plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
# plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
# plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
# plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title


filternames_in_plots = ["No memory", "CompoundSimilarity memory", "IdenticalMurckoScaffold memory", "IdenticalCarbonSkeleton memory", "ScaffoldSimilarity memory"]
sns.set_context("talk")
sns.set_style("whitegrid")
#plt.style.use('bmh')
fig = plt.figure(figsize=(5,6))
#

In [None]:
filternames_mapping = {k:v for k,v in zip(filternames,filternames_in_plots) }
filternames_mapping

In [None]:
for stuff in ["SMILES", "Murcko Scaffold", "Topological Scaffold"]:
    things_to_plot = {}
    for filtername in filternames: 
        things_to_plot[filtername] = {}
        for bucket in bucket_sizes: 
            experiment = get_experiment_names(filtername, bucket_size=bucket)
            if experiment:
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
            else: 
                experiment = get_experiment_names(filtername)
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
    
    fig, ax = plt.subplots(figsize=(5,6))
    for filtername in filternames:
        x = list(things_to_plot[filtername].keys())
        y = list(things_to_plot[filtername].values())
        ax.plot(x,y, label=filternames_mapping[filtername])
    #lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)

    if stuff == "SMILES":
        print_ylabel = "No. generated ECFP6 analogs"
    elif stuff == "Murcko Scaffold": 
        print_ylabel = "No. generated Bemis Murcko scaffolds"
    else:
        print_ylabel = "No. generated carbon skeletons"

    ax.set_xlabel("Bucket Size")
    ax.set_ylabel(print_ylabel)
    plt.tight_layout()
    plt.savefig(f"plots_revisions/bucket_size_{stuff}.svg", bbox_inches='tight')
    lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)
    plt.savefig(f"plots_revisions/bucket_size_{stuff}_legend.svg", bbox_extra_artists=(lgd,), bbox_inches='tight')

In [None]:
for stuff in ["SMILES", "Murcko Scaffold", "Topological Scaffold"]:
    things_to_plot = {}
    for filtername in filternames: 
        things_to_plot[filtername] = {}
        for bucket in bucket_sizes: 
            experiment = get_experiment_names(filtername, bucket_size=bucket, experience_replay=True)
            if experiment:
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
            else: 
                experiment = get_experiment_names(filtername, experience_replay=True)
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
    
    fig, ax = plt.subplots(figsize=(5,6))
    for filtername in filternames:
        x = list(things_to_plot[filtername].keys())
        y = list(things_to_plot[filtername].values())
        ax.plot(x,y, label=filternames_mapping[filtername])
    #lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)

    if stuff == "SMILES":
        print_ylabel = "No. generated ECFP6 analogs"
    elif stuff == "Murcko Scaffold":
        print_ylabel = "No. generated Bemis Murcko scaffolds"
    else:
        print_ylabel = "No. generated carbon skeletons"

    ax.set_xlabel("Bucket Size")
    ax.set_ylabel(print_ylabel)
    plt.tight_layout()
    
    plt.savefig(f"plots_revisions/bucket_size_{stuff} with experience.svg", bbox_inches='tight')
    lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)
    plt.savefig(f"plots_revisions/bucket_size_{stuff} with experience_legend.svg", bbox_extra_artists=(lgd,), bbox_inches='tight')

In [None]:

for stuff in ["SMILES", "Murcko Scaffold", "Topological Scaffold"]:
    things_to_plot = {}
    for filtername in filternames: 
        things_to_plot[filtername] = {}
        for minsimilarity in minsimilarities: 
            experiment = get_experiment_names(filtername, minsimilarity=minsimilarity)
            if experiment:
                things_to_plot[filtername][minsimilarity] = cumsum["DRD2"][experiment][stuff][-1]
            else: 
                experiment = get_experiment_names(filtername)
                things_to_plot[filtername][minsimilarity] = cumsum["DRD2"][experiment][stuff][-1]
    
    fig, ax = plt.subplots(figsize=(5,6))
    for filtername in filternames:
        x = list(things_to_plot[filtername].keys())
        y = list(things_to_plot[filtername].values())
        ax.plot(x,y, label=filternames_mapping[filtername])
    #lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)
    if stuff == "SMILES":
        print_ylabel = "No. generated ECFP6 analogs"
    elif stuff == "Murcko Scaffold":
        print_ylabel = "No. generated Bemis Murcko scaffolds"
    else:
        print_ylabel = "No. generated carbon skeletons"

    ax.set_xlabel("Minimum Similarity")
    ax.set_ylabel(print_ylabel)
    plt.tight_layout()    
    plt.savefig(f"plots_revisions/minsimilairty_{stuff}.svg", bbox_inches='tight')
    lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)
    plt.savefig(f"plots_revisions/minsimilairty_{stuff}_legend.svg", bbox_extra_artists=(lgd,), bbox_inches='tight')

In [None]:

for stuff in ["SMILES", "Murcko Scaffold", "Topological Scaffold"]:
    things_to_plot = {}
    for filtername in filternames: 
        things_to_plot[filtername] = {}
        for temperature in temperatures: 
            experiment = get_experiment_names(filtername, temperature=temperature)
            if experiment:
                things_to_plot[filtername][temperature] = cumsum["DRD2"][experiment][stuff][-1]
            else: 
                experiment = get_experiment_names(filtername)
                things_to_plot[filtername][temperature] = cumsum["DRD2"][experiment][stuff][-1]
    
    fig, ax = plt.subplots(figsize=(5,6))
    for filtername in filternames:
        x = list(things_to_plot[filtername].keys())
        y = list(things_to_plot[filtername].values())
        ax.plot(x,y, label=filternames_mapping[filtername])
    #lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)
    if stuff == "SMILES":
        print_ylabel = "No. generated ECFP6 analogs"
    elif stuff == "Murcko Scaffold":
        print_ylabel = "No. generated Bemis Murcko scaffolds"
    else:
        print_ylabel = "No. generated carbon skeletons"

    ax.set_xlabel("Temperature")
    ax.set_ylabel(print_ylabel)
    plt.tight_layout()
    plt.savefig(f"plots_revisions/temperature_{stuff}.svg", bbox_inches='tight')
    lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)
    plt.savefig(f"plots_revisions/temperature_{stuff}_legend.svg", bbox_extra_artists=(lgd,), bbox_inches='tight')

In [None]:

for stuff in ["SMILES", "Murcko Scaffold", "Topological Scaffold"]:
    things_to_plot = {}
    for filtername in filternames: 
        things_to_plot[filtername] = {}
        for outputmode in outputmodes: 
            experiment = get_experiment_names(filtername, outputmode=outputmode)
            if experiment:
                things_to_plot[filtername][outputmode] = cumsum["DRD2"][experiment][stuff][-1]
            else: 
                experiment = get_experiment_names(filtername)
                things_to_plot[filtername][outputmode] = cumsum["DRD2"][experiment][stuff][-1]
    
    fig, ax = plt.subplots(figsize=(5,6))
    for filtername in filternames:
        x = list(things_to_plot[filtername].keys())
        y = list(things_to_plot[filtername].values())
        ax.plot(x,y, label=filternames_mapping[filtername])
    #lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)
    if stuff == "SMILES":
        print_ylabel = "No. generated ECFP6 analogs"
    elif stuff == "Murcko Scaffold":
        print_ylabel = "No. generated Bemis Murcko scaffolds"
    else:
        print_ylabel = "No. generated carbon skeletons"

    ax.set_xlabel("Output Mode")
    ax.set_ylabel(print_ylabel)
    plt.tight_layout()
    plt.savefig(f"plots_revisions/outputmode_{stuff}.svg", bbox_inches='tight')
    lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)
    plt.savefig(f"plots_revisions/outputmode_{stuff}_legend.svg", bbox_extra_artists=(lgd,), bbox_inches='tight')


In [None]:
for stuff in ["SMILES", "Murcko Scaffold", "Topological Scaffold"]:
    fig, ax = plt.subplots(figsize=(5,6))
    colors = sns.color_palette()[0:5]

    things_to_plot = {}
    for filtername in filternames: 
        things_to_plot[filtername] = {}
        for bucket in bucket_sizes: 
            experiment = get_experiment_names(filtername, bucket_size=bucket, experience_replay=False)
            if experiment:
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
            else: 
                experiment = get_experiment_names(filtername, experience_replay=False)
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
    
    for i, filtername in enumerate(filternames):
        x = list(things_to_plot[filtername].keys())
        y = list(things_to_plot[filtername].values())
        ax.plot(x,y,':', label=f"{filternames_mapping[filtername]} without experience replay", c=colors[i], alpha=0.7)
    
    
    
    things_to_plot = {}
    for filtername in filternames: 
        things_to_plot[filtername] = {}
        for bucket in bucket_sizes: 
            experiment = get_experiment_names(filtername, bucket_size=bucket, experience_replay=True)
            if experiment:
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
            else: 
                experiment = get_experiment_names(filtername, experience_replay=True)
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
    

    for filtername in filternames:
        x = list(things_to_plot[filtername].keys())
        y = list(things_to_plot[filtername].values())
        ax.plot(x,y, label=f"{filternames_mapping[filtername]} with experience replay")
    #lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)

    
    

    
    if stuff == "SMILES":
        print_ylabel = "No. generated ECFP6 analogs"
    elif stuff == "Murcko Scaffold":
        print_ylabel = "No. generated Bemis Murcko scaffolds"
    else:
        print_ylabel = "No. generated carbon skeletons"

    ax.set_xlabel("Bucket Size")
    ax.set_ylabel(print_ylabel)
    plt.tight_layout()
    
    plt.savefig(f"plots_revisions/bucket_size_{stuff} with and without experience.svg", bbox_inches='tight')
    #lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)

In [None]:
for stuff in ["SMILES"]:
    fig, ax = plt.subplots(figsize=(5,6))
    colors = sns.color_palette()[0:5]

    things_to_plot = {}
    for filtername in filternames: 
        things_to_plot[filtername] = {}
        for bucket in bucket_sizes: 
            experiment = get_experiment_names(filtername, bucket_size=bucket, experience_replay=False)
            if experiment:
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
            else: 
                experiment = get_experiment_names(filtername, experience_replay=False)
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
    
    lines = []
    
    for i, filtername in enumerate(filternames):
        x = list(things_to_plot[filtername].keys())
        y = list(things_to_plot[filtername].values())
        line = ax.plot(x,y,':', label=f"{filternames_mapping[filtername]} without experience replay", c=colors[i], alpha=0.7)
        lines.append(line[0])
    
    

    
    if stuff == "SMILES":
        print_ylabel = "No. generated ECFP6 analogs"
    elif stuff == "Murcko Scaffold":
        print_ylabel = "No. generated Bemis Murcko scaffolds"
    else:
        print_ylabel = "No. generated carbon skeletons"

    ax.set_xlabel("Bucket Size")
    ax.set_ylabel(print_ylabel)
    plt.tight_layout()
    
    lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.4), ncol=3, borderaxespad=0, frameon=True)
    ax.axis("off")
    for line in lines:
        line.set_visible(False)
    plt.savefig(f"plots_revisions/legend dotted.svg", bbox_extra_artists=(lgd,), bbox_inches='tight')


In [None]:
for stuff in ["SMILES"]:
    fig, ax = plt.subplots(figsize=(5,6))
    colors = sns.color_palette()[0:5]

    things_to_plot = {}
    for filtername in filternames: 
        things_to_plot[filtername] = {}
        for bucket in bucket_sizes: 
            experiment = get_experiment_names(filtername, bucket_size=bucket, experience_replay=True)
            if experiment:
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
            else: 
                experiment = get_experiment_names(filtername, experience_replay=True)
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
    
    lines = []
    
    for i, filtername in enumerate(filternames):
        x = list(things_to_plot[filtername].keys())
        y = list(things_to_plot[filtername].values())
        line = ax.plot(x,y, label=f"{filternames_mapping[filtername]} with experience replay", c=colors[i], alpha=0.7)
        lines.append(line[0])
    
    

    
    if stuff == "SMILES":
        print_ylabel = "No. generated ECFP6 analogs"
    elif stuff == "Murcko Scaffold":
        print_ylabel = "No. generated Bemis Murcko scaffolds"
    else:
        print_ylabel = "No. generated carbon skeletons"

    ax.set_xlabel("Bucket Size")
    ax.set_ylabel(print_ylabel)
    plt.tight_layout()
    
    lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.4), ncol=3, borderaxespad=0, frameon=True)
    ax.axis("off")
    for line in lines:
        line.set_visible(False)
    plt.savefig(f"plots_revisions/legend line.svg", bbox_extra_artists=(lgd,), bbox_inches='tight')


In [None]:
for stuff in ["SMILES"]:
    fig, ax = plt.subplots(figsize=(5,6))
    colors = sns.color_palette()[0:5]

    things_to_plot = {}
    for filtername in filternames: 
        things_to_plot[filtername] = {}
        for bucket in bucket_sizes: 
            experiment = get_experiment_names(filtername, bucket_size=bucket, experience_replay=False)
            if experiment:
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
            else: 
                experiment = get_experiment_names(filtername, experience_replay=False)
                things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
    
    lines = []
    
    for i, filtername in enumerate(filternames):
        x = list(things_to_plot[filtername].keys())
        y = list(things_to_plot[filtername].values())
        line = ax.plot(x,y, label=f"{filternames_mapping[filtername]}", c=colors[i], alpha=0.7)
        lines.append(line[0])
    
    

    
    if stuff == "SMILES":
        print_ylabel = "No. generated ECFP6 analogs"
    elif stuff == "Murcko Scaffold":
        print_ylabel = "No. generated Bemis Murcko scaffolds"
    else:
        print_ylabel = "No. generated carbon skeletons"

    ax.set_xlabel("Bucket Size")
    ax.set_ylabel(print_ylabel)
    plt.tight_layout()
    
    lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.4), ncol=3, borderaxespad=0, frameon=True)
    ax.axis("off")
    for line in lines:
        line.set_visible(False)
    plt.savefig(f"plots_revisions/legend.svg", bbox_extra_artists=(lgd,), bbox_inches='tight')


In [None]:
    fig, ax = plt.subplots(nrows=3, ncols=2, sharex=True, sharey=True, figsize=(15,12))



In [None]:
    
fig, ax = plt.subplots(nrows=3, ncols=2, sharex=False, sharey="row", figsize=(12,15))
colors = sns.color_palette()[0:5]
    
for row, stuff in enumerate(["SMILES", "Murcko Scaffold", "Topological Scaffold"]):

    
    for col, experience_replay in enumerate([True, False]):
        things_to_plot = {}
        for filtername in filternames: 
            things_to_plot[filtername] = {}
            for bucket in bucket_sizes: 
                experiment = get_experiment_names(filtername, bucket_size=bucket, experience_replay=experience_replay)
                if experiment:
                    things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]
                else: 
                    experiment = get_experiment_names(filtername, experience_replay=experience_replay)
                    things_to_plot[filtername][bucket] = cumsum["DRD2"][experiment][stuff][-1]


        for filtername in filternames:
            x = list(things_to_plot[filtername].keys())
            y = list(things_to_plot[filtername].values())
            ax[row,col].plot(x,y, label=f"{filternames_mapping[filtername]}")
        #lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)

    
    

    
    if stuff == "SMILES":
        print_ylabel = "No. generated ECFP6 analogs"
    elif stuff == "Murcko Scaffold":
        print_ylabel = "No. generated Bemis Murcko scaffolds"
    else:
        print_ylabel = "No. generated carbon skeletons"
        
    ax[row,0].set_ylabel(print_ylabel)

ax[-1,0].set_xlabel("Bucket Size")
ax[-1,1].set_xlabel("Bucket Size")

ax[0,0].set_title(r"$\bf{with\ experience\ replay}$"+"\n\n(a)")
ax[0,1].set_title(r"$\bf{without\ experience\ replay}$"+"\n\n(b)")

ax[1,0].set_title("(c)")
ax[1,1].set_title("(d)")

ax[2,0].set_title("(e)")
ax[2,1].set_title("(f)")



plt.tight_layout()

plt.savefig(f"plots_revisions/bucket_size with and without experience.svg", bbox_inches='tight')
#lgd = ax.legend(loc='center left', bbox_to_anchor= (1.05, 0.5), ncol=1, borderaxespad=0, frameon=True)