# Piggybacking Experiment

In [1]:
import subprocess
import os
print('Current conda environment:', os.environ['CONDA_DEFAULT_ENV'])

cwd = os.getcwd()
print(cwd)

import warnings
warnings.filterwarnings('ignore')

import random
random.seed(42)

Current conda environment: reinvent
/home/fts_g_ucla_edu/Projects/rips-relay/experiments


In [2]:
import pandas as pd

from crem.crem import grow_mol, mutate_mol
crem_db = '../crem_db/crem_db2.5.db'

import mols2grid

from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import PandasTools
from rdkit import DataStructs

Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality


In [3]:
fragment_lead_pairs = pd.read_csv('data/fragment_lead_pairs.csv')

fragment_lead_pairs.head()

Unnamed: 0,Year,Table_Entry,Fragment,Lead
0,2022,1,Nc1cc(c[nH]c1=O)C(F)(F)F,N[C@H]1CCN(Cc2cccc(c2)c3ccc4c(=O)[nH]ccc4c3)C1
1,2022,2,CN1C[C@@H](O)[C@H](C1=O)c2ccc(C)cc2,COc1ccc(CN2C[C@H](O)[C@](CCC(C)C)(C2=O)c3ccc(c...
2,2022,3,Fc1cncc(c1)N2C(=O)N[C@@H](Cc3ccccc3)C2=O,Clc1ccccc1C2CC3(C2)NC(=O)N(C3=O)c4cncc5ccccc45
3,2022,4,c1ccc(cc1)c2ccccc2c3nnn[nH]3,Cc1ccc(cc1)c2cccc(c2c3nnn[nH]3)S(=O)(=O)N
4,2022,5,CN(C)C(=O)C(N)Cc1ccc(F)cc1,Clc1ccc(cc1)[C@H]2CN[C@H](C2)C(=O)N3CCN(CC3)c4...


In [4]:
def tanimoto_similarity(smi_1, smi_2, use_counts=True):
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048,countSimulation=True)
    mol_1 = Chem.MolFromSmiles(smi_1)
    mol_2 = Chem.MolFromSmiles(smi_2)
    if use_counts:
        fp_1 = rdFingerprintGenerator.GetCountFPs([mol_1])[0]
        fp_2 = rdFingerprintGenerator.GetCountFPs([mol_2])[0]
    else:
        fp_1 = rdFingerprintGenerator.GetFPs([mol_1])[0]
        fp_2 = rdFingerprintGenerator.GetFPs([mol_2])[0]
    return DataStructs.TanimotoSimilarity(fp_1, fp_2)

In [5]:
def piggyback(initial, lead, mol_list, model='reinvent'):

    initial_mol = Chem.MolFromSmiles(initial)
    results = {}    # Temporary dict to store results per-run

    if model == 'reinvent': # Generate analogs w/ REINVENT

        # Change directory to run python script
        %cd ..

        arg1 = f'--input_frag'
        subprocess.run(['python3', 'generate_analogs.py', arg1, initial],
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT)
        
        # Change directory back to that of the current notebook
        %cd experiments
        
        # Read dataframe
        df = pd.read_csv('data/dataframe.csv')
        df.drop(['Unnamed: 0'], axis=1, inplace=True)
    
    elif model == 'crem':   # Generate analogs w/ CReM
        
        out_list = []
        mutate_list = list(mutate_mol(initial_mol, db_name=crem_db, return_mol=False))

        for idx, analog in enumerate(mutate_list):
            out_list.append([idx, analog, initial])

        df = pd.DataFrame(out_list, columns=["Idx","SMILES","Input_SMILES"])

    else:   # Raise an error if an invalid model is entered
        raise Exception('Invalid Model')
    
    # Remove duplicate values
    df.drop_duplicates(inplace=True, ignore_index=True, subset=['SMILES'])
    
    # Remove the initial fragment from the generated distribution
    if initial in df['SMILES'].values:

        index = df.loc[df['Tanimoto'] == 1].index[0]
        df.drop(index)

    df.round(3)

    # Compute similarities to lead molecule
    similarities_to_lead = [tanimoto_similarity(analog, lead, True) for analog in df['SMILES'].values]
    df['sim_to_lead'] = similarities_to_lead
    df.sort_values('sim_to_lead', ascending=False, inplace=True)

    # Calculate mean and max Tanimoto similarities
    results['mean'], results['max'] = [df['sim_to_lead'].mean()], [df['sim_to_lead'].max()]

    # Find SMILES string corresponding to best value
    best = df['SMILES'].values[0]

    # Add best to list of piggybacked molecules
    mol_list.append(Chem.MolFromSmiles(best))

    return best, mol_list, results

In [6]:
def run_experiment(initial, lead, model='reinvent', max_iters=10):

    best = initial

    best_tanimoto = tanimoto_similarity(initial, lead)

    n_iters = 0

    mol_list = [Chem.MolFromSmiles(initial)]

    results_df = pd.DataFrame()

    while best_tanimoto < .9 and n_iters < max_iters:

        n_iters += 1

        print(f'\n ===   N_ITERS = {n_iters}   === \n')

        best, mol_list, results = piggyback(best, lead, mol_list, model)

        temp_df = pd.DataFrame(data=results)

        results_df = pd.concat((results_df, temp_df))

        best_tanimoto = results['max'][0]

        print(f'\n ===   CURRENT BEST = {best_tanimoto}   === \n')

    mol_list.append(Chem.MolFromSmiles(lead))

    return results_df, mol_list

In [10]:
dataframes = []
output_mols = []

crem_dataframes = []
crem_output_mols = []

In [11]:
index = 6

initial, lead = fragment_lead_pairs['Fragment'][index], fragment_lead_pairs['Lead'][index]
    
results_df, mol_list = run_experiment(initial, lead)

dataframes.append(results_df)
output_mols.append(mol_list)

results_df, mol_list = run_experiment(initial, lead, model='crem')

crem_dataframes.append(results_df)
crem_output_mols.append(mol_list)


 ===   N_ITERS = 1   === 

/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments

 ===   CURRENT BEST = 0.48484848484848486   === 


 ===   N_ITERS = 2   === 

/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments

 ===   CURRENT BEST = 0.589041095890411   === 


 ===   N_ITERS = 3   === 

/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments

 ===   CURRENT BEST = 0.7285714285714285   === 


 ===   N_ITERS = 4   === 

/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments

 ===   CURRENT BEST = 0.7285714285714285   === 


 ===   N_ITERS = 5   === 

/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments

 ===   CURRENT BEST = 0.7285714285714285   === 


 ===   N_ITERS = 6   === 

/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments

 ===   CURRENT BE