# Piggybacking Experiment

In [None]:
import subprocess
import os
print('Current conda environment:', os.environ['CONDA_DEFAULT_ENV'])

cwd = os.getcwd()
print(cwd)

import warnings
warnings.filterwarnings('ignore')

import random
random.seed(42)

In [None]:
import pandas as pd

from crem.crem import grow_mol, mutate_mol
crem_db = '../crem_db/crem_db2.5.db'

import mols2grid

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import PandasTools
from rdkit import DataStructs

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc = {'figure.figsize':(15,8)})

In [None]:
fragment_lead_pairs = pd.read_csv('data/fragment_lead_pairs.csv')

fragment_lead_pairs.head()

In [None]:
def tanimoto_similarity(smi_1, smi_2, use_counts=True):
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048,countSimulation=True)
    mol_1 = Chem.MolFromSmiles(smi_1)
    mol_2 = Chem.MolFromSmiles(smi_2)
    if use_counts:
        fp_1 = rdFingerprintGenerator.GetCountFPs([mol_1])[0]
        fp_2 = rdFingerprintGenerator.GetCountFPs([mol_2])[0]
    else:
        fp_1 = rdFingerprintGenerator.GetFPs([mol_1])[0]
        fp_2 = rdFingerprintGenerator.GetFPs([mol_2])[0]
    return DataStructs.TanimotoSimilarity(fp_1, fp_2)

In [None]:
def piggyback(initial, lead, mol_list, dataframes, model='reinvent'):

    initial_mol = Chem.MolFromSmiles(initial)
    results = {}    # Temporary dict to store results per-run

    if model == 'reinvent': # Generate analogs w/ REINVENT

        # Change directory to run python script
        %cd ..

        arg1 = f'--input_frag'
        subprocess.run(['python3', 'generate_analogs.py', arg1, initial],
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT)
        
        # Change directory back to that of the current notebook
        %cd experiments
        
        # Read dataframe
        df = pd.read_csv('data/dataframe.csv')
        df.drop(['Unnamed: 0'], axis=1, inplace=True)
    
    elif model == 'crem':   # Generate analogs w/ CReM
        
        out_list = []
        mutate_list = list(mutate_mol(initial_mol, db_name=crem_db, return_mol=False))

        for idx, analog in enumerate(mutate_list):
            out_list.append([analog, initial])

        df = pd.DataFrame(out_list, columns=["SMILES","Input_SMILES"])

    else:   # Raise an error if an invalid model is entered
        raise Exception('Invalid Model')
    
    # Remove duplicate values
    df.drop_duplicates(inplace=True, ignore_index=True, subset=['SMILES'])
    
    # Remove the initial fragment from the generated distribution
    if initial in df['SMILES'].values:

        index = df.loc[df['Tanimoto'] == 1].index[0]
        df.drop(index)

    df.round(3)

    dataframes.append(df)

    # Compute similarities to lead molecule
    similarities_to_lead = [tanimoto_similarity(analog, lead, True) for analog in df['SMILES'].values]
    df['sim_to_lead'] = similarities_to_lead
    df.sort_values('sim_to_lead', ascending=False, inplace=True)

    # Calculate mean and max Tanimoto similarities
    results['mean'], results['max'], results['num_analogs'] = [df['sim_to_lead'].mean()], [df['sim_to_lead'].max()], len(df)
    

    if len(df) > 0:
        # Find SMILES string corresponding to best value
        best = df['SMILES'].values[0]
        
        # Add best to list of piggybacked molecules
        mol_list.append(Chem.MolFromSmiles(best))
    else:
        best = None

    return best, mol_list, results, dataframes

In [None]:
def run_experiment(initial, lead, model='reinvent', max_iters=10):

    dataframes = []
    best = initial
    best_tanimoto = tanimoto_similarity(initial, lead)
    n_iters = 0
    mol_list = [Chem.MolFromSmiles(initial)]
    results_df = pd.DataFrame()

    while best_tanimoto < 1.0 and n_iters < max_iters:

        n_iters += 1

        print(f'\n ===   Iteration: {n_iters}   === \n')

        best, mol_list, results, dataframes = piggyback(best, lead, mol_list, dataframes, model)

        if best == None:
            print('Empty Dataframe')
            break

        temp_df = pd.DataFrame(data=results)

        results_df = pd.concat((results_df, temp_df))

        if best_tanimoto == results['max'][0]:
            print(f'GOT STUCK: {best_tanimoto}')
            break
        else:
            best_tanimoto = results['max'][0]
        
        print(f'\n ===   CURRENT BEST: {best_tanimoto}   === \n')

    mol_list.append(Chem.MolFromSmiles(lead))

    return results_df, mol_list, dataframes

### Exploring representative fragment-lead pairs

In [None]:
%%capture

i = 0

initial, lead = fragment_lead_pairs['Fragment'][i], fragment_lead_pairs['Lead'][i]
            
results_df, mol_list, dataframes = run_experiment(initial, lead)

In [None]:
mols2grid.display(mol_list, size=(300, 250))

In [None]:
for i, df in enumerate(dataframes):

    max = results_df['max'].values[i]
    mean = results_df['mean'].values[i]

    print(f' === Distribution: {i} === \n')
    print(f'Best similarity to lead: {max}')
    print(f'Average similarity to lead: {mean}')
    print(f'Size of distribution: {len(df)}\n\n')

In [None]:
num_distributions = len(dataframes)

for i in range(num_distributions - 1):

    df1, df2 = dataframes[i], dataframes[i+1]

    int_df = pd.merge(df1, df2, how='inner', on=['SMILES'])
    union_df = pd.merge(df1, df2, how='outer', on=['SMILES'])

    int_size = len(int_df)
    union_size = len(union_df)

    sim_score = int_size / union_size

    print(f'Size of intersection between distribution {i} and {i+1}: {int_size}')
    print(f'Similarity score: {sim_score}')
    print()

### Visualizing each distribution

In [None]:
for i, df in enumerate(dataframes):
    df['Distribution'] = i

df = pd.concat(dataframes)

In [None]:
smiles = df['SMILES'].values
mols = [Chem.MolFromSmiles(smile) for smile in smiles]

fpgen = AllChem.GetMorganGenerator()
fingerprints = [fpgen.GetFingerprint(mol) for mol in mols]

X = fingerprints

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
pca = PCA(n_components=3, random_state=0)
pca_fps = pca.fit_transform(X)

In [None]:
pca.explained_variance_ratio_

In [None]:
df['PC1'], df['PC2'], df['PC3'] = pca_fps.T[0], pca_fps.T[1], pca_fps.T[2]

In [None]:
plot_df = df.sample(n=500)

In [None]:
f = sns.pairplot(plot_df,
                 hue='Distribution',
                 vars=['PC1', 'PC2', 'PC3'],
                 palette='tab10',
                 aspect=2,
                 plot_kws=dict(s=10))

f.fig.suptitle('Pairwise Principle Component Plots', fontsize=18, y=1.04);