In [None]:
import subprocess
import os
print('Current conda environment:', os.environ['CONDA_DEFAULT_ENV'])
os.environ['TOKENIZERS_PARALLELISM'] = "false"

cwd = os.getcwd()
print(cwd)

import warnings
warnings.filterwarnings('ignore')

import random
random.seed(42)

## Generating data from REINVENT, CReM, SAFE, and COATI

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc = {'figure.figsize':(15,8)})

from crem.crem import grow_mol, mutate_mol
crem_db = '../crem_db/crem_db2.5.db'

import mols2grid

from rdkit import Chem
from rdkit.Chem import AllChem, rdFingerprintGenerator, CanonSmiles, Draw, MolFromSmiles, PandasTools
from rdkit.Chem.rdmolops import RDKFingerprint
from rdkit import DataStructs
from rdkit.DataStructs.cDataStructs import BulkTanimotoSimilarity
import useful_rdkit_utils as uru

import safe as sf
import datamol as dm

import mols2grid

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import torch

from coati.generative.coati_purifications import embed_smiles
from coati.models.io.coati import load_e3gnn_smiles_clip_e2e
from coati.models.simple_coati2.io import load_coati2

In [None]:
initial = 'Nc1cc(c[nH]c1=O)C(F)(F)F'
initial_mol = MolFromSmiles(initial)
initial_mol

In [None]:
def tanimoto_similarity(smi_1, smi_2, use_counts=True):
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048,countSimulation=True)
    mol_1 = Chem.MolFromSmiles(smi_1)
    mol_2 = Chem.MolFromSmiles(smi_2)
    if use_counts:
        fp_1 = rdFingerprintGenerator.GetCountFPs([mol_1])[0]
        fp_2 = rdFingerprintGenerator.GetCountFPs([mol_2])[0]
    else:
        fp_1 = rdFingerprintGenerator.GetFPs([mol_1])[0]
        fp_2 = rdFingerprintGenerator.GetFPs([mol_2])[0]
    return DataStructs.TanimotoSimilarity(fp_1, fp_2)

In [None]:
def remove_odd_rings(df):
    ring_system_lookup = uru.RingSystemLookup.default()
    df['ring_systems'] = df.SMILES.apply(ring_system_lookup.process_smiles)
    df[['min_ring','min_freq']] = df.ring_systems.apply(uru.get_min_ring_frequency).to_list()
    df = df.query('min_freq > 100').copy()
    return df.iloc[:, 0:3]

### Generating analogs w/ REINVENT

In [None]:
%cd ..

arg1 = f'--input_frag'
subprocess.run(['python3', 'generate_analogs.py', arg1, initial],
               stdout=subprocess.DEVNULL,
               stderr=subprocess.STDOUT)
        
# Change directory back to that of the current notebook
%cd experiments

In [None]:
df = pd.read_csv('data/reinvent_dataframe.csv')

df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
df = df.sample(n=500)
df['Model'] = 'reinvent'

df

### Generating analogs w/ CReM
Here we use the mutate function

In [None]:
out_list = []
mutate_list = list(mutate_mol(initial_mol, db_name=crem_db, return_mol=False))

for idx, analog in enumerate(mutate_list):
    out_list.append([analog, initial])

In [None]:
temp_df = pd.DataFrame(out_list, columns=["SMILES","Input_SMILES"])

In [None]:
crem_smiles = temp_df['SMILES'].values

sim_to_initial = [tanimoto_similarity(smile, initial) for smile in crem_smiles]

In [None]:
temp_df['Tanimoto'] = sim_to_initial

temp_df = remove_odd_rings(temp_df)

temp_df['Model'] = 'crem'

In [None]:
temp_df.head()

In [None]:
df = pd.concat((df, temp_df))

In [None]:
df

### Generating analogs w/ COATI

In [None]:
encoder1, tokenizer1 = load_e3gnn_smiles_clip_e2e(
    freeze=True,
    device=torch.device("cuda:0"),
    doc_url="s3://terray-public/models/barlow_closed.pkl"
)

encoder2, tokenizer2 = load_coati2(
    freeze=True,
    device=torch.device("cuda:0"),
    doc_url="s3://terray-public/models/coati2_chiral_03-08-24.pkl"
)

In [None]:
def gen_mol(smiles, coati_version=1, num_variations=100, noise_scale=0.15):

    # Embed the SMILES string
    smiles = CanonSmiles(smiles)

    if coati_version == 1:
        vector = embed_smiles(smiles, encoder1, tokenizer1)
    elif coati_version == 2:
        vector = embed_smiles(smiles, encoder2, tokenizer2)

    # Noise is added as an isotropic Gaussian with std=noise_scale
    if coati_version == 1:
        nearby_smiles = encoder1.hclip_to_2d_batch(
            h_clip=vector.unsqueeze(0).repeat(num_variations, 1),
            tokenizer=tokenizer1,
            noise_scale=noise_scale
        )
    elif coati_version == 2:
        nearby_smiles = encoder2.hcoati_to_2d_batch(
            h_coati=vector.unsqueeze(0).repeat(num_variations, 1),
            tokenizer=tokenizer2,
            noise_scale=noise_scale,
        )

    # Retrieve canonical SMILES of generated analogs
    unique_valid_smiles = list(set([CanonSmiles(smi) for smi in nearby_smiles if MolFromSmiles(smi)]))

    # Store true if original molecule is in the set of generated analogs
    had_orig = smiles in unique_valid_smiles

    unique_valid_smiles = list(set([smiles] + unique_valid_smiles))

    # Generate molecular fingerprints
    fp = RDKFingerprint(MolFromSmiles(smiles), minPath=1, maxPath=7, fpSize=2048)
    fps = [RDKFingerprint(MolFromSmiles(x), minPath=1, maxPath=7, fpSize=2048) for x in unique_valid_smiles]

    # Compute tanimoto similarities between distributions and store as list of strings
    sim = BulkTanimotoSimilarity(fp, fps)
    sim_str = [str(round(x, 2)) for x in sim]

    unique_valid_smiles, sim_str = zip(*sorted(zip(unique_valid_smiles, sim_str), key=lambda x:x[1], reverse=True))

    if not had_orig:
        unique_valid_smiles, sim_str = zip(*[[i, f"{j} (Added)"] if i==smiles else [i, j] for i, j in zip(unique_valid_smiles, sim_str)])

    # Output for molecule generation
    print (f"Attempted {num_variations} COATI{coati_version} generations with a noise scale of {noise_scale} and generated {len(unique_valid_smiles)} unique structures.")
    
    # Display molecules and tanimoto similarity to initial fragment
    # display(Draw.MolsToGridImage([MolFromSmiles(s) for s in unique_valid_smiles], molsPerRow=5, subImgSize=(200, 200), maxMols=100, legends=sim_str))
    
    return unique_valid_smiles

In [None]:
coati_smiles = gen_mol(initial, coati_version = 2, num_variations = 1000, noise_scale = 0.5)

In [None]:
temp_df = pd.DataFrame()
temp_df['SMILES'] = coati_smiles
temp_df['Input_SMILES'] = initial

len(temp_df)

In [None]:
sim_to_initial = [tanimoto_similarity(smile, initial) for smile in coati_smiles]

temp_df['Tanimoto'] = sim_to_initial

temp_df = remove_odd_rings(temp_df)

temp_df['Model'] = 'coati'

len(temp_df)

In [None]:
temp_df.head()

In [None]:
df = pd.concat((df, temp_df))

In [None]:
df

### Generating analogs w/ SAFE

In [None]:
designer = sf.SAFEDesign.load_default(verbose=True)

designer.model

In [None]:
generated_smiles = designer.super_structure(
    core=initial,
    n_samples_per_trial=200,
    n_trials=1,
    sanitize=True,
    do_not_fragment_further=False,
    attachment_point_depth=3,
)

generated_smiles

In [None]:
temp_df = pd.DataFrame()
temp_df['SMILES'] = generated_smiles
temp_df['Input_SMILES'] = initial

In [None]:
len(temp_df)

In [None]:
sim_to_initial = [tanimoto_similarity(smile, initial) for smile in generated_smiles]

temp_df['Tanimoto'] = sim_to_initial

temp_df = remove_odd_rings(temp_df)

temp_df['Model'] = 'safe'

len(temp_df)

In [None]:
df = pd.concat((df, temp_df))

In [None]:
df

## Visualizing the models in chemical space

### PCA

In [None]:
smiles = df['SMILES'].values
mols = [MolFromSmiles(smile) for smile in smiles]

In [None]:
fpgen = AllChem.GetMorganGenerator()

fingerprints = [fpgen.GetFingerprint(mol).ToList() for mol in mols]

In [None]:
df['Fingerprints'] = fingerprints

In [None]:
X = fingerprints

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
pca = PCA(n_components=3, random_state=0)
pca_fps = pca.fit_transform(X)

In [None]:
var1, var2, var3 = pca.explained_variance_ratio_

In [None]:
df['PC1'], df['PC2'], df['PC3'] = pca_fps.T[0], pca_fps.T[1], pca_fps.T[2]

In [None]:
plot_df = df.sample(n=500)

In [None]:
f = sns.pairplot(plot_df,
                 hue='Model',
                 vars=['PC1', 'PC2', 'PC3'],
                 palette='colorblind',
                 aspect=2,
                 plot_kws=dict(s=10))

f.fig.suptitle('Pairwise Principle Component Plots', fontsize=18, y=1.04);

### t-SNE

In [None]:
p = 100

pca_model = PCA(n_components=5, random_state=0)
tsne_model = TSNE(n_components=2, random_state=0, perplexity=p, n_iter=5000)
tsne_fps = tsne_model.fit_transform(pca_model.fit_transform(X))

In [None]:
df['TSNE1'], df['TSNE2'] = tsne_fps.T[0], tsne_fps.T[1]

In [None]:
f = sns.pairplot(df,
                 hue='Model',
                 vars=['TSNE1', 'TSNE2'],
                 palette='colorblind',
                 aspect=2,
                 plot_kws=dict(s=10))

title = f'Pairwise t-SNE plot w/ perplexity $p={p}$'

f.fig.suptitle(title, fontsize=18, y=1.04);

Define the Davies Boulin index to evaluate clustering

In [None]:
from sklearn.metrics import davies_bouldin_score

In [None]:
tsne_dvs = davies_bouldin_score(tsne_fps, df['Model'])

In [None]:
print(f'Davies-Bouldin Index for t-SNE: {tsne_dvs}')

### Try UMAP

In [None]:
import umap

In [None]:
# Set UMAP parameters
num_neighbors = 100 # similar to perplexity in t-SNE
reduced_dim = 2
rs = 0 # random state

# Apply UMAP
umap_model = umap.UMAP(n_components=reduced_dim, n_neighbors=num_neighbors, random_state=rs, init="pca")
umap_projection = umap_model.fit_transform(X)

In [None]:
df['UMAP1'], df['UMAP2'] = umap_projection.T[0], umap_projection.T[1]

f = sns.pairplot(df,
                    hue='Model',
                    vars=['UMAP1', 'UMAP2'],
                    palette='colorblind',
                    aspect=2,
                    plot_kws=dict(s=10))

title = f'Pairwise UMAP plot w/ neighbors $n={num_neighbors}$'


f.fig.suptitle(title, fontsize=18, y=1.04);




In [None]:
f = sns.scatterplot(data=df, x='TSNE1', y='TSNE2', hue='Model', palette='colorblind', s=20)

title = f't-SNE plot w/ perplexity $p={p}$'

plt.title(title, fontsize=18);

plt.show()

In [None]:
f = sns.scatterplot(data=df, x='UMAP1', y='UMAP2', hue='Model', palette='colorblind', s=20)

title = f'UMAP plot w/ neighbors $n={num_neighbors}$'

subtext = f'Random state: {rs}' 

plt.title(title, fontsize=18);

plt.show()

Calculate Davies Bouldin score for UMAP

In [None]:
umap_dvs = davies_bouldin_score(umap_projection, df['Model'])

In [None]:
print(f'Davies-Bouldin Index for UMAP: {umap_dvs}')

### Use KNN and SVM for classification

In [None]:
#define a train test split of the df

from sklearn.model_selection import train_test_split



In [None]:
X_train, X_test = train_test_split(df, test_size=0.1, random_state=42, stratify=df['Model'], shuffle=True)

In [None]:
print(f"Train size: {len(X_train)}")

In [None]:
print(f"Test size: {len(X_test)}")

In [None]:
X_train.head()

Convert each fingerprint to a numpy array

In [None]:
import numpy as np

In [None]:
X_train_fg = X_train['Fingerprints'].values

In [None]:
X_train_fg = np.array([np.array(x) for x in X_train_fg])

In [None]:
X_test_fg = X_test['Fingerprints'].values

In [None]:
X_test_fg = np.array([np.array(x) for x in X_test_fg])

In [None]:
X_test.head()

In [None]:
#make dictionary of models

model_dict = {'reinvent': 0, 'crem': 1, 'coati': 2, 'safe': 3}

#now relabel the models

y_train = X_train['Model'].map(model_dict)
y_test = X_test['Model'].map(model_dict)


Run KNN on the training and test dataset

In [None]:
#Run a simple classifier KNN

from sklearn.neighbors import KNeighborsClassifier


In [None]:
KNN = KNeighborsClassifier(n_neighbors=5)
UMAP_train = X_train[['UMAP1', 'UMAP2']].values
KNN.fit(UMAP_train, y_train)


In [None]:
y_pred = KNN.predict(X_test[['UMAP1', 'UMAP2']].values)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

Try Linear SVM

In [None]:
from sklearn.svm import LinearSVC


In [None]:
SVC = LinearSVC()
SVC.fit(UMAP_train, y_train)

In [None]:
y_pred = SVC.predict(X_test[['UMAP1', 'UMAP2']].values)

In [None]:
print(classification_report(y_test, y_pred))

### Plot the Tanimoto similarity of each model to the Initial Fragment

In [None]:
sim = [tanimoto_similarity(smile, initial) for smile in df['SMILES'].values]

In [None]:
len(sim)

In [None]:
df['Tanimoto'] = sim

In [None]:
sns.histplot(df, x='Tanimoto', hue='Model', bins=20, kde=True, palette='colorblind')

In [None]:
sns.boxenplot(data=df, x='Model', y='Tanimoto', palette='colorblind')

plt.title('Tanimoto Similarity to Initial Fragment', fontsize=18);

plt.show()

### Plot the Tanimoto Similarity to the Lead

In [None]:
lead = 'N[C@H]1CCN(Cc2cccc(c2)c3ccc4c(=O)[nH]ccc4c3)C1'
lead_mol = MolFromSmiles(lead)

In [None]:
lead_sim = [tanimoto_similarity(smile, lead) for smile in df['SMILES'].values]

df['Tanimoto_Lead'] = lead_sim

In [None]:
sns.boxenplot(data=df, x='Model', y='Tanimoto_Lead', palette='colorblind')

plt.title('Tanimoto Similarity to Lead Compound', fontsize=18);

plt.show()

In [None]:
sns.histplot(df, x='Tanimoto_Lead', hue='Model', bins=20, kde=True, palette='colorblind')

### We can approximate the number of unique molecules in each distribution with Vendi Sore

In [None]:
import numpy as np

In [None]:
def tanimoto_similarity_matrix(fps):
    sim_matrix = np.zeros((len(fps), len(fps)))
    for i in range(len(fps)):
        for j in range(len(fps)):
            sim_matrix[i, j] = DataStructs.TanimotoSimilarity(fps[i], fps[j])
    return sim_matrix

In [None]:
def plot_similarity_matrix(sim_matrix, title):
    plt.figure(figsize=(10, 10))
    sns.heatmap(sim_matrix, cmap='viridis')
    plt.title(title)
    plt.show()

In [None]:
from vendi_score import vendi

In [None]:
reinvent_fps = [fpgen.GetFingerprint(Chem.MolFromSmiles(smiles)) for smiles in df.query('Model == "reinvent"')['SMILES'].values]

In [None]:
reinvent_sim_matrix = tanimoto_similarity_matrix(reinvent_fps)
vendi.score_K(reinvent_sim_matrix)

In [None]:
crem_fps = [fpgen.GetFingerprint(Chem.MolFromSmiles(smiles)) for smiles in df.query('Model == "crem"')['SMILES'].values]

In [None]:
crem_sim_matrix = tanimoto_similarity_matrix(crem_fps)
vendi.score_K(crem_sim_matrix)

In [None]:
coati_fps = [fpgen.GetFingerprint(Chem.MolFromSmiles(smiles)) for smiles in df.query('Model == "coati"')['SMILES'].values]

In [None]:
coati_sim_matrix = tanimoto_similarity_matrix(coati_fps)
vendi.score_K(coati_sim_matrix)

In [None]:
safe_fps = [fpgen.GetFingerprint(Chem.MolFromSmiles(smiles)) for smiles in df.query('Model == "safe"')['SMILES'].values]


In [None]:
safe_sim_matrix = tanimoto_similarity_matrix(safe_fps)
vendi.score_K(safe_sim_matrix)