This notebook is for visualise the molecules chosen via Butina clustering, as well as to inspect the cluster members.

In [1]:
from rdkit import Chem
from rdkit import DataStructs

from rdkit.Chem import rdDepictor, rdMolDescriptors, rdFMCS, AllChem

from tqdm import tqdm
from rdkit.ML.Cluster.Butina import ClusterData
from rdkit.DataManip.Metric import GetTanimotoDistMat

from IPython.core.display import display, HTML, Math

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import mols2grid


rdDepictor.SetPreferCoordGen(True)

import rdkit
%pylab inline
print(rdkit.__version__)


INFO:rdkit:Enabling RDKit 2020.09.1 jupyter extensions


Populating the interactive namespace from numpy and matplotlib
2020.09.1


In [7]:
data_dir = '/home/wjm41/ml_physics/frag-pcore-screen/data/EnamineREAL/topN'

### Mpro

In [8]:
df_mpro_picks = pd.read_csv(f'{data_dir}/mpro_taut_picks_constrained.csv')
df_mpro_picks['mol'] = df_mpro_picks['smiles'].apply(Chem.MolFromSmiles)
mpro_grid = mols2grid.display(df_mpro_picks, template="pages", smiles_col='smiles', mol_col='mol',
                              n_rows=15, n_cols=4, subset=["img", "membership", 'smiles'], transform={"membership": lambda x: f"Size of Cluster: {x}"},
                              tooltip=['smiles'],
                              maxMols=60, size=(300, 150), selection=True)
display(HTML('<b>Mpro Butina</b>'))
display(mpro_grid)


### Mac-1

In [9]:
df_mac1_picks = pd.read_csv(f'{data_dir}/mac1_taut_picks_constrained.csv')

df_mac1_picks['mol'] = df_mac1_picks['smiles'].apply(Chem.MolFromSmiles)
mac1_grid = mols2grid.display(df_mac1_picks, template="pages", smiles_col='smiles', mol_col='mol',
                              n_rows=15, n_cols=4, subset=["img", "membership", 'smiles'], transform={"membership": lambda x: f"Size of Cluster: {x}"},
                              tooltip=['smiles'],
                              maxMols=60, size=(300, 150), selection=True)
display(HTML('<b>Mac-1 Butina</b>'))
display(mac1_grid)


In [14]:
def plot_cluster_members(df, list_of_cluster_indices, align_mcs=True):
    """A function to generate an image for the molecules from the selected cluster"""
    mol_list = []
    img = "Nothing selected"
    if len(list_of_cluster_indices):
        sel_df = df.query("cluster in @list_of_cluster_indices")
        # sel_df = df.iloc[sel]
        mol_list = [Chem.MolFromSmiles(x) for x in sel_df.smiles]
        # strip counterions
        # Align structures on the MCS
        if align_mcs and len(mol_list) > 1:
            mcs = rdFMCS.FindMCS(mol_list)
            mcs_query = Chem.MolFromSmarts(mcs.smartsString)
            AllChem.Compute2DCoords(mcs_query)
            for m in mol_list:
                AllChem.GenerateDepictionMatching2DStructure(m, mcs_query)
        # legends = list(sel_df.smiles.astype(str))
        grid = mols2grid.display(mol_list, template="pages",
                  n_rows=15, n_cols=4, subset=["img"],
                  tooltip=['SMILES'],
                  maxMols=60, size=(300, 150), selection=False)
    return grid


df_mpro_already_clustered = pd.read_csv(f'{data_dir}/mpro_taut_clustered.csv')
clusters_to_plot = [0]
display(HTML(f'<b>Mpro Members in Cluster: {clusters_to_plot}</b>'))
display(plot_cluster_members(df_mpro_already_clustered, clusters_to_plot, True))


In [15]:
df_mac1_already_clustered = pd.read_csv(f'{data_dir}/mac1_taut_clustered.csv')
clusters_to_plot = [3]
display(HTML(f'<b>Mpro Members in Cluster: {clusters_to_plot}</b>'))
display(plot_cluster_members(df_mac1_already_clustered, clusters_to_plot, True))
