# Clustering compounds based on the similarity of their Morgan fingerprints

Please cite our *Nature Protocols* paper, which features this Jupyter notebook: 

Tran-Nguyen, V. K., Junaid, M., Simeon, S. & Ballester, P. J. A practical guide to machine-learning scoring for structure-based virtual screening. *Nat. Protoc.* **18**, 3460–3511 (2023)

This is a Jupyter notebook that helps users cluster compounds based on the similarity of their Morgan fingerprints. Please refer to our Nature Protocols paper cited above for more information.

## 1. Install all required Python dependencies

Several Python dependencies have to be installed beforehand: set up your protocol-env environment using conda and the yml file protocol-env.yml (downloaded from our github repository). 

In [None]:
import oddt.pandas as opd
from oddt.pandas import ChemDataFrame
from rdkit import DataStructs
from rdkit import Chem
import pandas as pd
from rdkit.Chem.PandasTools import RenderImagesInAllDataFrames
from rdkit.Chem import AllChem

## 2. Load input structures of all compounds 

Input compounds are provided in sdf for this notebook.

In [None]:
# STEP 2.1: Define a function to load all input structures:

def LoadSDF(filename, idName='ID', molColName='ROMol', includeFingerprints=False,
            isomericSmiles=True, smilesName=None, embedProps=False, removeHs=True,
            strictParsing=True, sanitize=False):
    
    # Read the input sdf and return as Pandas data frame.
    # If embedProps=True, all properties will also get embedded in Mol objects in the molecule column.
    # If molColName=None, molecules will not be present in the resulting DataFrame (only properties will be read).
    
    if isinstance(filename, str):
        if filename.lower()[-3:] == ".gz":
            import gzip
            f = gzip.open(filename, "rb")
        else:
            f = open(filename, 'rb')
        close = f.close
    else:
        f = filename
        close = None
    records = []
    indices = []
    for i, mol in enumerate(
        Chem.ForwardSDMolSupplier(f, sanitize=False, removeHs=removeHs,
                                  strictParsing=strictParsing)):
        if not sanitize:
            Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_FINDRADICALS |
                             Chem.SanitizeFlags.SANITIZE_KEKULIZE |
                             Chem.SanitizeFlags.SANITIZE_SETAROMATICITY |
                             Chem.SanitizeFlags.SANITIZE_SETCONJUGATION |
                             Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION |
                             Chem.SanitizeFlags.SANITIZE_SYMMRINGS,
                             catchErrors=True)
        if mol is None:
            print(i)
            continue
        row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames())
        if molColName is not None and not embedProps:
            for prop in mol.GetPropNames():
                mol.ClearProp(prop)
        if mol.HasProp('_Name'):
            row[idName] = mol.GetProp('_Name')
        if smilesName is not None:
            try:
                row[smilesName] = Chem.MolToSmiles(
                    mol, isomericSmiles=isomericSmiles)
            except:
                log.warning(
                    'No valid smiles could be generated for molecule %s', i)
                row[smilesName] = None
        if molColName is not None and not includeFingerprints:
            row[molColName] = mol
        elif molColName is not None:
            row[molColName] = _MolPlusFingerprint(mol)
        records.append(row)
        indices.append(i)

    if close is not None:
        close()
    RenderImagesInAllDataFrames(images=True)
    return pd.DataFrame(records, index=indices)

In [None]:
# STEP 2.2: Load input structures using the function defined above:

input_sdf = LoadSDF("Provide_the_pathway_to_your_input_sdf")

## 3. Compute the Morgan fingerprints of all input compounds 

In [None]:
# Here we compute Morgan fingerprints of radius 2, 2048 bits:

fps = list()
for mol in input_sdf['ROMol']:
    fps.append(AllChem.GetMorganFingerprintAsBitVect(mol, radius = 2, nBits = 2048))

input_sdf["FP_2048b_r2"] = fps

## 4. Calculate the Tanimoto similarity of Morgan fingerprints and create a Tanimoto dissimilarity matrix 

In [None]:
# Define the function for computing the Tanimoto dissimilarity matrix based on Morgan fingerprints:

def tanimoto_distance_matrix(fp_list, cutoff = 0.3):
    dissimilarity_matrix = []
    for i in range(1, len(fp_list)):
        similarities = DataStructs.BulkTanimotoSimilarity(fp_list[i], fp_list[:i])
        dissimilarity_matrix.extend([1 - x for x in similarities])
    return dissimilarity_matrix

dist_matrix_raw = tanimoto_distance_matrix(input_sdf['FP_2048b_r2'])

## 5. Cluster all compounds based on the Tanimoto (dis)similarity of their Morgan fingerprints 

Read this paper for more information: https://pubs.acs.org/doi/10.1021/ci9803381.

In [None]:
# Here, molecules having Tanimoto similary of their Morgan fingerprints >= 0.7 will be grouped in the same cluster:

from rdkit.ML.Cluster.Butina import ClusterData
clusters = ClusterData(dist_matrix_raw, len(input_sdf), 0.3, isDistData = True)

In [None]:
from itertools import chain
get_index_cluster = []
cluster_group = []
for i in range(len(clusters)):
    index_row = list(clusters[i])
    each_cluster = [i] * len(index_row)
    get_index_cluster.append(index_row)
    cluster_group.append(each_cluster)
    
index_cluster_list = list(chain.from_iterable(i if isinstance(i, list) else [i] for i in get_index_cluster))
res_cluster_group = list(chain.from_iterable(i if isinstance(i, list) else [i] for i in cluster_group))

In [None]:
cluster_df = pd.DataFrame({"Cluster_Group": list(res_cluster_group), "Index_Cluster": list(index_cluster_list)})
cluster_df.index = list(cluster_df['Index_Cluster'])
df_cluster = pd.merge(input_sdf, cluster_df, left_index = True, right_index = True)
df_cluster.to_csv('Provide_the_pathway_to_a_directory_to_store_your_output_clustering_results_in_csv')