### Analysis of frequently occurring scaffolds and fragments in the training and test sets

Try to understand whether structures are frequently occurring in interfering compounds. This could also help in predicting what type of molecules Reinvent will output and check how far the tool goes from the training chemical space.

In [1]:
import pandas as pd
from tqdm import tqdm
import os

from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import BRICS
from rdkit import RDLogger

from FtF.path import training, testing

In [2]:
# Disable rdkit logger for cleaner output
RDLogger.DisableLog('rdApp.*') 

#### Data loading

In [3]:
def preprocess_pipeline(datadir):
    """
    Data preprocessing pipeline.
    """
    # Load the data.
    print("Loading data...")
    store_dataset = {}
    for i in tqdm(os.listdir(datadir)):
        if i.endswith(".csv") and "fluo" not in i:
            fname = i.split(".")[0]
            arr = pd.read_csv(str(datadir / i))
            arr.columns = ["SMILES", "Outcome"]
            store_dataset[fname] = arr[arr["Outcome"] == 1]# Only keep the interfering compounds.
    
    return store_dataset

In [4]:
# Collect train and test positives
training_data = preprocess_pipeline(training)
testing_data = preprocess_pipeline(testing)

Loading data...


  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:00<00:00, 138.85it/s]


Loading data...


100%|██████████| 7/7 [00:00<00:00, 431.42it/s]


#### Analize Murcko scaffolds

In [5]:
def get_scaffold(smile, include_chirality=False):
    """
    Generate Murcko Scaffold per SMILES string.

    """
    mol = Chem.MolFromSmiles(smile)
    scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)
    return scaffold

In [6]:
def scaffolds_pipeline(dataset):
    """
    Generating scaffolds for each dataset.    
    """
    for key in dataset.keys():
        dataset[key]["Scaffold"] = dataset[key]["SMILES"].apply(get_scaffold)
    return dataset

In [7]:
# Generate Murcko Scaffolds for the training and testing data.
training_scaffolds = scaffolds_pipeline(training_data)
testing_scaffolds = scaffolds_pipeline(testing_data)

In [8]:
# Now check most common scaffolds in the training data.
for key in training_scaffolds.keys():
    print(f"Most common scaffolds in {key}:")
    print(training_scaffolds[key]["Scaffold"].value_counts().head(3))
    print("total scaffolds:", training_scaffolds[key]["Scaffold"].value_counts().sum())
    print("\n")
    # Then draw the molecules in different subplots.
    # Chem.Draw.MolsToGridImage([Chem.MolFromSmiles(i) for i in training_scaffolds[key]["Scaffold"].value_counts().head(3).index], molsPerRow=3)

Most common scaffolds in fluc:
O=C(Nc1ccccc1)c1ccccc1                 2
c1ccc(C2COc3ccccc3C2)cc1               2
c1cc(Oc2ccc3nc(NC4CCCCC4)sc3c2)ccn1    1
Name: Scaffold, dtype: int64
total scaffolds: 80


Most common scaffolds in nluc:
c1ccccc1                                                 2
O=C(Nc1cnoc1-c1ccc(-c2ccccc2)cc1)OCc1ccccc1              2
O=C(Nc1cccc(N2CCNCC2)c1)c1ccc(-c2ccc(-c3nnco3)cc2)cc1    2
Name: Scaffold, dtype: int64
total scaffolds: 69


Most common scaffolds in redox:
c1ccc2c(OC3CCCCO3)cccc2c1                                                             2
C1CCC(OC2CCCOC2OC2COC(OC3CCC4C(CCC5C4CCC46OCC7(CCCCC74)CCC56)C3)C(OC3CCCCO3)C2)OC1    2
O=C1C(=O)c2c(ccc3ccccc23)-c2occc21                                                    1
Name: Scaffold, dtype: int64
total scaffolds: 101


Most common scaffolds in thiol:
c1ccc(OC2CCCCO2)cc1          21
C1=CC2CCCC2C(OC2CCCCO2)O1    12
c1ccccc1                      6
Name: Scaffold, dtype: int64
total scaffolds: 724




In [10]:
# Now check most common scaffolds in the test data.
for key in training_scaffolds.keys():
    print(f"Most common scaffolds in {key}:")
    print(testing_scaffolds[key]["Scaffold"].value_counts().head(3))
    print("total scaffolds:", testing_scaffolds[key]["Scaffold"].value_counts().sum())
    print("\n")

Most common scaffolds in fluc:
c1ccc(Nc2ccnc(Nc3ccccc3)n2)cc1    1
c1ccc(-c2nc3ccccc3s2)cc1          1
O=C(NCc1ccccc1)Nc1nccs1           1
Name: Scaffold, dtype: int64
total scaffolds: 14


Most common scaffolds in nluc:
c1ccc(-c2cc(OCc3ccccc3-n3cccn3)ncn2)cc1    1
c1ccccc1                                   1
c1ccc(Nc2ccc3ccccc3c2)cc1                  1
Name: Scaffold, dtype: int64
total scaffolds: 15


Most common scaffolds in redox:
C=C1CCCCC1=CC=C1CCCC2CCCC12                      2
c1ccc(CCc2nc3cc(-c4cnoc4)ccc3n2CCN2CCOCC2)cc1    1
O=C1OCC2C3=C(C(=O)c4occ1c42)C1CCC(=O)C1CC3       1
Name: Scaffold, dtype: int64
total scaffolds: 23


Most common scaffolds in thiol:
c1ccc(OC2CCCCO2)cc1                           8
c1ccc(Nc2ccnc(Nc3ccc(N4CCNCC4)cc3)n2)cc1      2
O=C(C=Cc1ccccc1)OC1CC2C=COC(OC3CCCCO3)C2C1    2
Name: Scaffold, dtype: int64
total scaffolds: 178




Scaffolds are very different between the training and testing data.

#### Analyze fragments

In [11]:
def get_frags(smile):
    """
    Generate BRICS fragments per SMILES string.
    """
    mol = Chem.MolFromSmiles(smile)
    fragments = BRICS.BRICSDecompose(mol, minFragmentSize=3)
    return fragments

In [12]:
def fragments_pipeline(dataset):
    """
    Generating fragments for each dataset.
    """
    for key in dataset.keys():
        dataset[key]["Fragments"] = dataset[key]["SMILES"].apply(get_frags)
    return dataset

In [13]:
# Generate Fragments for the training and testing data.
training_fragments = fragments_pipeline(training_data)
testing_fragments = fragments_pipeline(testing_data)

In [14]:
# Now check most common fragments in the training data.
for key in training_fragments.keys():
    # Concatenate all the fragments sets.
    all_frags = []
    for i in training_fragments[key]["Fragments"]:
        all_frags += i
        # then count the fragments count for each set.
    print(f"Most common fragments in {key}:")
    print(pd.Series(all_frags).value_counts().head(3))
    print("\n")

Most common fragments in fluc:
[6*]C(=O)O              12
[16*]c1ccc([16*])cc1    11
[16*]c1ccccc1           11
dtype: int64


Most common fragments in nluc:
[6*]C(=O)O              12
[16*]c1ccccc1           11
[16*]c1ccc([16*])cc1    11
dtype: int64


Most common fragments in redox:
[1*]C(C)=O                  14
[13*]C1OC(CO)C(O)C(O)C1O    13
[3*]OC1OC(CO)C(O)C(O)C1O    13
dtype: int64


Most common fragments in thiol:
[3*]OC1OC(CO)C(O)C(O)C1O        172
[13*]C1OC(CO)C(O)C(O)C1O        167
[3*]OCC1OC(O[3*])C(O)C(O)C1O     56
dtype: int64




In [15]:
# Now check most common fragments in the training data.
for key in testing_fragments.keys():
    # Concatenate all the fragments sets.
    all_frags = []
    for i in testing_fragments[key]["Fragments"]:
        all_frags += i
        # then count the fragments count for each set.
    print(f"Most common fragments in {key}:")
    print(pd.Series(all_frags).value_counts().head(3))
    print("\n")

Most common fragments in fluc:
[5*]Nc1nccc([14*])n1    2
[14*]c1ccnc([14*])n1    2
[1*]C(=O)c1ccccn1       2
dtype: int64


Most common fragments in nluc:
[16*]c1ccccc1           5
[6*]C(=O)O              4
[16*]c1ccc([16*])cc1    3
dtype: int64


Most common fragments in redox:
[13*]C1OC(CO)C(O)C(O)C1O    3
[3*]OC1OC(CO)C(O)C(O)C1O    3
[16*]c1ccc(OC)cc1           2
dtype: int64


Most common fragments in thiol:
[3*]OC1OC(CO)C(O)C(O)C1O    41
[13*]C1OC(CO)C(O)C(O)C1O    38
[16*]c1ccccc1               15
dtype: int64


