## Beta-2 adrenergic receptor - part 4 (library enumeration)

### Import libraries

In [1]:
import pandas as pd

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import sys
sys.path.append('/home/daniel/wizepair2')
from mmpa.chem import strip_stereo

from rdkit import Chem

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Read in the datasets

In [2]:
df = pd.read_csv('beta2_agonists_stripped.csv')
df_trans = pd.read_csv('beta2_agonists_transformations.csv')

### Take adrenaline as only seed

In [3]:
df = df[df.stripped_smiles=='CNC[C@H](O)c1ccc(O)c(O)c1']

### Merge on assay identifier to enumerate all seed molecule / transformation pairs

In [4]:
df = df[pd.notna(df.pchembl_value)]

### Apply the transformations

In [5]:
def apply_reaction(smiles, smirks):
    rxn = Chem.rdChemReactions.ReactionFromSmarts(smirks)
    productset = rxn.RunReactants((Chem.AddHs(Chem.MolFromSmiles(smiles)),))
    productlist = list()
    for product in productset:
        productlist.append('.'.join([Chem.MolToSmiles(Chem.RemoveHs(productpart)) for productpart in product]))
    return list(set(productlist))

In [6]:
def enumerate_recursive(df: pd.DataFrame(), df_trans: pd.DataFrame(), generations=1):
    
    if generations > 0: 
    
        # merge with transformations and enumerate products
        df = df.merge(df_trans, on=['target_pref_name', 'standard_type'])
        df_product = df.sample(frac=1).parallel_apply(lambda x: apply_reaction(x.stripped_smiles, x.smirks), axis=1)

        # fan out individual products
        df_product = df.join(pd.DataFrame(df_product.explode(), columns=['products']))
        df_product = df_product[pd.notna(df_product.products)]
        
        # trail tracking
        if all(elem in df_product.columns for elem in ['last_smiles', 'min_pchembl_value', 'min_percentmcs', 'max_radius']): 
            df_product.last_smiles = df_product.last_smiles + '.' + df_product.stripped_smiles
            df_product.min_pchembl_value = df_product[['min_pchembl_value', 'pchembl_value']].min(axis=1)
            df_product.min_mean_percentmcs = df_product[['min_mean_percentmcs', 'mean_percentmcs']].min(axis=1)
            df_product.max_radius = df_product[['max_radius', 'radius']].max(axis=1)
        else: 
            df_product['last_smiles'] = df_product.stripped_smiles
            df_product['min_pchembl_value'] = df_product.pchembl_value
            df_product['min_mean_percentmcs'] = df_product.mean_percentmcs
            df_product['max_radius'] = df_product.radius
            
        # update seed paramters
        df_product.stripped_smiles = df_product.products
        df_product.pchembl_value = df_product.pchembl_value + df_product.mean_pchembl_exact_delta
        
        # condense duplicate trails
        df_product = pd.pivot_table(df_product, values='pchembl_value', index=[
            'last_smiles', 'stripped_smiles', 'target_pref_name', 'standard_type'], aggfunc='mean').reset_index()
        df_product['generation'] = generations
        
        # detect if target has been identified
        df_product['achiral_smiles'] = df_product.stripped_smiles.apply(strip_stereo)
        if 'CC(C)(C)NCC(O)c1ccc(O)c(CO)c1' in df_product.achiral_smiles.to_list(): print('hit!')

        # increment counter any yield
        generations -= 1
        yield df_product
        yield from enumerate_recursive(df_product, df_trans, generations)


In [7]:
df_products = enumerate_recursive(df, df_trans, 3)

In [8]:
df_products = pd.concat([x for x in df_products])

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2656), Label(value='0 / 2656'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13217), Label(value='0 / 13217')))…

hit!


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=238888), Label(value='0 / 238888')…

hit!


In [9]:
df_products.groupby('stripped_smiles').count().sort_values('generation')

Unnamed: 0_level_0,last_smiles,target_pref_name,standard_type,pchembl_value,generation,achiral_smiles
stripped_smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CNCC(O)c1ccc2sc(O)nc2c1,1,1,1,1,1,1
CCNCC(O)c1ccc(O)c(O)c1,1,1,1,1,1,1
CC(C)NCC(O)c1ccc(CO)cc1,1,1,1,1,1,1
CC(C)NCC(O)c1ccc(CO)c2nc(O)sc12,1,1,1,1,1,1
CCN[C@@H](CC)[C@H](O)c1ccc(O)c(O)c1,1,1,1,1,1,1
...,...,...,...,...,...,...
CNCC(O)c1ccc(O)c(O)c1,586,586,586,586,586,586
CC(C)NC[C@H](O)c1ccc(O)c(O)c1,782,782,782,782,782,782
CC(C)NCC(O)c1ccc(O)c(O)c1,806,806,806,806,806,806
NC[C@H](O)c1ccc(O)c(O)c1,987,987,987,987,987,987


In [10]:
df_holdout = pd.read_csv('beta2_agonists_stripped_holdout.csv')

In [11]:
df_holdout = df_holdout.merge(df_products, on=['stripped_smiles', 'target_pref_name', 'standard_type'])

In [None]:
Chem.MolFromSmiles(df_holdout.sample(1).last_smiles.to_list()[0])

### Write to file

In [13]:
df_products = df_products.reset_index()
df_products.to_csv('beta2_agonists_products.csv', index=False)