## Histone deacetylase 1 - part 4 (library enumeration)

### Import libraries

In [1]:
import pandas as pd

from rdkit import Chem

### Read in the datasets

In [2]:
df = pd.read_csv('hdac1_inhibitors_stripped.csv')
df_trans = pd.read_csv('hdac1_inhibitors_transformations.csv')

### Merge on assay identifier to enumerate all seed molecule / transformation pairs

In [3]:
df = df[pd.notna(df.pchembl_value)]
df = df.merge(df_trans, on='assay_chembl_id')

### Apply the transformations

In [4]:
def apply_reaction(smiles, smirks):
    rxn = Chem.rdChemReactions.ReactionFromSmarts(smirks)
    productset = rxn.RunReactants((Chem.AddHs(Chem.MolFromSmiles(smiles)),))
    productlist = list()
    for product in productset:
        productlist.append('.'.join([Chem.MolToSmiles(Chem.RemoveHs(productpart)) for productpart in product]))
    return list(set(productlist))

df_seed = df.apply(lambda x: apply_reaction(x.stripped_smiles, x.smirks), axis=1)

### Associate products with seed data

In [5]:
df_seed = df.join(pd.DataFrame(df_seed.explode(), columns=['products']))
df_seed = df_seed[pd.notna(df_seed.products)]

### Seed + delta = prediction, and aggregate (optionally weighted according to count of pairs in transformation)

In [6]:
df_seed['pchembl_predicted'] = (df_seed.pchembl_value + df_seed.mean_pchembl_exact_delta) #* df_seed.count_pchembl_exact_delta
df_products = pd.pivot_table(df_seed, values=['count_pchembl_exact_delta', 'pchembl_predicted'], index=['products', 'radius', 'assay_chembl_id'], aggfunc='mean')
#df_products['pchembl_predicted'] = df_products.pchembl_predicted / df_products.count_pchembl_exact_delta

### Write to file

In [7]:
df_products = df_products.reset_index()
df_products.to_csv('hdac1_inhibitors_products.csv', index=False)

### ... and finaly tabulate (tables 4 & 5 from wizepairz paper)

In [8]:
df_holdout = pd.read_csv('hdac1_inhibitors_stripped_holdout.csv')
df_holdout = df_seed.merge(df_holdout, left_on=['assay_chembl_id', 'products'], right_on=['assay_chembl_id', 'stripped_smiles'])
tablecols = ['_metadata.parent_molecule_data.compound_key_x', 'fragment1', 'fragment2', 'pchembl_value_x', 'mean_pchembl_exact_delta', 'pchembl_predicted']

In [9]:
df_holdout[(df_holdout.radius==3) & (df_holdout.assay_chembl_id=='CHEMBL927948') & (df_holdout['_metadata.parent_molecule_data.compound_key_y']=='13c')][tablecols]

Unnamed: 0,_metadata.parent_molecule_data.compound_key_x,fragment1,fragment2,pchembl_value_x,mean_pchembl_exact_delta,pchembl_predicted
0,13b,[cH2][cH][c]([C]#[N])[c](-[cH3])[nH],[cH2][cH][c]([Cl])[c](-[cH3])[nH],8.01,-0.16,7.85
5,13a,[cH2][cH][c]([CH3])[c](-[cH3])[nH],[cH2][cH][c]([Cl])[c](-[cH3])[nH],7.67,0.11,7.78
9,13d,[cH2][cH][c]([F])[c](-[cH3])[nH],[cH2][cH][c]([Cl])[c](-[cH3])[nH],7.3,0.4,7.7


In [10]:
df_holdout[(df_holdout.radius==3) & (df_holdout.assay_chembl_id=='CHEMBL927948') & (df_holdout['_metadata.parent_molecule_data.compound_key_y']=='14d')][tablecols]

Unnamed: 0,_metadata.parent_molecule_data.compound_key_x,fragment1,fragment2,pchembl_value_x,mean_pchembl_exact_delta,pchembl_predicted
13,15d,[CH3][CH2][N]([CH2][CH3])[CH]([CH3])[CH3],[CH3][CH2][N]([CH2][CH3])[CH2][CH3],7.48,-0.016667,7.463333
18,14a,[cH2][cH][c]([CH3])[c](-[cH3])[nH],[cH2][cH][c]([F])[c](-[cH3])[nH],7.73,-0.305,7.425
21,14b,[cH2][cH][c]([C]#[N])[c](-[cH3])[nH],[cH2][cH][c]([F])[c](-[cH3])[nH],8.01,-0.605,7.405
25,14c,[cH2][cH][c]([Cl])[c](-[cH3])[nH],[cH2][cH][c]([F])[c](-[cH3])[nH],7.79,-0.4,7.39
