# Imports

In [1]:
from rdkit.Chem import AllChem, Descriptors, MolFromSmiles, MolToSmiles, MolFromSmarts, FindMolChiralCenters
from rdkit import DataStructs
import os

import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.core.display import display, HTML

from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

import matplotlib.pyplot as plt

# Main

## Import and decipher PAINS

In [2]:
df_pains = pd.read_csv('../data/PAINS.sieve', delim_whitespace=True, skiprows=10, names=['family', 'regID', 'smarts', 'a', 'b'])[['regID','smarts']]
df_pains['regID'] = df_pains['regID'].str.replace('regId=', '')

print('Number of PAINS filters: {}'.format(len(df_pains)))

pains_lib = [MolFromSmarts(x) for x in df_pains['smarts']]

Number of PAINS filters: 481


In [3]:
def pains_match(smi, pains_lib=pains_lib, log=False):
    mol = MolFromSmiles(smi)
    matches = [bool(mol.GetSubstructMatches(pain)) for pain in pains_lib]
    if log:
        print(matches)
    return np.any(matches)

# print(pains_match(
#     '[H]O[C@H]1CN(C(=O)c2cnn3ccn([H])c23)C[C@H]1N([H])C(=O)c1cc(F)cn1[H]', pains_lib=pains_lib, log=True))

def chiral_match(smi, log=False):
    mol = MolFromSmiles(smi)
    matches = FindMolChiralCenters(mol, includeUnassigned=True, useLegacyImplementation=False)
    # matches = FindMolChiralCenters(mol, includeUnassigned=True)

    if len(matches)>1:
        return True
    else:
        return False

## Useful Functions

In [4]:
data_dir = '/rds-d7/project/rds-ZNFRY9wKoeE/EnamineREAL/data/'
target = 'dpp11'

df_topn = pd.read_csv(data_dir+'../topN/topN_new_' + target+'.csv')
# df_tot = pd.read_csv(data_dir+'../topN/top15M_'+target+'.csv')

tqdm.pandas()
df_topn['pains'] = df_topn['smiles'].progress_apply(pains_match)
print(df_topn['pains'].value_counts())

df_topn['diastereomer'] = df_topn['smiles'].progress_apply(chiral_match)
print(df_topn['diastereomer'].value_counts())

N = 50000
# print(df_tot.head())
# print([(~df_tot['pains']) & (~df_tot['diastereomer'])])
df_topn['keep'] = (~df_topn['pains']) & (~df_topn['diastereomer'])
df_topn = df_topn[df_topn['keep']].nlargest(n=N, columns='new_score')
df_topn[df_topn['keep']].to_csv(data_dir+'../topN/topN_new_filtered_' + target+'.csv')


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100000/100000 [04:50<00:00, 343.89it/s]


False    99295
True       705
Name: pains, dtype: int64


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100000/100000 [00:57<00:00, 1740.92it/s]


False    77488
True     22512
Name: diastereomer, dtype: int64


In [5]:
import mols2grid

display_n = 50

mols2grid.display(df_topn[df_topn['keep']].iloc[:display_n].rename(columns={
                  'smiles': 'SMILES'}), template="pages", n_rows=10, n_cols=5, subset=["img"], tooltip=['SMILES', 'new_score'], size=(250, 150))


In [None]:
df_tot['pains'] = df_tot['smiles'].progress_apply(pains_match)
print(df_tot['pains'].value_counts())

df_tot['diastereomer'] = df_tot['smiles'].progress_apply(chiral_match)
print(df_tot['diastereomer'].value_counts())

N = 50000
# print(df_tot.head())
# print([(~df_tot['pains']) & (~df_tot['diastereomer'])])
df_tot['keep'] = (~df_tot['pains']) & (~df_tot['diastereomer'])
df_topn_new = df_tot[df_tot['keep']].nlargest(n=N, columns='new_score')


In [9]:
important = ['Donor-Aromatic',
             'Donor-Acceptor',
             'Aromatic-Aromatic']
unimportant = ['Donor-Donor',
               'Aromatic-Acceptor',
               'Acceptor-Acceptor']

pairs = important+unimportant

df_tot = pd.concat(df_all).drop_duplicates(subset=['smiles'], keep='first')

x = df_tot[pairs].to_numpy().astype(float)
x[np.isnan(x)] = -100
df_tot['new_score'] = np.mean(x, axis=1)
df_topn = df_tot.nlargest(n=N, columns='new_score')

df_topn.to_csv(data_dir+'../topN/topN_new_'+target+'.csv', index=False)
# print(len(df_tot))


In [38]:
df_topn['smiles'].to_csv(data_dir+'../topN/topN_new_'+target+'.csv', index=False)


In [35]:
df = pd.read_csv('../data/activity_data.csv')
df = df[(df['acrylamide'] == False) & (df['chloroacetamide'] == False)]
df.reset_index(inplace=True)

df['mol'] = [MolFromSmiles(x) for x in df['SMILES']]
df_topn['mol'] = [MolFromSmiles(x)
                     for x in df_topn['smiles']]
fprints_moonshot = [AllChem.GetMorganFingerprintAsBitVect(
    mol, radius=3, nBits=2048) for mol in df['mol'].values]
fprints_topn = [AllChem.GetMorganFingerprintAsBitVect(
    mol, radius=3, nBits=2048) for mol in df_topn['mol'].values]

In [None]:
threshold = 0.8
sim_mat = np.empty((len(fprints_moonshot), len(fprints_topn)))
for i in tqdm(range(len(fprints_moonshot))):
    sim_mat[i] = np.array(DataStructs.BulkTanimotoSimilarity(
        fprints_moonshot[i], fprints_topn))

    # inds = np.argwhere(scores).flatten()
    # if inds!=[]:
    #     print(i, inds)
fig = plt.figure(dpi=400)
plt.matshow(sim_mat)

In [None]:
top_n = 100
# print()
# print(np.amax(sim_mat))
max_inds = np.unravel_index(np.argpartition(
    sim_mat.flatten(), -top_n)[-top_n:], shape=((len(fprints_moonshot), len(fprints_topn))))

x = max_inds[0]
y = max_inds[1]
for n in range(top_n):
    print(df_topn.iloc[y[n]]['smiles'], sim_mat[x[n], y[n]])
    display(df.iloc[x[n]][['SMILES', 'CID', 'f_avg_IC50']])
    


## Visualise results