-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_templates.py
347 lines (309 loc) · 16.2 KB
/
extract_templates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
import argparse
import json
import numpy as np
import os
import multiprocessing
import pickle
import sys
import pandas as pd
from collections import Counter
from rdkit import Chem
from tqdm import tqdm
from rdchiral.template_extractor import extract_from_reaction
from rdchiral.main import rdchiralReaction, rdchiralReactants, rdchiralRun
from chemutils import canonicalize_rxn
from chemutils import cano_smiles, cano_smiles_, cano_smarts, cano_smarts_
from chemutils import smarts_to_cano_smiles, get_pattern_fingerprint_bitstr
class _Reactor(object):
def __init__(self):
self.rxn_cooked = {}
self.src_cooked = {}
self.cached_results = {}
def get_rxn(self, rxn):
p, a, r = rxn.split('>')
if '.' in p: # we assume the product has only one molecule
if p[0] != '(':
p = '('+p+')'
rxn = '>'.join((p, a, r))
if not rxn in self.rxn_cooked:
try:
t = rdchiralReaction(rxn)
except:
t = None
self.rxn_cooked[rxn] = t
return self.rxn_cooked[rxn]
def get_src(self, smiles):
if not smiles in self.src_cooked:
self.src_cooked[smiles] = rdchiralReactants(smiles)
return self.src_cooked[smiles]
def run_reaction(self, src, template, keep_mapnums=False):
key = (src, template)
if key in self.cached_results:
return self.cached_results[key]
rxn = self.get_rxn(template)
src = self.get_src(src)
if rxn is None or src is None:
return None
try:
outcomes = rdchiralRun(rxn, src, keep_mapnums=keep_mapnums)
self.cached_results[key] = outcomes
except:
self.cached_results[key] = None
return self.cached_results[key]
Reactor = _Reactor()
def get_tpl(task):
idx, react, prod, id, cls = task
# reassign product mapping numbers
react, prod = canonicalize_rxn(react, prod)
reaction = {'_id': idx, 'reactants': react, 'products': prod}
template = extract_from_reaction(reaction)
pred_reacts = []
retro_okay_list = []
reaction_smarts = ''
cano_react = cano_smiles(react)
if 'reaction_smarts' in template:
reaction_smarts = template['reaction_smarts']
pred_mols = Reactor.run_reaction(prod, reaction_smarts)
if pred_mols and len(pred_mols):
react_mol = Chem.MolFromSmiles(cano_react)
for pred_react in pred_mols:
pred_react = cano_smiles(pred_react)
retro_okay = False
if cano_react == pred_react:
retro_okay = 'exact_match'
else:
pred_react_mol = Chem.MolFromSmiles(pred_react)
if react_mol.HasSubstructMatch(pred_react_mol, useChirality=True) and pred_react_mol.HasSubstructMatch(react_mol, useChirality=True):
retro_okay = 'equal_mol'
pred_reacts.append(pred_react)
retro_okay_list.append(retro_okay)
return idx, react, prod, reaction_smarts, cano_react, retro_okay_list, id, cls
def match_template(task):
idx, val = task
reactant = cano_smiles(val['reactant'])
params = Chem.SmilesParserParams()
params.removeHs = False
mol_prod = Chem.MolFromSmiles(val['product'], params)
prod_fp_vec = int(get_pattern_fingerprint_bitstr(mol_prod), 2)
sequences = []
template_cands = []
templates_list = []
atom_indexes_fp_labels = {}
# multiple templates may be valid for a reaction, find all of them
for prod_smarts_fp_idx, prod_smarts_tmpls in prod_smarts_fp_to_templates.items():
prod_smarts_fp_idx = int(prod_smarts_fp_idx)
prod_smarts_fp = prod_smarts_fp_list[prod_smarts_fp_idx]
for prod_smarts_idx, tmpls in prod_smarts_tmpls.items():
# skip if fingerprint not match
if (prod_smarts_fp & prod_fp_vec) < prod_smarts_fp:
continue
prod_smarts_idx = int(prod_smarts_idx)
prod_smarts = prod_smarts_list[prod_smarts_idx]
if prod_smarts not in smarts_mol_cache:
smarts_mol_cache[prod_smarts] = Chem.MergeQueryHs(Chem.MolFromSmarts(prod_smarts))
# we need also find matched atom indexes
matches = mol_prod.GetSubstructMatches(smarts_mol_cache[prod_smarts])
if len(matches):
found_okay_tmpl = False
for tmpl in tmpls:
pred_mols = Reactor.run_reaction(val['product'], tmpl)
if reactant and pred_mols and (reactant in pred_mols):
found_okay_tmpl = True
template_cands.append(templates_train.index(tmpl))
templates_list.append(tmpl)
reacts = tmpl.split('>>')[1].split('.')
if len(reacts) > 2: print('too many reacts:', reacts, idx)
seq_reacts = [react_smarts_list.index(cano_smarts(r)) for r in reacts]
seq = [prod_smarts_fp_idx] + sorted(seq_reacts)
sequences.append(seq)
# for each prod center, there may be multiple matches
for match in matches:
match = tuple(sorted(match))
if match not in atom_indexes_fp_labels:
atom_indexes_fp_labels[match] = {}
if prod_smarts_fp_idx not in atom_indexes_fp_labels[match]:
atom_indexes_fp_labels[match][prod_smarts_fp_idx] = [[], []]
atom_indexes_fp_labels[match][prod_smarts_fp_idx][0].append(prod_smarts_idx)
atom_indexes_fp_labels[match][prod_smarts_fp_idx][1].append(found_okay_tmpl)
reaction_center_cands = []
reaction_center_cands_labels = []
reaction_center_cands_smarts = []
reaction_center_atom_indexes = []
for atom_index in sorted(atom_indexes_fp_labels.keys()):
for fp_idx, val in atom_indexes_fp_labels[atom_index].items():
reaction_center_cands.append(fp_idx)
reaction_center_cands_smarts.append(val[0])
reaction_center_cands_labels.append(True in val[1])
reaction_center_atom_indexes.append(atom_index)
tmpl_res = {
'templates': templates_list,
'template_cands': template_cands,
'template_sequences': sequences,
'reaction_center_cands': reaction_center_cands,
'reaction_center_cands_labels': reaction_center_cands_labels,
'reaction_center_cands_smarts': reaction_center_cands_smarts,
'reaction_center_atom_indexes': reaction_center_atom_indexes,
}
return idx, tmpl_res
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default='USPTO50K', help='dataset: USPTO50K')
parser.add_argument('--prod_k', type=int, default='1', help='product min counter to be kept')
parser.add_argument('--react_k', type=int, default='1', help='reactant min counter to be kept')
args = parser.parse_args()
print('extract templates for dataset {}...'.format(args.dataset))
assert args.dataset in ['USPTO50K']
for data_set in ['valid', 'train', 'test']:
data_file = './data/{}/raw_{}.csv'.format(args.dataset, data_set)
csv = pd.read_csv(data_file)
data_file = './data/{}/templates_{}.json'.format(args.dataset, data_set)
if not os.path.isfile(data_file):
reaction_list = csv['reactants>reagents>production']
reactant_list = list(map(lambda x: x.split('>')[0], reaction_list))
product_list = list(map(lambda x: x.split('>')[-1], reaction_list))
ids = csv['id'].tolist()
classes = csv['class'].tolist()
rxns = []
for idx, r in enumerate(reactant_list):
rxns.append((idx, r, product_list[idx], ids[idx], classes[idx]))
print('total rxns:', len(rxns))
cnt = 0
train_templates = {}
pool = multiprocessing.Pool(16)
for result in tqdm(pool.imap_unordered(get_tpl, rxns), total=len(rxns)):
idx, react, prod, reaction_smarts, cano_reacts, retro_okay, id, cls = result
cnt += 'exact_match' in retro_okay or 'equal_mol' in retro_okay
train_templates[idx] = {
'id': id,
'class': cls,
'reactant': react,
'product': prod,
'reaction_smarts': reaction_smarts,
'cano_reactants': cano_reacts,
}
print('retro_okay cnt:', cnt, len(rxns), cnt / len(rxns))
with open(data_file, 'w') as f:
json.dump(train_templates, f, indent=4)
# find cano templates
templates_cano_train = './data/{}/templates_cano_train.json'.format(args.dataset)
if False and os.path.isfile(templates_cano_train):
data = json.load(open(templates_cano_train))
templates_train = data['templates_train']
react_smarts_list = data['react_smarts_list']
prod_smarts_list = data['prod_smarts_list']
prod_smarts_fp_list = data['prod_smarts_fp_list']
fp_prod_smarts_dict = data['fp_prod_smarts_dict']
prod_smarts_fp_to_templates = data['prod_smarts_fp_to_templates']
for key, val in data.items():
print(key, len(val))
else:
print('find all cano training templates')
data_file = './data/{}/templates_{}.json'.format(args.dataset, 'train')
templates_data_train = json.load(open(data_file))
smarts_fp_cache = {}
fp_prod_smarts_dict = {}
cano_react_smarts_dict = Counter()
for idx, val in tqdm(templates_data_train.items()):
items = val['reaction_smarts'].split('>>')
if len(items) == 2 and items[0] and items[1]:
prod_smarts, reacts_smarts = items
prod_smarts = cano_smarts_(prod_smarts)
if prod_smarts not in smarts_fp_cache:
mol = Chem.MergeQueryHs(Chem.MolFromSmarts(prod_smarts))
smarts_fp_cache[prod_smarts] = int(get_pattern_fingerprint_bitstr(mol), 2)
if smarts_fp_cache[prod_smarts] not in fp_prod_smarts_dict:
fp_prod_smarts_dict[smarts_fp_cache[prod_smarts]] = {'cnt': 0, 'cano_smarts': []}
fp_prod_smarts_dict[smarts_fp_cache[prod_smarts]]['cnt'] += 1
fp_prod_smarts_dict[smarts_fp_cache[prod_smarts]]['cano_smarts'].append(prod_smarts)
cano_reacts_smarts = cano_smarts(reacts_smarts)
cano_react_smarts_dict.update(cano_reacts_smarts.split('.'))
val['cano_reaction_smarts'] = prod_smarts + '>>' + cano_reacts_smarts
else:
print('invalid reaction_smarts:', idx, items)
print('fp_prod_smarts_dict and cano_react_smarts_dict size:', len(fp_prod_smarts_dict), len(cano_react_smarts_dict))
print('smarts filter threshold: ', args.prod_k, args.react_k)
prod_smarts_list = set()
prod_smarts_fp_to_remove = []
# filter product smarts less then the frequency
for fp, val in fp_prod_smarts_dict.items():
if val['cnt'] < args.prod_k:
prod_smarts_fp_to_remove.append(fp)
else:
fp_prod_smarts_dict[fp] = list(set(val['cano_smarts']))
prod_smarts_list.update(fp_prod_smarts_dict[fp])
[fp_prod_smarts_dict.pop(fp) for fp in prod_smarts_fp_to_remove]
prod_smarts_fp_list = sorted(fp_prod_smarts_dict.keys())
prod_smarts_list = sorted(prod_smarts_list)
# find smarts indexes
for fp, val in fp_prod_smarts_dict.items():
fp_prod_smarts_dict[fp] = [prod_smarts_list.index(v) for v in val]
cano_react_smarts_dict = dict(filter(lambda elem: elem[1] >= args.react_k, cano_react_smarts_dict.items()))
# sort reactants by frequency from high to low
react_smarts_list = [k for k, v in
sorted(cano_react_smarts_dict.items(), key=lambda item: item[1], reverse=True)]
print('after filtering, prod_smarts_fp_list and cano_react_smarts_list size:', len(prod_smarts_fp_list),
len(cano_react_smarts_dict))
prod_smarts_fp_to_templates = {}
for idx, val in tqdm(templates_data_train.items()):
if 'cano_reaction_smarts' in val:
cano_prod_smarts, cano_reacts_smarts = val['cano_reaction_smarts'].split('>>')
if cano_prod_smarts not in smarts_fp_cache:
mol = Chem.MergeQueryHs(Chem.MolFromSmarts(cano_prod_smarts))
smarts_fp_cache[cano_prod_smarts] = int(get_pattern_fingerprint_bitstr(mol), 2)
if smarts_fp_cache[cano_prod_smarts] not in fp_prod_smarts_dict:
print('skip cano_prod_smarts:', idx, cano_prod_smarts)
continue
cano_reacts_smarts = set(cano_reacts_smarts.split('.'))
if not cano_reacts_smarts.issubset(cano_react_smarts_dict):
print('skip cano_reacts_smarts:', idx, cano_reacts_smarts)
continue
cano_prod_smarts_fp_idx = prod_smarts_fp_list.index(smarts_fp_cache[cano_prod_smarts])
if cano_prod_smarts_fp_idx not in prod_smarts_fp_to_templates:
prod_smarts_fp_to_templates[cano_prod_smarts_fp_idx] = {}
cano_prod_smarts_idx = prod_smarts_list.index(cano_prod_smarts)
if cano_prod_smarts_idx not in prod_smarts_fp_to_templates[cano_prod_smarts_fp_idx]:
prod_smarts_fp_to_templates[cano_prod_smarts_fp_idx][cano_prod_smarts_idx] = set()
prod_smarts_fp_to_templates[cano_prod_smarts_fp_idx][cano_prod_smarts_idx].add(val['reaction_smarts'])
tmpl_lens = []
templates_train = set()
for fp, val in prod_smarts_fp_to_templates.items():
for cano_prod_smarts, tmpls in val.items():
tmpl_lens.append(len(tmpls))
templates_train.update(tmpls)
prod_smarts_fp_to_templates[fp][cano_prod_smarts] = list(tmpls)
print('#average template variants per cano_prod_smarts:', np.mean(tmpl_lens))
print('templates_data_train:', len(templates_data_train))
templates_train = sorted(list(templates_train))
data = {
'templates_train': templates_train,
'react_smarts_list': react_smarts_list,
'prod_smarts_list': prod_smarts_list,
'prod_smarts_fp_list': prod_smarts_fp_list,
'fp_prod_smarts_dict': fp_prod_smarts_dict,
'prod_smarts_fp_to_templates': prod_smarts_fp_to_templates,
}
for key, val in data.items():
print(key, len(val))
with open(templates_cano_train, 'w') as f:
json.dump(data, f, indent=4)
smarts_mol_cache = {}
smarts_fp_cache = {}
# find all applicable templates for each reaction
# since multiple templates may be valid for a reaction
for data_set in ['test', 'train', 'valid']:
data_file = './data/{}/templates_{}.json'.format(args.dataset, data_set)
data_file_new = './data/{}/templates_{}_new.json'.format(args.dataset, data_set)
print('find all applicable templates for each reaction:', data_file)
rxn_templates = {}
templates_data = json.load(open(data_file))
tasks = [(idx, rxn) for idx, rxn in templates_data.items()]
# for task in tasks: match_template(task)
cnt = 0
with multiprocessing.Pool(16) as pool:
for res in tqdm(pool.imap_unordered(match_template, tasks), total=len(tasks)):
idx, tmpl_res = res
templates_data[idx].update(tmpl_res)
cnt += len(tmpl_res['templates']) > 0
print('template coverage: {} for {} dataset'.format(cnt / len(tasks), data_set))
with open(data_file, 'w') as f:
json.dump(templates_data, f, indent=4)