# 0. Imports

In [57]:
%load_ext autoreload
%autoreload 2

In [58]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

# for flattening tuples and lists
from itertools import chain

from tqdm import tqdm
import os
import random
import pickle
import multiprocessing

#import own modules
import balancing_workflow as bw


In [299]:
#(implement elsewhere, zenodo file maybe?) load GDB13S 
with open('/home/yves/Documents/GitHub/ReactionClasses/16_02_23/GDB/13S.smi', 'r') as f:
    GDB13S = []
    for line in f:
        GDB13S.append(line.split('\n')[0])

#load formatted template dataframe
#temp_r0 = pd.read_pickle('/home/yves/Documents/GitHub/USPTO_balance/data/amol_USPTO_templates_set_r0_v2_formatted.pkl')

#load df_templates prepared in (*)
df_templates = pd.read_pickle('/home/yves/Documents/GitHub/USPTO_balance/data/df_templates_to_enrich.pkl')

#load GDB13S_mol (3%) prepared in (*)
#with open('./data/GDB13S_003perc_mol.pkl', 'rb') as f:
#    GDB13S_mol = pickle.load(f)

# (*) 1. Data preparation

df_template

In [6]:
#(*)
#retrieve list of templates to work on
templates_to_enrich, templates_to_enrich_appearances = bw.select_templates_to_enrich(temp_r0['template_hash'])

In [5]:
#(*)
#create a dataframe to store the enrichment information

df_templates = pd.DataFrame(columns=['template_hash', 'frequency'])
df_templates['template_hash'] = templates_to_enrich
df_templates['frequency'] = templates_to_enrich_appearances
df_templates['retro_templates'] = [bw.find_reaction_template_of_hash(temp_r0, templates_to_enrich[i]) for i in range(len(templates_to_enrich))] #3 min
df_templates['retro_reac'] = [Chem.MolToSmiles(bw.rxn_smarts_to_sanitized_reactant_smarts(df_templates.at[i, 'retro_templates'])) for i in range(len(df_templates))]

In [6]:
#(*)
#save df_templates to pickle 
df_templates.to_pickle('/home/yves/Documents/GitHub/USPTO_balance/data/df_templates_to_enrich.pkl')

GDB13S_mol version

In [None]:
#choose a subset of GDB13S ~3M SMILES
random.seed(42)
GDB13S = random.sample(GDB13S,round(0.03*len(GDB13S)))

#convert SMILES to RDKit mol format
from multiprocessing import Pool

dataset = GDB13S
processes = os.cpu_count()-2

def MolFromSmiles(smi):
    return Chem.MolFromSmiles(smi)

if __name__ == '__main__':
   with Pool(processes) as p:
      output = list(tqdm(p.imap(MolFromSmiles, dataset), total = len(dataset)))

GDB13S_mol = output
del output, dataset

with open('./data/GDB13S_003perc_mol.pkl', 'wb') as f:
    pickle.dump(GDB13S_mol, f)

# 2. Framework dev

## 2.1 Prepare data to run all templates on a single portion of GDB13S

In [300]:
#after loading GDB13S (full) and df_templates (full)

#1. select a short enough part of GDB13S and save both the SMILES and the mol format

fraction = 0.01 #input if function
GDB_version = 1 #input if function
percentage = fraction*100 

random.seed(42)
GDB13S = random.sample(GDB13S,round(fraction*len(GDB13S)))

with open(f'./data/GDB13S_{GDB_version}.txt', 'w') as f:
    for item in GDB13S:
        f.write(item + '\n')

100%|██████████| 993942/993942 [00:43<00:00, 22894.14it/s]


In [None]:
#2. split df_template

In [347]:
#save 10 different df_templates with 10 percent of the retro_reac each
frac_retro_reac_per_split = 0.01 #input if function

n_parts = int(1/frac_retro_reac_per_split)
datasplits = list(split(list(df_templates['retro_reac'].unique()), n_parts))

for i in range(n_parts):
    df_templates_split = df_templates[df_templates['retro_reac'].isin(datasplits[i])]
    df_templates_split.to_pickle(f'/home/yves/Documents/GitHub/USPTO_balance/data/templates_split/df_templates_to_enrich_part_{i+1}.pkl')

In [349]:
#create 100 config files for part 1 with same GDB version
for i in range(n_parts):
    with open(f'/home/yves/Documents/GitHub/USPTO_balance/config_files/config_part1_{i+1}.yaml', 'w') as f:
        f.write(f'GDB13S_path: "/home/yves/Documents/GitHub/USPTO_balance/data/GDB13S_{GDB_version}.txt"\n')
        f.write(f'df_templates_path_to_pkl: "/home/yves/Documents/GitHub/USPTO_balance/data/templates_split/df_templates_to_enrich_part_{i+1}.pkl"\n')
        f.write(f'GDB_version: "{GDB_version}"\n')
        f.write(f'template_version: "{i+1}"\n')

RUN PART 1-------------------------------------------

In [350]:
#create 100 config files for part 2 with same GDB version
for i in range(n_parts):
    with open(f'/home/yves/Documents/GitHub/USPTO_balance/config_files/config_part2_{i+1}.yaml', 'w') as f:
        f.write(f'df_templates_path_to_pkl: "/home/yves/Documents/GitHub/USPTO_balance/data/templates_split/df_templates_to_enrich_part_{i+1}.pkl"\n')
        f.write(f'GDB_version: "{GDB_version}"\n')
        f.write(f'template_version: "{i+1}"\n')

RUN PART 2----------------------------------------

In [234]:
#create 100 config files for part 3 with same GDB version
for i in range(n_parts):
    with open(f'/home/yves/Documents/GitHub/USPTO_balance/config_files/config_part3_{i+1}.yaml', 'w') as f:
        f.write(f'df_templates_path_to_pkl: "/home/yves/Documents/GitHub/USPTO_balance/data/templates_split/df_templates_to_enrich_part_{i+1}.pkl"\n')
        f.write(f'GDB_version: "{GDB_version}"\n')
        f.write(f'template_version: "{i+1}"\n')
        f.write(f'Model_path_T2: "/home/yves/Documents/GitHub/TTL_versions/1.4/models/USPTO_STEREO_separated_T2_Reagent_Pred_225000.pt"\n')
        f.write(f'Model_path_T3: "/home/yves/Documents/GitHub/TTL_versions/1.4/models/USPTO_STEREO_separated_T3_Forward_255000.pt"')

RUN PART 3----------------------------------------

## 2.2 Development of reactions with confidence score > 0.9 validation

In [None]:
# function that takes a list of reactions and makes them pass through T2
# function that takes a list of reactions and makes them pass through T3
# function that checks confidence score 

function that takes a list of reactions and makes them pass through T2

In [2]:
# function that takes a list of reactions and makes them pass through T2 (1)

#imports
from ttlretro.single_step_retro import SingleStepRetrosynthesis
singlestepretrosynthesis = SingleStepRetrosynthesis()


Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transformers/albert_heads_8_uspto_all_1310k were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transfo

In [8]:
# function that takes a list of reactions and makes them pass through T2 (2)

#let's load a random list (will be an input in the real function)
GDB_version = '1'
template_version = '1'
retro_reac = '[C:1][C:2]'
retro_template = '([CH2;D2;+0:1]-[CH2;D2;+0:2])>>([C;H0;D2;+0:1]#[C;H0;D2;+0:2])'

with open(f'/home/yves/Documents/GitHub/USPTO_balance/created_rxns_{GDB_version}_{template_version}/rxns_{retro_reac}_{retro_template}.txt', 'r') as f:
    rxns_list = []
    for line in f:
        rxns_list.append(line.split('\n')[0])
        
rxns_list = rxns_list[:100] #ONLY FOR QUICK TESTING

#associated function 
def load_rxns(GDB_version, template_version, retro_reac, retro_template):
    with open(f'/home/yves/Documents/GitHub/USPTO_balance/created_rxns_{GDB_version}_{template_version}/rxns_{retro_reac}_{retro_template}.txt', 'r') as f:
    rxns_list = []
    for line in f:
        rxns_list.append(line.split('\n')[0])
    return rxns_list

In [10]:
# function that takes a list of reactions and makes them pass through T2 (3)

#tokenize reactions
tok_rxns_list = [singlestepretrosynthesis.smi_tokenizer(i) for i in rxns_list]

#associated function
def tokenize_rxn_list(rxns_list):
    tok_rxns_list = [singlestepretrosynthesis.smi_tokenizer(i) for i in rxns_list]
    return tok_rxns_list

In [19]:
# function that takes a list of reactions and makes them pass through T2 (4)
SMILES_list = tok_rxns_list 
Model_path  = '/home/yves/Documents/GitHub/TTL_versions/1.4/models/USPTO_STEREO_separated_T2_Reagent_Pred_225000.pt'
beam_size   = 1 
batch_size  = 64
untokenize_output = True
[preds_T2, probs_T2] = singlestepretrosynthesis.Execute_Prediction(SMILES_list, Model_path, beam_size, batch_size, untokenize_output) #weird but results are under preds_T2[0], not preds_T2

#associated function
def run_T2_predictions(SMILES_list, Model_path, beam_size, batch_size, untokenize_output):
    [preds_T2, probs_T2] = singlestepretrosynthesis.Execute_Prediction(SMILES_list, Model_path, beam_size, batch_size, untokenize_output)
    return preds_T2

function that takes the T2 outputs and prepares them to pass through T3

In [40]:
# function that takes the T2 outputs and prepares them to pass through T3 (1)

rxns_T2_list = [rxns_list[i].split('>>')[0] + '>' + preds_T2[0][i] + '>' + rxns_list[i].split('>>')[1] for i in range(len(preds_T2[0]))]
rxns_T2_to_T3 = [rxns_list[i].split('>>')[0] + '>' + preds_T2[0][i] for i in range(len(preds_T2[0]))]
rxns_T2_to_T3_tok = [singlestepretrosynthesis.smi_tokenizer(i) for i in rxns_T2_to_T3]

#associated function
def prepare_rxns_T2_for_T3(rxns_list, preds_T2):
    rxns_T2_list = [rxns_list[i].split('>>')[0] + '>' + preds_T2[0][i] + '>' + rxns_list[i].split('>>')[1] for i in range(len(preds_T2[0]))]
    rxns_T2_to_T3 = [rxns_list[i].split('>>')[0] + '>' + preds_T2[0][i] for i in range(len(preds_T2[0]))]
    rxns_T2_to_T3_tok = [singlestepretrosynthesis.smi_tokenizer(i) for i in rxns_T2_to_T3]
    return rxns_T2_list, rxns_T2_to_T3, rxns_T2_to_T3_tok

 function that takes a list of reactions and makes them pass through T3

In [44]:
SMILES_list = rxns_T2_to_T3_tok 
Model_path  = '/home/yves/Documents/GitHub/TTL_versions/1.4/models/USPTO_STEREO_separated_T3_Forward_255000.pt'
beam_size   = 3 
batch_size  = 64
untokenize_output = True
[preds_T3, probs_T3] = singlestepretrosynthesis.Execute_Prediction(SMILES_list, Model_path, beam_size, batch_size, untokenize_output)

#associated function
def run_T3_predictions(SMILES_list, Model_path, beam_size, batch_size, untokenize_output):
    [preds_T3, probs_T3] = singlestepretrosynthesis.Execute_Prediction(SMILES_list, Model_path, beam_size, batch_size, untokenize_output)
    return preds_T3[0]



function comparing T3 predictions and reference and giving back the indices 

In [115]:
#compare preds_T3[0] with predictions reference

#canonicalize preds_T3
preds_T3[0] = [singlestepretrosynthesis.canonicalize_smiles(i) for i in preds_T3[0]]
#preds_ref considered already canonicalized here
preds_ref = [rxns_list[i].split('>>')[1] for i in range(len(rxns_list))]
ind = [i for i in range(len(preds_T3[0])) if preds_T3[0][i] == preds_ref[i]]
ind


#associated function
def find_ind_match_T3_preds_ref(preds_T3, rxns_list):
    #canonicalize preds_T3
    preds_T3 = [singlestepretrosynthesis.canonicalize_smiles(i) for i in preds_T3]
    #preds_ref considered already canonicalized here
    preds_ref = [rxns_list[i].split('>>')[1] for i in range(len(rxns_list))]
    ind = [i for i in range(len(preds_T3)) if preds_T3[i] == preds_ref[i]]
    return ind

[0, 2]

function that gives back the full reactions with a confidence score > 0.9

In [54]:
ind_keep = [probs_T3[0][i] > 0.9 for i in range(len(probs_T3[0]))]
rxns_conf = [rxns_T2_to_T3[i] for i in range(len(ind_keep)) if ind_keep[i] == True]

#associated function
def keeps_match_confident_rxns(rxns_T2_to_T3, probs_T3, conf_score: int = 90, ind:list):
    ind_keep = [probs_T3[i] > conf_score for i in range(len(probs_T3))]
    rxns_conf = [rxns_T2_to_T3[i] for i in range(len(ind_keep)) if ind_keep[i] == True and i in ind]
    return rxns_conf

function that saves the reactions with validated confidence scores replacing the old file

In [63]:
GDB_version = '1'
template_version = '1'
retro_reac = '[C:1][C:2]'
retro_template = '([CH2;D2;+0:1]-[CH2;D2;+0:2])>>([C;H0;D2;+0:1]#[C;H0;D2;+0:2])'

with open(f'/home/yves/Documents/GitHub/USPTO_balance/created_rxns_{GDB_version}_{template_version}/rxns_{retro_reac}_{retro_template}.txt', 'w') as f:
    for item in rxns_conf:
        f.write(item + '\n')



#associated function 
def save_conf_rxns(rxns_conf, GDB_version, template_version, retro_reac, retro_template):

    folder_path = f'saved_rxns_{GDB_version}_{template_version}'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    with open(f'/home/yves/Documents/GitHub/USPTO_balance/{folder_path}/rxns_{retro_reac}_{retro_template}.txt', 'w') as f:
        for item in rxns_conf:
            f.write(item + '\n')

function that reads a specific df_templates_split

In [64]:
template_version = '1'
df_templates_split = pd.read_pickle(f'/home/yves/Documents/GitHub/USPTO_balance/data/templates_split/df_templates_to_enrich_part_{template_version}.pkl')

#associated function
def load_template_version(template_version):
    df_templates_split = pd.read_pickle(f'/home/yves/Documents/GitHub/USPTO_balance/data/templates_split/df_templates_to_enrich_part_{template_version}.pkl')
    return df_templates_split

function that returns the list of retro_reac and retro_template of a given df_template

In [78]:
def extract_temp_and_retro_from_df(df_templates_split):
    retro_templates =   [df_templates_split['retro_templates'].iloc[i] for i in range(len(df_templates_split))]
    retro_reac =        [df_templates_split['retro_reac'].iloc[i] for i in range(len(df_templates_split))]
    return retro_templates, retro_reac

In [70]:
df_templates_split

Unnamed: 0,template_hash,frequency,retro_templates,retro_reac
0,154a266ded329783e2edd97afda37dcc79a77eb501e212...,9594,([O;H0;D2;+0:2]-[c;H0;D3;+0:1])>>(Cl-[c;H0;D3;...,[cH3:1][OH:2]
1,d572f15ce843682d57bd39cd6c8575947b70a50f4c6527...,9461,([CH;D2;+0:1]=[O;H0;D1;+0:2])>>([CH2;D2;+0:1]-...,[C:1]=[O:2]
2,ab03ec2e113ad21265223676d461baf815168d6c7b2190...,8230,([C;H0;D3;+0:1]=[N;H0;D2;+0:2])>>(O=[C;H0;D3;+...,[CH2:1]=[NH:2]
3,075d673cff5ec4531a74899eab1365e439a32f4c8b9037...,7853,([CH2;D2;+0:1]-[CH2;D2;+0:2])>>([CH;D2;+0:1]=[...,[C:1][C:2]
4,3c23d7484bb108274ce3044df7aec2c2dd6107ec5ca792...,7833,([CH2;D2;+0:1]-[n;H0;D3;+0:2])>>(Br-[CH2;D2;+0...,[C:1][nH2:2]
...,...,...,...,...
5540,8ca49036809ede2d44b731e140a50503fb00bf912ba207...,10,([C;H0;D3;+0:2]=[CH2;D1;+0:1])>>(Br-P(-[CH3;D1...,[C:1]=[CH2:2]
5549,b1bb6c849f1a4b49a0b872cfe028397cf16e47bb82dd65...,10,([C;H0;D3;+0:2]=[CH2;D1;+0:1])>>(C-N(-C)-[CH2;...,[C:1]=[CH2:2]
5553,4fd1f87303fb64d23793a5141afe558cc8e8706f8862d7...,10,([C;H0;D3;+0:1]-[NH;D2;+0:2])>>(C-O-C(=O)-C-N-...,[CH3:1][N:2]
5554,540e2d1754cd1abfe987ca68d05a0d3e0ab395ec72ee9e...,10,([c;H0;D3;+0:1]-[c;H0;D3;+0:2])>>(O-[c;H0;D3;+...,[cH3:1]-[cH3:2]


full version 

In [None]:
import os
import pandas as pd
from ttlretro.single_step_retro import SingleStepRetrosynthesis
singlestepretrosynthesis = SingleStepRetrosynthesis()


def load_rxns(GDB_version, template_version, retro_reac, retro_template):
    with open(f'/home/yves/Documents/GitHub/USPTO_balance/created_rxns_{GDB_version}_{template_version}/rxns_{retro_reac}_{retro_template}.txt', 'r') as f:
        rxns_list = []
        for line in f:
            rxns_list.append(line.split('\n')[0])
    return rxns_list


def tokenize_rxn_list(rxns_list):
    tok_rxns_list = [singlestepretrosynthesis.smi_tokenizer(i) for i in rxns_list]
    return tok_rxns_list


def run_T2_predictions(SMILES_list, Model_path, beam_size: int = 1, batch_size: int = 64, untokenize_output:bool = True):
    [preds_T2, probs_T2] = singlestepretrosynthesis.Execute_Prediction(SMILES_list, Model_path, beam_size, batch_size, untokenize_output)
    return preds_T2[0]


def prepare_rxns_T2_for_T3(rxns_list, preds_T2):
    rxns_T2_list = [rxns_list[i].split('>>')[0] + '>' + preds_T2[0][i] + '>' + rxns_list[i].split('>>')[1] for i in range(len(preds_T2))]
    rxns_T2_to_T3 = [rxns_list[i].split('>>')[0] + '>' + preds_T2[0][i] for i in range(len(preds_T2))]
    rxns_T2_to_T3_tok = [singlestepretrosynthesis.smi_tokenizer(i) for i in rxns_T2_to_T3]
    return rxns_T2_list, rxns_T2_to_T3, rxns_T2_to_T3_tok


def run_T3_predictions(SMILES_list, Model_path, beam_size: int = 3, batch_size: int = 64, untokenize_output:bool = True):
    [preds_T3, probs_T3] = singlestepretrosynthesis.Execute_Prediction(SMILES_list, Model_path, beam_size, batch_size, untokenize_output)
    return preds_T3[0], probs_T3[0]


def find_ind_match_T3_preds_ref(preds_T3, rxns_list):
    #canonicalize preds_T3
    preds_T3 = [singlestepretrosynthesis.canonicalize_smiles(i) for i in preds_T3]
    preds_ref = [rxns_list[i].split('>>')[1] for i in range(len(rxns_list))]
    ind = [i for i in range(len(preds_T3)) if preds_T3[i] == preds_ref[i]]
    return ind


def keeps_match_confident_rxns(rxns_T2_to_T3, probs_T3, conf_score: int = 90, match_ind: list):
    ind_keep = [probs_T3[i] > conf_score for i in range(len(probs_T3))]
    rxns_conf = [rxns_T2_to_T3[i] for i in range(len(ind_keep)) if ind_keep[i] == True and i in match_ind]
    return rxns_conf


def save_conf_rxns(rxns_conf, GDB_version, template_version, retro_reac, retro_template):

    folder_path = f'saved_rxns_{GDB_version}_{template_version}'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    with open(f'/home/yves/Documents/GitHub/USPTO_balance/{folder_path}/rxns_{retro_reac}_{retro_template}.txt', 'w') as f:
        for item in rxns_conf:
            f.write(item + '\n')


def load_template_version(template_version):
    df_templates_split = pd.read_pickle(f'/home/yves/Documents/GitHub/USPTO_balance/data/templates_split/df_templates_to_enrich_part_{template_version}.pkl')
    return df_templates_split


def delete_evaluated_rxns(GDB_version, template_version, retro_reac, retro_template):

    name = f'rxns_{retro_reac}_{retro_template}'
    folder_path = f'./created_rxns_{GDB_version}_{template_version}'
    os.remove(f'{folder_path}/{name}.txt')


def read_config(config_file):
    with open(config_file, 'r') as f:
        config = yaml.safe_load(f)
    return config


def reactions_conf_validation(GDB_version, template_version, retro_reac, retro_template, Model_path_T2, Model_path_T3):

    rxns_list = load_rxns(GDB_version, template_version, retro_reac, retro_template)
    tok_rxns_list = tokenize_rxn_list(rxns_list)
    preds_T2 = run_T2_predictions(tok_rxns_list, Model_path_T2, beam_size = 1, batch_size = 64, untokenize_output = True)
    rxns_T2_list, rxns_T2_to_T3, rxns_T2_to_T3_tok = prepare_rxns_T2_for_T3(rxns_list, preds_T2)
    preds_T3, probs_T3 = run_T3_predictions(rxns_T2_to_T3_tok, Model_path_T3, beam_size = 3, batch_size = 64, untokenize_output = True)
    ind_match = find_ind_match_T3_preds_ref(preds_T3, rxns_list)
    rxns_conf = keeps_match_confident_rxns(rxns_T2_to_T3, probs_T3, ind_match, conf_score = 0.9)
    save_conf_rxns(rxns_conf, GDB_version, template_version, retro_reac, retro_template)
    #delete_evaluated_rxns(GDB_version, template_version, retro_reac, retro_template)

def main(GDB_version, template_version, Model_path_T2, Model_path_T3):

    df_templates_split = load_template_version(template_version)

    for retro_reac, retro_template in tqdm(zip(df_templates_split['retro_reac'], df_templates_split['retro_templates'])):
        reactions_conf_validation(GDB_version, template_version, retro_reac, retro_template, Model_path_T2, Model_path_T3)


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', help='Path to the configuration file')
    args = parser.parse_args()

    if not args.config:
        print('Please provide a configuration file')
        sys.exit()
    elif not os.path.exists(args.config):
        print('The configuration file does not exist')
        sys.exit()
    config = read_config(args.config)

    main(
        config['GDB_version'],
        config['template_version'],
        config['Model_path_T2'],
        config['Model_path_T3']        
    )

In [None]:
#needs in the config file
template_version = '1'
GDB_version = '1'
Model_path_T2 = '/home/yves/Documents/GitHub/TTL_versions/1.4/models/USPTO_STEREO_separated_T2_Reagent_Pred_225000.pt'
Model_path_T3 = '/home/yves/Documents/GitHub/TTL_versions/1.4/models/USPTO_STEREO_separated_T3_Forward_255000.pt'

Test part 3 of the framework

In [190]:
#config file
template_version = '1'
GDB_version = '1'
Model_path_T2 = '/home/yves/Documents/GitHub/TTL_versions/1.4/models/USPTO_STEREO_separated_T2_Reagent_Pred_225000.pt'
Model_path_T3 = '/home/yves/Documents/GitHub/TTL_versions/1.4/models/USPTO_STEREO_separated_T3_Forward_255000.pt'

In [155]:
df_templates_split = bw.load_template_version(template_version)

In [157]:
#choose retro_reac and retro_template (done in the for loop for all couples in df_templates_split)
retro_reac = '[C:1][C:2]'
retro_template = '([CH2;D2;+0:1]-[CH2;D2;+0:2])>>([CH;D2;+0:1]=[CH;D2;+0:2])'
                 

In [161]:
rxns_list = bw.load_rxns(GDB_version, template_version, retro_reac, retro_template)

In [164]:
#keep a subset to go faster (only for testing)
rxns_list = rxns_list[:100]

In [168]:
tok_rxns_list = bw.tokenize_rxn_list(rxns_list)

Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transformers/albert_heads_8_uspto_all_1310k were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transfo

In [173]:
preds_T2 = bw.run_T2_predictions(tok_rxns_list, Model_path_T2, beam_size = 1, batch_size = 64, untokenize_output = True)


Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transformers/albert_heads_8_uspto_all_1310k were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transfo

In [179]:
len(rxns_list), len(preds_T2)

(100, 100)

In [186]:
rxns_T2_list, rxns_T2_to_T3, rxns_T2_to_T3_tok = bw.prepare_rxns_T2_for_T3(rxns_list, preds_T2)


Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transformers/albert_heads_8_uspto_all_1310k were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transfo

In [191]:
preds_T3, probs_T3 = bw.run_T3_predictions(rxns_T2_to_T3_tok, Model_path_T3, beam_size = 3, batch_size = 64, untokenize_output = True)


Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transformers/albert_heads_8_uspto_all_1310k were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transfo

In [195]:
ind_match = bw.find_ind_match_T3_preds_ref(preds_T3, rxns_list)


Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transformers/albert_heads_8_uspto_all_1310k were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transfo

In [218]:
rxns_conf = bw.keeps_match_confident_rxns(rxns_T2_to_T3, probs_T3, ind_match, conf_score = 0.9) #adapt arguments to added function

In [222]:
bw.save_conf_rxns(rxns_conf, GDB_version, template_version, retro_reac, retro_template)

Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transformers/albert_heads_8_uspto_all_1310k were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at /home/yves/anaconda3/envs/MultiStepRetro/lib/python3.8/site-packages/rxnmapper/models/transfo

create function that deletes files that were looked through already

In [229]:
def delete_evaluated_rxns(GDB_version, template_version, retro_reac, retro_template):

    name = f'rxns_{retro_reac}_{retro_template}'
    folder_path = f'./created_rxns_{GDB_version}_{template_version}'
    os.remove(f'{folder_path}/{name}.txt')

In [230]:
delete_evaluated_rxns(GDB_version, template_version, retro_reac, retro_template)

# 3.Tests

## 3.1 Add canonicalization in format_reaction function (both sides or only right side?)

### 3.1.1 compare part 1 results with and after canonicalization of the subset

### 3.1.2 Watch impact of canonicalization on part 2

In [116]:
#config
df_templates_path = "/home/yves/Documents/GitHub/USPTO_balance/data/templates_split/df_templates_to_enrich_part_1.pkl"
GDB_version = "1"
template_version = "1"

In [119]:
df_templates = pd.read_pickle(df_templates_path)

#insert manually both retro_reac and retro_template as it is in a for loop usually
retro_reac = '[C:1][C:2]'
retro_template = '([CH2;D2;+0:1]-[CH2;D2;+0:2])>>([CH;D2;+0:1]=[CH;D2;+0:2])'

In [122]:
GDB13S_sub, GDB13S_sub_mol = bw.load_subsets(retro_reac, GDB_version, template_version)

In [126]:
#shorten a bit the datasets for testing
GDB13S_sub = GDB13S_sub[:100]
GDB13S_sub_mol = GDB13S_sub_mol[:100]

In [128]:
GDB13S_sub_app_temp = bw.apply_rxn_template_on_mols_list(GDB13S_sub_mol, retro_template)

In [130]:
ind_remove = [GDB13S_sub_app_temp[i] == () for i in range(len(GDB13S_sub_app_temp))]
GDB13S_sub_app_temp_sort = [GDB13S_sub_app_temp[i] for i in range(len(GDB13S_sub_app_temp)) if not ind_remove[i]]
GDB13S_sub_sort = [GDB13S_sub[i] for i in range(len(GDB13S_sub)) if not ind_remove[i]]

In [141]:
fictive_rxns_list = [bw.format_reaction(GDB13S_sub_app_temp_sort[k], GDB13S_sub_sort[k]) for k in range(len(GDB13S_sub_sort))]
fictive_rxns_list = list(chain.from_iterable(fictive_rxns_list))