In [1]:
import pickle
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd

In [2]:
with open('../data/evaluation/routes_possible_test_hard.pkl', 'rb') as f:
    data = pickle.load(f)


In [3]:
for rxn_str in data[0]:
    rxn = AllChem.ReactionFromSmarts(rxn_str)

In [4]:
cleaned_SMILES = []
molecule_labels = []    

for i, route in enumerate(data):
    target_SMILES = route[0].split('>>')[0]
    assert '.' not in target_SMILES
    
    try:
        # convert the target SMILES provided to a mol object
        target_mol = Chem.MolFromSmiles(target_SMILES)
        # then remove stereochemistry and sanitize the mol object
        Chem.RemoveStereochemistry(target_mol)
        Chem.SanitizeMol(target_mol)

        # convert the mol object back to a SMILES string to get its canonical representation
        target_SMILES = Chem.MolToSmiles(target_mol)
        cleaned_SMILES.append(target_SMILES)
        molecule_labels.append(f'Retro190_target_{i+1}')
    
    except Exception as e:
        print(f"Error processing route {i}: {e}")


In [5]:
Retro190_df = pd.DataFrame({
    'molecule_label': molecule_labels,
    'cleaned_SMILES': cleaned_SMILES
})

Retro190_df.to_csv("../data/evaluation/Retro190_benchmark_smiles.csv", index=False)