In [None]:
import pandas as pd
import sys
import os

# helpful Utils Tools

In [None]:
import pandas as pd
import torch
from rxnmapper import RXNMapper
from rdkit import Chem


def get_mol(smiles: str, kekulize: bool = False) -> Chem.Mol:
    """SMILES string to Mol.
    Parameters
    ----------
    smiles: str,
        SMILES string for molecule
    kekulize: bool,
        Whether to kekulize the molecule
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None and kekulize:
        Chem.Kekulize(mol)
    return mol

def canonical_smiles(x):
    mol = get_mol(x, kekulize=False)
    return Chem.MolToSmiles(mol)

def clear_map_number(smi):
    """Clear the atom mapping number of a SMILES sequence"""
    mol = Chem.MolFromSmiles(smi)
    for atom in mol.GetAtoms():
        if atom.HasProp('molAtomMapNumber'):
            atom.ClearProp('molAtomMapNumber')
    return canonical_smiles(Chem.MolToSmiles(mol))


def canonical_rxn(rxn):
    reac, prod = rxn.split('>>')
    reac = clear_map_number(reac)
    prod = clear_map_number(prod)
    return f'{reac}>>{prod}'


# add atom-mapping infomation

In [None]:
# change your source data csv path, here; please ensure 'rxn' column is in the csv file
src_data_path = 'Canonicalized_SMILES_Reactions_input_data.csv'
source_data = pd.read_csv(src_data_path, header=0, index_col=False)
dst_data_path = 'clear_data.csv'
source_data['canonicalized_rxn'] = source_data['rxn'].apply(canonical_rxn)
rxns = source_data['canonicalized_rxn'].values

mapper = RXNMapper()
mapped_rxns_info = mapper.get_attention_guided_atom_maps(rxns)
mapped_rxns = [info['mapped_rxn'] for info in mapped_rxns_info]
confidence = [info['confidence'] for info in mapped_rxns_info]
source_data['mapped_rxn'] = mapped_rxns
source_data['mapped_score'] = confidence

columns = ['canonicalized_rxn', 'Output', 'mapped_rxn', 'mapped_score', 'Arene_smi', 'Radical_smi', 'Product_smi', 'React_sites_Ar_R', 'Prod_sites']
source_data:pd.DataFrame = source_data[columns]
source_data.rename(columns={'canonicalized_rxn':'rxn', 'Arene_smi':'arene', 'Radical_smi':'radical', 'Output':'output'})
source_data.to_csv(dst_data_path)

In [None]:
# split the dataset as train/val/test part.

In [None]:
source_data = 'clear_data.csv'
which_data = 'denmark'
dst_root_dir = os.path.join("YOUR-DST-DIR", which_data)
ratio = 0.7
seeds = [42, 2024, 10086, 23333, 66666, 99999, 123456, 654321, 987654, 13579]

for seed in seeds:
    df = pd.read_csv(source_data)
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    # sample train[70% * 0.9], val [70% * 0.1] and test[30%] data
    train_df = df[:int(len(df) * ratio * 0.9)]
    val_df = df[int(len(df) * ratio * 0.9):int(len(df) * ratio)]
    test_df = df[int(len(df) * ratio):]

    # save to csv
    dst_data_dir = os.path.join(dst_root_dir, str(seed))
    if not os.path.exists(dst_data_dir):
        os.makedirs(dst_data_dir)
    train_df.to_csv(os.path.join(dst_data_dir, 'train.csv'), index=False)
    val_df.to_csv(os.path.join(dst_data_dir, 'val.csv'), index=False)
    test_df.to_csv(os.path.join(dst_data_dir, 'test.csv'), index=False)

    print(f'seed {seed} done!')


seed 42 done!
seed 2024 done!
seed 10086 done!
seed 23333 done!
seed 66666 done!
seed 99999 done!
seed 123456 done!
seed 654321 done!
seed 987654 done!
seed 13579 done!
