# Prepping data for molecular transformer

https://github.com/pschwllr/MolecularTransformer

In [1]:
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
import random
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*')  

## Check that all reactions in test set only have 1 product

In [7]:
# MIT mixed augmented
path = '../../data/mol_transformer_data/MIT_mixed_augm/tgt-test.txt'

with open(path, 'r') as file:
    test_set = file.read()

In [8]:
'.' in test_set

True

In [9]:
count = 0
for char in test_set:
    if char == '.':
        count += 1
        
print(count)

471


# Implement separate functions

## Remove reactions with a product found in the test set

In [13]:
# convert from .txt file to list of smiles
#path = '../mol_transformer_datasets/ORDerly_mixed_augm_strict/tgt-test.txt'
path = '/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_transformer_datasets/forward/dataset_D/tgt-test.txt'

with open(path, 'r') as file:
    lines = file.readlines()

products = [product.strip().replace(" ", "") for product in lines]

In [14]:
# Only keep the canonicalised smiles strings
# Canonicalize the SMILES strings
products = products[:10000]
print(len(products))
canonical_smiles_list = [Chem.MolToSmiles(Chem.MolFromSmiles(smiles)) for smiles in tqdm(products)]
products == canonical_smiles_list


10000


100%|██████████| 10000/10000 [00:03<00:00, 2726.15it/s]


False

In [5]:
for i in range(40000):
    if products[i] != canonical_smiles_list[i]:
        print(i, products[i], canonical_smiles_list[i])
        break

1368 O=C(Nc1cc2[nH]c(-c3ccccc3)c3cn[nH]c(=O)c(c1)c23)C1CCCCN1 O=C1NN=Cc2c(-c3ccccc3)[nH]c3cc(NC(=O)C4CCCCN4)cc1c23


In [6]:
canonical_smiles_list = list(set(canonical_smiles_list))
print(len(canonical_smiles_list))

39418


In [4]:
# Read in data
strict_filtering = pd.read_parquet('/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_response_datasets/strict_filtering.parquet')

In [5]:
# Add mask to remove any rows that have a reaction contained within canonical_smiles_list

# Add a new column 'test_set', which is True if the value in 'product_000' is in canonical_smiles_list, otherwise False
strict_filtering['test_set'] = strict_filtering['product_000'].isin(canonical_smiles_list)

# if there are multiple products:
if 'product_001' in df.columns:
    strict_filtering['prod_0.prod_1'] = strict_filtering['product_000']+'.'+strict_filtering['product_001']
    strict_filtering['prod_1.prod_0'] = strict_filtering['product_001']+'.'+strict_filtering['product_000']
    strict_filtering['test_set_01'] = strict_filtering['prod_0.prod_1'].isin(canonical_smiles_list)
    strict_filtering['test_set_10'] = strict_filtering['prod_1.prod_0'].isin(canonical_smiles_list)
    
    strict_filtering['test_set'] = strict_filtering['test_set'] and strict_filtering['test_set_01'] and strict_filtering['test_set_10']
    



In [6]:
train_val_df = strict_filtering[strict_filtering['test_set'] == False]
train_val_df.reset_index(inplace=True, drop=True)

In [7]:
len(train_val_df)

340285

In [8]:
len(strict_filtering)

356906

In [9]:
356906-340285

16621

In [10]:
# Now create mask for validation set
# Lets first find out how large the validation set should be
30000/(30000+409035)

0.06833168198435205

In [11]:
# prepare src (ie the source inputs for the model, which are the reactants, solvents, and agents)
def process_dataframe(df):
    df = df.copy()
    # Create a list of all columns beginning with "reactant", "solvent", and "agent"
    reagent_cols = [col for col in df.columns if col.startswith(("reactant", "solvent", "agent"))]

    # Define a function to shuffle the values and concatenate them
    def concatenate_reagents(row):
        values = [row[col] for col in reagent_cols if row[col] is not None]
        random.shuffle(values)
        for item in values:
            if type(item) is not str:
                print(row)
        return '.'.join(values)

    # Apply the function to each row and create a new 'src' column
    df['src'] = df.apply(concatenate_reagents, axis=1)

    return df

In [12]:
df = process_dataframe(train_val_df)
# Splitting the DataFrame into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.068, random_state=42)

In [15]:
# create list of strings for src and tgt
src_list = df['src'].tolist()
tgt_list = df['product_000'].tolist()

In [17]:
tgt_list[:10]

['Nc1cccc2cnc(Cl)cc12',
 'CCCCC[C@H](O)C=CC1CCC(=O)C1CC=CCCCC(=O)O',
 'CCOC(=O)C=C1Cc2ccccc2N(C)c2ccc(SCC)cc21',
 'CCSc1ccc2c(c1)C(CC(=O)O)=Cc1ccccc1N2C',
 'Clc1ccc2nc3n(c2c1)CCC3',
 'O=C(O)c1ccc2c(c1)nc1n2CCC1',
 'CC(C)(C)N=NC(C)(C#N)C1CC1',
 'CC(C)(C)N=NC(C)(C#N)CC(=O)O',
 'O=C(O)c1ccc(NC(=O)C2CC2)cc1[N+](=O)[O-]',
 'O=C(C=Cc1ccccc1)NN1CC(=O)NC1=O']

In [18]:
src_list[:10]

['[OH-].CC(=O)O.[Na+].[Fe].O=[N+]([O-])c1cccc2cnc(Cl)cc12.O',
 'Cl.CCO.CCCCC[C@H](O)C=CC1C=CC(=O)C1CC=CCCCC(=O)O',
 'CCOC(=O)CC1(O)Cc2ccccc2N(C)c2ccc(SCC)cc21.CCO.Cl',
 'Cl.[K+].CCO.CCOC(=O)C=C1Cc2ccccc2N(C)c2ccc(SCC)cc21.[OH-]',
 'Nc1ccc2nc3n(c2c1)CCC3.O=N[O-].Cl.O.[Na+]',
 '[Na+].CC(=O)O.CCO.[OH-].CCOC(=O)c1ccc2c(c1)nc1n2CCC1',
 'BrBr.CC(C)(C)NNC(C)(C#N)C1CC1.O.ClCCl',
 'O.CO.[OH-].[Na+].CCOC(=O)CC(C)(C#N)N=NC(C)(C)C',
 'C1CCOC1.O=C(Cl)C1CC1.Nc1ccc(C(=O)O)c([N+](=O)[O-])c1.[Na+].O.[OH-]',
 'NN1CC(=O)NC1=O.Cl.O=C(Cl)C=Cc1ccccc1.c1ccncc1']

In [26]:
# augment
def augment_smiles_list(smiles_list, n=10):
    new_smiles = []
    for smiles in tqdm(smiles_list):
        random_equivalent_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), doRandom=True)
    return smiles_list + new_smiles

In [28]:
src_list_aug = augment_smiles_list(src_list[:100])
tgt_list_aug = augment_smiles_list(tgt_list[:100])

100%|██████████| 100/100 [00:00<00:00, 1740.71it/s]
100%|██████████| 100/100 [00:00<00:00, 1673.53it/s]


In [30]:
# tokenize smiles
def smi_tokenizer(smi):
    """
    Tokenize a SMILES molecule or reaction
    """
    import re
    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    assert smi == ''.join(tokens)
    return ' '.join(tokens)

def smi_tokenizer_list(smi_list):
    new_smi_list = []
    for smi in smi_list:
        new_smi_list.append(smi_tokenizer(smi))
        
    return new_smi_list

In [31]:
src_final = smi_tokenizer_list(src_list_aug)

In [33]:
# write to txt
with open('output_file_using_join.txt', 'w') as file:
    file.write('\n'.join(src_final))

# Implement overall function

In [2]:
def prep_data_for_mol_transformer(df, path_to_tgt_test, output_folder_path):
    """
    df: dataframe created by ORDerly
    tgt_test_list: a list of strings for the targents (tgt, i.e. the products) in the test set. These will be removed when generating the train and val sets.
    output_folder_path: path to where to save the txt files
    
    """
    with open(path_to_tgt_test, 'r') as file:
        lines = file.readlines()

    products = [product.strip().replace(" ", "") for product in lines]
    
    print('Canonicalising...')
    # Double check that all the tgt_smiles are canonicalised
    canonical_smiles_list = [Chem.MolToSmiles(Chem.MolFromSmiles(smiles)) for smiles in tqdm(products)]
    canonical_smiles_list = list(set(canonical_smiles_list))

    # Add mask col for any rows that have product(s) contained in the test set
    
    # Add mask to remove any rows that have a reaction contained within canonical_smiles_list

    # Add a new column 'test_set', which is True if the value in 'product_000' is in canonical_smiles_list, otherwise False
    df['test_set'] = df['product_000'].isin(canonical_smiles_list)

    # if there are multiple products:
    if 'product_001' in df.columns:
        df['prod_0.prod_1'] = df['product_000']+'.'+df['product_001']
        df['prod_1.prod_0'] = df['product_001']+'.'+df['product_000']
        df['test_set_01'] = df['prod_0.prod_1'].isin(canonical_smiles_list)
        df['test_set_10'] = df['prod_1.prod_0'].isin(canonical_smiles_list)
        
        df['test_set'] = df['test_set'] + df['test_set_01'] + df['test_set_10']
        
        
    # We now can create our train_val_df
    train_val_df = df[df['test_set'] == False]
    train_val_df.reset_index(inplace=True, drop=True)
    
    pre_drop_size = len(df)
    post_drop_size = len(train_val_df)
    assert pre_drop_size > post_drop_size
    
    
    # Define a function to shuffle the values and concatenate them
    def concatenate_reagents(row, cols):
        values = [row[col] for col in cols if row[col] is not None]
        random.shuffle(values)
        for item in values:
            if type(item) is not str:
                print(row)
        return '.'.join(values)
    
    # Create src col (ie the source inputs for the model, which are the reactants, solvents, and agents)
    def create_src(df):
        df = df.copy()
        # Create a list of all columns beginning with "reactant", "solvent", and "agent"
        reagent_cols = [col for col in df.columns if col.startswith(("reactant", "solvent", "agent"))]

        # Apply the function to each row and create a new 'src' column
        df['src'] = df.apply(concatenate_reagents, args=(reagent_cols,), axis=1)
        return df
    
    def create_tgt(df):
        df = df.copy()
        prod_cols = [col for col in df.columns if col.startswith(("product"))]
        
        if len(prod_cols) == 1:
            df['tgt'] = df['product_000']
        else:
            df['tgt'] = df.apply(concatenate_reagents, args=(prod_cols,), axis=1)
        return df
    print('Creating src...')
    df = create_src(df)
    print('Creating tgt...')
    df = create_tgt(df)
    
    print('Augmenting...')
    # augment
    def augment_smiles_list(smiles_list):
        new_smiles = []
        for smiles in tqdm(smiles_list):
            random_equivalent_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), doRandom=True)
            new_smiles.append(random_equivalent_smiles)
        return smiles_list + new_smiles
    
    src = augment_smiles_list(list(df['src']))
    tgt = augment_smiles_list(list(df['tgt']))
    
    assert len(src) == 2*len(df['src'])
    assert len(tgt) == 2*len(df['tgt'])
    
    print('Tokenizing...')
    
    # tokenize smiles
    def smi_tokenizer(smi):
        """
        Tokenize a SMILES molecule or reaction
        """
        import re
        pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
        regex = re.compile(pattern)
        tokens = [token for token in regex.findall(smi)]
        assert smi == ''.join(tokens)
        return ' '.join(tokens)

    def smi_tokenizer_list(smi_list):
        new_smi_list = []
        for smi in smi_list:
            new_smi_list.append(smi_tokenizer(smi))
            
        return new_smi_list
    
    src_final = smi_tokenizer_list(src)
    tgt_final = smi_tokenizer_list(tgt)
    
    src_tgt_df = pd.DataFrame({'src': src_final, 'tgt': tgt_final})
    src_tgt_df = src_tgt_df.drop_duplicates()
    # Splitting the DataFrame into training and validation sets
    train_df, val_df = train_test_split(src_tgt_df, test_size=0.068, random_state=42)
    
    src_train = train_df['src'].tolist()
    src_val = val_df['src'].tolist()
    
    tgt_train = train_df['tgt'].tolist()
    tgt_val = val_df['tgt'].tolist()
    
    
    
    # write to txt
    with open(f'{output_folder_path}/src-train.txt', 'w') as file:
        file.write('\n'.join(src_train))
    with open(f'{output_folder_path}/src-val.txt', 'w') as file:
        file.write('\n'.join(src_val))
    with open(f'{output_folder_path}/tgt-train.txt', 'w') as file:
        file.write('\n'.join(tgt_train))
    with open(f'{output_folder_path}/tgt-val.txt', 'w') as file:
        file.write('\n'.join(tgt_val))
    
    print('Done!')
    


In [4]:
# Strict dataset/ orderly benchmark / dataset D
df = pd.read_parquet('/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_generated_datasets/strict_filtering.parquet')

# convert from .txt file to list of smiles
path_to_tgt_test = '/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_transformer_datasets/forward/dataset_D/tgt-test.txt'

output_folder_path = '/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_transformer_datasets/forward/dataset_D/'

prep_data_for_mol_transformer(df, path_to_tgt_test, output_folder_path)

Canonicalising...


100%|██████████| 40000/40000 [00:14<00:00, 2747.82it/s]


Creating src...
Creating tgt...
Augmenting...


100%|██████████| 356906/356906 [03:50<00:00, 1545.09it/s]
100%|██████████| 356906/356906 [02:16<00:00, 2614.26it/s]


Tokenizing...
Done!


In [3]:
# forward prediction dataset/ mid filtering / dataset E
df = pd.read_parquet('/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_generated_datasets/forward_pred_data.parquet')

# convert from .txt file to list of smiles
path_to_tgt_test = '/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_transformer_datasets/forward/dataset_E/tgt-test.txt'

output_folder_path = '/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_transformer_datasets/forward/dataset_E/'

prep_data_for_mol_transformer(df, path_to_tgt_test, output_folder_path)

Canonicalising...


100%|██████████| 40000/40000 [00:15<00:00, 2588.84it/s]


Creating src...
Creating tgt...
Augmenting...


100%|██████████| 919231/919231 [10:22<00:00, 1476.72it/s]
100%|██████████| 919231/919231 [05:52<00:00, 2608.53it/s]


Tokenizing...
Done!
