# Inspect USPTO 50k

In [None]:
from tdc.generation import RetroSyn
data = RetroSyn(name = 'USPTO-50K')
split = data.get_split()

Downloading...
100%|██████████| 5.22M/5.22M [00:01<00:00, 4.48MiB/s]
Loading...
Done!


In [6]:
split['train'].iloc[4]

input                C#CCCCCC(C)(C)c1cc(OC)cc(OC)c1
output    COc1cc(OC)cc(C(C)(C)CCCCC#C[Si](C)(C)C)c1
Name: 4, dtype: object

# Generate txt files

In [2]:
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
import random
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*')  

In [None]:
def prep_data_for_retrosynth_transformer(df, path_to_src_test, output_folder_path):
    """
    df: dataframe created by ORDerly
    src_test_list: a list of strings for the source (src, i.e. the products) in the test set. These will be removed when generating the train and val sets. 
    output_folder_path: path to where to save the txt files

    """
    with open(path_to_src_test, 'r') as file:
        lines = file.readlines()

    products = [product.strip().replace(" ", "") for product in lines]
    
    print('Canonicalising...')
    # Double check that all the tgt_smiles are canonicalised
    canonical_smiles_list = [Chem.MolToSmiles(Chem.MolFromSmiles(smiles)) for smiles in tqdm(products)]
    canonical_smiles_list = list(set(canonical_smiles_list))
    
    # Retrosynthesis can only handle reactions where there's one product, so remove any rows containing 2 products
    assert 'product_001' not in df.columns

    # Add mask col for any rows that have product(s) contained in the test set
    
    # Add mask to remove any rows that have a reaction contained within canonical_smiles_list

    # Add a new column 'test_set', which is True if the value in 'product_000' is in canonical_smiles_list, otherwise False
    df['test_set'] = df['product_000'].isin(canonical_smiles_list)
        
        
    # We now can create our train_val_df
    train_val_df = df[df['test_set'] == False]
    train_val_df.reset_index(inplace=True, drop=True)
    
    pre_drop_size = len(df)
    post_drop_size = len(train_val_df)
    assert pre_drop_size > post_drop_size
    print(pre_drop_size-post_drop_size, ' of ',pre_drop_size ,' reactions dropped (',(pre_drop_size-post_drop_size)/pre_drop_size,'%)')
    
    
    # Define a function to shuffle the values and concatenate them
    def concatenate_reagents(row, cols):
        values = [row[col] for col in cols if row[col] is not None]
        random.shuffle(values)
        for item in values:
            if type(item) is not str:
                print(row)
        return '.'.join(values)
    
    # Create src col (ie the source inputs for the model, which are the reactants, solvents, and agents)
    def create_src(df):
        df = df.copy()
        # Create a list of all columns beginning with "reactant", "solvent", and "agent"
        reagent_cols = [col for col in df.columns if col.startswith(("product"))]
        
        df['src'] = df['product_000']
        return df
    
    def create_tgt(df):
        df = df.copy()
        reactant_cols = [col for col in df.columns if col.startswith(("reactant"))]
        
        # Apply the function to each row and create a new 'src' column
        df['tgt'] = df.apply(concatenate_reagents, args=(reactant_cols,), axis=1)
        return df

    print('Creating src...')
    df = create_src(df)
    print('Creating tgt...')
    df = create_tgt(df)
    
    print('Augmenting...')
    # augment
    def augment_smiles_list(smiles_list):
        new_smiles = []
        for smiles in tqdm(smiles_list):
            random_equivalent_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), doRandom=True)
            new_smiles.append(random_equivalent_smiles)
        return smiles_list + new_smiles
    
    src = augment_smiles_list(list(df['src']))
    tgt = augment_smiles_list(list(df['tgt']))
    
    assert len(src) == 2*len(df['src'])
    assert len(tgt) == 2*len(df['tgt'])
    
    print('Tokenizing...')
    
    # tokenize smiles
    def smi_tokenizer(smi):
        """
        Tokenize a SMILES molecule or reaction
        """
        import re
        pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
        regex = re.compile(pattern)
        tokens = [token for token in regex.findall(smi)]
        assert smi == ''.join(tokens)
        return ' '.join(tokens)

    def smi_tokenizer_list(smi_list):
        new_smi_list = []
        for smi in smi_list:
            new_smi_list.append(smi_tokenizer(smi))
            
        return new_smi_list
    
    src_final = smi_tokenizer_list(src)
    tgt_final = smi_tokenizer_list(tgt)
    
    src_tgt_df = pd.DataFrame({'src': src_final, 'tgt': tgt_final})
    src_tgt_df = src_tgt_df.drop_duplicates()
    # Splitting the DataFrame into training and validation sets
    train_df, val_df = train_test_split(src_tgt_df, test_size=0.068, random_state=42)
    
    src_train = train_df['src'].tolist()
    src_val = val_df['src'].tolist()
    
    tgt_train = train_df['tgt'].tolist()
    tgt_val = val_df['tgt'].tolist()
    
    
    
    # write to txt
    with open(f'{output_folder_path}/src_train.txt', 'w') as file:
        file.write('\n'.join(src_train))
    with open(f'{output_folder_path}/src_val.txt', 'w') as file:
        file.write('\n'.join(src_val))
    with open(f'{output_folder_path}/tgt_train.txt', 'w') as file:
        file.write('\n'.join(tgt_train))
    with open(f'{output_folder_path}/tgt_val.txt', 'w') as file:
        file.write('\n'.join(tgt_val))
    
    print('Done!')
    


In [None]:
# Strict dataset/ orderly benchmark / dataset D
df = pd.read_parquet('/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_generated_datasets/strict_filtering.parquet')

# convert from .txt file to list of smiles
path_to_tgt_test = '/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_transformer_datasets/forward/dataset_D/src-test.txt'

output_folder_path = '/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_transformer_datasets/forward/dataset_D/'

prep_data_for_mol_transformer(df, path_to_tgt_test, output_folder_path)

In [None]:
# retrosynth prediction dataset/ loose filtering / dataset F
df = pd.read_parquet('/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_generated_datasets/retrosynth_data.parquet')

# convert from .txt file to list of smiles
path_to_tgt_test = '/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_transformer_datasets/retrosynth/dataset_F/src-test.txt'

output_folder_path = '/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_transformer_datasets/forward/dataset_F/'

prep_data_for_mol_transformer(df, path_to_tgt_test, output_folder_path)

In [3]:
df = pd.read_parquet('/Users/dsw46/Projects_local/orderly_reviewer_response/orderly_generated_datasets/strict_filtering.parquet')