# Prepping data for molecular transformer

https://github.com/pschwllr/MolecularTransformer

In [1]:
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
import random
from rdkit import RDLogger 
import re
import os
RDLogger.DisableLog('rdApp.*')

In [2]:
# prepare src (ie the source inputs for the model, which are the reactants, solvents, and agents)
def process_dataframe(df):
    df = df.copy()
    # Create a list of all columns beginning with "reactant", "solvent", and "agent"
    reagent_cols = [col for col in df.columns if col.startswith(("reactant", "solvent", "agent"))]

    # Define a function to shuffle the values and concatenate them
    def concatenate_molecules(row):
        values = [row[col] for col in reagent_cols if row[col] is not None]
        random.shuffle(values)
        for item in values:
            if type(item) is not str:
                print(row)
        return '.'.join(values)

    # Apply the function to each row and create a new 'src' column
    df['src'] = df.apply(concatenate_molecules, axis=1)

    return df

In [12]:
df = process_dataframe(train_val_df)
# Splitting the DataFrame into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.068, random_state=42)

In [15]:
# create list of strings for src and tgt
src_list = df['src'].tolist()
tgt_list = df['product_000'].tolist()

In [2]:
# augment
def augment_smiles_list(smiles_list, n=10):
    new_smiles = []
    for smiles in tqdm(smiles_list):
        random_equivalent_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), doRandom=True)
    return smiles_list + new_smiles

In [28]:
src_list_aug = augment_smiles_list(src_list[:100])
tgt_list_aug = augment_smiles_list(tgt_list[:100])

100%|██████████| 100/100 [00:00<00:00, 1740.71it/s]
100%|██████████| 100/100 [00:00<00:00, 1673.53it/s]


In [30]:
# tokenize smiles
def smi_tokenizer(smi):
    """
    Tokenize a SMILES molecule or reaction
    """
    import re
    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    assert smi == ''.join(tokens)
    return ' '.join(tokens)

def smi_tokenizer_list(smi_list):
    new_smi_list = []
    for smi in smi_list:
        new_smi_list.append(smi_tokenizer(smi))
        
    return new_smi_list

In [31]:
src_final = smi_tokenizer_list(src_list_aug)

In [33]:
# write to txt
with open('output_file_using_join.txt', 'w') as file:
    file.write('\n'.join(src_final))

# Implement overall function

In [2]:
# Algorithm overview
# 1. Read in data from parquet file
# 2. Mix order of molecules (reactants, solvents, agents, products, depends on what you're doing)
# 3. create src and tgt lists
# 4. augment
# 5. tokenize
# 6. write to txt file (either split to train val or just test)

In [3]:
def prep_data_for_mol_transformer(path_to_data, output_folder_path, is_it_train_val_data=True, val_size=1/9, src_cols = ("reactant", "solvent", "agent",), tgt_cols=("product",), reactants_and_agents_solvents_separated=True, augment=True, random_state=42, condition_prediction=False):
    """
    path_to_data (str): path to the parquet file generated by ORDerly
    output_folder_path (str): path to where to save the txt files    
    is_it_train_val_data (bool): True if the data is train/val data, False if it's test data
    train_val_split (float): the proportion of the data to use for validation
    src_cols, tgt_cols (tuple): list of columns to use for the src and tgt.
        For forward prediction:
            src_cols = ["reactant", "solvent", "agent"]
            tgt_cols = ["product"]
        For retrosynthesis prediction:
            src_cols = ["product"]
            tgt_cols = ["reactant"]
    reactants_and_agents_solvents_separated (bool): True if the reactants and agents/solvents will be separated by > in the src, False if they will be separated by .
        example src separated: reactant1.reactant3.reactant2 > agent1.solvent1.agent2 
        example src mixed: reactant1.agent1.reactant2.solvent1.reactant3.agent2
        NB: the tgt will always be separated by .
        NB: the order of molecules will always be scrambled
    augment (bool): True if you want to augment the data by creating one random equivalent SMILES strings per molecule
    random_state (int): random state for train_test_split
    """
    # read in data
    print('Reading in data...')
    df = pd.read_parquet(path_to_data)
    print(f'Number of rows in df: {len(df)}')
    
    print('verifying output_folder_path...')
    if not os.path.exists(output_folder_path):
        print('Creating output_folder_path...')
        os.makedirs(output_folder_path)
    
    # remove reactions that contain ->
    # Schwaller's regex doesn't work with it
    # Filter out rows with '->' and handle None values
    df = df[df['agent_000'].apply(lambda x: x is None or '->' not in x)]

    
    
    
    # Define a function to shuffle the values and concatenate them
    def concatenate_molecules(row, cols):
        values = [row[col] for col in cols if (row[col] is not None and row[col] != 'NULL')]
        random.shuffle(values)
        for item in values:
            if type(item) is not str:
                print('Problem found: ',row)
        return '.'.join(values)
    
    # Create src col (ie the source inputs for the model, which are the reactants, solvents, and agents)
    def create_src(df, src_col_names, reactants_and_agents_solvents_separated=True, condition_prediction=False):
        """
        src_cols list: list of columns to use for the src
        """
        df = df.copy() # to avoid the SettingWithCopyWarning
        # Create a list of all columns beginning with "reactant", "solvent", and "agent"
        print(f'Creating src from {src_col_names}...')
        src_cols = [col for col in df.columns if col.startswith(src_col_names)]
        if (not reactants_and_agents_solvents_separated) or (len(src_col_names) == 1): # this one is super confusing, should change
            # Apply the function to each row and create a new 'src' column
            df['src'] = df.apply(concatenate_molecules, args=(src_cols,), axis=1)
            return df
        elif condition_prediction:
            # separate reactants and agents/solvents
            reactant_cols = [col for col in df.columns if col.startswith('reactant')]
            prod_cols = [col for col in df.columns if col.startswith(('product'))]
            
            # Apply the function to each row and create a new 'src' column
            df['reactants'] = df.apply(concatenate_molecules, args=(reactant_cols,), axis=1)
            df['products'] = df.apply(concatenate_molecules, args=(prod_cols,), axis=1)
            df['src'] = df['reactants'] + '>' + df['products']
            return df
            
        else:
            # separate reactants and agents/solvents
            reactant_cols = [col for col in df.columns if col.startswith('reactant')]
            agent_solvent_cols = [col for col in df.columns if col.startswith(('agent', 'solvent'))]
            
            # Apply the function to each row and create a new 'src' column
            df['reactants'] = df.apply(concatenate_molecules, args=(reactant_cols,), axis=1)
            df['agent_solvents'] = df.apply(concatenate_molecules, args=(agent_solvent_cols,), axis=1)
            df['src'] = df['reactants'] + '>' + df['agent_solvents']
            return df

    
    def create_tgt(df, tgt_cols):
        df = df.copy()
        tgt_cols = [col for col in df.columns if col.startswith(tgt_cols)]
        df['tgt'] = df.apply(concatenate_molecules, args=(tgt_cols,), axis=1)
        return df
    
    print('Creating src...')
    df = create_src(df, src_cols, reactants_and_agents_solvents_separated, condition_prediction)
    print('Creating tgt...')
    df = create_tgt(df, tgt_cols)
    
    print(f'Number of rows in df: {len(df)}')
    
    print('Augmenting...')
    # augment
    def augment_smiles_list(smiles_list):
        new_smiles = []
        if '>' not in smiles_list[0]: 
            # This separator is used to separate reactants and agents/solvents
            # The presence of it means that MolFromSmiles won't work, so we have to split up the smiles
            for smiles in tqdm(smiles_list):
                random_equivalent_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), doRandom=True)
                new_smiles.append(random_equivalent_smiles)
            return smiles_list + new_smiles
        else:
            for smiles in tqdm(smiles_list):
                r, a = smiles.split('>') # reactants and agents/solvents
                r_random_equivalent_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(r), doRandom=True)
                if a is not None:
                    a_random_equivalent_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(a), doRandom=True)
                    r_a = r_random_equivalent_smiles + '>' + a_random_equivalent_smiles
                else:
                    r_a = r_random_equivalent_smiles
                new_smiles.append(r_a)
            return smiles_list + new_smiles
            
    
    src = augment_smiles_list(list(df['src']))
    tgt = augment_smiles_list(list(df['tgt']))
    
    assert len(src) == 2*len(df['src'])
    assert len(tgt) == 2*len(df['tgt'])
    print(f'Number of rows in df (after augmentation). src: {len(src)}, tgt: {len(tgt)}')
    print('Tokenizing...')
    
    # tokenize smiles
    def smi_tokenizer(smi):
        """
        Tokenize a SMILES molecule or reaction
        """
        import re
        pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
        regex = re.compile(pattern)
        tokens = [token for token in regex.findall(smi)]
        assert smi == ''.join(tokens)
        return ' '.join(tokens)

    def smi_tokenizer_list(smi_list):
        new_smi_list = []
        for smi in smi_list:
            new_smi_list.append(smi_tokenizer(smi))
            
        return new_smi_list
    
    src_final = smi_tokenizer_list(src)
    tgt_final = smi_tokenizer_list(tgt)
    
    src_tgt_df = pd.DataFrame({'src': src_final, 'tgt': tgt_final})
    print(f'len before dropping duplicates: {len(src_tgt_df)}')
    src_tgt_df = src_tgt_df.drop_duplicates()
    print(f'len after dropping duplicates: {len(src_tgt_df)}')
    # Check if there are any rows with empty src or tgt
    print(f'Number of rows with empty src: {len(src_tgt_df[src_tgt_df["src"] == ""])}')
    print(f'Number of rows with empty tgt: {len(src_tgt_df[src_tgt_df["tgt"] == ""])}')
    # Drop rows with empty src or tgt
    src_tgt_df = src_tgt_df[src_tgt_df["src"] != ""]
    src_tgt_df = src_tgt_df[src_tgt_df["tgt"] != ""]
    print(f'len after dropping empty src and tgt: {len(src_tgt_df)}')
    
    if is_it_train_val_data:
        print('Splitting into train and val...')
        
        # Splitting the DataFrame into training and validation sets
        train_df, val_df = train_test_split(src_tgt_df, test_size=val_size, random_state=42)
        src_train = train_df['src'].tolist()
        src_val = val_df['src'].tolist()
        
        tgt_train = train_df['tgt'].tolist()
        tgt_val = val_df['tgt'].tolist()
        
        print(f'Number of rows in train_df: {len(train_df)}')
        print(f'Number of rows in val_df: {len(val_df)}')
    
        # write to txt
        with open(f'{output_folder_path}/src-train.txt', 'w') as file:
            file.write('\n'.join(src_train))
        with open(f'{output_folder_path}/src-val.txt', 'w') as file:
            file.write('\n'.join(src_val))
        with open(f'{output_folder_path}/tgt-train.txt', 'w') as file:
            file.write('\n'.join(tgt_train))
        with open(f'{output_folder_path}/tgt-val.txt', 'w') as file:
            file.write('\n'.join(tgt_val))
        
    else:
        print('saving test data...')
        print('Number of rows in test data: ', len(src_tgt_df))
        src_test = src_tgt_df['src'].tolist()
        tgt_test = src_tgt_df['tgt'].tolist()
        
        # write to txt
        with open(f'{output_folder_path}/src-test.txt', 'w') as file:
            file.write('\n'.join(src_test))
        with open(f'{output_folder_path}/tgt-test.txt', 'w') as file:
            file.write('\n'.join(tgt_test))
    
    print('Done!')


In [4]:
# orderly_forward_mixed
#train
parent = '/Users/dsw46/Projects_local/ORDerly_jcim_response/'
path_to_data = parent+'orderly_benchmarks/orderly_forward_train.parquet'
output_folder_path = parent+'transformer_data/orderly_forward_mixed'


prep_data_for_mol_transformer(path_to_data, output_folder_path, is_it_train_val_data=True, val_size=1/9, src_cols = ("reactant", "solvent", "agent",), tgt_cols=("product",), reactants_and_agents_solvents_separated=False, augment=True, random_state=42)
# NB: previous size of df: Number of rows in df: 833112
# NB: new size of df: Number of rows in df: 832809

Reading in data...
Number of rows in df: 832809
verifying output_folder_path...
Creating output_folder_path...
Creating src...
Creating src from ('reactant', 'solvent', 'agent')...
Creating tgt...
Number of rows in df: 832809
Augmenting...


100%|██████████| 832809/832809 [05:40<00:00, 2445.99it/s]
100%|██████████| 832809/832809 [03:26<00:00, 4028.86it/s]


Number of rows in df (after augmentation). src: 1665618, tgt: 1665618
Tokenizing...
len before dropping duplicates: 1665618
len after dropping duplicates: 1665595
Number of rows with empty src: 0
Number of rows with empty tgt: 0
len after dropping empty src and tgt: 1665595
Splitting into train and val...
Number of rows in train_df: 1480528
Number of rows in val_df: 185067
Done!


In [5]:
# orderly_forward_mixed
#test
parent = '/Users/dsw46/Projects_local/ORDerly_jcim_response/'
path_to_data = parent+'orderly_benchmarks/orderly_forward_test.parquet'
output_folder_path = parent+'transformer_data/orderly_forward_mixed'


prep_data_for_mol_transformer(path_to_data, output_folder_path, is_it_train_val_data=False, val_size=1/9, src_cols = ("reactant", "solvent", "agent",), tgt_cols=("product",), reactants_and_agents_solvents_separated=False, augment=True, random_state=42)

Reading in data...
Number of rows in df: 86268
verifying output_folder_path...
Creating src...
Creating src from ('reactant', 'solvent', 'agent')...
Creating tgt...
Number of rows in df: 86268
Augmenting...


100%|██████████| 86268/86268 [00:36<00:00, 2382.11it/s]
100%|██████████| 86268/86268 [00:22<00:00, 3822.01it/s]


Number of rows in df (after augmentation). src: 172536, tgt: 172536
Tokenizing...
len before dropping duplicates: 172536
len after dropping duplicates: 172535
Number of rows with empty src: 0
Number of rows with empty tgt: 0
len after dropping empty src and tgt: 172535
saving test data...
Number of rows in test data:  172535
Done!


In [6]:
# orderly_forward_separated
#train
parent = '/Users/dsw46/Projects_local/ORDerly_jcim_response/'
path_to_data = parent+'orderly_benchmarks/orderly_forward_train.parquet'
output_folder_path = parent+'transformer_data/orderly_forward_separated'


prep_data_for_mol_transformer(path_to_data, output_folder_path, is_it_train_val_data=True, val_size=1/9, src_cols = ("reactant", "solvent", "agent",), tgt_cols=("product",), reactants_and_agents_solvents_separated=True, augment=True, random_state=42)

Reading in data...
Number of rows in df: 832809
verifying output_folder_path...
Creating output_folder_path...
Creating src...
Creating src from ('reactant', 'solvent', 'agent')...
Creating tgt...
Number of rows in df: 832809
Augmenting...


100%|██████████| 832809/832809 [05:37<00:00, 2470.56it/s]
100%|██████████| 832809/832809 [03:24<00:00, 4077.53it/s]


Number of rows in df (after augmentation). src: 1665618, tgt: 1665618
Tokenizing...
len before dropping duplicates: 1665618
len after dropping duplicates: 1665602
Number of rows with empty src: 0
Number of rows with empty tgt: 0
len after dropping empty src and tgt: 1665602
Splitting into train and val...
Number of rows in train_df: 1480535
Number of rows in val_df: 185067
Done!


In [7]:
# orderly_forward_separated
#test
parent = '/Users/dsw46/Projects_local/ORDerly_jcim_response/'
path_to_data = parent+'orderly_benchmarks/orderly_forward_test.parquet'
output_folder_path = parent+'transformer_data/orderly_forward_separated'


prep_data_for_mol_transformer(path_to_data, output_folder_path, is_it_train_val_data=False, val_size=1/9, src_cols = ("reactant", "solvent", "agent",), tgt_cols=("product",), reactants_and_agents_solvents_separated=True, augment=True, random_state=42)

Reading in data...
Number of rows in df: 86268
verifying output_folder_path...
Creating src...
Creating src from ('reactant', 'solvent', 'agent')...
Creating tgt...
Number of rows in df: 86268
Augmenting...


100%|██████████| 86268/86268 [00:36<00:00, 2355.28it/s]
100%|██████████| 86268/86268 [00:22<00:00, 3837.48it/s]


Number of rows in df (after augmentation). src: 172536, tgt: 172536
Tokenizing...
len before dropping duplicates: 172536
len after dropping duplicates: 172535
Number of rows with empty src: 0
Number of rows with empty tgt: 0
len after dropping empty src and tgt: 172535
saving test data...
Number of rows in test data:  172535
Done!


In [8]:
# orderly_retro
# train
parent = '/Users/dsw46/Projects_local/ORDerly_jcim_response/'
path_to_data = parent+'orderly_benchmarks/orderly_retro_train.parquet'
output_folder_path = parent+'transformer_data/orderly_retro'


prep_data_for_mol_transformer(path_to_data, output_folder_path, is_it_train_val_data=True, val_size=1/9, src_cols = ("product",), tgt_cols=("reactant",), reactants_and_agents_solvents_separated=True, augment=True, random_state=42)

Reading in data...
Number of rows in df: 852476
verifying output_folder_path...
Creating output_folder_path...
Creating src...
Creating src from ('product',)...
Creating tgt...
Number of rows in df: 852476
Augmenting...


100%|██████████| 852476/852476 [03:24<00:00, 4170.29it/s]
100%|██████████| 852476/852476 [03:57<00:00, 3584.72it/s]


Number of rows in df (after augmentation). src: 1704952, tgt: 1704952
Tokenizing...
len before dropping duplicates: 1704952
len after dropping duplicates: 1674222
Number of rows with empty src: 0
Number of rows with empty tgt: 0
len after dropping empty src and tgt: 1674222
Splitting into train and val...
Number of rows in train_df: 1488197
Number of rows in val_df: 186025
Done!


In [9]:
# orderly_retro
# test
parent = '/Users/dsw46/Projects_local/ORDerly_jcim_response/'
path_to_data = parent+'orderly_benchmarks/orderly_retro_test.parquet'
output_folder_path = parent+'transformer_data/orderly_retro'


prep_data_for_mol_transformer(path_to_data, output_folder_path, is_it_train_val_data=False, val_size=1/9, src_cols = ("product",), tgt_cols=("reactant",), reactants_and_agents_solvents_separated=True, augment=True, random_state=42)

Reading in data...
Number of rows in df: 87172
verifying output_folder_path...
Creating src...
Creating src from ('product',)...
Creating tgt...
Number of rows in df: 87172
Augmenting...


100%|██████████| 87172/87172 [00:22<00:00, 3865.63it/s]
100%|██████████| 87172/87172 [00:26<00:00, 3322.14it/s]


Number of rows in df (after augmentation). src: 174344, tgt: 174344
Tokenizing...
len before dropping duplicates: 174344
len after dropping duplicates: 174161
Number of rows with empty src: 0
Number of rows with empty tgt: 0
len after dropping empty src and tgt: 174161
saving test data...
Number of rows in test data:  174161
Done!


In [10]:
# # orderly_condition
# # train
# parent = '/Users/dsw46/Projects_local/ORDerly_jcim_response/'
# path_to_data = parent+'orderly_benchmarks/orderly_condition_train.parquet'
# output_folder_path = parent+'transformer_data/orderly_condition'


# df = prep_data_for_mol_transformer(path_to_data, output_folder_path, is_it_train_val_data=True, val_size=1/9, src_cols = ("reactant","product",), tgt_cols=("reactant", "solvent", "agent",), reactants_and_agents_solvents_separated=True, augment=True, random_state=42, condition_prediction=True)


In [11]:
# # orderly_condition
# # test
# parent = '/Users/dsw46/Projects_local/ORDerly_jcim_response/'
# path_to_data = parent+'orderly_benchmarks/orderly_condition_test.parquet'
# output_folder_path = parent+'transformer_data/orderly_condition'


# prep_data_for_mol_transformer(path_to_data, output_folder_path, is_it_train_val_data=False, val_size=1/9, src_cols = ("reactant","product",), tgt_cols=("reactant", "solvent", "agent",), reactants_and_agents_solvents_separated=True, augment=True, random_state=42, condition_prediction=True)


In [15]:
# Not uspto forward separated
#test
parent = '/Users/dsw46/Projects_local/ORDerly_jcim_response/'
path_to_data = parent+'orderly_benchmarks/orderly_forward_non_uspto.parquet'
output_folder_path = parent+'transformer_data/not_uspto_forward_separated'


prep_data_for_mol_transformer(path_to_data, output_folder_path, is_it_train_val_data=False, val_size=1, src_cols = ("reactant", "solvent", "agent",), tgt_cols=("product"), reactants_and_agents_solvents_separated=True, augment=True, random_state=42, condition_prediction=False)

Reading in data...
Number of rows in df: 29417
verifying output_folder_path...
Creating output_folder_path...
Creating src...
Creating src from ('reactant', 'solvent', 'agent')...
Creating tgt...
Number of rows in df: 25105
Augmenting...


100%|██████████| 25105/25105 [00:09<00:00, 2556.62it/s]
100%|██████████| 25105/25105 [00:07<00:00, 3528.49it/s]


Number of rows in df (after augmentation). src: 50210, tgt: 50210
Tokenizing...
len before dropping duplicates: 50210
len after dropping duplicates: 50210
Number of rows with empty src: 0
Number of rows with empty tgt: 0
len after dropping empty src and tgt: 50210
saving test data...
Number of rows in test data:  50210
Done!


In [16]:
# Not uspto forward mixed
#test
parent = '/Users/dsw46/Projects_local/ORDerly_jcim_response/'
path_to_data = parent+'orderly_benchmarks/orderly_forward_non_uspto.parquet'
output_folder_path = parent+'transformer_data/not_uspto_forward_mixed'


prep_data_for_mol_transformer(path_to_data, output_folder_path, is_it_train_val_data=False, val_size=1, src_cols = ("reactant", "solvent", "agent",), tgt_cols=("product"), reactants_and_agents_solvents_separated=False, augment=True, random_state=42, condition_prediction=False)

Reading in data...
Number of rows in df: 29417
verifying output_folder_path...
Creating output_folder_path...
Creating src...
Creating src from ('reactant', 'solvent', 'agent')...
Creating tgt...
Number of rows in df: 25105
Augmenting...


100%|██████████| 25105/25105 [00:09<00:00, 2616.30it/s]
100%|██████████| 25105/25105 [00:07<00:00, 3553.24it/s]


Number of rows in df (after augmentation). src: 50210, tgt: 50210
Tokenizing...
len before dropping duplicates: 50210
len after dropping duplicates: 50210
Number of rows with empty src: 0
Number of rows with empty tgt: 0
len after dropping empty src and tgt: 50210
saving test data...
Number of rows in test data:  50210
Done!


In [17]:
# Not uspto retro
#test
parent = '/Users/dsw46/Projects_local/ORDerly_jcim_response/'
path_to_data = parent+'orderly_benchmarks/orderly_forward_non_uspto.parquet'
output_folder_path = parent+'transformer_data/not_uspto_retro'


prep_data_for_mol_transformer(path_to_data, output_folder_path, is_it_train_val_data=False, val_size=1, src_cols = ("product",), tgt_cols=("reactant"), reactants_and_agents_solvents_separated=True, augment=True, random_state=42, condition_prediction=False)

Reading in data...
Number of rows in df: 29417
verifying output_folder_path...
Creating output_folder_path...
Creating src...
Creating src from ('product',)...
Creating tgt...
Number of rows in df: 25105
Augmenting...


100%|██████████| 25105/25105 [00:07<00:00, 3567.58it/s]
100%|██████████| 25105/25105 [00:09<00:00, 2727.94it/s]


Number of rows in df (after augmentation). src: 50210, tgt: 50210
Tokenizing...
len before dropping duplicates: 50210
len after dropping duplicates: 49206
Number of rows with empty src: 0
Number of rows with empty tgt: 0
len after dropping empty src and tgt: 49206
saving test data...
Number of rows in test data:  49206
Done!
