# Generate SMILES file from ORDerly dataset

In [7]:
import pandas as pd
import numpy as np

In [8]:
import pandas as pd

def save_react_prod_to_smi(input_path, output_path, include_agents=False):
    """
    input_path: path to parquet file
    output_path: path to save the output file
    include_agents: if True, include agents in the SMILES string
    """
    orderly_data = pd.read_parquet(input_path)
    # only first 100 rows
    # orderly_data = orderly_data.head(100)
    columns = ['reactant_001']
    for col in columns:
        orderly_data[col] = orderly_data[col].replace("NULL", None)

    # Define a function to concatenate elements in a row, ignoring None, with a period between molecule strings
    def concatenate_elements(row):
        # Group the elements
        reactants = [row['reactant_000'], row['reactant_001']]
        agents = [row['solvent_000'], row['solvent_001'], row['agent_000'], row['agent_001'], row['agent_002']]
        products = [row['product_000']]

        # Function to concatenate elements in a group, separated by '.'
        def concatenate_group(group):
            return '.'.join([str(element) for element in group if element is not None])


        # Concatenate each group and join groups with '>'
        # Add an extra '>' if there are no agents
        # rxn_string = reactants +'>' + agents+'>' + products

        if include_agents:
            rxn_string = '>'.join([
                concatenate_group(reactants),
                concatenate_group(agents),
                concatenate_group(products)
            ])
        else:
            rxn_string = '>>'.join([
                concatenate_group(reactants),
                concatenate_group(products)
            ])
        return rxn_string

    # Apply the function to each row
    orderly_data['rxn'] = orderly_data.apply(concatenate_elements, axis=1)

    # Convert to list
    rxn = orderly_data['rxn'].tolist()

    with open(output_path, 'w') as file:
        # Write each string followed by a newline character
        for string in rxn:
            file.write(string + '\n')

    print("File saved successfully.")

In [4]:
# Takes about 25s
parent = '/Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/'
save_react_prod_to_smi(parent+'orderly_no_trust_no_min_freq_train.parquet', parent+'orderly_condition_train.smi', include_agents=True)
save_react_prod_to_smi(parent+'orderly_no_trust_no_min_freq_test.parquet', parent+'orderly_condition_test.smi', include_agents=True)

File saved successfully.
File saved successfully.


In [9]:
# Takes about 25s
parent = '/Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/'
save_react_prod_to_smi(parent+'orderly_no_trust_no_min_freq_train.parquet', parent+'orderly_condition_wo_ag_train.smi', include_agents=False)
save_react_prod_to_smi(parent+'orderly_no_trust_no_min_freq_test.parquet', parent+'orderly_condition_wo_ag_test.smi', include_agents=False)

File saved successfully.
File saved successfully.


In [10]:
# run this command:
# ./namerxn <infile> [<outfile>]
in_path = '/Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/orderly_condition_wo_ag_train.smi'
out_path = '/Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/orderly_condition_wo_ag_train_classified.smi'
print('./namerxn '+in_path+' '+out_path)

./namerxn /Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/orderly_condition_wo_ag_train.smi /Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/orderly_condition_wo_ag_train_classified.smi


In [5]:
# run this command:
# ./namerxn <infile> [<outfile>]
in_path = '/Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/orderly_condition_test.smi'
out_path = '/Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/orderly_condition_test_classified.smi'
print('./namerxn '+in_path+' '+out_path)

./namerxn /Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/orderly_condition_test.smi /Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/orderly_condition_test_classified.smi


In [None]:
./namerxn /Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/orderly_condition_wo_ag_test.smi /Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/orderly_condition_wo_ag_test_classified.smi

# inspect

In [35]:
path = '/Users/danielwigh/projects_local/chemical-parameter-sharing/data/v6/orderly_no_trust_no_min_freq_train.parquet'
df = pd.read_parquet(path)

In [34]:
print(len(df))
print(len(df['reactant_001'].dropna()))
# check if "NULL" is in the column
print('NULL' in df['agent_001'].dropna().tolist())

682576
682576
False


In [32]:
df[df['agent_002']=='NULL']

Unnamed: 0_level_0,original_index,agent_000,agent_001,agent_002,date_of_experiment,extracted_from_file,grant_date,is_mapped,procedure_details,product_000,reactant_000,reactant_001,rxn_str,rxn_time,solvent_000,solvent_001,temperature,yield_000
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
