In [1]:
import pandas as pd
from typing import List
import numpy as np
from rdkit import Chem


In [10]:
def canonicalize_smiles(smiles):
    """
    Canonicalize a SMILES string using RDKit.

    Parameters:
    smiles (str): A SMILES string representing a molecule.

    Returns:
    str: The canonicalized SMILES string.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol, canonical=True)
    else:
        raise ValueError("Invalid SMILES string")

# Example usage
smiles = "[Pd]/C"  # Benzene
canonical_smiles = canonicalize_smiles(smiles)
print(canonical_smiles)


C[Pd]


# Unique values above/below a threshold

In [2]:
def get_freq(df, cols:List[str], threshold:int):
    
    # Count the frequency of unique strings
    df_freq = df[cols].stack().value_counts()
    
    # Number of unique strings with a count above the threshold
    above_threshold = df_freq[df_freq >= threshold].shape[0]
    
    # Number of unique strings with a count below the threshold
    below_threshold = df_freq[df_freq < threshold].shape[0]
    
    # Percentage of all values that are None
    none_percentage = df[cols].isnull().mean().mean() * 100
    none_percentage = round(none_percentage, 2)
    
    if "reactant_000" in cols or "product_000" in cols:
        return df_freq.shape[0], 0, none_percentage
    else:
        return above_threshold, below_threshold, none_percentage

    


In [3]:
def build_overleaf_table(path, list_of_cols, threshold):
    df = pd.read_parquet(path)
    for cols in list_of_cols:
        above_threshold, below_threshold, none_percentage = get_freq(df, cols, threshold)
        if len(cols) > 4:
            component = "everything"
        else:
            component, _ = cols[0].split("_")
        
    
        table_entry = f"""
        {component} & {above_threshold} // {below_threshold} // {none_percentage} & 
        """
        print(table_entry)
    
    
    

In [4]:
solv_cols = ["solvent_000", "solvent_001"]
catalyst_cols = ["catalyst_000"]
agent_cols = ["agent_000", "agent_001", "agent_002"]
reagent_cols = ["reagent_000", "reagent_001"]
reactant_cols = ["reactant_000", "reactant_001"]
product_cols = ["product_000"]

In [5]:
# path = "/Users/dsw46/Projects_local/orderly_07_06/ORDerly/data/orderly/uspto_no_trust/filtered/filtered_orderly_ord.parquet"
path =   "/Users/dsw46/Projects_local/ORDerly_jcim_response/ORDerly/data/orderly/uspto_no_trust/filtered/filtered_orderly_ord.parquet"
threshold = 100
list_of_cols = [reactant_cols, product_cols, solv_cols, agent_cols]
build_overleaf_table(path, list_of_cols, threshold)


        reactant & 503625 // 0 // 12.96 & 
        

        product & 694279 // 0 // 0.0 & 
        

        solvent & 104 // 316 // 45.72 & 
        

        agent & 275 // 24547 // 60.93 & 
        


In [7]:
catalyst_cols+reagent_cols

['catalyst_000', 'reagent_000', 'reagent_001']

In [9]:
# path = "/Users/dsw46/Projects_local/orderly_07_06/ORDerly/data/orderly/uspto_with_trust/filtered/filtered_orderly_ord.parquet"
path = "/Users/dsw46/Projects_local/ORDerly_jcim_response/ORDerly/data/orderly/uspto_with_trust/filtered/filtered_orderly_ord.parquet"
threshold = 100
list_of_cols = [reactant_cols, product_cols, solv_cols, catalyst_cols+reagent_cols]
build_overleaf_table(path, list_of_cols, threshold)


        reactant & 207066 // 0 // 6.95 & 
        

        product & 253908 // 0 // 0.0 & 
        

        solvent & 59 // 598 // 65.84 & 
        

        catalyst & 50 // 546 // 92.54 & 
        


In [4]:
path = "/Users/dsw46/Projects_local/orderly_07_06/ORDerly/data/orderly/uspto_no_trust/filtered/filtered_orderly_ord.parquet"
df = pd.read_parquet(path)
df.shape

(411538, 17)

In [2]:
import pandas as pd

In [6]:
df['product_000'].value_counts().shape

(382850,)

# Most common agents in orderly-cond

In [2]:
import pandas as pd

In [3]:
path = "/Users/dsw46/Projects_local/ORDerly-project/ORDerly/data/orderly/datasets_v6/orderly_no_trust_no_min_freq_train.parquet"
df = pd.read_parquet(path)

In [43]:
# 1. Check [Pd]/C not present
df[df['agent_001'] == "[Pd]"]
# OMG there's a bug, the [Pd] should be first!!


Unnamed: 0_level_0,original_index,agent_000,agent_001,agent_002,date_of_experiment,extracted_from_file,grant_date,is_mapped,procedure_details,product_000,reactant_000,reactant_001,rxn_str,rxn_time,solvent_000,solvent_001,temperature,yield_000
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
371755,864236,O=C[O-],[Pd],[NH4+],NaT,ord_dataset-b8b98725045d45bdbd73512048f4b47e,2009-01-01 00:02:00,True,A solution of 740 mg of 5-benzyloxy-3-[1-tert-...,CC(C)(C)OC(=O)n1nc(-c2cc3cc(OCCN4CCOCC4)ccc3n2...,CC(C)(C)OC(=O)n1nc(-c2cc3cc(OCCN4CCOCC4)ccc3n2...,,[C:1]([O:5][C:6]([N:8]1[C:16]2[C:11](=[CH:12][...,,CCO,,25.0,83.2
111108,245483,CNC,[Pd],,NaT,ord_dataset-5eb2900a93c842ee98f26c305e657b61,1992-01-01 00:04:00,True,"10 g (0.03 mol) of 2-(2'-hydroxy-3',5'-di-t-bu...",CC(C)(C)c1cc(-n2nc3ccccc3n2)c(O)c(C(C)(C)C)c1,CC(C)(C)c1cc(-n2nc3ccccc3[n+]2[O-])c(O)c(C(C)(...,[H][H],[OH:1][C:2]1[C:7]([C:8]([CH3:11])([CH3:10])[CH...,,Cc1ccccc1,O,50.0,73.2
67346,150389,Cl,[Pd],,NaT,ord_dataset-b9a9e369e9da4413999591aa08f4c3e3,1986-01-01 00:11:00,True,4.36 g. of the resultant compound of Example 5...,COc1ccc2c(c1OC)CC1CNCC21,COc1ccc2c(c1OC)CC1CN(Cc3ccccc3)CC21,,[ClH:1].C([N:9]1[CH2:13][CH:12]2[CH2:14][C:15]...,,CO,,,
121435,269124,[BH4-],[Pd],[Na+],NaT,ord_dataset-a20aed058d7b40bc81fdf50bc5b03f97,1993-01-01 00:06:00,True,"4-(2-hydroxyethoxy)-6,6'-dimethyl-2,2'-bipyrid...",Cc1cccc(-c2cc(OCCO)cc(C)n2)n1,Cc1cccc(-c2cc(OCCO)cc(C)[n+]2[O-])n1,,[OH:1][CH2:2][CH2:3][O:4][C:5]1[CH:6]=[C:7]([C...,,CO,,,
21375,42319,NC(=O)C(=O)O,[Pd],,NaT,ord_dataset-4c8627b52d564809adb9b494879c07c0,1978-01-01 00:07:00,True,A suspension of 10 parts of [[[2-[3-(dimethyla...,CCOC(=O)CN(Cc1ccccc1OCCCN(C)C)C(=O)C=Cc1ccc(N)cc1,CCOC(=O)CN(Cc1ccccc1OCCCN(C)C)C(=O)C=Cc1ccc([N...,[H][H],[C:1]([OH:6])(=[O:5])[C:2]([NH2:4])=[O:3].[CH3...,,CCO,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
752297,1767946,c1ccc(P(c2ccccc2)c2ccccc2)cc1,[Pd],,NaT,ord_dataset-0b32b90cc77b4a3db47ad263e0bbc1a8,2016-01-01 00:09:00,True,"A mixture of dimethyl(2S,3S,5S)-5-[(allyloxy)c...",C=CCO[C@H]1C[C@H](C(=O)OC)[C@@H](C(=O)N2CCN(c3...,C1CCOC1,C=CCOC(=O)O[C@H]1C[C@H](C(=O)OC)[C@@H](C(=O)N2...,C(O[C:5]([O:7][C@@H:8]1[CH2:13][N:12]([C:14]([...,,,,,
179187,419812,Cl,[Pd],,NaT,ord_dataset-94e21e9990034c729ea727e7d2ab0eb0,1998-01-01 00:12:00,True,To a solution of 4-amino-3-nitrophenol (25.0 g...,Nc1ccc(O)cc1N,Nc1ccc(O)cc1[N+](=O)[O-],,[NH2:1][C:2]1[CH:7]=[CH:6][C:5]([OH:8])=[CH:4]...,12.0,CO,,,
147296,329677,NN,[Pd],,NaT,ord_dataset-2c460e2ef9934444aaf26fec1f75741f,1996-01-01 00:05:00,True,"To a solution of 5 mmol of 6,7-dihydro-5-(4-ni...",Nc1ccc(C(=O)N2CCc3cccn3-c3ccccc32)cc1,O=C(c1ccc([N+](=O)[O-])cc1)N1CCc2cccn2-c2ccccc21,,[N+:1]([C:4]1[CH:25]=[CH:24][C:7]([C:8]([N:10]...,,CCO,,,
236303,560550,c1ccc(P(c2ccccc2)c2ccccc2)cc1,[Pd],,NaT,ord_dataset-7c28974b7fcf4c9c86d5f2a42ba328a2,2002-01-01 00:09:00,True,"A mixture of 6-(3-bromo-phenyl)-4,4-dimethyl-1...",CC1(C)OC(=O)Nc2ccc(-c3cccc(C#C[Si](C)(C)C)c3)cc21,C#C[Si](C)(C)C,CC1(C)OC(=O)Nc2ccc(-c3cccc(Br)c3)cc21,Br[C:2]1[CH:3]=[C:4]([C:8]2[CH:20]=[CH:19][C:1...,,CCN(CC)CC,,80.0,91.8


In [24]:
# Bug 2: Charcol is being written as "C" -> this becomes CH4
print(df[df['agent_001'] == "C"]['procedure_details'].iloc[0])
print(df[df['agent_001'] == "C"]['procedure_details'].index[0])

To a stirred solution of 3-(4-propyl-2-methoxyphenoxy)-2-nitropyridine (1.1 g, 3.8 mmol) in methanol (15 ml) was added anhydrous Ferric chloride (55 mg, 5% by wt) and activated charcoal (55 mg, 5% by wt). The resulting mixture was heated to reflux and hydrazine hydrate (570 mg, 11.45 mmol) was added dropwise. The reaction was allowed to stir under reflux condition overnight, then filtered through celite. The filtrate was concentrated under reduced pressure, taken in ethyl acetate (150 ml). The organic layer was washed with water followed by brine, dried over anhydrous sodium sulfate and concentrated under reduced pressure. The residue was passed through silica column (eluant ethylacetate/pet ether 1:3 to get 900 mg (91.3%) of title compound as a pale yellow solid.
619281


In [23]:
df.loc[619281]['rxn_str']

'[CH2:1]([C:4]1[CH:19]=[CH:18][C:7]([O:8][C:9]2[C:10]([N+:15]([O-])=O)=[N:11][CH:12]=[CH:13][CH:14]=2)=[C:6]([O:20][CH3:21])[CH:5]=1)[CH2:2][CH3:3].C.O.NN>CO>[CH3:21][O:20][C:6]1[CH:5]=[C:4]([CH2:1][CH2:2][CH3:3])[CH:19]=[CH:18][C:7]=1[O:8][C:9]1[C:10]([NH2:15])=[N:11][CH:12]=[CH:13][CH:14]=1'

In [None]:
'[CH2:1]([C:4]1[CH:19]=[CH:18][C:7]([O:8][C:9]2[C:10]([N+:15]([O-])=O)=[N:11][CH:12]=[CH:13][CH:14]=2)=[C:6]([O:20][CH3:21])[CH:5]=1)[CH2:2][CH3:3].C.O.NN>CO>[CH3:21][O:20][C:6]1[CH:5]=[C:4]([CH2:1][CH2:2][CH3:3])[CH:19]=[CH:18][C:7]=1[O:8][C:9]1[C:10]([NH2:15])=[N:11][CH:12]=[CH:13][CH:14]=1'

In [41]:
df[df['procedure_details'] == "zeolite"]

Unnamed: 0_level_0,original_index,agent_000,agent_001,agent_002,date_of_experiment,extracted_from_file,grant_date,is_mapped,procedure_details,product_000,reactant_000,reactant_001,rxn_str,rxn_time,solvent_000,solvent_001,temperature,yield_000
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1


In [40]:
len(df[df['agent_002'] == "C"])

57

In [None]:
'C(O[CH:4](OCC)[CH2:5][NH:6][C:7]1[N:11]([CH3:12])[N:10]=[N:9][N:8]=1)C.[ClH:16].[OH:17][C:18]1[CH:19]=[C:20]([CH:24]=[CH:25][C:26]=1[OH:27])[CH2:21][CH2:22][NH2:23].Cl.C>O.C(O)C>[ClH:16].[CH3:12][N:11]1[C:7]([NH:6][CH2:5][CH:4]2[C:24]3[C:20](=[CH:19][C:18]([OH:17])=[C:26]([OH:27])[CH:25]=3)[CH2:21][CH2:22][NH:23]2)=[N:8][N:9]=[N:10]1'

In [25]:
# most common agents in agent_000
df['agent_000'].value_counts().head(20)

agent_000
Cl                                                                                                                                        63102
[Na+]                                                                                                                                     56010
[K+]                                                                                                                                      25356
[OH-]                                                                                                                                     22763
[Pd]                                                                                                                                      22616
O=C([O-])[O-]                                                                                                                             21063
[H-]                                                                                                                          

In [28]:
df[df['agent_000']=="Cl"]['rxn_str'].iloc[0]

'[C:1]([O:5][C:6]([C:8]1[C:9]([C:14]2[CH:19]=[CH:18][C:17]([CH2:20][N:21]3[C:25]([CH:26]=O)=[C:24]([CH:28]=[CH2:29])[N:23]=[C:22]3[O:30][CH2:31][CH3:32])=[C:16]([F:33])[CH:15]=2)=[CH:10][CH:11]=[CH:12][CH:13]=1)=[O:7])([CH3:4])([CH3:3])[CH3:2].Cl.[NH2:35][OH:36]>N1C=CC=CC=1.O>[C:1]([O:5][C:6]([C:8]1[C:9]([C:14]2[CH:19]=[CH:18][C:17]([CH2:20][N:21]3[C:25]([CH:26]=[N:35][OH:36])=[C:24]([CH:28]=[CH2:29])[N:23]=[C:22]3[O:30][CH2:31][CH3:32])=[C:16]([F:33])[CH:15]=2)=[CH:10][CH:11]=[CH:12][CH:13]=1)=[O:7])([CH3:2])([CH3:4])[CH3:3]'

In [51]:
df[df['agent_000']=="Cl"]['procedure_details'].iloc[0]

'Intermediate (15b) (19.5 g, 43.4 mmol) was dissolved in pyridine (100 mL, 1 μmol). Hydroxylamine hydrochloride (9.0 g, 130 mmol) was added, followed by water (50 mL, 3 mol), and the mixture was stirred at room temperature overnight. Water (100 mL) was then added and the mixture was stirred for 20 minutes. The precipitant was filtered off and dried to yield intermediate (15c) (13.5 g). MS m/z: [M+H+] calcd for C26H28FN2O4, 466.2; found 466.4. 1H-NMR (CDCl3): 9.78 (1H, s), 7.81 (1H, d), 7.48 (2H, m), 7.26 (1H, s), 7.0 (4H, m), 6.20 (1H, d), 5.53 (1H, d), 5.50 (2H, s), 4.55 (2H, q), 1.43 (3H, t), 1.25 (9H, s).'

In [None]:
'[C:1]([O:5][C:6]([C:8]1[C:9]([C:14]2[CH:19]=[CH:18][C:17]([CH2:20][N:21]3[C:25]([CH:26]=O)=[C:24]([CH:28]=[CH2:29])[N:23]=[C:22]3[O:30][CH2:31][CH3:32])=[C:16]([F:33])[CH:15]=2)=[CH:10][CH:11]=[CH:12][CH:13]=1)=[O:7])([CH3:4])([CH3:3])[CH3:2].Cl.[NH2:35][OH:36]>N1C=CC=CC=1.O>[C:1]([O:5][C:6]([C:8]1[C:9]([C:14]2[CH:19]=[CH:18][C:17]([CH2:20][N:21]3[C:25]([CH:26]=[N:35][OH:36])=[C:24]([CH:28]=[CH2:29])[N:23]=[C:22]3[O:30][CH2:31][CH3:32])=[C:16]([F:33])[CH:15]=2)=[CH:10][CH:11]=[CH:12][CH:13]=1)=[O:7])([CH3:2])([CH3:4])[CH3:3]'


In [29]:
df[df['agent_000']=="[Cl-]"]['rxn_str'].iloc[0]

'[Cl:1][C:2]1[CH:3]=[CH:4][C:5]([O:12][CH2:13][C:14]([N:16]2[CH2:21][C@H:20]([CH3:22])[N:19]([CH2:23][C:24]3[CH:29]=[CH:28][C:27]([F:30])=[CH:26][CH:25]=3)[CH2:18][C@H:17]2[CH3:31])=[O:15])=[C:6]([CH:11]=1)[O:7][CH2:8][C:9]#[N:10].[Cl-].[NH4+].[N-:34]=[N+:35]=[N-:36].[Na+]>CN(C)C=O.C(OCC)(=O)C>[Cl:1][C:2]1[CH:3]=[CH:4][C:5]([O:12][CH2:13][C:14]([N:16]2[CH2:21][C@H:20]([CH3:22])[N:19]([CH2:23][C:24]3[CH:25]=[CH:26][C:27]([F:30])=[CH:28][CH:29]=3)[CH2:18][C@H:17]2[CH3:31])=[O:15])=[C:6]([O:7][CH2:8][C:9]2[N:34]=[N:35][NH:36][N:10]=2)[CH:11]=1'

In [None]:
'[Cl:1][C:2]1[CH:3]=[CH:4][C:5]([O:12][CH2:13][C:14]([N:16]2[CH2:21][C@H:20]([CH3:22])[N:19]([CH2:23][C:24]3[CH:29]=[CH:28][C:27]([F:30])=[CH:26][CH:25]=3)[CH2:18][C@H:17]2[CH3:31])=[O:15])=[C:6]([CH:11]=1)[O:7][CH2:8][C:9]#[N:10].[Cl-].[NH4+].[N-:34]=[N+:35]=[N-:36].[Na+]>CN(C)C=O.C(OCC)(=O)C>[Cl:1][C:2]1[CH:3]=[CH:4][C:5]([O:12][CH2:13][C:14]([N:16]2[CH2:21][C@H:20]([CH3:22])[N:19]([CH2:23][C:24]3[CH:25]=[CH:26][C:27]([F:30])=[CH:28][CH:29]=3)[CH2:18][C@H:17]2[CH3:31])=[O:15])=[C:6]([O:7][CH2:8][C:9]2[N:34]=[N:35][NH:36][N:10]=2)[CH:11]=1'


In [34]:
# Check reactions with H2
df[df['agent_000']=="[H][H]"]['rxn_str'].iloc[0]

'[CH3:1][N:2]([CH3:13])[C:3]1[CH:8]=[CH:7][CH:6]=[C:5]([N+:9]([O-])=O)[C:4]=1[CH3:12]>CO.[H][H].[Ni]>[CH3:1][N:2]([CH3:13])[C:3]1[C:4]([CH3:12])=[C:5]([CH:6]=[CH:7][CH:8]=1)[NH2:9]'

In [None]:
# NB: atom mapping does not involve H atoms
'[CH3:1][N:2]([CH3:13])[C:3]1[CH:8]=[CH:7][CH:6]=[C:5]([N+:9]([O-])=O)[C:4]=1[CH3:12]>CO.[H][H].[Ni]>[CH3:1][N:2]([CH3:13])[C:3]1[C:4]([CH3:12])=[C:5]([CH:6]=[CH:7][CH:8]=1)[NH2:9]'


In [35]:
df[df['reactant_000']=="[H][H]"]['rxn_str'].iloc[0]

'[N:1]([CH:4]([CH:12]([OH:26])[CH:13]([OH:25])[CH:14]([N:22]=[N+]=[N-])[CH2:15][C:16]1[CH:21]=[CH:20][CH:19]=[CH:18][CH:17]=1)[CH2:5][C:6]1[CH:11]=[CH:10][CH:9]=[CH:8][CH:7]=1)=[N+]=[N-].[H][H]>CO.[Pd]>[NH2:1][CH:4]([CH:12]([OH:26])[CH:13]([OH:25])[CH:14]([NH2:22])[CH2:15][C:16]1[CH:21]=[CH:20][CH:19]=[CH:18][CH:17]=1)[CH2:5][C:6]1[CH:11]=[CH:10][CH:9]=[CH:8][CH:7]=1'

In [50]:
# Bug 2: Charcol is being written as "C" -> this becomes CH4
print(df[df['agent_000'] == "[Cl]"]['rxn_str'])

index
98490     [OH-:1].[Na+].[CH2:3]([Sn:7](Cl)(Cl)[CH2:8][CH...
297681    [Cl].[Br:2][C:3]1[CH:11]=[CH:10][C:6]([C:7]([O...
719131    [CH:1]1([C:4](=[O:11])[CH2:5][C:6]([O:8][CH2:9...
Name: rxn_str, dtype: object
