In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem

In [2]:
df_hte_bh = pd.read_excel("Dreher_and_Doyle_input_data.xlsx")

In [3]:
df_hte_bh["Aryl halide"].unique()
dict_prod = {'ClC1=NC=CC=C1' : 'CC1=CC=C(C2=NC=CC=C2)C=C1',
          'BrC1=NC=CC=C1' : 'CC1=CC=C(C2=NC=CC=C2)C=C1',
          'IC1=CC=C(CC)C=C1' : 'CC1=CC=C(C2=CC=C(CC)C=C2)C=C1',
          'ClC1=CC=C(C(F)(F)F)C=C1' : 'CC1=CC=C(C2=CC=C(C(F)(F)F)C=C2)C=C1',
          'ClC1=CC=C(OC)C=C1' : 'CC1=CC=C(C2=CC=C(OC)C=C2)C=C1', 
          'BrC1=CN=CC=C1' : 'CC1=CC=C(C2=CN=CC=C2)C=C1',
          'IC1=NC=CC=C1' : 'CC1=CC=C(C2=NC=CC=C2)C=C1',
          'BrC1=CC=C(CC)C=C1' : 'CC1=CC=C(C2=CC=C(CC)C=C2)C=C1',
          'ClC1=CN=CC=C1' : 'CC1=CC=C(C2=CN=CC=C2)C=C1',
          'ClC1=CC=C(CC)C=C1' : 'CC1=CC=C(C2=CC=C(CC)C=C2)C=C1',
          'IC1=CN=CC=C1' : 'CC1=CC=C(C2=CN=CC=C2)C=C1',
          'BrC1=CC=C(OC)C=C1' : 'CC1=CC=C(C2=CC=C(OC)C=C2)C=C1',
          'IC1=CC=C(C(F)(F)F)C=C1' : 'CC1=CC=C(C2=CC=C(C(F)(F)F)C=C2)C=C1',
          'IC1=CC=C(OC)C=C1' : 'CC1=CC=C(C2=CC=C(OC)C=C2)C=C1',
          'BrC1=CC=C(C(F)(F)F)C=C1' : 'CC1=CC=C(C2=CC=C(C(F)(F)F)C=C2)C=C1'}

In [4]:
def make_reaction_smarts(df):
    all_rxn_smarts = []
    for i, row in df.iterrows(): 
        rxn_smarts = str(row["Aryl halide"] + '.' + 'CC1=CC=C(N)C=C1' + '>')
        rxn_smarts = str(rxn_smarts + row["Ligand"] + '.' + row["Additive"] + '.' + row["Base"] + '>')
        rxn_smarts = str(rxn_smarts + dict_prod[row["Aryl halide"]])
        if Chem.CanonSmiles(dict_prod[row["Aryl halide"]]):
            all_rxn_smarts.append(rxn_smarts)
        else:
            print(dict_prod[row["Aryl halide"]])
    return all_rxn_smarts

In [5]:
list_rxn_smarts = make_reaction_smarts(df_hte_bh)

In [6]:
df = pd.DataFrame(data=np.array(list_rxn_smarts), columns=["rxn_smarts"])
df.to_csv('bh_hte_rxn_smarts.csv')

In [7]:
df_hte_suz = pd.read_excel("aap9112_Data_File_S1.xlsx")

In [8]:
name_to_smiles_r = {'6-chloroquinoline' : "C1=CC2=C(C=CC(=C2)Cl)N=C1", 
                    '6-Bromoquinoline' :  "C1=CC2=C(C=CC(=C2)Br)N=C1", 
                    '6-triflatequinoline' : "C1=CC2=C(C=CC(=C2)OS(=O)(=O)C(F)(F)F)N=C1",
                    '6-Iodoquinoline' : "C1=CC2=C(C=CC(=C2)I)N=C1", 
                    '6-quinoline-boronic acid hydrochloride' : "C1=CC2=C(C=CC(=C2)B(O)O)N=C1.Cl",
                    'Potassium quinoline-6-trifluoroborate' : "C1=CC2=C(C=CC(=C2)[B-](F)(F)F)N=C1.[K+]",
                    '6-Quinolineboronic acid pinacol ester' : "C1=CC2=C(C=CC(=C2)B3OC(C)(C)C(C)(C)O3)N=C1"}
df_hte_suz["Reactant_1_smi"] = df_hte_suz["Reactant_1_Name"].map(name_to_smiles_r)

In [9]:
name_to_smiles_reag = {'2a, Boronic Acid' : 'Cc3ccc1c(cnn1C2CCCCO2)c3B(O)O',
                       '2b, Boronic Ester' : 'Cc3ccc1c(cnn1C2CCCCO2)c3B4OC(C)(C)C(C)(C)O4',
                       '2c, Trifluoroborate' : 'Cc3ccc1c(cnn1C2CCCCO2)c3[B-](F)(F)F.[K+]',
                       '2d, Bromide' : 'Cc3ccc1c(cnn1C2CCCCO2)c3Br'}
df_hte_suz["Reactant_2_smi"] = df_hte_suz["Reactant_2_Name"].map(name_to_smiles_reag)

In [10]:
name_to_smiles_lig = {'P(tBu)3' : 'CC(C)(C)P(C(C)(C)C)C(C)(C)C)', 
                      'P(Ph)3 ' : 'C1=CC=CC=C1P(C1=CC=CC=C1)C1=CC=CC=C1',
                      'AmPhos' : 'C1=C(N(C)C)C=CC=C1P(C(C)(C)C)C(C)(C)C)', 
                      'P(Cy)3' : 'C1CCCCC1P(C1CCCCC1)C1CCCCC1',
                      'P(o-Tol)3' : 'C1=CC=CC(C)=C1P(C1=CC=CC=C1C)C1=CC=CC=C1C',
                      'CataCXium A' : 'CCCCP([C@]12C[C@H]3C[C@H](C[C@H](C3)C1)C2)[C@@]45C[C@@H]6C[C@@H](C[C@@H](C6)C4)C5',
                      'SPhos' : 'COc1cccc(c1c2ccccc2P(C3CCCCC3)C4CCCCC4)OC',
                      'dtbpf' : '[Fe].CC(C)(C)P([C]1[CH][CH][CH][CH]1)C(C)(C)C.CC(C)(C)P([C]2[CH][CH][CH][CH]2)C(C)(C)C',
                      'XPhos' : 'CC(C)c1cc(C(C)C)c(c(c1)C(C)C)-c2ccccc2P(C3CCCCC3)C4CCCCC4',
                      'dppf' : '[Fe].[CH]1[CH][CH][C]([CH]1)P(c2ccccc2)c3ccccc3.[CH]4[CH][CH][C]([CH]4)P(c5ccccc5)c6ccccc6',
                      'Xantphos' : 'CC1(C)c2cccc(P(c3ccccc3)c4ccccc4)c2Oc5c(cccc15)P(c6ccccc6)c7ccccc7',
                      'None' : ''}
df_hte_suz["Ligand_smi"] = df_hte_suz["Ligand_Short_Hand"].map(name_to_smiles_lig)

In [11]:
name_to_smiles_base = {'NaOH' : '[Na+].[O-]',
                       'NaHCO3' : '[Na+].[O-]C(=O)=O',
                       'CsF' : '[Cs+].[F-]',
                       'K3PO4' : '[K+].[K+].[K+].[O-]P(=O)([O-])[O-]',
                       'KOH' : '[K+].[O-]',
                       'LiOtBu' : '[Li+].[O-]C(C)(C)C',
                       'Et3N' : 'CCN(CC)CC',
                       'None' : ''}
df_hte_suz["Base_smi"] = df_hte_suz["Reagent_1_Short_Hand"].map(name_to_smiles_base)

In [12]:
df_hte_suz = df_hte_suz.drop(columns=["Reaction_No", "Reactant_1_Name", "Reactant_2_Name", "Reactant_1_Short_Hand", 
                         "Reactant_1_eq", "Reactant_1_mmol", "Reactant_2_eq", "Catalyst_1_Short_Hand",
                         "Catalyst_1_eq", "Ligand_Short_Hand", "Ligand_eq", "Reagent_1_eq", 
                         "Solvent_1_Short_Hand", "Reagent_1_Short_Hand"])


In [21]:
df_hte_suz

Unnamed: 0,Product_Yield_PCT_Area_UV,Product_Yield_Mass_Ion_Count,Reactant_1_smi,Reactant_2_smi,Ligand_smi,Base_smi
0,4.764109,6.262059e+03,C1=CC2=C(C=CC(=C2)Cl)N=C1,Cc3ccc1c(cnn1C2CCCCO2)c3B(O)O,CC(C)(C)P(C(C)(C)C)C(C)(C)C),[Na+].[O-]
1,4.120962,1.324557e+04,C1=CC2=C(C=CC(=C2)Cl)N=C1,Cc3ccc1c(cnn1C2CCCCO2)c3B(O)O,C1=CC=CC=C1P(C1=CC=CC=C1)C1=CC=CC=C1,[Na+].[O-]
2,2.583837,3.009166e+03,C1=CC2=C(C=CC(=C2)Cl)N=C1,Cc3ccc1c(cnn1C2CCCCO2)c3B(O)O,C1=C(N(C)C)C=CC=C1P(C(C)(C)C)C(C)(C)C),[Na+].[O-]
3,4.443171,3.086070e+04,C1=CC2=C(C=CC(=C2)Cl)N=C1,Cc3ccc1c(cnn1C2CCCCO2)c3B(O)O,C1CCCCC1P(C1CCCCC1)C1CCCCC1,[Na+].[O-]
4,1.949874,2.486306e+03,C1=CC2=C(C=CC(=C2)Cl)N=C1,Cc3ccc1c(cnn1C2CCCCO2)c3B(O)O,C1=CC=CC(C)=C1P(C1=CC=CC=C1C)C1=CC=CC=C1C,[Na+].[O-]
...,...,...,...,...,...,...
5755,47.211431,1.164924e+07,C1=CC2=C(C=CC(=C2)B3OC(C)(C)C(C)(C)O3)N=C1,Cc3ccc1c(cnn1C2CCCCO2)c3Br,[Fe].CC(C)(C)P([C]1[CH][CH][CH][CH]1)C(C)(C)C....,[K+].[K+].[K+].[O-]P(=O)([O-])[O-]
5756,0.000000,1.473563e+07,C1=CC2=C(C=CC(=C2)B3OC(C)(C)C(C)(C)O3)N=C1,Cc3ccc1c(cnn1C2CCCCO2)c3Br,CC(C)c1cc(C(C)C)c(c(c1)C(C)C)-c2ccccc2P(C3CCCC...,[K+].[K+].[K+].[O-]P(=O)([O-])[O-]
5757,31.443681,4.665383e+06,C1=CC2=C(C=CC(=C2)B3OC(C)(C)C(C)(C)O3)N=C1,Cc3ccc1c(cnn1C2CCCCO2)c3Br,[Fe].[CH]1[CH][CH][C]([CH]1)P(c2ccccc2)c3ccccc...,[K+].[K+].[K+].[O-]P(=O)([O-])[O-]
5758,0.000000,1.335187e+06,C1=CC2=C(C=CC(=C2)B3OC(C)(C)C(C)(C)O3)N=C1,Cc3ccc1c(cnn1C2CCCCO2)c3Br,CC1(C)c2cccc(P(c3ccccc3)c4ccccc4)c2Oc5c(cccc15...,[K+].[K+].[K+].[O-]P(=O)([O-])[O-]


In [22]:
def make_reaction_smarts_suz(df):
    all_rxn_smarts = []
    for i, row in df.iterrows(): 
        rxn_smarts = str(row["Reactant_1_smi"] + '.' + row["Reactant_2_smi"] + '>')
        rxn_smarts = str(rxn_smarts + row["Ligand_smi"] + '.' + row["Base_smi"] + '>')
        rxn_smarts = str(rxn_smarts + 'Cc3ccc1c(cnn1C2CCCCO2)c3c1ccc2ncccc2c1')
        all_rxn_smarts.append(rxn_smarts)
    return all_rxn_smarts

In [23]:
list_rxn_smarts = make_reaction_smarts_suz(df_hte_suz)
df = pd.DataFrame(data=np.array(list_rxn_smarts), columns=["rxn_smarts"])
df.to_csv('suz_hte_rxn_smarts.csv')

In [24]:
df

Unnamed: 0,rxn_smarts
0,C1=CC2=C(C=CC(=C2)Cl)N=C1.Cc3ccc1c(cnn1C2CCCCO...
1,C1=CC2=C(C=CC(=C2)Cl)N=C1.Cc3ccc1c(cnn1C2CCCCO...
2,C1=CC2=C(C=CC(=C2)Cl)N=C1.Cc3ccc1c(cnn1C2CCCCO...
3,C1=CC2=C(C=CC(=C2)Cl)N=C1.Cc3ccc1c(cnn1C2CCCCO...
4,C1=CC2=C(C=CC(=C2)Cl)N=C1.Cc3ccc1c(cnn1C2CCCCO...
...,...
5755,C1=CC2=C(C=CC(=C2)B3OC(C)(C)C(C)(C)O3)N=C1.Cc3...
5756,C1=CC2=C(C=CC(=C2)B3OC(C)(C)C(C)(C)O3)N=C1.Cc3...
5757,C1=CC2=C(C=CC(=C2)B3OC(C)(C)C(C)(C)O3)N=C1.Cc3...
5758,C1=CC2=C(C=CC(=C2)B3OC(C)(C)C(C)(C)O3)N=C1.Cc3...
