In [1]:
from predict_snar.smiles import ReactionSmilesProcessor
from predict_snar.data import SolventPicker
from analyze_snar.extraction import extract_dataframe
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

fcntl not avaiable on Windows.
resource not available on Windows.


Make functions for joblib parallelization

In [2]:
def get_influential_solvent(solvent):
    sp = SolventPicker()
    influential_solvent = sp.get_influential_solvent(solvent)
    return influential_solvent

In [3]:
def get_solvent_smiles(solvent):
    sp = SolventPicker()
    influential_solvent = sp.smiles_from_name(solvent)
    return influential_solvent    

In [4]:
sp = SolventPicker()

# Process kinetic database

In [452]:
data_df = pd.read_excel("../prepare_kinetic_data/kinetic_data_v4.xlsx")

In [453]:
data_rsmps = data_df["Reaction_SMILES"].apply(lambda x: ReactionSmilesProcessor(x))

In [454]:
data_solvents = Parallel(n_jobs=4)(delayed(get_influential_solvent)(solvent) for solvent in data_df["Solvent"])

In [416]:
solvent_smiles_data = Parallel(n_jobs=4)(delayed(get_solvent_smiles)(solvent) for solvent in data_solvents)

In [455]:
data_df["complete_reaction_smiles"] = [AllChem.ReactionToSmiles(AllChem.ReactionFromSmarts(rsmp.reaction_smiles)) for rsmp in data_rsmps]

In [456]:
data_df["influential_solvent"] = data_solvents
data_df["solvent_smiles"] = solvent_smiles_data

#### Investigate replicated reactions

In [457]:
data_df[data_df.duplicated(["complete_reaction_smiles", "influential_solvent", "Temp (K)"], keep=False)][["Reaction_SMILES", 'Exp_Rate_Constant k1 (M-1s-1)', 'Activation free Energy (kcal) Eyring', 'Solvent', 'Temp (K)', 'Reference', 'DOI']]

Unnamed: 0,Reaction_SMILES,Exp_Rate_Constant k1 (M-1s-1),Activation free Energy (kcal) Eyring,Solvent,Temp (K),Reference,DOI
26,FC1=CC=C(C=C1N(=O)=O)N(=O)=O.C3CCNCC3>>C3CCN(C...,150.0,14.46,Acetonitrile,298.0,"Eur. J. Org. Chem, 2007, 1378",https://doi.org/10.1002/ejoc.200600968
108,Fc1ccc(cc1N(=O)=O)N(=O)=O.C1CCNCC1>>O=N(=O)c1c...,380.0,13.91,Acetonitrile,298.0,"J. Org. Chem., 2007, 72, 8797.",https://doi.org/doi/10.1021/jo701549h
139,Clc1ccc(cc1N(=O)=O)N(=O)=O.C1CCNCC1>>C2CCN(CC2...,0.0153,19.9,Ethanol,298.0,"J. Chem. Soc., 1950, 507-516",https://doi.org/10.1039/JR9500000507
219,C1=CC(=CC=C1[N+](=O)[O-])F.[N-]=[N+]=[N-]>>[N-...,0.001738,21.18,TMS,298.0,"J. Am. Chem. Soc.,1973, 95, 408-410",http://dx.doi.org/10.1021/ja00783a016
222,C1=CC(=CC=C1[N+](=O)[O-])F.[N-]=[N+]=[N-]>>[N-...,0.0795,18.92,HMPT,298.0,"J. Am. Chem. Soc.,1973, 95, 408-410",http://dx.doi.org/10.1021/ja00783a016
467,C1=CC(=CC=C1[N+](=O)[O-])F.[N-]=[N+]=[N-]>>[N-...,1.2589,17.29,HMPT,298.0,"J. Am. Chem. Soc., 1968, 90, 5049-5069",https://doi.org/10.1021/ja01021a002
468,C1=CC(=CC=C1[N+](=O)[O-])F.[N-]=[N+]=[N-]>>[N-...,0.00199,21.1,TMS,298.0,"J. Am. Chem. Soc., 1968, 90, 5049-5069",https://doi.org/10.1021/ja01021a002
489,Clc1ccc(cc1N(=O)=O)N(=O)=O.C1CCNCC1>>O=N(=O)c1...,0.018,19.8,Ethanol,298.0,"J. Chem. Soc., Perkin Trans. 2, 1986, 1427-1431",http://dx.doi.org/10.1039/P29860001427


In [458]:
data_df.drop_duplicates(["complete_reaction_smiles", "influential_solvent", "Temp (K)"], inplace=True)

#### Drop examples with activation energies

In [459]:
data_df.dropna(subset=["Activation free Energy (kcal) Eyring"], inplace=True)

# Process data which was submitted

In [460]:
submitted_df = pd.read_csv("submitted_smiles", sep=" ", names=["reaction_smiles", "temperature", "solvent"])
print(submitted_df.shape)

(476, 3)


In [461]:
submitted_rsmps = submitted_df["reaction_smiles"].apply(lambda x: ReactionSmilesProcessor(x))

In [462]:
submitted_df["complete_reaction_smiles"] = [AllChem.ReactionToSmiles(AllChem.ReactionFromSmarts(rsmp.reaction_smiles)) for rsmp in submitted_rsmps]
submitted_df.drop_duplicates(["complete_reaction_smiles", "temperature", "solvent"], inplace=True)
print(submitted_df.shape)

(452, 4)


In [463]:
comp_submitted = submitted_df[["complete_reaction_smiles", "solvent", "temperature"]]
comp_submitted.columns = ["reaction_smiles", "solvent", "temperature"]
comp_data = data_df[["complete_reaction_smiles", "influential_solvent", "Temp (K)"]]
comp_data.columns = ["reaction_smiles", "solvent", "temperature"]

In [464]:
comp_submitted.shape

(452, 3)

# Find submitted reactions not in the dataset

In [465]:
sub_in_data = pd.merge(comp_submitted, comp_data, how="left", on=["reaction_smiles", "solvent", "temperature"], indicator=True)
sub_in_data.index = comp_submitted.index

In [466]:
left_only_sub = comp_submitted[sub_in_data["_merge"] == "left_only"]
both_sub = comp_submitted[sub_in_data["_merge"] == "both"]

In [467]:
len(both_sub)

451

In [468]:
comp_submitted[sub_in_data["_merge"] == "left_only"]

Unnamed: 0,reaction_smiles,solvent,temperature
352,CCCCN.N#Cc1cc(C(F)(F)F)ccc1Cl>>CCCCNc1ccc(C(F)...,Acetonitrile,298.0


# Find reactions in dataset that were not submitted

In [469]:
data_in_sub = pd.merge(comp_data, comp_submitted, how="left", on=["reaction_smiles", "solvent", "temperature"], indicator=True)
data_in_sub.index = comp_data.index

In [470]:
left_only_data = comp_data[data_in_sub["_merge"] == "left_only"]
both_data = comp_data[data_in_sub["_merge"] == "both"]

In [471]:
left_only_data.index

Int64Index([347, 353, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
            480, 481, 482, 483, 484, 485, 486, 487, 488, 490, 491, 492, 493,
            494, 495, 496, 497, 498, 500, 501, 502, 503, 504, 505, 506, 507,
            508, 510, 511, 512, 513, 514, 515, 516, 517],
           dtype='int64')

In [472]:
len(left_only_data)

48

# Analysis of completed reactions

In [489]:
df = extract_dataframe("../machine_learning/2019-12-15/db", or_lg_correction=True)
print(df.shape)

(475, 55)


In [490]:
reactions = df["reaction_smiles"].apply(lambda x: AllChem.ReactionFromSmarts(x, useSmiles=True))
for reaction in reactions:
    for mol in list(reaction.GetReactants()) + list(reaction.GetProducts()):
        Chem.SanitizeMol(mol)
df["reaction_smiles"] = [AllChem.ReactionToSmiles(reaction) for reaction in reactions]

In [437]:
solvent_smiles_sub = Parallel(n_jobs=4)(delayed(get_solvent_smiles)(solvent) for solvent in both_sub["solvent"])

In [491]:
good_sub = both_sub.copy()

In [492]:
good_sub["solvent"] = solvent_smiles_sub

In [493]:
df.drop_duplicates(["reaction_smiles", "solvent", "temperature"], inplace=True)
print(df.shape)

(452, 55)


#### Investigate submitted jobs which were not finished
Turns out that all jobs were finished

In [498]:
finished = pd.merge(good_sub, df, how="left", on=["reaction_smiles", "solvent", "temperature"], indicator=True)
finished.index = good_sub.index

In [499]:
finished["_merge"].value_counts()

both          451
right_only      0
left_only       0
Name: _merge, dtype: int64

In [500]:
left_only = finished[finished["_merge"] == "left_only"]

In [501]:
left_only

Unnamed: 0,reaction_smiles,solvent,temperature,reaction_smiles_orig,v_av_central_atom,v_av_nu_atom,reaction_energy,reaction_energy_qh_grimme,reaction_energy_qh_truhlar,activation_energy,...,bo_nu_ts,bo_lg_ts,bo_prod_nu,nu_symbol,lg_symbol,inchikey_substrate,inchikey_nu,inchikey_lg,inchikey_product,_merge


In [502]:
extra_submitted = pd.merge(df, good_sub, how="left", on=["reaction_smiles", "solvent", "temperature"], indicator=True)
extra_submitted.index = df.index

In [503]:
extra_submitted["_merge"].value_counts()

both          451
left_only       1
right_only      0
Name: _merge, dtype: int64

In [504]:
df[extra_submitted["_merge"] == "left_only"]["reaction_smiles"]

322    CCCCN.N#Cc1cc(C(F)(F)F)ccc1Cl>>CCCCNc1ccc(C(F)...
Name: reaction_smiles, dtype: object

In [505]:
df[extra_submitted["_merge"] == "left_only"]["reaction_smiles"].values

array(['CCCCN.N#Cc1cc(C(F)(F)F)ccc1Cl>>CCCCNc1ccc(C(F)(F)F)cc1C#N.Cl'],
      dtype=object)

# Find status for reactions in dataset
* Not submitted
* Submitted
* Failed
* Not treated

Failed reactions:

| index | reason |
| ----- | ------ |
| 201   | TMS    |
| 66    | TS     |
| 185   | TS     |
| 339   | TS     |
| 457   | TS     |
| 364   | C nu   |
| 365   | C nu   |

In [529]:
non_submitted = left_only_data.index
failed = pd.Index([66, 185, 339, 457, 364, 365])
non_treated = pd.Index([201, 474])
wrongly_submitted = pd.Index([322])

Check that numbers add up

In [536]:
n_good = len(df) - len(non_treated) - len(wrongly_submitted)
n_successful = n_good - len(failed)
n_failed = len(failed)

In [539]:
print("Good submitted reactions:", n_good)
print("Successful reactions:", n_successful)
print("Failed reactions:", n_failed)
print(f"Percentage failures: {(n_failed / n_good) * 100:.2f}")

Good submitted reactions: 449
Successful reactions: 443
Failed reactions: 6
Percentage failures: 1.34


In [95]:
data_df[data_df["complete_reaction_smiles"] == ]['Activation free Energy (kcal) Eyring']

2      22.08
337    20.09
Name: Activation free Energy (kcal) Eyring, dtype: float64

In [84]:
left_only_sub["reaction_smiles"].values

352    CCCCN.N#Cc1cc(C(F)(F)F)ccc1Cl>>CCCCNc1ccc(C(F)...
Name: reaction_smiles, dtype: object

In [170]:
smiles_submitted = 'O=N(=O)c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc(cn1)N(=O)=O'

In [171]:
smiles_data = "O=N(=O)c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc(cn1)N(=O)=O"

In [167]:
data_df[data_df["Reaction_SMILES"] == 'O=N(=O)c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc(cn1)N(=O)=O']["Reaction_SMILES"]

337    O=N(=O)c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc(cn1)N(=O)=O
Name: Reaction_SMILES, dtype: object

In [163]:
AllChem.ReactionToSmiles(AllChem.ReactionFromSmarts('O=[N+]([O-])c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc([N+](=O)[O-])cn1.Cl', useSmiles=True))

'CCCCN.O=[N+]([O-])c1ccc(Cl)nc1>>CCCCNc1ccc([N+](=O)[O-])cn1.Cl'

In [97]:
df[df["reaction_smiles"] == AllChem.ReactionToSmiles(AllChem.ReactionFromSmarts('O=[N+]([O-])c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc([N+](=O)[O-])cn1.Cl', useSmiles=True))]

Unnamed: 0,reaction_smiles,reaction_smiles_orig,v_av_central_atom,v_av_nu_atom,reaction_energy,reaction_energy_qh_grimme,reaction_energy_qh_truhlar,activation_energy,activation_energy_qh_grimme,activation_energy_qh_truhlar,...,bo_lg_ts,bo_prod_nu,nu_symbol,lg_symbol,solvent,inchikey_substrate,inchikey_nu,inchikey_lg,inchikey_product,temperature
296,CCCCN.O=[N+]([O-])c1ccc(Cl)nc1>>CCCCNc1ccc([N+...,O=N(=O)c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc(cn1)N(=O)=O,14.8,-13.3,-19.948616,-18.996684,-18.662221,20.704556,21.868586,22.199283,...,0.889538,1.302474,N,Cl,CS(=O)C,AFJQLCOKPDRRAN-UHFFFAOYSA-N,HQABUPZFAYXKJW-UHFFFAOYSA-N,VEXZGXHMUGYJMC-UHFFFAOYSA-N,AFJQLCOKPDRRAN-UHFFFAOYSA-N,298.0
439,CCCCN.O=[N+]([O-])c1ccc(Cl)nc1>>CCCCNc1ccc([N+...,O=N(=O)c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc(cn1)N(=O)=O,14.8,-13.3,-19.142419,-18.463453,-18.317244,21.480323,22.530146,22.78115,...,0.890495,1.303473,N,Cl,C(C)#N,AFJQLCOKPDRRAN-UHFFFAOYSA-N,HQABUPZFAYXKJW-UHFFFAOYSA-N,VEXZGXHMUGYJMC-UHFFFAOYSA-N,AFJQLCOKPDRRAN-UHFFFAOYSA-N,298.0


In [102]:
data_df.columns

Index(['ID', 'Exp_Rate_Constant k1 (M-1s-1)', 'Molecule', 'Modelled',
       'Reaction_SMILES', 'Substrate_SMILES', 'Nucleophile_SMILES',
       'Product_SMILES', 'RLPT', 'Original Data', 'Temp(degC)',
       'Activation free Energy (kcal) Eyring', 'Temp (K)', 'Reference',
       'Book_Page', 'Year', 'Title', 'Authors', 'Link', 'DOI', 'Comment',
       'Referenced Data', 'Mechanism', 'Table', 'Substitution Site',
       'Nucleophile', 'Leaving Group', 'Solvent', 'logK',
       'Half_life (1M) (mins)', 'Half_life (IM) (hrs)', 'Index',
       'canonical_Substrate', 'canonical_Nucleophile', 'canonical_Product',
       'canonical_reaction_smiles', 'reaction_mols', 'Duplicates',
       'complete_reaction_smiles', 'influential_solvent'],
      dtype='object')

In [159]:
merged_df = pd.merge(df, data_df, how="left", left_on=["reaction_smiles", "solvent", "temperature"], right_on=["complete_reaction_smiles", "solvent_smiles", "Temp (K)"], indicator=True)

In [160]:
merged_df.columns

Index(['reaction_smiles', 'reaction_smiles_orig', 'v_av_central_atom',
       'v_av_nu_atom', 'reaction_energy', 'reaction_energy_qh_grimme',
       'reaction_energy_qh_truhlar', 'activation_energy',
       'activation_energy_qh_grimme', 'activation_energy_qh_truhlar',
       'sasa_nu', 'sasa_central', 'sasa_ratio_nu', 'sasa_ratio_central',
       'epn_central', 'epn_nu', 'epn_lg', 'epn_central_ts', 'epn_nu_ts',
       'epn_lg_ts', 'nu_charge', 'nu_charge_ts', 'central_charge',
       'central_charge_ts', 'lg_charge', 'lg_charge_ts', 'nu_formed',
       'lg_broken', 'omega', 'n', 'local_electrophilicity_center',
       'local_nucleophilicity_nu', 'is_av', 'es_av', 'concerted', 'rds',
       'flat_pes', 'p_int_central', 'p_int_nu', 'p_int_area_central',
       'p_int_area_nu', 'bo_diff_nu', 'bo_diff_lg', 'bo_sub_lg', 'bo_nu_ts',
       'bo_lg_ts', 'bo_prod_nu', 'nu_symbol', 'lg_symbol', 'solvent',
       'inchikey_substrate', 'inchikey_nu', 'inchikey_lg', 'inchikey_product',
       'tem

In [161]:
merged_df["_merge"].value_counts()

both          451
left_only       1
right_only      0
Name: _merge, dtype: int64

In [153]:
merged_df["Activation free Energy (kcal) Eyring"].isna().sum()

0

In [172]:
df

Unnamed: 0,reaction_smiles,reaction_smiles_orig,v_av_central_atom,v_av_nu_atom,reaction_energy,reaction_energy_qh_grimme,reaction_energy_qh_truhlar,activation_energy,activation_energy_qh_grimme,activation_energy_qh_truhlar,...,bo_lg_ts,bo_prod_nu,nu_symbol,lg_symbol,solvent,inchikey_substrate,inchikey_nu,inchikey_lg,inchikey_product,temperature
0,CCCCN.O=[N+]([O-])c1ccc(Cl)c([N+](=O)[O-])c1>>...,ClC1=CC=C(C=C1N(=O)=O)N(=O)=O.CCCCN>>CCCCNC1=C...,21.9,-13.3,-22.311394,-21.284788,-20.891967,18.140663,19.386269,19.762147,...,0.986291,1.382835,N,Cl,C(C)#N,XFLMVRNRVDULNK-UHFFFAOYSA-N,HQABUPZFAYXKJW-UHFFFAOYSA-N,VEXZGXHMUGYJMC-UHFFFAOYSA-N,XFLMVRNRVDULNK-UHFFFAOYSA-N,298.0
1,CCCCN.O=[N+]([O-])c1cccnc1Cl>>CCCCNc1ncccc1[N+...,ClC1=C(C=CC=N1)N(=O)=O.CCCCN>>CCCCNC1=NC=CC=C1...,7.8,-13.3,-22.573525,-21.662381,-21.391297,20.164600,21.250192,21.575242,...,0.898877,1.334886,N,Cl,C(C)#N,ZNWRAFXHMOAZLG-UHFFFAOYSA-N,HQABUPZFAYXKJW-UHFFFAOYSA-N,VEXZGXHMUGYJMC-UHFFFAOYSA-N,ZNWRAFXHMOAZLG-UHFFFAOYSA-N,298.0
2,CCCCN.N#Cc1cc([N+](=O)[O-])ccc1Cl>>CCCCNc1ccc(...,N(=O)(=O)C1=CC=C(Cl)C(=C1)C#N.CCCCN>>CCCCNc1cc...,23.5,-13.3,-19.184607,-18.104664,-17.714353,20.347548,21.935146,22.471667,...,0.960115,1.363791,N,Cl,C(C)#N,JPXHHGMADNZXBT-UHFFFAOYSA-N,HQABUPZFAYXKJW-UHFFFAOYSA-N,VEXZGXHMUGYJMC-UHFFFAOYSA-N,JPXHHGMADNZXBT-UHFFFAOYSA-N,298.0
3,CCCCN.O=[N+]([O-])c1cc(C(F)(F)F)ccc1Cl>>CCCCNc...,FC(F)(F)C1=CC=C(Cl)C(=C1)N(=O)=O.CCCCN>>CCCCNC...,14.6,-13.3,-19.767273,-18.995436,-18.869935,21.422391,22.513002,22.700628,...,0.951981,1.369824,N,Cl,C(C)#N,NDULSBQKYMDLLE-UHFFFAOYSA-N,HQABUPZFAYXKJW-UHFFFAOYSA-N,VEXZGXHMUGYJMC-UHFFFAOYSA-N,NDULSBQKYMDLLE-UHFFFAOYSA-N,298.0
4,CCCCN.O=[N+]([O-])c1cccc([N+](=O)[O-])c1Cl>>CC...,ClC1=C(C=CC=C1N(=O)=O)N(=O)=O.CCCCN>>CCCCNC1=C...,13.2,-13.3,-18.289102,-17.436945,-17.140133,17.186188,18.374691,18.815830,...,0.990552,1.379160,N,Cl,C(C)#N,AVUXCSUXMFUMEO-UHFFFAOYSA-N,HQABUPZFAYXKJW-UHFFFAOYSA-N,VEXZGXHMUGYJMC-UHFFFAOYSA-N,AVUXCSUXMFUMEO-UHFFFAOYSA-N,298.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,CC(C)CN.O=[N+]([O-])c1ccc(Cl)c([N+](=O)[O-])c1...,Clc1ccc(cc1N(=O)=O)N(=O)=O.CC(C)CN>>CC(C)CNc1c...,23.3,-13.6,-22.744591,-21.945144,-21.765676,16.829053,17.997476,18.294915,...,0.996301,1.383156,N,Cl,C(C)O,HYHCZHIKPLOGQR-UHFFFAOYSA-N,KDSNLYIMUZNERS-UHFFFAOYSA-N,VEXZGXHMUGYJMC-UHFFFAOYSA-N,HYHCZHIKPLOGQR-UHFFFAOYSA-N,298.0
471,C1CCNCC1.O=[N+]([O-])c1ccc(Oc2ccccc2)c([N+](=O...,c1(Oc2ccccc2)ccc(cc1N(=O)=O)N(=O)=O.C2CCNCC2>>...,23.0,-12.7,-11.775591,-12.600139,-13.021825,19.698838,19.826222,19.540078,...,0.261072,1.277427,N,O,C(C)#N,MPGYUYKSTWTQFS-UHFFFAOYSA-N,NQRYJNQNLNOLGT-UHFFFAOYSA-N,ISWSIDIOOBJBQZ-UHFFFAOYSA-N,MPGYUYKSTWTQFS-UHFFFAOYSA-N,298.0
472,CCCCN.O=[N+]([O-])c1cccc([N+](=O)[O-])c1Oc1ccc...,c1(Oc2ccccc2)c(cccc1N(=O)=O)N(=O)=O.CCCCN>>CCC...,6.4,-13.0,-16.703200,-16.984952,-16.912161,17.093061,18.418361,18.997552,...,1.078376,1.379074,N,O,C(C)#N,AVUXCSUXMFUMEO-UHFFFAOYSA-N,HQABUPZFAYXKJW-UHFFFAOYSA-N,ISWSIDIOOBJBQZ-UHFFFAOYSA-N,AVUXCSUXMFUMEO-UHFFFAOYSA-N,298.0
473,CCCCN.N#Cc1cc([N+](=O)[O-])ccc1Oc1ccccc1>>CCCC...,N(=O)(=O)c1ccc(Oc2ccccc2)c(c1)C#N.CCCCN>>CCCCN...,23.4,-13.3,-16.621954,-16.898686,-16.898058,24.458697,25.443260,25.659750,...,0.319093,1.363767,N,O,C(C)#N,JPXHHGMADNZXBT-UHFFFAOYSA-N,HQABUPZFAYXKJW-UHFFFAOYSA-N,ISWSIDIOOBJBQZ-UHFFFAOYSA-N,JPXHHGMADNZXBT-UHFFFAOYSA-N,298.0


In [173]:
kinetic_df = pd.read_csv("../machine_learning/kinetic_data.csv")

In [226]:
df.duplicated(["reaction_smiles_orig", "solvent", "temperature"]).value_counts()

False    452
dtype: int64

In [219]:
comp_1 = df[["reaction_smiles_orig", "solvent", "temperature"]].copy()
comp_1.columns = ["reaction_smiles", "solvent", "temperature"]
comp_1["reaction_smiles"] = comp_1["reaction_smiles"].apply(lambda x: AllChem.ReactionToSmiles(AllChem.ReactionFromSmarts(x, useSmiles=True)))
comp_1["solvent"] = comp_1["solvent"].apply(Chem.MolFromSmiles).apply(Chem.MolToInchiKey)

In [239]:
comp_2 = kinetic_df[["reaction_smiles", "solvent_smiles", "temperature", "activation_energy"]].copy()
comp_2.columns = ["reaction_smiles", "solvent", "temperature", "activation_energy"]
comp_2["reaction_smiles"] = comp_2["reaction_smiles"].apply(lambda x: AllChem.ReactionToSmiles(AllChem.ReactionFromSmarts(x, useSmiles=True)))
comp_2["solvent"] = comp_2["solvent"].apply(Chem.MolFromSmiles).apply(Chem.MolToInchiKey)
comp_2.groupby(["reaction_smiles", "solvent", "temperature"]).mean().reset_index()
comp_2.drop_duplicates(["reaction_smiles", "solvent", "temperature"], inplace=True)

In [240]:
merged_df = pd.merge(comp_1, comp_2, how="left", on=["reaction_smiles", "solvent", "temperature"], indicator=True)
merged_df.index = df.index

In [241]:
merged_df["_merge"].value_counts()

both          449
left_only       3
right_only      0
Name: _merge, dtype: int64

In [251]:
df[merged_df["_merge"] == "left_only"][["reaction_smiles_orig", "solvent"]].values

array([['FC(F)(F)c1ccc(Cl)c(c1)C#N.CCCCN>>CCCCNc1ccc(cc1C#N)C(F)(F)F',
        'C(C)#N'],
       ['O=N(=O)c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc(cn1)N(=O)=O', 'C(C)#N'],
       ['Fc1ccc(cc1N(=O)=O)N(=O)=O.[NH2+]1CCNCC1>>O=N(=O)c1ccc(N2CC[NH2+]CC2)c(c1)N(=O)=O',
        'O']], dtype=object)

In [None]:
reaction.

In [274]:
reaction = AllChem.ReactionFromSmarts("CCCCN.c1cc(ncc1N(=O)=O)Cl>>CCCCNc1ccc(cn1)N(=O)=O", useSmiles=True)
for mol in list(reaction.GetReactants()) + list(reaction.GetProducts()):
    Chem.SanitizeMol(mol)
AllChem.SanitizeRxn(reaction, Chem.rdChemReactions.SANITIZE_ADJUST_REACTANTS)
AllChem.ReactionToSmiles(reaction)

'CCCCN.O=[N+]([O-])c1ccc(Cl)nc1>>CCCCNc1ccc([N+](=O)[O-])cn1'

'O=[N+]([O-])c1ccc(F)cc1.[N-]=[N+]=[N-]>>[F-].[N-]=[N+]=Nc1ccc([N+](=O)[O-])cc1'

In [279]:
AllChem.ReactionToSmiles(AllChem.ReactionFromSmarts(ReactionSmilesProcessor("O=[N+]([O-])c1ccc(F)cc1.[N-]=[N+]=[N-]>>[N-]=[N+]=Nc1ccc([N+](=O)[O-])cc1.[F-]").reaction_smiles, useSmiles=True))

'O=[N+]([O-])c1ccc(F)cc1.[N-]=[N+]=[N-]>>[F-].[N-]=[N+]=Nc1ccc([N+](=O)[O-])cc1'

In [322]:
df[df["reaction_smiles"] == "O=[N+]([O-])c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc([N+](=O)[O-])cn1.Cl"]

Unnamed: 0,reaction_smiles,reaction_smiles_orig,v_av_central_atom,v_av_nu_atom,reaction_energy,reaction_energy_qh_grimme,reaction_energy_qh_truhlar,activation_energy,activation_energy_qh_grimme,activation_energy_qh_truhlar,...,bo_lg_ts,bo_prod_nu,nu_symbol,lg_symbol,solvent,inchikey_substrate,inchikey_nu,inchikey_lg,inchikey_product,temperature


In [319]:
data_df[data_df["complete_reaction_smiles"] == ReactionSmilesProcessor("O=[N+]([O-])c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc([N+](=O)[O-])cn1.Cl").reaction_smiles]

Unnamed: 0,ID,Exp_Rate_Constant k1 (M-1s-1),Molecule,Modelled,Reaction_SMILES,Substrate_SMILES,Nucleophile_SMILES,Product_SMILES,RLPT,Original Data,...,Index,canonical_Substrate,canonical_Nucleophile,canonical_Product,canonical_reaction_smiles,reaction_mols,Duplicates,complete_reaction_smiles,influential_solvent,solvent_smiles
2,3,0.000384,3a,Yes,ClC1=NC=C(C=C1)N(=O)=O.CCCCN>>CCCCNC1=NC=C(C=C...,ClC1=NC=C(C=C1)N(=O)=O,CCCCN,CCCCNC1=NC=C(C=C1)N(=O)=O,,y,...,3,O=[N+]([O-])c1ccc(Cl)nc1,CCCCN,CCCCNc1ccc([N+](=O)[O-])cn1,O=[N+]([O-])c1ccc(Cl)nc1.CCCCN>>CCCCNC1=NC=C(C...,<rdkit.Chem.rdChemReactions.ChemicalReaction o...,[],CCCCN.O=[N+]([O-])c1ccc(Cl)nc1>>CCCCNc1ccc([N+...,Acetonitrile,C(C)#N
337,347,0.011,Table4_2a,No,O=N(=O)c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc(cn1)N(=O)=O,O=N(=O)c1ccc(Cl)nc1,CCCCN,CCCCNc1ccc(cn1)N(=O)=O,,,...,347,O=[N+]([O-])c1ccc(Cl)nc1,CCCCN,CCCCNc1ccc([N+](=O)[O-])cn1,O=[N+]([O-])c1ccc(Cl)nc1.CCCCN>>CCCCNc1ccc(cn1...,<rdkit.Chem.rdChemReactions.ChemicalReaction o...,[],CCCCN.O=[N+]([O-])c1ccc(Cl)nc1>>CCCCNc1ccc([N+...,DMSO,CS(=O)C
