In [1]:
from predict_snar.smiles import ReactionSmilesProcessor
from predict_snar.data import SolventPicker
from analyze_snar.extraction import extract_dataframe
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

fcntl not avaiable on Windows.
resource not available on Windows.


Make functions for joblib parallelization

In [2]:
def get_influential_solvent(solvent):
    sp = SolventPicker()
    influential_solvent = sp.get_influential_solvent(solvent)
    return influential_solvent

In [3]:
def get_solvent_smiles(solvent):
    sp = SolventPicker()
    influential_solvent = sp.smiles_from_name(solvent)
    return influential_solvent    

In [4]:
sp = SolventPicker()

# Process kinetic database

In [659]:
data_df = pd.read_excel("../prepare_kinetic_data/kinetic_data_v4.xlsx")

In [660]:
data_rsmps = data_df["Reaction_SMILES"].apply(lambda x: ReactionSmilesProcessor(x))

In [454]:
data_solvents = Parallel(n_jobs=4)(delayed(get_influential_solvent)(solvent) for solvent in data_df["Solvent"])

In [416]:
solvent_smiles_data = Parallel(n_jobs=4)(delayed(get_solvent_smiles)(solvent) for solvent in data_solvents)

In [661]:
data_df["complete_reaction_smiles"] = [AllChem.ReactionToSmiles(AllChem.ReactionFromSmarts(rsmp.reaction_smiles)) for rsmp in data_rsmps]

In [662]:
data_df["influential_solvent"] = data_solvents
data_df["solvent_smiles"] = solvent_smiles_data

#### Investigate replicated reactions

In [663]:
data_df[data_df.duplicated(["complete_reaction_smiles", "influential_solvent", "Temp (K)"], keep=False)][["Reaction_SMILES", 'Exp_Rate_Constant k1 (M-1s-1)', 'Activation free Energy (kcal) Eyring', 'Solvent', 'Temp (K)', 'Reference', 'DOI']]

Unnamed: 0,Reaction_SMILES,Exp_Rate_Constant k1 (M-1s-1),Activation free Energy (kcal) Eyring,Solvent,Temp (K),Reference,DOI
26,FC1=CC=C(C=C1N(=O)=O)N(=O)=O.C3CCNCC3>>C3CCN(C...,150.0,14.46,Acetonitrile,298.0,"Eur. J. Org. Chem, 2007, 1378",https://doi.org/10.1002/ejoc.200600968
108,Fc1ccc(cc1N(=O)=O)N(=O)=O.C1CCNCC1>>O=N(=O)c1c...,380.0,13.91,Acetonitrile,298.0,"J. Org. Chem., 2007, 72, 8797.",https://doi.org/doi/10.1021/jo701549h
139,Clc1ccc(cc1N(=O)=O)N(=O)=O.C1CCNCC1>>C2CCN(CC2...,0.0153,19.9,Ethanol,298.0,"J. Chem. Soc., 1950, 507-516",https://doi.org/10.1039/JR9500000507
219,C1=CC(=CC=C1[N+](=O)[O-])F.[N-]=[N+]=[N-]>>[N-...,0.001738,21.18,TMS,298.0,"J. Am. Chem. Soc.,1973, 95, 408-410",http://dx.doi.org/10.1021/ja00783a016
222,C1=CC(=CC=C1[N+](=O)[O-])F.[N-]=[N+]=[N-]>>[N-...,0.0795,18.92,HMPT,298.0,"J. Am. Chem. Soc.,1973, 95, 408-410",http://dx.doi.org/10.1021/ja00783a016
467,C1=CC(=CC=C1[N+](=O)[O-])F.[N-]=[N+]=[N-]>>[N-...,1.2589,17.29,HMPT,298.0,"J. Am. Chem. Soc., 1968, 90, 5049-5069",https://doi.org/10.1021/ja01021a002
468,C1=CC(=CC=C1[N+](=O)[O-])F.[N-]=[N+]=[N-]>>[N-...,0.00199,21.1,TMS,298.0,"J. Am. Chem. Soc., 1968, 90, 5049-5069",https://doi.org/10.1021/ja01021a002
489,Clc1ccc(cc1N(=O)=O)N(=O)=O.C1CCNCC1>>O=N(=O)c1...,0.018,19.8,Ethanol,298.0,"J. Chem. Soc., Perkin Trans. 2, 1986, 1427-1431",http://dx.doi.org/10.1039/P29860001427


In [664]:
all_data_df = data_df.copy()

In [665]:
data_df.drop_duplicates(["complete_reaction_smiles", "influential_solvent", "Temp (K)"], inplace=True)

#### Drop examples with activation energies

In [667]:
print(data_df.shape)
data_df.dropna(subset=["Activation free Energy (kcal) Eyring"], inplace=True)
print(data_df.shape)

(499, 41)
(499, 41)


# Process data which was submitted

In [668]:
submitted_df = pd.read_csv("submitted_smiles", sep=" ", names=["reaction_smiles", "temperature", "solvent"])
print(submitted_df.shape)

(476, 3)


In [669]:
submitted_rsmps = submitted_df["reaction_smiles"].apply(lambda x: ReactionSmilesProcessor(x))

In [670]:
submitted_df["complete_reaction_smiles"] = [AllChem.ReactionToSmiles(AllChem.ReactionFromSmarts(rsmp.reaction_smiles)) for rsmp in submitted_rsmps]
submitted_df.drop_duplicates(["complete_reaction_smiles", "temperature", "solvent"], inplace=True)
print(submitted_df.shape)

(452, 4)


In [671]:
comp_submitted = submitted_df[["complete_reaction_smiles", "solvent", "temperature"]]
comp_submitted.columns = ["reaction_smiles", "solvent", "temperature"]
comp_data = data_df[["complete_reaction_smiles", "influential_solvent", "Temp (K)"]]
comp_data.columns = ["reaction_smiles", "solvent", "temperature"]

In [672]:
comp_submitted.shape

(452, 3)

# Find submitted reactions not in the dataset

In [465]:
sub_in_data = pd.merge(comp_submitted, comp_data, how="left", on=["reaction_smiles", "solvent", "temperature"], indicator=True)
sub_in_data.index = comp_submitted.index

In [466]:
left_only_sub = comp_submitted[sub_in_data["_merge"] == "left_only"]
both_sub = comp_submitted[sub_in_data["_merge"] == "both"]

In [467]:
len(both_sub)

451

In [468]:
comp_submitted[sub_in_data["_merge"] == "left_only"]

Unnamed: 0,reaction_smiles,solvent,temperature
352,CCCCN.N#Cc1cc(C(F)(F)F)ccc1Cl>>CCCCNc1ccc(C(F)...,Acetonitrile,298.0


# Find reactions in dataset that were not submitted

In [469]:
data_in_sub = pd.merge(comp_data, comp_submitted, how="left", on=["reaction_smiles", "solvent", "temperature"], indicator=True)
data_in_sub.index = comp_data.index

In [470]:
left_only_data = comp_data[data_in_sub["_merge"] == "left_only"]
both_data = comp_data[data_in_sub["_merge"] == "both"]

In [471]:
left_only_data.index

Int64Index([347, 353, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
            480, 481, 482, 483, 484, 485, 486, 487, 488, 490, 491, 492, 493,
            494, 495, 496, 497, 498, 500, 501, 502, 503, 504, 505, 506, 507,
            508, 510, 511, 512, 513, 514, 515, 516, 517],
           dtype='int64')

In [472]:
len(left_only_data)

48

# Analysis of completed reactions

In [489]:
df = extract_dataframe("../machine_learning/2019-12-15/db", or_lg_correction=True)
print(df.shape)

(475, 55)


In [490]:
reactions = df["reaction_smiles"].apply(lambda x: AllChem.ReactionFromSmarts(x, useSmiles=True))
for reaction in reactions:
    for mol in list(reaction.GetReactants()) + list(reaction.GetProducts()):
        Chem.SanitizeMol(mol)
df["reaction_smiles"] = [AllChem.ReactionToSmiles(reaction) for reaction in reactions]

In [493]:
df.drop_duplicates(["reaction_smiles", "solvent", "temperature"], inplace=True)
print(df.shape)

(452, 55)


#### Investigate submitted jobs which were not finished
Turns out that all jobs were finished

In [437]:
solvent_smiles_sub = Parallel(n_jobs=4)(delayed(get_solvent_smiles)(solvent) for solvent in both_sub["solvent"])

In [559]:
good_sub = both_sub.copy()
good_sub["solvent_name"] = good_sub["solvent"]
good_sub["solvent"] = solvent_smiles_sub

In [560]:
finished = pd.merge(good_sub, df, how="left", on=["reaction_smiles", "solvent", "temperature"], indicator=True)
finished.index = good_sub.index

In [561]:
finished["_merge"].value_counts()

both          451
right_only      0
left_only       0
Name: _merge, dtype: int64

In [500]:
left_only = finished[finished["_merge"] == "left_only"]

In [501]:
left_only

Unnamed: 0,reaction_smiles,solvent,temperature,reaction_smiles_orig,v_av_central_atom,v_av_nu_atom,reaction_energy,reaction_energy_qh_grimme,reaction_energy_qh_truhlar,activation_energy,...,bo_nu_ts,bo_lg_ts,bo_prod_nu,nu_symbol,lg_symbol,inchikey_substrate,inchikey_nu,inchikey_lg,inchikey_product,_merge


In [502]:
extra_submitted = pd.merge(df, good_sub, how="left", on=["reaction_smiles", "solvent", "temperature"], indicator=True)
extra_submitted.index = df.index

In [503]:
extra_submitted["_merge"].value_counts()

both          451
left_only       1
right_only      0
Name: _merge, dtype: int64

In [504]:
df[extra_submitted["_merge"] == "left_only"]["reaction_smiles"]

322    CCCCN.N#Cc1cc(C(F)(F)F)ccc1Cl>>CCCCNc1ccc(C(F)...
Name: reaction_smiles, dtype: object

In [505]:
df[extra_submitted["_merge"] == "left_only"]["reaction_smiles"].values

array(['CCCCN.N#Cc1cc(C(F)(F)F)ccc1Cl>>CCCCNc1ccc(C(F)(F)F)cc1C#N.Cl'],
      dtype=object)

# Find status for reactions in dataset
* Not submitted
* Submitted
* Failed
* Not treated

Failed reactions:

| index | reason |
| ----- | ------ |
| 201   | TMS    |
| 66    | TS     |
| 185   | TS     |
| 339   | TS     |
| 457   | TS     |
| 364   | C nu   |
| 365   | C nu   |

In [529]:
non_submitted = left_only_data.index
failed = pd.Index([66, 185, 339, 457, 364, 365])
non_treated = pd.Index([201, 474])
wrongly_submitted = pd.Index([322])

Check that numbers add up

In [536]:
n_good = len(df) - len(non_treated) - len(wrongly_submitted)
n_successful = n_good - len(failed)
n_failed = len(failed)

In [539]:
print("Good submitted reactions:", n_good)
print("Successful reactions:", n_successful)
print("Failed reactions:", n_failed)
print(f"Percentage failures: {(n_failed / n_good) * 100:.2f}")

Good submitted reactions: 449
Successful reactions: 443
Failed reactions: 6
Percentage failures: 1.34


In [607]:
labels_df = df.copy()
labels_df["label"] = "modelled"
labels_df.loc[failed, "label"] = "failed"
labels_df.loc[non_treated, "label"] = "removed"
labels_df.loc[wrongly_submitted, "label"] = "wrong"

In [608]:
labels_df["label"].value_counts()

modelled    443
failed        6
removed       2
wrong         1
Name: label, dtype: int64

In [609]:
labels_merged = pd.merge(good_sub, labels_df, how="left", on=["reaction_smiles", "solvent", "temperature"])

In [673]:
data_labels_merged = pd.merge(comp_data, labels_merged, how="left", left_on=["reaction_smiles", "solvent", "temperature"], right_on=["reaction_smiles", "solvent_name", "temperature"], indicator=True)

In [674]:
data_labels_merged["_merge"].value_counts()

both          451
left_only      48
right_only      0
Name: _merge, dtype: int64

In [675]:
data_labels_merged["label"] = data_labels_merged["label"].fillna("not modelled")

In [676]:
data_labels_merged["label"].value_counts()

modelled        443
not modelled     48
failed            6
removed           2
Name: label, dtype: int64

In [677]:
len(all_data_df)

518

In [678]:
len(data_labels_merged)

499

In [679]:
len(all_data_df)

518

In [702]:
test_df = pd.merge(all_data_df, data_labels_merged.drop(["_merge"], axis=1), how="left", left_on=["complete_reaction_smiles", "influential_solvent", "Temp (K)"], right_on=["reaction_smiles", "solvent_x", "temperature"], indicator=True)

In [703]:
len(test_df)

518

In [704]:
test_df["label"] = test_df["label"].fillna("missing data")

In [705]:
test_df["label"].value_counts()

modelled        446
not modelled     48
missing data     15
failed            6
removed           3
Name: label, dtype: int64

In [706]:
test_df["label"].to_csv("labels.csv", index=False, header=False)

In [708]:
labels = test_df["label"]

In [714]:
print(labels.loc[[26, 108]])
print(labels.loc[[139, 489]])
print(labels.loc[[219, 468]])
print(labels.loc[[222, 467]])

26     modelled
108    modelled
Name: label, dtype: object
139    modelled
489    modelled
Name: label, dtype: object
219    removed
468    removed
Name: label, dtype: object
222    modelled
467    modelled
Name: label, dtype: object
