In [1]:
import re
import os
import sweetviz as sv

from general_cleaning import *
from verify_by_MW import *
from verify_by_CObond import *

# <span style="color:blue"> Search data on Reaxys </span>

- cyclopentanone → cyclopentanol, map C-carbonyl → give 60,455 reactions
    - limit to sodium tetrahydroborate, tetrahydrofuran, single-step, article → give 835 reactions
    - exclude NA yield → **483 reactions**
- cyclohexanone → cyclohexanol, map C-carbonyl → give 61,693 reactions
    - limit to sodium tetrahydroborate, tetrahydrofuran, single-step, article → give 1,125 reactions
    - exclude NA yield → **657 reactions**

*use ‘limit to’ for document type as ‘exclude’ other document type will delete the rxn that has mix document types, which might also contain article .*

# <span style="color:blue"> Import data </span>

- Double check number of reactions
- Concatenate 2 datasets.

In [2]:
# Change working directory
os.chdir('/Users/suongsuong/Documents/GitHub/Reactivity-based-metric-of-complexity')

In [3]:
cyclopentanone = pd.read_excel('Reduction of ketone/Reaxys_Raw data/hydride_reduction_of_cyclopentanone.xlsx')
count_num_reaction(cyclopentanone)

Number of Reactions: 483
Number of Rows: 2183


In [4]:
cyclohextanone = pd.read_excel('Reduction of ketone/Reaxys_Raw data/hydride_reduction_of_cyclohexanone.xlsx')
count_num_reaction(cyclohextanone)

Number of Reactions: 657
Number of Rows: 3755


In [5]:
# Concatenate 
cyclo_concat = pd.concat([cyclopentanone, cyclohextanone], axis=0)

# Only takes these information:
columns_to_keep = [ 
    'Reaction', 
    'Reactant', 
    'Product', 
    'Reagent', 
    'Catalyst', 
    'Solvent (Reaction Details)',
    
    'Time (Reaction Details) [h]',
    'Temperature (Reaction Details) [C]',
    
    'Yield',
    
    'Reaction ID', 
    'Links to Reaxys',
    'Reaction: Links to Reaxys',
    'References'
]

cyclo_concat = cyclo_concat[columns_to_keep]
count_num_reaction(cyclo_concat)

Number of Reactions: 1055
Number of Rows: 5938


# <span style="color:blue"> EDA </span>
#### view HTML file for EDA result. 

In [6]:
# eda = sv.analyze(cyclo_concat)
# eda.show_html(filepath='Reduction of ketone/Reaxys_Raw data/EDA_Rawdata.html')

#  <span style="color:blue"> Step 1: General Cleaning</span>

In [10]:
# clean
GeneralClean_cyclo = GeneralCleaning(cyclo_concat)
count_num_reaction(GeneralClean_cyclo)

# extract yield (number)

GeneralClean_cyclo = GeneralClean_cyclo.copy()
GeneralClean_cyclo['Yield (number)'] = GeneralClean_cyclo['Yield'].apply(extract_yield)

Number of Reactions: 992
Number of Rows: 2267


#### Reaction conditions will be analyzed after

#  <span style="color:blue"> Step 2: Verify reaction by MW</span>

In [11]:
# get reactant and product SMILES from the reaction SMILES
getReactandProduct_cyclo_concat = getReactandProduct(GeneralClean_cyclo)

# Calculate change in MW after reaction
changeMW_cyclo_concat = get_change_MW(getReactandProduct_cyclo_concat)


[21:05:07] Explicit valence for atom # 0 B, 5, is greater than permitted


#### come back to this warning later

### For the single reduction (filtered by MW), only take those having 1 reactant and 1 product for the next step.

In [14]:
# For the single reduction, verified by MW:
SingleReduction_MW = verify_SingleReduc_MW(changeMW_cyclo_concat)

count_num_reaction(SingleReduction_MW)

Number of Reactions: 705
Number of Rows: 1776


### For the double reduction (filtered by MW), only take those making stereoisomers product.

In [15]:
# For the single reduction, verified by MW:
Stereo_SingleReduction_MW = verify_SingleReduc_Stereo_MW(changeMW_cyclo_concat)

# Change the format back to 1 reactant -> 1 product (single reduction)
Stereo_SingleReduction_MW['Reactant SMILES'] = Stereo_SingleReduction_MW['Reactant SMILES'].apply(lambda x: [x[0]])
Stereo_SingleReduction_MW['Product SMILES'] = Stereo_SingleReduction_MW['Product SMILES'].apply(lambda x: [x[0]])


count_num_reaction(Stereo_SingleReduction_MW)

Number of Reactions: 101
Number of Rows: 187


#### Note that Reactant SMILES still has stereochemistry while Product SMILES does not have. Later, if the stereochemistry of Product SMILES is needed, it can be extracted back from 'Reaction' column


#### Concatenate both data set of single reduction for now

In [16]:
SingleReduction_MW_concat = pd.concat([SingleReduction_MW, Stereo_SingleReduction_MW], axis = 0)
count_num_reaction(SingleReduction_MW_concat)

Number of Reactions: 806
Number of Rows: 1963


#  <span style="color:blue"> Step 3: Verify reaction by change in C-O bonds</span>

In [17]:
SingleReduction_CObond = verify_SingleReduc_CObond(SingleReduction_MW_concat)
count_num_reaction(SingleReduction_CObond)

Number of Reactions: 794
Number of Rows: 1946


###  <span style="color:red"> After 3 steps, the data set now has 1946 rows consisting of 794 reactions </span>

In [None]:
### use the following code to view (number) reaction scheme
# view_reactionScheme(SingleReduction_CObond, (number here) )