In [1]:
import os
import pandas as pd
import sweetviz as sv

# <span style="color:blue"> Search data on Reaxys </span>

- cyclopentanone → cyclopentanol, map C-carbonyl → give 35,665 reactions
	- exclude NA yield → 5,747 reactions
	- limit to single-step, article -> 4,885 reactions
	- limit to ‘sodium tetrahydroborate’ -> **2,272 reactions**
- cyclohexanone → cyclohexanol, map C-carbonyl → give 61,693 reactions
	- exclude NA yield → 9,765 reactions
	- limit to single-step, article -> 8,502 reactions
	- limit to ‘sodium tetrahydroborate’ -> **3,487 reactions**

*use ‘limit to’ for document type as ‘exclude’ other document type will delete the rxn that has mix document types, which might also contain article .*

In [2]:
def count_num_reaction(data):
    # Count how many reactions there are
    rxnCount_data = data['Reaction ID'].nunique()
    print('Number of Reactions:', rxnCount_data)
    print('Number of Rows:', data.shape[0])

# <span style="color:blue"> Import data </span>

- Double check number of reactions
- Concatenate 2 datasets.

In [3]:
# Change working directory
os.chdir('/Users/suongsuong/Documents/GitHub/Reactivity-based-metric-of-complexity/Reduction of ketone/')

In [13]:
cyclo_concat = pd.read_excel('Reaxys_Raw data/cyclo5,6.xlsx')
count_num_reaction(cyclo_concat)

Number of Reactions: 5421
Number of Rows: 11834


In [14]:
# Only takes these information:
columns_to_keep = [ 
    'Reaction', 
    'Reactant', 
    'Product', 
    'Reagent', 
    'Catalyst', 
    'Solvent (Reaction Details)',
    
    'Time (Reaction Details) [h]',
    'Temperature (Reaction Details) [C]',
    
    'Yield',
    
    'Reaction ID', 
    'Links to Reaxys',
    'Reaction: Links to Reaxys',
    'References'
]

cyclo_concat = cyclo_concat[columns_to_keep]
count_num_reaction(cyclo_concat)

Number of Reactions: 5421
Number of Rows: 11834


# <span style="color:blue"> EDA </span>
#### view HTML file for EDA result. 

In [6]:
eda = sv.analyze(cyclo_concat)
eda.show_html(filepath='Notebook/EDA_Rawdata_cyclo5,6.html')

                                             |          | [  0%]   00:00 -> (? left)

Report Notebook/EDA_Rawdata_cyclo5,6.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# <span style="color:blue"> Cleaning</span>

## 1. Drop NA for Reagent, Reaction, Yield, Temperature, Time, Solvent 
## & Remove rows having catalyst
*Only drop NA for catalysts for the type of reaction needed.*

In [15]:
# Drop NA
cleanNA_cyclo_concat = cyclo_concat.dropna(subset=['Reagent', 'Reaction', 'Yield', 'Temperature (Reaction Details) [C]', 'Time (Reaction Details) [h]', 'Solvent (Reaction Details)'])

# Remove rows that have catalyst
cleanNA_cyclo_concat = cleanNA_cyclo_concat[cleanNA_cyclo_concat['Catalyst'].isna() == True]

count_num_reaction(cleanNA_cyclo_concat)

Number of Reactions: 2771
Number of Rows: 3919


## 2. Only keep row having "Article" for "References""

In [16]:
article_cyclo_concat = cleanNA_cyclo_concat[cleanNA_cyclo_concat['References'].str.contains('Article')]
count_num_reaction(article_cyclo_concat)

Number of Reactions: 2710
Number of Rows: 3678


# 3. Remove rows that 'Time' contains ";" or "-"
#### which corresponds to more than 1 step reaction.

In [17]:
# Remove row having ';' or '-'
oneStep_cyclo_concat = article_cyclo_concat[ article_cyclo_concat['Time (Reaction Details) [h]'].str.contains(';|-') == False].copy()

# Convert time to numeric
oneStep_cyclo_concat['Time (Reaction Details) [h]'] = pd.to_numeric(oneStep_cyclo_concat['Time (Reaction Details) [h]'], errors='raise')

count_num_reaction(oneStep_cyclo_concat)

Number of Reactions: 2610
Number of Rows: 3513


# 4. Remove row that reported yield is not percent yield

In [18]:
# Remove yield reported with '>' (either >99 or >95 - remove)
percent_cyclo_concat = oneStep_cyclo_concat[oneStep_cyclo_concat['Yield'].str.contains('>') == False].copy()

# Only keep the one reported as 'percent'
percent_cyclo_concat = percent_cyclo_concat[percent_cyclo_concat['Yield'].str.contains('percent', case = False)]

count_num_reaction(percent_cyclo_concat)

Number of Reactions: 2593
Number of Rows: 3468


# 5. Remove duplicate rows

In [19]:
percent_cyclo_concat = percent_cyclo_concat.drop_duplicates()

count_num_reaction(percent_cyclo_concat)

Number of Reactions: 2593
Number of Rows: 3259


#### Reaction conditions will be analyzed after

### save data

In [20]:
percent_cyclo_concat.to_excel('Notebook/Reduction_Cyclo.xlsx', index = False)