In [1]:
import pandas as pd
import numpy as np
import sweetviz as sv
import os
import matplotlib.pyplot as plt

from count_view_Reaction import *
from step1_to_4 import *

# <span style="color:blue"> Search data on Reaxys </span>
- Draw as shown
<img src="Reaxys_Raw data/draw/ketone_in_chain_search.png" width="400" height="180">
- Choose ‘as substructure’ then ‘On all atoms’
- Start with **131,870 reactions**
- Limit to ‘sodium tetrahydroborate’, single-step, article → **22,995 reactions**
- Exclude NA yield → **10,868 reactions**


# <span style="color:blue"> Import data </span>

- Double check number of reactions

In [2]:
# Change working directory
os.chdir('/Users/suongsuong/Documents/GitHub/Reactivity-based-metric-of-complexity/Reduction of ketone/')

In [3]:
ketone_in_chain= pd.read_excel('Reaxys_Raw data/ketone_in_chain.xlsx')
count_num_reaction(ketone_in_chain)
count_num_row(ketone_in_chain)

Number of Reactions: 10868
Number of Rows: 42306


# <span style="color:blue"> EDA </span>
#### view HTML file for EDA result. 

In [4]:
#eda = sv.analyze(ketone_in_chain)
#eda.show_html(filepath='Reaxys_Raw data/EDA_Rawdata_ketone_in_chain.html')

###  <span style="color:blue"> Step 1: General Cleaning</span>
### <span style="color:blue"> Step 2: Verify reaction by MW</span>
### <span style="color:blue"> Step 3: Verify reaction by change in C-O bonds</span>
### <span style="color:blue"> Step 4: Ensure consistent reaction conditions</span>

In [5]:
ketone_in_chain_clean_1_to_4 = step_1_to_4(ketone_in_chain)

STEP 1 - general cleaning:
Number of Reactions: 4885
---------------------------------------
STEP 2 - Verify reaction by change in MW:
Number of Reactions: 4177
Number of Rows: 8259
---------------------------------------
1    8259
Name: change in C-O single bond, dtype: int64
-1    8259
Name: change in C=O double bond, dtype: int64
STEP 3 - Verify reaction by change in C-O bond:
Number of Reactions: 4177
Number of Rows: 8259
---------------------------------------
STEP 4 - Ensure consistent reaction conditions:
 - After filtering by reagent:
Number of Reactions: 2986
 - After filtering by solvent:
Number of Reactions: 1730
 - After filtering by temperature:
Number of Reactions: 629
 - After filtering by time:
Number of Reactions: 628
Number of Rows: 644


### <span style="color:blue"> Step 5: Inspect duplicate rows </span>

In [6]:
ketone_in_chain_clean_1_to_4['Reaction ID'].value_counts().head(20)

997369      3
855977      2
2532675     2
34623882    2
2163086     2
2104996     2
1985375     2
34623878    2
34623890    2
4551209     2
39265034    2
46506389    2
38244374    2
782191      2
578384      2
5048629     1
673876      1
35423632    1
48152818    1
48152829    1
Name: Reaction ID, dtype: int64

In [7]:
#### USE THIS CODE TO EXAMINE THE REACTION

# Reaction_ID = 997369
# view_reactionScheme(ketone_in_chain_clean_1_to_4[ketone_in_chain_clean_1_to_4['Reaction ID'] == Reaction_ID], 2, 'keep duplicate')
# ketone_in_chain_clean_1_to_4[ketone_in_chain_clean_1_to_4['Reaction ID'] == Reaction_ID]

<span style="color:salmon"> Reaction ID = 578384 </span>

Difference references.
Reaction of 64%% yield is defined by LCMS
The one of 95% has no purification step

=> keep the one of 64% yield

<span style="color:salmon"> Reaction ID = 782191 </span>

Difference references.
Reaction of 27%% yield is defined by LCMS
The one of 63% - no access even by requesting via MIT library

=> Keep the one of 27%

<span style="color:salmon"> Reaction ID = 39265034 </span>

Difference references.
Reaction of 90% yield having wrong temperature
The one of 100% have no purification step 

=> Keep the one having 100 % for now

<span style="color:salmon"> Reaction ID = 1985375 </span>
Difference references, the right reaction is 0.5 h and 80 %. The other one has no procedure reported

=> Keep the one having 80 %

<span style="color:salmon"> Reaction ID = 2163086 </span>
Difference references, the right reaction is 1.5 h and 95 %. The other one has no procedure reported

=> Keep the one having 95 %

<span style="color:salmon"> Reaction ID = 2163086 </span>
Same reference
The right reaction is 1h and 67%

=> Keep the one having 67%

<span style="color:salmon"> Reaction ID = 997369 </span>

The reaction of 0.5h and 90% yield. the time was wrongly reported, it's actually 2/3 h

=> Keep the 2 other two then take average of yield, and time. 

<span style="color:salmon"> Reaction ID = 855977 </span>

The reaction having 960% yield. no detailed procedure reported

=> Keep the one having 100%

<span style="color:salmon"> Reaction ID = 2532675 </span>
 
Exact same reference.
=> Keep the one having the right yield which is 47 percent; 45 percent

<span style="color:salmon"> Reaction ID =  34623882, 34623878, 34623890, 4551209,46506389, 38244374  </span> 

Same reaction, conditions, yield, but different journals

=> keep 1, no matter which


In [8]:
id_tokeep1_nomatter = [34623882, 34623878, 34623890, 4551209,46506389, 38244374]
id_tokeep1_condition = [855977, 2163086,2104996,1985375, 39265034, 578384, 78219,2532675]

# Keep 1 no matter what condition
chain_remove_duplicate = pd.concat([
    ketone_in_chain_clean_1_to_4[ketone_in_chain_clean_1_to_4['Reaction ID'].isin(id_tokeep1_nomatter)].drop_duplicates(subset=['Reaction ID']),
    ketone_in_chain_clean_1_to_4[~ketone_in_chain_clean_1_to_4['Reaction ID'].isin(id_tokeep1_nomatter)]
])


# Keep 1 with condition
conditions = [
    (855977, 'Yield (number)', 100),
    (2163086, 'Yield (number)', 67),
    (2104996, 'Yield (number)', 95),
    (1985375, 'Yield (number)', 80),
    (39265034, 'Yield (number)', 100),
    (578384, 'Yield (number)', 64),
    (782191, 'Yield (number)', 27),
    (2532675, 'Yield', '47 percent; 45 percent') ## update yield(number)
]

not_conditions = chain_remove_duplicate[~chain_remove_duplicate['Reaction ID'].isin(id_tokeep1_condition)]
condition_rows = [chain_remove_duplicate[(chain_remove_duplicate['Reaction ID'] == id) & (chain_remove_duplicate[col] == val)] for id, col, val in conditions]

for id, col, val in conditions:
    if col == 'Yield (number)':
        chain_remove_duplicate.loc[chain_remove_duplicate['Reaction ID'] == id, 'Yield'] = (str(val) + 'percent')

chain_remove_duplicate = pd.concat([not_conditions] + condition_rows)


# Update the 'Yield' and 'Time' column for 'Reaction ID' 997369
#note: remove one of 0.5h and 90%, take average of the other 2.
chain_remove_duplicate = chain_remove_duplicate.drop(chain_remove_duplicate[
    (chain_remove_duplicate['Reaction ID'] == 997369) &
    (chain_remove_duplicate['Yield (number)'] == 90) &
    (chain_remove_duplicate['Time (Reaction Details) [h]'] == 0.5)
    ].index)
chain_remove_duplicate = chain_remove_duplicate.drop_duplicates(subset=['Reaction ID'])
chain_remove_duplicate.loc[chain_remove_duplicate['Reaction ID'] == 2227545, 'Yield (number)'] = np.average([90, 98])
chain_remove_duplicate.loc[chain_remove_duplicate['Reaction ID'] == 2227545, 'Yield'] = str(np.average([90, 98]))
chain_remove_duplicate.loc[chain_remove_duplicate['Reaction ID'] == 2227545, 'Time (Reaction Details) [h]'] = np.average([0.5, 2/3])


count_num_reaction(chain_remove_duplicate)
count_num_row(chain_remove_duplicate)


Number of Reactions: 628
Number of Rows: 628


#### <span style="color:salmon"> Verify duplicated rows by reaction SMILES </span> 

There are 2 reaction are the same but were assigned to different ID: 48228963 and 48228690
Everything else are the same -> keep 1

In [26]:
chain_remove_duplicate['Reaction'].value_counts().head(3)

COC(OC)C1=CC=CC=C1C(CC#N)C(=O)C1=CC=CC=C1>>COC(OC)C1=CC=CC=C1[C@H](CC#N)[C@H](O)C1=CC=CC=C1                                1
COC(=O)C1=CC=C(C=C1)C(=O)CCN1CCC(=CC1)C1=CC=C(F)C=C1>>COC(=O)C1=CC=C(C=C1)C(O)CCN1CCC(=CC1)C1=CC=C(F)C=C1                  1
CC1=CC=C(C=C1)S(=O)(=O)NC1=CC=C(F)C=C1C(=O)\C=C\C1=CC=CC=C1>>CC1=CC=C(C=C1)S(=O)(=O)NC1=CC=C(F)C=C1C(O)\C=C\C1=CC=CC=C1    1
Name: Reaction, dtype: int64

### <span style="color:blue"> Step 6: Manually verify reaction by checking literature </span>

----------

## <span style="color:blue"> Histogram of Yield (number) </span>

---------
#### Save data - cyclo5,6ketone