In [1]:
import pandas as pd
import os
import numpy as np
from rdkit import Chem
import sweetviz as sv
from rdkit.Chem import Draw
from rdkit.Chem import rdChemReactions

In [2]:
from IPython.display import display
from PIL import Image
import io

In [3]:
def count_num_reaction(data):
    '''Count how many reactions there are'''
    rxnCount_data = data['Reaction ID'].nunique()
    print('Number of Reactions:', rxnCount_data)
    print('Number of Rows:', data.shape[0])

def view_reactionScheme(data, NumReaction_to_view, duplicate):
    '''pick and show reaction scheme '''

    if duplicate == 'drop duplicate':
        Reaction_data = data.drop_duplicates(subset=['Reaction ID'], keep='first')
        Reaction_data = Reaction_data.reset_index(drop=True)
    elif duplicate == 'keep duplicate': 
        Reaction_data = data.reset_index(drop=True)
    else:
        raise ValueError('Invalid input')
        
    if NumReaction_to_view > Reaction_data.shape[0]:
        raise ValueError('Number of reactions to view is more than the total number of reactions in the dataset')
    else:
        # Draw
        random_start = np.random.randint(0, (Reaction_data.shape[0] - NumReaction_to_view + 1) , size=1)[0]
        for idx, row in Reaction_data[random_start: random_start + NumReaction_to_view].iterrows():
            reaction_smarts = row['Reaction']
            rxn = rdChemReactions.ReactionFromSmarts(reaction_smarts, useSmiles=True)
            if rxn:
                print('Reaction ID:', Reaction_data.iloc[idx]['Reaction ID'])
                img = Draw.ReactionToImage(rxn)
                
                # display image in notebook
                with io.BytesIO() as output:
                    img.save(output, format="PNG")
                    display(Image.open(output))
                # Return conditions
                print(row[['Largest Reactant MW', 'Reagent', 'Solvent (Reaction Details)', 'Time (Reaction Details) [h]', 'Temperature (Reaction Details) [C]', 'Yield']])
                print(f'Link: {row["Links to Reaxys"]}\n')
                
def count_C_O_bonds(molecule_SMILES, bond_type):
    '''given molecule SMILES and bond type, count the number of bonds between C and O'''
    mol = Chem.MolFromSmiles(molecule_SMILES)
    num_bonds = 0
    for bond in mol.GetBonds():
        a1 = bond.GetBeginAtom()
        a2 = bond.GetEndAtom()
        if (a1.GetAtomicNum() == 6 and a2.GetAtomicNum() == 8) or (a1.GetAtomicNum() == 8 and a2.GetAtomicNum() == 6):
            if bond.GetBondType() == bond_type:
                num_bonds += 1            
    return num_bonds

def change_C_O_bonds(reactant_SMILES, product_SMILES, bond_type):
    '''given the product, reactant SMILES and bondtype, calculate the change in # of bonds between C and O'''
    change = count_C_O_bonds(product_SMILES, bond_type) - count_C_O_bonds(reactant_SMILES, bond_type)
    return change

def change_single_and_double_C_O_bond(data):
    '''given the data frame having 'Reactant SMILES' and 'Product SMILES', calculate the change in # of single and double bonds between C and O'''
    data['change in C-O single bond'] = data.apply(
        lambda x: change_C_O_bonds(x['Reactant SMILES'][0], x['Product SMILES'][0], Chem.rdchem.BondType.SINGLE), axis=1)
    data['change in C=O double bond'] = data.apply(
        lambda x: change_C_O_bonds(x['Reactant SMILES'][0], x['Product SMILES'][0], Chem.rdchem.BondType.DOUBLE), axis=1)
    return data



In [4]:
# Change working directory
os.chdir('/Users/suongsuong/Documents/GitHub/Reactivity-based-metric-of-complexity/Reduction of ketone/Notebook/')

## <span style="color:blue"> Import data after checking by MW </span>

In [5]:
SingleReduction_byMW = pd.read_excel('SingleReduction_byMW.xlsx')
count_num_reaction(SingleReduction_byMW)

Number of Reactions: 2020
Number of Rows: 2742


In [6]:
Stereo_SingleReduction_byMW = pd.read_excel('Stereo_SingleReduction_byMW.xlsx')
count_num_reaction(Stereo_SingleReduction_byMW)

Number of Reactions: 213
Number of Rows: 285


## <span style="color:blue"> Concatenate both data sets </span>

In [7]:
SingleReduc_byMW_all = pd.concat([SingleReduction_byMW,Stereo_SingleReduction_byMW], axis = 0)
count_num_reaction(SingleReduc_byMW_all)

Number of Reactions: 2233
Number of Rows: 3027


In [8]:
# turn to list of strings
SingleReduc_byMW_all['Reactant SMILES'] = SingleReduc_byMW_all['Reactant SMILES'].apply(lambda x: eval(x))
SingleReduc_byMW_all['Product SMILES'] = SingleReduc_byMW_all['Product SMILES'].apply(lambda x: eval(x))

## <span style="color:blue">  Only take reaction having 1 more single bond C-O and 1 less double bond C=O after reaction</span>

In [9]:
# get bond change
SingleReduc_byMW_all = change_single_and_double_C_O_bond(SingleReduc_byMW_all)

In [10]:
# Filter reaction having correct CO bond change
SingleReduc_byCObond = SingleReduc_byMW_all[
    (SingleReduc_byMW_all['change in C-O single bond'] == 1) & (SingleReduc_byMW_all['change in C=O double bond'] == -1)
]
count_num_reaction(SingleReduc_byCObond)

Number of Reactions: 2210
Number of Rows: 3000


### Example of removed reaction

In [11]:
SingleReduc_byMW_all['change in C-O single bond'].value_counts()

1    3000
2      11
3       8
0       8
Name: change in C-O single bond, dtype: int64

In [12]:
SingleReduc_byMW_all['change in C=O double bond'].value_counts()

-1    3011
-2       8
 0       8
Name: change in C=O double bond, dtype: int64

In [13]:
removed_rxn = SingleReduc_byMW_all[
    (SingleReduc_byMW_all['change in C-O single bond'] != 1) | (SingleReduc_byMW_all['change in C=O double bond'] != -1)
]
count_num_reaction(removed_rxn)

Number of Reactions: 23
Number of Rows: 27


In [14]:
#view_reactionScheme(removed_rxn, 3, 'drop duplicate')

## <span style="color:blue">  Save data </span>

In [15]:
# Save data
SingleReduc_byCObond.to_excel('SingleReduc_byCObond.xlsx', index=False)

### EDA after 3 steps

In [16]:
columns_with_lists = [col for col in SingleReduc_byCObond.columns if SingleReduc_byCObond[col].apply(lambda x: isinstance(x, list)).any()]
EDA_CObond = SingleReduc_byCObond.drop(columns=columns_with_lists)
eda = sv.analyze(EDA_CObond)
eda.show_html(filepath='EDA_CObond.html')

                                             |          | [  0%]   00:00 -> (? left)

  value_counts_without_nan = pd.Series()


Report Notebook/EDA_CObond.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
