In [1]:
import pandas as pd
import os
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import rdChemReactions
from rdkit.Chem import Descriptors
import re

from IPython.display import display
from PIL import Image
import io

In [2]:
def getReactandProduct(data):
    '''take a dataframe have reaction SMILES as column named 'Reaction' 
    extract reactants and products into a lists'''
    
    data = data.copy() #to avoid SettingWithCopyWarning
    
    # Split reactions into reactants and product 
    data[['Reactant SMILES', 'Product SMILES']] = data['Reaction'].str.split('>>', expand=True)

    # Split reactants and products into list
    data['Reactant SMILES'] = data['Reactant SMILES'].astype(str)
    data['Product SMILES'] = data['Product SMILES'].astype(str)
    data['Reactant SMILES'] = data['Reactant SMILES'].apply(lambda x: x.split('.'))
    data['Product SMILES'] = data['Product SMILES'].apply(lambda x: x.split('.'))
    return data

def calculate_mw(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Chem.Descriptors.MolWt(mol)

def calculate_change_MW(reactants_list,products_list):
    '''calculate change in MW after the reaction'''
    reactants_MW = 0
    products_MW = 0
    # Calculate mass of reactants
    for reactant in reactants_list:
        reactants_MW += calculate_mw(reactant)
    # Calculate mass of products       
    for product in products_list:
        products_MW += calculate_mw(product)
    #Calculate change
    change_MW = products_MW - reactants_MW
    return change_MW

def get_largest_reactant_MW(reactants_list):
    '''calculate MW of largest reactant'''
    reactant_MW = 0
    # Calculate mass of reactants
    for reactant in reactants_list:
        if reactant_MW < calculate_mw(reactant):
            reactant_MW = calculate_mw(reactant)
    return reactant_MW

def view_reactionScheme(data, NumReaction_to_view, duplicate):
    '''pick and show reaction scheme '''

    if duplicate == 'drop duplicate':
        Reaction_data = data.drop_duplicates(subset=['Reaction ID'], keep='first')
        Reaction_data = Reaction_data.reset_index(drop=True)
    elif duplicate == 'keep duplicate': 
        Reaction_data = data.reset_index(drop=True)
    else:
        raise ValueError('Invalid input')
        
    if NumReaction_to_view > Reaction_data.shape[0]:
        raise ValueError('Number of reactions to view is more than the total number of reactions in the dataset')
    else:
        # Draw
        random_start = np.random.randint(0, (Reaction_data.shape[0] - NumReaction_to_view + 1) , size=1)[0]
        for idx, row in Reaction_data[random_start: random_start + NumReaction_to_view].iterrows():
            reaction_smarts = row['Reaction']
            rxn = rdChemReactions.ReactionFromSmarts(reaction_smarts, useSmiles=True)
            if rxn:
                print('Reaction ID:', Reaction_data.iloc[idx]['Reaction ID'])
                img = Draw.ReactionToImage(rxn)
                
                # display image in notebook
                with io.BytesIO() as output:
                    img.save(output, format="PNG")
                    display(Image.open(output))
                # Return conditions
                print(row[['Largest Reactant MW', 'Reagent', 'Solvent (Reaction Details)', 'Time (Reaction Details) [h]', 'Temperature (Reaction Details) [C]', 'Yield']])
                print(f'Link: {row["Links to Reaxys"]}\n')


def count_num_reaction(data):
    '''Count how many reactions there are'''
    rxnCount_data = data['Reaction ID'].nunique()
    print('Number of Reactions:', rxnCount_data)
    print('Number of Rows:', data.shape[0])

def extract_yield(string):
    ''' extract yield number from a string then add up all yield numbers'''
    # Extract yield
    numbers = re.findall(r'\d+\.\d+|\d+', string)
    # If there are 2 numbers, sum the yield (!!!: only for stereoisomers)
    if len(numbers) == 2: 
        return sum(float(num) for num in numbers)
    elif len(numbers) == 1:  # If there is only 1 number, return the number
        return float(numbers[0])
    else:
        return None


# <span style="color:blue"> Import data after general cleaning </span>

In [3]:
# Change working directory
os.chdir('/Users/suongsuong/Documents/GitHub/Reactivity-based-metric-of-complexity/Reduction of ketone/Notebook/')

In [4]:
Reduction_cyclo = pd.read_excel('Reduction_Cyclo.xlsx')
count_num_reaction(Reduction_cyclo)

Number of Reactions: 2593
Number of Rows: 3468


# <span style="color:blue"> Calculate change in MW</span>

In [5]:
# Process the reaction SMILES
getReactandProduct_cyclo_concat = getReactandProduct(Reduction_cyclo)

In [6]:
#remove NA if have
getReactandProduct_cyclo_concat = getReactandProduct_cyclo_concat.drop(getReactandProduct_cyclo_concat[getReactandProduct_cyclo_concat['Reactant SMILES'].isna() == True].index)
getReactandProduct_cyclo_concat = getReactandProduct_cyclo_concat.drop(getReactandProduct_cyclo_concat[getReactandProduct_cyclo_concat['Product SMILES'].isna() == True].index)

In [7]:
# Calculate MW change
changeMW_cyclo_concat = getReactandProduct_cyclo_concat.dropna(subset=['Reactant SMILES', 'Product SMILES'])
changeMW_cyclo_concat['Change_MW'] = changeMW_cyclo_concat.apply(
    lambda x: calculate_change_MW(x['Reactant SMILES'], x['Product SMILES']), axis=1)

# Add a new column 'Largest Reactant MW' based on 'Reactant SMILES'
changeMW_cyclo_concat['Largest Reactant MW'] = changeMW_cyclo_concat.apply(lambda x: round(get_largest_reactant_MW( x['Reactant SMILES']),3), axis=1)

count_num_reaction(changeMW_cyclo_concat)

Number of Reactions: 2593
Number of Rows: 3468


#### I rounded the change to 3 decimal places because it's vary after 3 decimal places. For example, below is values' count if I don't round:

In [8]:
changeMW_cyclo_concat['Change_MW'].value_counts().head(5)

2.016    1012
2.016     759
2.016     681
4.032     206
4.032     158
Name: Change_MW, dtype: int64

In [9]:
# Rounded to 3 decimal places
changeMW_cyclo_concat['Change_MW'] = round(changeMW_cyclo_concat['Change_MW'],3)

changeMW_cyclo_concat['Change_MW'].value_counts().head(5)

 2.016     2746
 4.032      549
 6.048       18
-40.021      17
 8.064       17
Name: Change_MW, dtype: int64

# <span style="color:blue"> Inspect double and single reduction by MW </span>

#### Single reduction have change in MW of 2.016. Double reduction have change in MW of 4.032 

## 1. Single reduction by MW

In [10]:
MWSingleReduction_cyclo_concat = changeMW_cyclo_concat[changeMW_cyclo_concat['Change_MW'] == 2.016]
count_num_reaction(MWSingleReduction_cyclo_concat)

Number of Reactions: 2024
Number of Rows: 2746


### a. single reduction contains 1 reactant and 1 product

In [11]:
# Filter either 1 reactant or 1 product 
OneReactAndProduct_Single = MWSingleReduction_cyclo_concat[(MWSingleReduction_cyclo_concat['Reactant SMILES'].apply(len) == 1) &
                                                             (MWSingleReduction_cyclo_concat['Product SMILES'].apply(len) == 1)]

count_num_reaction(OneReactAndProduct_Single)

Number of Reactions: 2020
Number of Rows: 2742


use the following code for viewing reaction scheme if needed

In [12]:
#view 3 randomn reaction

#view_reactionScheme(OneReactAndProduct_Single, 3 , 'drop duplicate')

### b. single reduction contains multiple reactants or multiple products


In [13]:
# Filter rows that either have >1 reactants or >1 products
Multi_ReactOrProduct_Single  = MWSingleReduction_cyclo_concat[(MWSingleReduction_cyclo_concat['Reactant SMILES'].apply(len) > 1) | (MWSingleReduction_cyclo_concat['Product SMILES'].apply(len) > 1)]

# Count how many reactions there are
count_num_reaction(Multi_ReactOrProduct_Single)


Number of Reactions: 4
Number of Rows: 4


use the following code for viewing reaction scheme if needed

In [14]:
# #view reaction

#view_reactionScheme(Multi_ReactOrProduct_Single,3, 'drop duplicate')


#### Remove rows of single reduction having >1 reactant and >1 product

#### For the single reduction (filtered by MW), only take those having 1 reactant and 1 product for the next step.

## 2. Double reduction by MW

In [15]:
# Filter those has double changeMW
MWDoubleReduction_cyclo_concat = changeMW_cyclo_concat[changeMW_cyclo_concat['Change_MW'] == 4.032]

count_num_reaction(MWDoubleReduction_cyclo_concat)

Number of Reactions: 424
Number of Rows: 549


### a. double reduction contains 1 reactant and 1 product

In [16]:
# 1 reactant and 1 product 
OneReactAndProduct_Double = MWDoubleReduction_cyclo_concat[(MWDoubleReduction_cyclo_concat['Reactant SMILES'].apply(len) == 1)&
                                                             (MWDoubleReduction_cyclo_concat['Product SMILES'].apply(len) == 1)]

count_num_reaction(OneReactAndProduct_Double)

Number of Reactions: 163
Number of Rows: 201


#### No need to check as we only take single reduction of ketone. Reactions which have change in MW of 4 and have scheme of 1 reactant to 1 product are obviously not single reduction of ketone. => Remove

Some reactions are double reduction of ketones (mostly), or $\alpha,\beta$-unsaturated ketone reducion, or ketone and imine reduction.

In [17]:
# #view 3 reactions

# view_reactionScheme(OneReactAndProduct_Double,3, 'drop duplicate')

### b. double reduction contains multiple reactants or multiple products

In [18]:

Multi_ReactOrProduct_Double  = MWDoubleReduction_cyclo_concat[(MWDoubleReduction_cyclo_concat['Reactant SMILES'].apply(len) > 1) | (MWDoubleReduction_cyclo_concat['Product SMILES'].apply(len) > 1)]

count_num_reaction(Multi_ReactOrProduct_Double)

Number of Reactions: 261
Number of Rows: 348


In [19]:
# check number of reactants and products
print('Stats of Reactants:', Multi_ReactOrProduct_Double['Reactant SMILES'].apply(len).value_counts())
print('Stats of Products:', Multi_ReactOrProduct_Double['Product SMILES'].apply(len).value_counts())

Stats of Reactants: 2    348
Name: Reactant SMILES, dtype: int64
Stats of Products: 2    348
Name: Product SMILES, dtype: int64


In [20]:
# check if Reactants SMILES are the same 
Multi_ReactOrProduct_Double['Reactant SMILES'].apply(lambda x: x[0]== x[1]).value_counts()


True    348
Name: Reactant SMILES, dtype: int64

#### All the rows contains multiple reactants and products with change in MW of 4.032 are the one that have 2 reactants SMILES and 2 products SMILES. 

#### 2 reactants SMILES of all these rows are the same. This means all the data are actually 1 reactants -> 2 isomers (constitutional or stereoisomers)

In [21]:
# #view 3 reactions

# view_reactionScheme(Multi_ReactOrProduct_Double,3, 'drop duplicate')

#### Only keep the reaction making stereoisomers

In [22]:
# Remove stereochemistry for the product 
Stereo_Multi_ReactOrProduct_Double = Multi_ReactOrProduct_Double.copy() #to avoid SettingWithCopyWarning
Stereo_Multi_ReactOrProduct_Double['Product SMILES'] = Stereo_Multi_ReactOrProduct_Double['Product SMILES'].apply(lambda x: [i.replace('@', '') for i in x])

# Keep the row if the 2 product SMILES (after removing stereochemistry) are the same.
Stereo_Multi_ReactOrProduct_Double = Stereo_Multi_ReactOrProduct_Double[Stereo_Multi_ReactOrProduct_Double['Product SMILES'].apply(lambda x: x[0]== x[1])]

#### Only keep those having 2 yield numbers reported

In [23]:
# Only keep rows having 2 yield-numbers reported by counting 'percent'

Stereo_Multi_ReactOrProduct_Double = Stereo_Multi_ReactOrProduct_Double[Stereo_Multi_ReactOrProduct_Double['Yield'].str.count('percent') == 2]

#give back the strereochemistry for the product
Stereo_Multi_ReactOrProduct_Double = getReactandProduct(Stereo_Multi_ReactOrProduct_Double)

count_num_reaction(Stereo_Multi_ReactOrProduct_Double)

Number of Reactions: 213
Number of Rows: 285


<div style="color:red">
    
#### For the single reduction (filtered by MW), only take those having 1 reactant and 1 product for the next step.

#### For the double reduction (filtered by MW), only take those making stereoisomers product. Stored dataset separately. 

</div>

# <span style="color:blue"> Extract Yield (number)</span>

In [24]:
Stereo_Multi_ReactOrProduct_Double['Yield (number)'] = Stereo_Multi_ReactOrProduct_Double['Yield'].apply(extract_yield)
Stereo_Multi_ReactOrProduct_Double['Yield (number)'] = pd.to_numeric(Stereo_Multi_ReactOrProduct_Double['Yield (number)'], errors = 'raise')

OneReactAndProduct_Single = OneReactAndProduct_Single.copy()
OneReactAndProduct_Single['Yield (number)'] = OneReactAndProduct_Single['Yield'].apply(extract_yield)
OneReactAndProduct_Single['Yield (number)'] = pd.to_numeric(OneReactAndProduct_Single['Yield (number)'], errors = 'raise')

In [25]:
# Save data
OneReactAndProduct_Single.to_excel('SingleReduction_byMW.xlsx', index=False)
Stereo_Multi_ReactOrProduct_Double.to_excel('Stereo_SingleReduction_byMW.xlsx', index=False)