In [1]:
import pandas as pd
import re
import os
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem import rdChemReactions
from rdkit.Chem import Descriptors

In [2]:
def getReactandProduct(data):
    '''take a dataframe have reaction SMILES as column named 'Reaction' 
    extract reactants and products into a lists'''
    
    data = data.copy() #to avoid SettingWithCopyWarning
    
    # Split reactions into reactants and product 
    data[['Reactant SMILES', 'Product SMILES']] = data['Reaction'].str.split('>>', expand=True)

    # Split reactants and products into list
    data['Reactant SMILES'] = data['Reactant SMILES'].astype(str)
    data['Product SMILES'] = data['Product SMILES'].astype(str)
    data['Reactant SMILES'] = data['Reactant SMILES'].apply(lambda x: x.split('.'))
    data['Product SMILES'] = data['Product SMILES'].apply(lambda x: x.split('.'))
    return data

def calculate_MW(reactants_list,products_list):
    '''calculate change in MW after the reaction'''
    reactants_MW = 0
    products_MW = 0
    # Calculate mass of reactants
    for reactant in reactants_list:
        reactant_molecule = Chem.MolFromSmiles(reactant)
        if reactant_molecule is not None:
            reactants_MW += Descriptors.MolWt(reactant_molecule)
    # Calculate mass of products       
    for product in products_list:
        product_molecule = Chem.MolFromSmiles(product)
        if product_molecule is not None:
            products_MW += Descriptors.MolWt(product_molecule)
    #Calculate change
    change_MW = products_MW - reactants_MW
    return change_MW

def view_reactionScheme(data, NumReaction_to_view):
    ''' randomly pick and show reaction scheme'''
    # Get 1 sample for each reaction ID, Remove duplicated ID
    Reaction_data = data.drop_duplicates(subset=['Reaction ID'], keep='first')
    Reaction_data = Reaction_data.reset_index(drop = True)
    
    if NumReaction_to_view > Reaction_data.shape[0]:
        raise ValueError('Number of reactions to view is more than the total number of reactions in the dataset')
    else:
        # Draw
        for idx, row in Reaction_data.sample(n=NumReaction_to_view).iterrows():
            reaction_smarts = row['Reaction']
            rxn = rdChemReactions.ReactionFromSmarts(reaction_smarts, useSmiles=True)
            if rxn:
                print('Reaction ID:', Reaction_data.iloc[idx]['Reaction ID'] )
                Draw.ReactionToImage(rxn).show() # image pop up
                print(f'Link: {row["Links to Reaxys"]}\n')

def extract_yield(string):
    ''' extract yield number from a string then add up all yield numbers'''
    # Extract yield
    numbers = re.findall(r'\d+\.\d+|\d+', string)
    # If there are 2 numbers, sum the yield (!!!: only for stereoisomers)
    if len(numbers) == 2: 
        return sum(float(num) for num in numbers)
    elif len(numbers) == 1:  # If there is only 1 number, return the number
        return float(numbers[0])
    else:
        return None

def count_num_reaction(data):
    # Count how many reactions there are
    rxnCount_data = data['Reaction ID'].nunique()
    print('Number of Reactions:', rxnCount_data)
    print('Number of Rows:', data.shape[0])
    

# <span style="color:blue"> Import data after general cleaning </span>

In [3]:
# Change working directory
os.chdir('/Users/suongsuong/Documents/GitHub/Reactivity-based-metric-of-complexity')

In [4]:
Reduction_cyclo = pd.read_excel('Reduction of ketone/Notebook/Reduction_Cyclo.xlsx')
count_num_reaction(Reduction_cyclo)

Number of Reactions: 590
Number of Rows: 1308


# <span style="color:blue"> Calculate change in MW</span>

In [5]:
# Process the reaction SMILES
getReactandProduct_cyclo_concat = getReactandProduct(Reduction_cyclo)

In [6]:
# Calculate MW change
changeMW_cyclo_concat = getReactandProduct_cyclo_concat.dropna(subset=['Reactant SMILES', 'Product SMILES'])
changeMW_cyclo_concat['Change_MW'] = changeMW_cyclo_concat.apply(
    lambda x: calculate_MW(x['Reactant SMILES'], x['Product SMILES']), axis=1)


#### I'm not sure what the error was. Seem like it was able to calculated all the reaction. As there's no NA value of *Change MW* column. 

In [7]:
# no NA value after calculation
changeMW_cyclo_concat[changeMW_cyclo_concat['Change_MW'].isna() == True].shape[0]

0

#### I rounded the change to 3 decimal places because it's vary after 3 decimal places. For example, below is values' count if I don't round:

In [8]:
changeMW_cyclo_concat['Change_MW'].value_counts().head(5)

2.016    594
2.016    239
2.016    157
4.032     77
4.032     51
Name: Change_MW, dtype: int64

In [9]:
# Rounded to 3 decimal places
changeMW_cyclo_concat['Change_MW'] = round(changeMW_cyclo_concat['Change_MW'],3)

changeMW_cyclo_concat['Change_MW'].value_counts().head(5)

 2.016     1062
 4.032      197
-40.021      11
-15.999       6
-13.983       4
Name: Change_MW, dtype: int64

# <span style="color:blue"> Inspect double and single reduction by MW </span>

#### Single reduction have change in MW of 2.016. Double reduction have change in MW of 4.032 

## 1. Single reduction by MW

In [10]:
MWSingleReduction_cyclo_concat = changeMW_cyclo_concat[changeMW_cyclo_concat['Change_MW'] == 2.016]
count_num_reaction(MWSingleReduction_cyclo_concat)

Number of Reactions: 432
Number of Rows: 1062


### a. single reduction contains 1 reactant and 1 product

In [11]:
# Filter either 1 reactant or 1 product 
OneReactAndProduct_Single = MWSingleReduction_cyclo_concat[(MWSingleReduction_cyclo_concat['Reactant SMILES'].apply(len) == 1) &
                                                             (MWSingleReduction_cyclo_concat['Product SMILES'].apply(len) == 1)]

count_num_reaction(OneReactAndProduct_Single)

Number of Reactions: 431
Number of Rows: 1061


use the following code for viewing reaction scheme if needed

In [12]:
# #view 3 randomn reaction

# view_reactionScheme(OneReactAndProduct_Single,3)

### b. single reduction contains multiple reactants or multiple products


In [13]:
# Filter rows that either have >1 reactants or >1 products
Multi_ReactOrProduct_Single  = MWSingleReduction_cyclo_concat[(MWSingleReduction_cyclo_concat['Reactant SMILES'].apply(len) > 1) | (MWSingleReduction_cyclo_concat['Product SMILES'].apply(len) > 1)]

# Count how many reactions there are
count_num_reaction(Multi_ReactOrProduct_Single)


Number of Reactions: 1
Number of Rows: 1


use the following code for viewing reaction scheme if needed

In [14]:
# #view all reaction

#view_reactionScheme(Multi_ReactOrProduct_Single,3)


#### Remove rows of single reduction having >1 reactant and >1 product

### <span style="color:red"> For the single reduction (filtered by MW), only take those having 1 reactant and 1 product for the next step. </span>

In [15]:
# Extract yield number
SingleReduction_byMW = OneReactAndProduct_Single.copy()
SingleReduction_byMW['Yield (number)'] = SingleReduction_byMW['Yield'].apply(extract_yield)
count_num_reaction(SingleReduction_byMW)

Number of Reactions: 431
Number of Rows: 1061


In [16]:
# Store data
SingleReduction_byMW.to_excel('Reduction of ketone/Notebook/SingleReduction_byMW.xlsx', index=False)

## 2. Double reduction by MW

In [17]:
# Filter those has double changeMW
MWDoubleReduction_cyclo_concat = changeMW_cyclo_concat[changeMW_cyclo_concat['Change_MW'] == 4.032]

count_num_reaction(MWDoubleReduction_cyclo_concat)

Number of Reactions: 123
Number of Rows: 197


### a. double reduction contains 1 reactant and 1 product

In [18]:
# 1 reactant and 1 product 
OneReactAndProduct_Double = MWDoubleReduction_cyclo_concat[(MWDoubleReduction_cyclo_concat['Reactant SMILES'].apply(len) == 1)]

count_num_reaction(OneReactAndProduct_Double)

Number of Reactions: 48
Number of Rows: 65


#### No need to check as we only take single reduction of ketone. Reactions which have change in MW of 4 and have scheme of 1 reactant to 1 product are obviously not single reduction of ketone. => Remove

Some reactions are double reduction of ketones (mostly), or $\alpha,\beta$-unsaturated ketone reducion, or ketone and imine reduction.

In [19]:
# #view 3 reactions

# view_reactionScheme(OneReactAndProduct_Double,3)

### b. double reduction contains multiple reactants or multiple products

In [20]:

Multi_ReactOrProduct_Double  = MWDoubleReduction_cyclo_concat[(MWDoubleReduction_cyclo_concat['Reactant SMILES'].apply(len) > 1) | (MWDoubleReduction_cyclo_concat['Product SMILES'].apply(len) > 1)]

count_num_reaction(Multi_ReactOrProduct_Double)

Number of Reactions: 75
Number of Rows: 132


In [21]:
# check number of reactants and products
print('Stats of Reactants:', Multi_ReactOrProduct_Double['Reactant SMILES'].apply(len).value_counts())
print('Stats of Products:', Multi_ReactOrProduct_Double['Product SMILES'].apply(len).value_counts())

Stats of Reactants: 2    132
Name: Reactant SMILES, dtype: int64
Stats of Products: 2    132
Name: Product SMILES, dtype: int64


In [22]:
# check if Reactants SMILES are the same 
Multi_ReactOrProduct_Double['Reactant SMILES'].apply(lambda x: x[0]== x[1]).value_counts()


True    132
Name: Reactant SMILES, dtype: int64

#### All the rows contains multiple reactants and products with change in MW of 4.032 are the one that have 2 reactants SMILES and 2 products SMILES. 

#### 2 reactants SMILES of all these rows are the same. This means all 1048 rows (127 reactions) are actually 1 reactants -> 2 isomers (constitutional or stereoisomers)

In [23]:
# #view 3 reactions

# view_reactionScheme(Multi_ReactOrProduct_Double,3)

#### Only keep the reaction making stereoisomers

In [24]:
# Remove stereochemistry for the product 
Stereo_Multi_ReactOrProduct_Double = Multi_ReactOrProduct_Double.copy() #to avoid SettingWithCopyWarning
Stereo_Multi_ReactOrProduct_Double['Product SMILES'] = Stereo_Multi_ReactOrProduct_Double['Product SMILES'].apply(lambda x: [i.replace('@', '') for i in x])

# Keep the row if the 2 product SMILES (after removing stereochemistry) are the same.
Stereo_Multi_ReactOrProduct_Double = Stereo_Multi_ReactOrProduct_Double[Stereo_Multi_ReactOrProduct_Double['Product SMILES'].apply(lambda x: x[0]== x[1])]

####  Change the format back to 1 reactant -> 1 product (single reduction)


<div style="color:red">
    Note that Reactant SMILES still has stereochemistry while Product SMILES does not have. 
    Later, if the stereochemistry of Product SMILES is needed, it can be extracted back from 'Reaction' column
</div>

In [25]:
# Change the format back to 1 reactant -> 1 product (single reduction)
Stereo_Multi_ReactOrProduct_Double['Reactant SMILES'] = Stereo_Multi_ReactOrProduct_Double['Reactant SMILES'].apply(lambda x: [x[0]])
Stereo_Multi_ReactOrProduct_Double['Product SMILES'] = Stereo_Multi_ReactOrProduct_Double['Product SMILES'].apply(lambda x: [x[0]])
count_num_reaction(Stereo_Multi_ReactOrProduct_Double)

Number of Reactions: 71
Number of Rows: 126


#### Sum 2 yield numbers reported

In [26]:
# Only keep rows having 2 yield-numbers reported by counting 'percent'

Stereo_Multi_ReactOrProduct_Double = Stereo_Multi_ReactOrProduct_Double[Stereo_Multi_ReactOrProduct_Double['Yield'].str.count('percent') == 2]

# Sum yield of stereoisomers
Stereo_SingleReduction_byMW = Stereo_Multi_ReactOrProduct_Double.copy()
Stereo_SingleReduction_byMW['Yield (number)'] = Stereo_SingleReduction_byMW['Yield'].apply(extract_yield)
count_num_reaction(Stereo_SingleReduction_byMW)

Number of Reactions: 60
Number of Rows: 105


In [27]:
# Save data
Stereo_SingleReduction_byMW.to_excel('Reduction of ketone/Notebook/Stereo_SingleReduction_byMW.xlsx', index=False)

<div style="color:red">
    
### For the single reduction (filtered by MW), only take those having 1 reactant and 1 product for the next step.

### For the double reduction (filtered by MW), only take those making stereoisomers product. Stored dataset separately. 

</div>