In [1]:
# Change working directory
os.chdir('/Users/suongsuong/Documents/GitHub/Reactivity-based-metric-of-complexity/Reduction of ketone/')

import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem 

In [2]:
def generate_3D(data, folder_path):
    id_toremove = [] 
    for idx, row in data.iterrows():
        smiles = row['Reactant smiles map']
        reaction_id = row['Reaction ID']
        mol = Chem.MolFromSmiles(smiles) 
        mol = Chem.AddHs(mol)
        embed_success = AllChem.EmbedMolecule(mol,useBasicKnowledge=False,useSmallRingTorsions=True, useRandomCoords=True, maxAttempts = 500)
        if embed_success == -1:
            print(f'Embedding failed for Reaction ID: {reaction_id}')
            id_toremove.append(reaction_id)
            continue
        try:
            AllChem.UFFOptimizeMolecule(mol)
            xyz_filename = folder_path + str(reaction_id) + '.xyz'
            Chem.MolToXYZFile(mol, xyz_filename)
        except ValueError as e:
            print(f'Optimization failed for Reaction ID: {reaction_id}, Error: {e}')
            id_toremove.append(reaction_id)
            continue 
    return id_toremove

In [23]:
atomMap_df = pd.read_excel('ReductionKetone_atomMapping.xlsx')
atomMap_df.shape

(962, 26)

------ 
#### generate 3D structure

took 20min to run

In [4]:
# atomMap_df = atomMap_df.head(100)
id_remove = generate_3D(atomMap_df, '3D_structures/')
df_3D = atomMap_df[~atomMap_df['Reaction ID'].isin(id_remove)]  # remove unsuccessful 3D

Embedding failed for Reaction ID: 48314851
Embedding failed for Reaction ID: 9348888
Embedding failed for Reaction ID: 1739720
Embedding failed for Reaction ID: 2236430
Embedding failed for Reaction ID: 3695571
Embedding failed for Reaction ID: 4444115
Embedding failed for Reaction ID: 5041068
Embedding failed for Reaction ID: 8838507
Embedding failed for Reaction ID: 8843079
Embedding failed for Reaction ID: 8961682
Embedding failed for Reaction ID: 8961684
Embedding failed for Reaction ID: 8977587
Embedding failed for Reaction ID: 8978127
Embedding failed for Reaction ID: 9475237
Embedding failed for Reaction ID: 27941777
Embedding failed for Reaction ID: 27941778
Embedding failed for Reaction ID: 27941779
Embedding failed for Reaction ID: 29204447
Embedding failed for Reaction ID: 29659655
Embedding failed for Reaction ID: 29659656
Embedding failed for Reaction ID: 29659677
Embedding failed for Reaction ID: 31809453
Embedding failed for Reaction ID: 32414566
Embedding failed for Rea

In [5]:
print('There are',len(id_remove),'rxn not having generated 3D structures')

There are 54 rxn not having generated 3D structures


In [15]:
def generate_3D_tryembed(data, folder_path):
    id_toremove = [] 
    for idx, row in data.iterrows():
        smiles = row['Reactant smiles map']
        reaction_id = row['Reaction ID']
        mol = Chem.MolFromSmiles(smiles) 
        mol = Chem.AddHs(mol)
        # embed_success = AllChem.EmbedMolecule(mol,useBasicKnowledge=False,useSmallRingTorsions=True, useRandomCoords=True, maxIters=500)
        # if embed_success == -1:
        #     print(f'Embedding failed for Reaction ID: {reaction_id}')
        #     id_toremove.append(reaction_id)
        #     continue
        try:
            AllChem.UFFOptimizeMolecule(mol)
            xyz_filename = folder_path + str(reaction_id) + '.xyz'
            Chem.MolToXYZFile(mol, xyz_filename)
        except ValueError as e:
            print(f'Optimization failed for Reaction ID: {reaction_id}, Error: {e}')
            id_toremove.append(reaction_id)
            continue 
    return id_toremove

------ 
### make sure the right index was pulled out

#### read xyz file to see the index 
first index represent the first atom in the xyz file

In [6]:
def read_xyz_file_to_dataframe(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    # Skip the first two lines (header lines)
    atom_lines = lines[2:]
    
    # Initialize lists to store data
    count = 0
    atom_count = []
    atoms = []
    x_coords = []
    y_coords = []
    z_coords = []

    for line in atom_lines:
        count +=1
        parts = line.split()
        atom_type = parts[0]
        x = float(parts[1])
        y = float(parts[2])
        z = float(parts[3])
        
        atom_count.append(count)
        atoms.append(atom_type)
        x_coords.append(x)
        y_coords.append(y)
        z_coords.append(z)
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Atom Num': atom_count,
        'Atom': atoms,
        'X': x_coords,
        'Y': y_coords,
        'Z': z_coords
    })
    
    return df
read_xyz_file_to_dataframe('3D_structures/40358815.xyz'  )

Unnamed: 0,Atom Num,Atom,X,Y,Z
0,1,C,-1.572715,0.592792,-0.735253
1,2,C,-1.795534,-0.950955,-0.888157
2,3,C,-0.541506,-1.799546,-1.0915
3,4,C,0.622023,-1.301759,-0.289482
4,5,C,1.512055,-2.241508,0.277841
5,6,C,2.680225,-1.77433,0.865013
6,7,C,2.988637,-0.434863,0.870106
7,8,C,2.147484,0.520088,0.311117
8,9,C,0.906352,0.095394,-0.239445
9,10,C,-0.068765,1.091858,-0.887416


In [7]:
df_3D[df_3D['Reaction ID']==40358815]

Unnamed: 0,Reaction,Reactant,Product,Reagent,Catalyst,Solvent (Reaction Details),Time (Reaction Details) [h],Temperature (Reaction Details) [C],Yield,Reaction ID,...,Change_MW,Reactant MW,change in C-O single bond,change in C=O double bond,Procedure,Reaction Map,Reactant smiles map,Product smiles map,C_idx,O_idx
9,[H][C@]12CCC3=CC4=C(C=C3[C@]1(C)CCC(=O)[C@@]2(...,C24H25NO3,C24H27NO3,sodium tetrahydroborate,,methanol,3.0,0,72 percent,40358815,...,2.016,375.468,1,-1,"Alcohol S6. Carbazole 18 (100 mg, 0.27 mmol, 1...",[C@H:1]12[CH2:2][CH2:3][c:4]3[cH:5][c:6]4[c:7]...,[C@@H:1]12[CH2:2][CH2:3][c:4]3[cH:5][c:6]4[c:7...,[C@@H:1]12[CH2:2][CH2:3][c:4]3[cH:5][c:6]4[c:7...,14,15


In [8]:
# # USE THIS CODE, combinded with view_reactionScheme() with mapping, to check if the right index was pull out
# df = get_sterimol[get_sterimol['Reaction ID'] == 52106508]
# for idx, row in df.iterrows(): 
#     smiles_map = row['Reactant smiles map']
#     reactant_mol = Chem.MolFromSmiles(smiles_map)

#     C_idx = row['C_idx'] - 1 # - 1 because the idx in the df is the order of atom counting from 1
#     O_idx = row['O_idx'] - 1
#     atom_C = reactant_mol.GetAtomWithIdx(C_idx)
#     atom_O = reactant_mol.GetAtomWithIdx(O_idx)
#     print('this supposed to be C:', atom_C.GetSymbol(), atom_C.GetAtomMapNum())
#     print('this supposed to be O:', atom_O.GetSymbol(), atom_O.GetAtomMapNum())

----

In [9]:
df_3D.to_excel('ReductionKetone_have3D.xlsx',index=False)