In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from deepchem.utils import conformers
from rdkit.Chem.rdMolTransforms import GetDihedralDeg
from rdkit.Chem.rdMolTransforms import SetDihedralDeg
import numpy as np
import pandas as pd
import os



# Generate a dataset based on torsional configurations

This notebook generates data files (.sdf and corresponding .sdf.csv) to be read by DeepChem. The specific example explored here is the energy of different polyethylene conformations. Torsional degrees of freedom play a dominate role in determining conformations, so all torsional combinations we enumerated for a short (3 monomers, 6 total carbons, and 3 unique torsion angles) polyethylene chain. The energy of each conformer was calcuated using the UFF potential implemented in RDKit.        

In [2]:
# list of all torsion angles

In [3]:
torsion_angles = np.linspace(-170.0, 180.0, num=36)

In [4]:
torsion_angles

array([-170., -160., -150., -140., -130., -120., -110., -100.,  -90.,
        -80.,  -70.,  -60.,  -50.,  -40.,  -30.,  -20.,  -10.,    0.,
         10.,   20.,   30.,   40.,   50.,   60.,   70.,   80.,   90.,
        100.,  110.,  120.,  130.,  140.,  150.,  160.,  170.,  180.])

In [5]:
# array of all combinations for 3 torsion angles ~46,000
# this includes all combinations, not all will be unique because of symmetry

In [6]:
tor_combinations = np.zeros((46656, 3)) 

In [7]:
count = 0
for i in torsion_angles:
    for j in torsion_angles:
        for k in torsion_angles:
            tor_combinations[count] = [i, j, k]
            count += 1

In [8]:
# load the short polyethylene molecule from file, keeping the hydrogens
pe_n6_mol = Chem.MolFromMolFile('../data/pe_n6.mol', removeHs=False)

In [9]:
# these are the atoms in each torsion respectively
# can view the molecule in avagadro to check
pe_n6_tor_atoms = [[0, 1, 5, 8], [1, 5, 8, 11], [5, 8, 11, 14]]

In [10]:
# define a function to randomly select a few thousand torsion combinations,
# rotate the base molecule, calculates the energy, and put into a pandas dataframe  

In [11]:
# this function selects randomly a number of torsional conformations
# it returns three datasets to do sequential learning
def random_tor_array(tor_array, num):
    tor_copy = np.copy(tor_array)
    np.random.shuffle(tor_copy)
    rand_tor_1 = tor_copy[:num]
    rand_tor_2 = tor_copy[num:(2 * num)]
    rand_tor_3 = tor_copy[(2 * num):(3 * num)]
    return rand_tor_1, rand_tor_2, rand_tor_3

In [12]:
# this function rotates all the torsion angles in a conformation
def rotate_all_torsions(base_mol, tor_atoms, tor_angles):
    # copy base mol
    rot_mol = Chem.Mol(base_mol)
    # loop through all the torsion angles in the conformer
    for i, atom in enumerate(tor_atoms):
        SetDihedralDeg(rot_mol.GetConformer(0), atom[0], atom[1], atom[2], atom[3], tor_angles[i])
    return rot_mol

In [None]:
# Energy units are kcal/mol

In [13]:
def calc_uff_energy(mol):
    ff = AllChem.UFFGetMoleculeForceField(mol)
    energy = ff.CalcEnergy()
    return energy

In [14]:
def generate_conform_data(base_mol, tors_array, tor_atoms, sample_num):
    mol_list = []
    energy_list = []
    for i, angles in enumerate(tors_array):
        rot_mol = rotate_all_torsions(base_mol, tor_atoms, angles)
        energy = calc_uff_energy(rot_mol)
        # set properties to tag molecules in the .sdf file
        rot_mol.SetProp("_Name","pe_{id}".format(id=i))
        rot_mol.SetProp("energy", "{e}".format(e=energy))
        # add energy if statements here
        # this if statement limits conformations to under 200 kcal/mol, so no overlapping atoms
        if energy < 200:
            mol_list.append(rot_mol)
            energy_list.append(energy)
        else:
            continue
        if len(mol_list) == sample_num:
            break
        else:
            continue
    return mol_list, energy_list

In [15]:
def write_sdf(filename, mol_list):
    w = Chem.SDWriter(filename)
    for m in mol_list: w.write(m)

In [16]:
def write_csv(filename, energy_list):
    e_data = {"energy": energy_list}
    e_df = pd.DataFrame(e_data)
    e_df.to_csv(filename, index=False)

In [17]:
# generate unique torsion combinations for all data files
# sample number in the tor_array needs to be higher than the desired sample number because
# of downselection by the energy cutoff

In [18]:
pe_rand_tor_array_1, pe_rand_tor_array_2, pe_rand_tor_array_3 = random_tor_array(tor_combinations, 1000)

In [19]:
# rotate molecules, calculate energies, and write sdf/cvs files for deep chem

In [20]:
pe_mol_list_1, pe_energy_list_1 = generate_conform_data(pe_n6_mol, 
                                                        pe_rand_tor_array_1, 
                                                        pe_n6_tor_atoms, 500)

In [21]:
# write the sdf file and corresponding csv
write_sdf("../data/medium_dataset/pe_conformer_1.sdf", pe_mol_list_1)
write_csv("../data/medium_dataset/pe_conformer_1.sdf.csv", pe_energy_list_1)

In [22]:
# dataset number 2 

In [23]:
pe_mol_list_2, pe_energy_list_2 = generate_conform_data(pe_n6_mol, 
                                                        pe_rand_tor_array_2, 
                                                        pe_n6_tor_atoms, 500)

In [24]:
write_sdf("../data/medium_dataset/pe_conformer_2.sdf", pe_mol_list_2)
write_csv("../data/medium_dataset/pe_conformer_2.sdf.csv", pe_energy_list_2)

In [25]:
# dataset number 3 

In [26]:
pe_mol_list_3, pe_energy_list_3 = generate_conform_data(pe_n6_mol, 
                                                        pe_rand_tor_array_3, 
                                                        pe_n6_tor_atoms, 500)

In [27]:
write_sdf("../data/medium_dataset/pe_conformer_3.sdf", pe_mol_list_3)
write_csv("../data/medium_dataset/pe_conformer_3.sdf.csv", pe_energy_list_3)