### Mordred + RS 12 - 100 + Smallest RS + Largest RS + # Sugars + # Core Esters

In [1]:
import itertools
import pandas as pd
from mordred import Calculator, descriptors
from mordred.RingCount import RingCount
from rdkit import Chem




In [2]:
def convert_time(second):
    day = second/86400
    hour = (day - int(day))*24
    minute = (hour - int(hour))*60
    second = round((minute - int(minute))*60,4)
    return(str(int(day)) + ' DAYS: '+ str(int(hour)) + ' HOURS: '+ str(int(minute)) + ' MINUTES: ' + str(second) + ' SECONDS')


### Compute Mordred + Macrocycle Descriptors

In [3]:
class Macrocycle_Descriptors:
    
    def __init__(self, mols):
        self.mols = mols
        
    def compute_ringsize(self, mol):
        '''
        check for macrolides of RS 3 to 100, return a  list of ring counts.
        [RS3,RS4,.....,RS100]
        [0,0,0,...,1,...,0]
        '''
        RS_3_100 = [i+3 for i in range(97)]
        RS_count = []
        for j in RS_3_100:
            RS = RingCount(order=j)(mol)
            RS_count.append(RS)
        return RS_count
    
    def macrolide_ring_info(self):
        headers = ['n'+str(i+13)+'Ring' for i in range(87)]+['SmallestRS','LargestRS']  
        # up to nR12 is already with mordred, start with nR13 to nR100
        ring_sizes = []
        for i in range(len(self.mols)):
            RS = self.compute_ringsize(self.mols[i])  # nR3 to nR100
            RS_12_100 = RS[9:]    # start with nR12 up to nR100
            ring_indices = [i for i,x in enumerate(RS_12_100) if x!=0]  # get index if item isn't equal to 0
            if ring_indices:
                # find 1, locate the last index
                # largest_RS is based on RS 3 to 100. 
                # Add 3 (starting ring count) to get up to the actual ring size 
                smallest_RS = ring_indices[0]+12 
                largest_RS = ring_indices[-1]+12
                RS_12_100.append(smallest_RS)  # Smallest RS
                RS_12_100.append(largest_RS)  # Largest RS
            else:
                RS_12_100.extend(['',''])
            ring_sizes.append(RS_12_100[1:]) # up to nR12 is already with mordred, start with nR13 to nR100
        df = pd.DataFrame(ring_sizes, columns=headers)
        return df
    
    def sugar_count(self):
        sugar_patterns = [
        '[OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)]',
        '[OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]',
        '[OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C(O)@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C(O)@C(O)@C1)]',
        '[OX2;$([r5]1@C(!@[OX2H1])@C@C@C1),$([r6]1@C(!@[OX2H1])@C@C@C@C1)]',
        '[OX2;$([r5]1@[C@@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@[C@@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]',
        '[OX2;$([r5]1@[C@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@[C@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]',
        ]
        sugar_mols = [Chem.MolFromSmarts(i) for i in sugar_patterns]
        sugar_counts = []
        for i in self.mols:
            matches_total = []
            for s_mol in sugar_mols:
                raw_matches = i.GetSubstructMatches(s_mol)
                matches = list(sum(raw_matches, ()))
                if matches not in matches_total and len(matches) !=0:
                    matches_total.append(matches)
            sugar_indices = set((list(itertools.chain(*matches_total))))
            count = len(sugar_indices)
            sugar_counts.append(count)
        df = pd.DataFrame(sugar_counts, columns=['nSugars'])
        return df

    def core_ester_count(self):
        '''
        Returns pandas frame containing the count of esters in core rings of >=12 membered macrocycles.
        '''
        ester_smarts = '[CX3](=[OX1])O@[r;!r3;!r4;!r5;!r6;!r7;!r8;!r9;!r10;!r11]'
        core_ester = []
        ester_mol = Chem.MolFromSmarts(ester_smarts)
        for i in self.mols:
            ester_count = len(i.GetSubstructMatches(ester_mol))
            core_ester.append(ester_count)
        df = pd.DataFrame(core_ester, columns=['core_ester'])
        return df

    def mordred_compute(self):
        calc = Calculator(descriptors, ignore_3D=True)
        rings = RingCount() 
        df = calc.pandas(self.mols)
        return df

    def compute_mrc(self):
        mordred_df = self.mordred_compute()
        ring_df = self.macrolide_ring_info()
        sugar_df = self.sugar_count()
        ester_df = self.core_ester_count()
        merged = pd.concat([mordred_df, ring_df,sugar_df, ester_df], axis=1)
        return merged


In [4]:
def main():
    sample_smiles = ['CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@H](C)[C@@H](O[C@@H]2O[C@H](C)C[C@@H]([C@H]2O)N(C)C)[C@](C)(O)C[C@@H](C)C(=O)[C@H](C)[C@@H](O)[C@]1(C)O',
            'CC[C@@H](C)[C@H]1O[C@]2(CC[C@@H]1C)C[C@@H]1C[C@@H](C\C=C(C)\[C@@H](O[C@H]3C[C@H](OC)[C@@H](O[C@H]4C[C@H](OC)[C@@H](O)[C@H](C)O4)[C@H](C)O3)[C@@H](C)\C=C\C=C3/CO[C@@H]4[C@H](O)C(C)=C[C@@H](C(=O)O1)[C@]34O)O2',
            'CC[C@H]1OC(=O)[C@@](C)(F)C(=O)[C@H](C)[C@@H](O[C@@H]2O[C@H](C)C[C@@H]([C@H]2O)N(C)C)[C@@](C)(C[C@@H](C)C(=O)[C@H](C)[C@H]2N(CCCCN3C=C(N=N3)C3=CC=CC(N)=C3)C(=O)O[C@]12C)OC']
    mols = [Chem.MolFromSmiles(smi) for smi in sample_smiles]
    descriptor = Macrocycle_Descriptors(mols)
    desc_frame = descriptor.compute_mrc()
    desc_frame.insert(loc=0, column='smiles', value=sample_smiles)
    desc_frame.to_csv('mordred_mrc.csv', index=False)

main()

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.00s/it]
