### Get the Murcko scaffold of compounds with biological source information (aglycones and non-glycosides)

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import AllChem as ch
from rdkit.Chem import Draw as d
import pandas as pd
from rdkit import DataStructs
import collections
from rdkit.Chem.MolStandardize import rdMolStandardize

# Get the structure of compound with biological source information (deglycosylation)
df=pd.read_csv('../5 Biological Source and Glycosylation/Species_SugarResults.csv')
df=df[df['deglycosylatedMoleculeSMILES']!='[empty]']

# Define the function to calculate the Murcko scaffold
def Murcko_Skeleton(df,filename):
    deglycosylatedMoleculeSMILES=df.deglycosylatedMoleculeSMILES.values
    mols=[]
    for i in deglycosylatedMoleculeSMILES:
        mol=Chem.MolFromSmiles(i)
        mol_frags = Chem.rdmolops.GetMolFrags(mol, asMols = True)
        largest_mol = max(mol_frags, default=mol, key=lambda m: m.GetNumAtoms())
        mols.append(largest_mol)
        
    database_atomic_scaffolds = [MurckoScaffold.GetScaffoldForMol(mol) for mol in mols]
    for i in database_atomic_scaffolds:
        i.Compute2DCoords()
    def genericize_scaffold(s):
        try:
            return MurckoScaffold.MakeScaffoldGeneric(s)
        except ValueError:
            return None
    database_grafh_scaffolds = [genericize_scaffold(s) for s in database_atomic_scaffolds]
    scaffold_smiles = [Chem.MolToSmiles(scaffold) for scaffold in database_atomic_scaffolds if scaffold != None]
    
    df['Scaffold_Smiles']=scaffold_smiles
    df.to_csv(filename+'_Scaffold.csv',index=False)
    
    counter=collections.Counter(scaffold_smiles)
    columns = ['Smiles','Num']
    results = pd.DataFrame(counter.items(),columns=columns)
    results = results.sort_values(by=['Num'],ascending=False)
    results.reset_index(drop=True,inplace=True)
    results.to_csv(filename+'_Murcko.csv',index=False)

# ① The murcko scaffold of all compounds with biological source information
Murcko_Skeleton(df,'All_HasSpecies')

# ② The murcko scaffold of aglycones
df_aglycone=df[(df['hadOrHasSugars']==True)]
df_aglycone.reset_index(drop=True,inplace=True)
Murcko_Skeleton(df_aglycone,'aglycone')

# ③ The murcko scaffold of non-glycosides
df_nonglycoside=df[df['hadOrHasSugars']==False]
df_nonglycoside.reset_index(drop=True,inplace=True)
Murcko_Skeleton(df_nonglycoside,'nonglycoside')

# ④ The murcko scaffold of each species (Animal, Plant, Bacteria, Fungi)
df_Animal=df[(df['Animal']==1)&(df['hadOrHasSugars']==True)]
df_Plant=df[(df['Plant']==1)&(df['hadOrHasSugars']==True)]
df_Bacteria=df[(df['Bacteria']==1)&(df['hadOrHasSugars']==True)]
df_Fungi=df[(df['Fungi']==1)&(df['hadOrHasSugars']==True)]
Murcko_Skeleton(df_Animal,'Animal')
Murcko_Skeleton(df_Plant,'Plant')
Murcko_Skeleton(df_Bacteria,'Bacteria')
Murcko_Skeleton(df_Fungi,'Fungi')