### Glycoside types (monosaccharides, oligosaccharide)

In [None]:
import pandas as pd
from rdkit import rdBase, Chem
from rdkit.Chem import AllChem
from rdkit import Chem
import numpy as np
df=pd.read_csv('../7.3 Sugar Type/Sugar_Type.csv')

# Defines the function for obtaining the largest molecular fragment
def Largetst_Smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    mol_frags = Chem.rdmolops.GetMolFrags(mol, asMols = True)
    largest_mol = max(mol_frags, default=mol, key=lambda m: m.GetNumAtoms())
    largest_smiles = Chem.MolToSmiles(largest_mol)
    return largest_smiles

# Define the function of the step by step removal of the largest molecular fragment of glycosides
def Delete_Aglycones(ori_smiles,deg_smiles):
    ori_mol=Chem.MolFromSmiles(ori_smiles)
    deg_mol=Chem.MolFromSmiles(deg_smiles)
    deg_smiles_split=deg_smiles.split('.')
    tmp=ori_mol
    for i in range(len(deg_smiles_split)):
        largest_smiles = Largetst_Smiles(deg_smiles)
        patt=Chem.MolFromSmiles(largest_smiles)
        tmp = Chem.ReplaceCore(tmp,patt)
        deg_smiles_split.remove(largest_smiles)
        deg_smiles='.'.join(deg_smiles_split)
    tmp_smiles=Chem.MolToSmiles(tmp)
    return tmp_smiles

# Get the structure of the sugar
Sugar_Smiles=[]
Structure_Error_ID=[]
for i in range(len(df)):
    ori_smiles=df['originalMoleculeSMILES'][i]
    degly_smiles=df['deglycosylatedMoleculeSMILES'][i]
    if len(df['SugarMoietySMILES'][i].split(','))==1:
        sugar_smiles=df['SugarMoietySMILES'][i][2:-2]
    elif degly_smiles=='[empty]': 
        sugar_smiles=ori_smiles
    else:
        try:
            sugar_smiles=Delete_Aglycones(ori_smiles,degly_smiles)
        except:
            sugar_smiles='error'
            Structure_Error_ID.append(df['ID'][i])
    Sugar_Smiles.append(sugar_smiles)
df['sugar_smiles']=Sugar_Smiles
df.to_csv('Sugar_Smiles.csv',index=False)

# Get the number of sugars in each molecule
Sugar_Ring_Num=[]
for i in range(len(df)):
    sugar_smiles=df['sugar_smiles'][i]
    if sugar_smiles=='error':
        sugar_ring_num='error'
    else:
        each_one=[]
        sugar_smiles_split=sugar_smiles.split('.')
        for j in sugar_smiles_split:
            a = Chem.MolFromSmiles(j)
            b = Chem.MolFromSmarts('[$([OR])]')
            submols1 = a.GetSubstructMatches(b)
            each_one.append(len(submols1))
        sugar_ring_num=each_one
    Sugar_Ring_Num.append(sugar_ring_num)
df['sugar_ring_num']=Sugar_Ring_Num

# Get the sugar type
Sugar_Class=[]
for i in range(len(df)):
    sugar_ring_num=df['sugar_ring_num'][i]
    if sugar_ring_num=='error':
        sugar_class='error'
    else:
        sugar_class_list=[]
        for j in sugar_ring_num:
            if j==0:
                sugar_class_list.append('mono')
            elif j==1:
                sugar_class_list.append('mono')
            elif (j>=2 and j<=10):
                sugar_class_list.append('oli')
            else:
                sugar_class_list.append('poly')
        sugar_class=sugar_class_list
    Sugar_Class.append(sugar_class)
df['sugar_class']=Sugar_Class
df.to_csv('Sugar_Class.csv',index=False)
df

### Confirm if the results are wrong

In [None]:
Results=[]
for i in range(len(df)):
    sugar_ring_num=df['sugar_ring_num'][i]
    if sugar_ring_num=='error':
        Results.append('error')
    else: 
        num=0
        for j in sugar_ring_num:
            if j==0:
                j=1
            num=num+j
        if df['sugar_num_count'][i]!=num:
            Results.append('error')
        else:
            Results.append('right')
df['Results']=Results
df.to_csv('Glycosides_Sites.csv',index=False)
df

### Output wrong results and check manually

In [None]:
df_error=df[df['Results']=='error']
df_error.to_csv('Error.csv',index=False)
df_error.reset_index(drop=True,inplace=True)
df_error

### The style of sugar combinations (Finally)

In [None]:
df=pd.read_csv('Glycosides_Sites.csv')
df_error_right=pd.read_csv('Error_Right.csv')
data=[]
for i in range(len(df)):
    ID=df['ID'][i]
    if df['Results'][i]=='error':
        for j in range(len(df_error_right)):
            if df_error_right['ID'][j]==ID:
                sugar_class=df_error_right['sugar_class'][j]
        df['sugar_class'][i]=sugar_class
df.to_csv('Results.csv',index=False)
df['sugar_class'].value_counts()