## Glycosylation ratio of each subclass of lignans

In [None]:
from rdkit import Chem
import numpy as np
import pandas as pd

### 1. Calculate the glycosylation ratio of lignans in DNP

In [None]:
df=pd.read_csv('Structure_Codes.csv')
df=df[(df['codes'].str.contains('VA')|df['codes'].str.contains('VC')|df['codes'].str.contains('VE')|df['codes'].str.contains('VF')|df['codes'].str.contains('VG')
  |df['codes'].str.contains('VH')|df['codes'].str.contains('VI')|df['codes'].str.contains('VK')|df['codes'].str.contains('VM')|df['codes'].str.contains('VO')
  |df['codes'].str.contains('VQ')|df['codes'].str.contains('VS')|df['codes'].str.contains('VT')|df['codes'].str.contains('VV')|df['codes'].str.contains('VX')
  |df['codes'].str.contains('VY')|df['codes'].str.contains('VZ'))]
df_lignans=df[df['codes'].str.contains('VO')]
df_lignans.reset_index(drop=True,inplace=True)
len(df_lignans[df_lignans['hadOrHasSugars']==True])/len(df_lignans)

### 2. Calculate the glycosylation ratio of each subclass of lignans

### 2.1 Get the CRCNumber of lignans

In [None]:
VO_class_all = []
VO_class_all_count = []
for k in range(len(df_lignans)):
    VO_class = df_lignans['codes'][k]
    VO_class = VO_class.split(', ')
    VO_class_list = []
    for i in range(len(VO_class)):
        if VO_class[i][0:2] == 'VO':
            VO_class_list.append(VO_class[i])
    VO_class_all.append(VO_class_list)
    VO_class_all_count.append(len(VO_class_list))
df_lignans['class_code'] = VO_class_all
df_lignans['count'] = VO_class_all_count

### 2.2 Classification of lignans based on the annotation files of DNP

In [None]:
VO_name_all = []
for k in range(len(df_lignans)):
    VO_class = df_lignans['class_code'][k]
    VO_name = []
    for i in VO_class:
        a = int(i[2:])
        if a==20:
            VO_name.append('Lignan monomers')
        elif (a==50) or (a==100):
            VO_name.append('Dibenzylbutane lignans')
        elif a == 150:
            VO_name.append('Saturated dibenzylbutyrolactone and lactol lignans')
        elif a==200:
            VO_name.append('Unsaturated dibenzylbutyrolactone and lactol lignans')
        elif (a==250) or (a==280) or (a==300) or (a==350):
            VO_name.append('Epoxyfuranoid lignans')
        elif (a==400) or (a==450) or (a==470):
            VO_name.append('Furofuranoid lignans')
        elif a==500:
            VO_name.append('Simple 2,7’-cyclolignans arylnaphthalenes')
        elif a == 550:
            VO_name.append('2,7’-Cyclolignans side-chain oxygenated')
        elif a == 600:
            VO_name.append('2,7’-Cyclo-9,9’-epoxylignans')
        elif a==620:
            VO_name.append('2,7’-Cyclolignan-9,9’-olides')
        elif a==630:
            VO_name.append('2,7’-Cyclolignan-9’,9-olides')
        elif a == 640:
            VO_name.append('Miscellaneous naphthalenoid lignans')
        elif a==670:
            VO_name.append('7’,7’/7’,8’-Cyclolignans cyclobutanes')
        elif a==750:
            VO_name.append('2,2’-Cyclolignans dibenzocyclooctadienes')
        elif a==800:
            VO_name.append('Norlignans')
        elif a==850:
            VO_name.append('Homolignans')
        elif a==1000:
            VO_name.append('Sesquilignans')
        elif a==1200:
            VO_name.append('Bilignans')
        elif a==1500:
            VO_name.append('Neolignans')
        elif a==1600:
            VO_name.append('Flavonolignans')
        elif a==1650:
            VO_name.append('Stilbenolignans')
        elif a>=8000 and a<=8050:
            VO_name.append('Diarylheptanoids')
    VO_name_all.append(VO_name)
df_lignans['Classify'] = VO_name_all
df1=df_lignans.drop(columns = ['class_code','count'])
for i in range(len(df1)):
    df1['Classify'][i]=' '.join(df1['Classify'][i])
df1.to_csv('DNP_Lignans.csv',index=False)

### 2.3 Calculate the glycosylation ratio

In [None]:
classify=['Lignan monomers','Dibenzylbutane lignans','Saturated dibenzylbutyrolactone and lactol lignans',
          'Unsaturated dibenzylbutyrolactone and lactol lignans','Epoxyfuranoid lignans','Furofuranoid lignans', 
          'Simple 2,7’-cyclolignans arylnaphthalenes', '2,7’-Cyclolignans side-chain oxygenated', '2,7’-Cyclo-9,9’-epoxylignans',
          '2,7’-Cyclolignan-9,9’-olides','2,7’-Cyclolignan-9’,9-olides','Miscellaneous naphthalenoid lignans',
          '7’,7’/7’,8’-Cyclolignans cyclobutanes','2,2’-Cyclolignans dibenzocyclooctadienes','Norlignans',
          'Homolignans','Sesquilignans','Bilignans','Neolignans','Flavonolignans','Stilbenolignans','Diarylheptanoids']
data=[]
for i in classify:
    try:
        df_classify=df1[df1['Classify'].str.contains(i)]
        df_classify_hassugar=df1[(df1['Classify'].str.contains(i))&(df1['hadOrHasSugars']==True)]
        glycosylation_ratio=round(len(df_classify_hassugar)/len(df_classify)*100,2)
        data.append((i,len(df_classify),len(df_classify_hassugar),glycosylation_ratio))
    except:
        pass
data=pd.DataFrame(data,columns=['Category','All','Sugar','Percentages(%)'])
data