## Glycosylation ratio of each subclass of flavonoids

In [None]:
from rdkit import Chem
import numpy as np
import pandas as pd

### 1. Calculate the glycosylation ratio of flavonoids in DNP

In [None]:
df=pd.read_csv('Structure_Codes.csv')
df=df[(df['codes'].str.contains('VA')|df['codes'].str.contains('VC')|df['codes'].str.contains('VE')|df['codes'].str.contains('VF')|df['codes'].str.contains('VG')
  |df['codes'].str.contains('VH')|df['codes'].str.contains('VI')|df['codes'].str.contains('VK')|df['codes'].str.contains('VM')|df['codes'].str.contains('VO')
  |df['codes'].str.contains('VQ')|df['codes'].str.contains('VS')|df['codes'].str.contains('VT')|df['codes'].str.contains('VV')|df['codes'].str.contains('VX')
  |df['codes'].str.contains('VY')|df['codes'].str.contains('VZ'))]
df_flavonoids=df[df['codes'].str.contains('VK')]
df_flavonoids.reset_index(drop=True,inplace=True)
len(df_flavonoids[df_flavonoids['hadOrHasSugars']==True])/len(df_flavonoids)

### 2. Calculate the glycosylation ratio of each subclass of flavonoids

### 2.1 Get the CRCNumber of flavonoids

In [None]:
VK_class_all = []
VK_class_all_count = []
for k in range(len(df_flavonoids)):
    VK_class = df_flavonoids['codes'][k]
    VK_class = VK_class.split(', ')
    VK_class_list = []
    for i in range(len(VK_class)):
        if VK_class[i][0:2] == 'VK':
            VK_class_list.append(VK_class[i])
    VK_class_all.append(VK_class_list)
    VK_class_all_count.append(len(VK_class_list))
df_flavonoids['class_code'] = VK_class_all
df_flavonoids['count'] = VK_class_all_count

### 2.2 Classification of flavonoids based on the annotation files of DNP

In [None]:
VK_name_all = []
for k in range(len(df_flavonoids)):
    VK_class = df_flavonoids['class_code'][k]
    VK_name = []
    for i in VK_class:
        a = int(i[2:])
        if (a >= 10 and a <= 95) or (a == 1500):
            VK_name.append('Anthocyanidins')
        elif (a == 1000) or (a == 1100) or (a == 1200) or (a == 1250):
            VK_name.append('Flavans, Flavanols and Leucoanthocyanidins')
        elif a == 2000:
            VK_name.append('Biflavonoids and polyflavonoids')
        elif a >= 3000 and a <= 3100:
            VK_name.append('Isoflavonoids')
        elif a >= 3200 and a <= 3300:
            VK_name.append('Rotenoid flavonoids')
        elif a >= 3400 and a <= 3550:
            VK_name.append('Pterocarpans')
        elif a >= 3600 and a <= 3700:
            VK_name.append('Isoflavans')
        elif a == 3750:
            VK_name.append('Coumestan flavonoids')
        elif a == 4000:
            VK_name.append('Neoflavonoids')
        elif a >= 5000 and a <= 5290:
            VK_name.append('Flavones and Flavonols')
        elif (a >= 6010 and a <= 6095) or (a == 6200):
            VK_name.append('Chalcones and dihydrochalcones')
        elif a == 6100:
            VK_name.append('Aurone flavonoids')
        elif a >= 6300 and a <= 6390:
            VK_name.append('Flavanones')
        elif a >= 6410 and a <= 6490:
            VK_name.append('Dihydroflavonols')
        elif (a==1300) or (a==3720) or (a==3770) or (a==3800) or (a==3820) or (a >= 6500 and a <= 7000) or (a==8300) or (a==3850):
            VK_name.append('Others')
    VK_name_all.append(VK_name)
df_flavonoids['Classify'] = VK_name_all
df1=df_flavonoids.drop(columns = ['class_code','count'])
for i in range(len(df1)):
    df1['Classify'][i]=' '.join(df1['Classify'][i])
df1.to_csv('DNP_Flavonoids.csv',index=False)

### 2.3 Calculate the glycosylation ratio

In [None]:
classify=['Anthocyanidins','Flavans, Flavanols and Leucoanthocyanidins','Biflavonoids and polyflavonoids','Isoflavonoids',
          'Rotenoid flavonoids','Pterocarpans', 'Isoflavans', 'Coumestan flavonoids', 'Neoflavonoids', 'Flavones and Flavonols',
          'Chalcones and dihydrochalcones','Aurone flavonoids','Flavanones','Dihydroflavonols','Others']
data=[]
for i in classify:
    df_classify=df1[df1['Classify'].str.contains(i)]
    df_classify_hassugar=df1[(df1['Classify'].str.contains(i))&(df1['hadOrHasSugars']==True)]
    glycosylation_ratio=round(len(df_classify_hassugar)/len(df_classify)*100,2)
    data.append((i,len(df_classify),len(df_classify_hassugar),glycosylation_ratio))
data=pd.DataFrame(data,columns=['Category','All','Sugar','Percentages(%)'])
data