## Glycosylation ratio of each subclass of tannins

In [None]:
from rdkit import Chem
import numpy as np
import pandas as pd

### 1. Calculate the glycosylation ratio of tannins in DNP

In [None]:
df=pd.read_csv('Structure_Codes.csv')
df=df[(df['codes'].str.contains('VA')|df['codes'].str.contains('VC')|df['codes'].str.contains('VE')|df['codes'].str.contains('VF')|df['codes'].str.contains('VG')
  |df['codes'].str.contains('VH')|df['codes'].str.contains('VI')|df['codes'].str.contains('VK')|df['codes'].str.contains('VM')|df['codes'].str.contains('VO')
  |df['codes'].str.contains('VQ')|df['codes'].str.contains('VS')|df['codes'].str.contains('VT')|df['codes'].str.contains('VV')|df['codes'].str.contains('VX')
  |df['codes'].str.contains('VY')|df['codes'].str.contains('VZ'))]
df_tannins=df[df['codes'].str.contains('VM')]
df_tannins.reset_index(drop=True,inplace=True)
len(df_tannins[df_tannins['hadOrHasSugars']==True])/len(df_tannins)

### 2. Calculate the glycosylation ratio of each subclass of tannins

### 2.1 Get the CRCNumber of tannins

In [None]:
VM_class_all = []
VM_class_all_count = []
for k in range(len(df_tannins)):
    VM_class = df_tannins['codes'][k]
    VM_class = VM_class.split(', ')
    VM_class_list = []
    for i in range(len(VM_class)):
        if VM_class[i][0:2] == 'VM':
            VM_class_list.append(VM_class[i])
    VM_class_all.append(VM_class_list)
    VM_class_all_count.append(len(VM_class_list))
    
df_tannins['class_code'] = VM_class_all
df_tannins['count'] = VM_class_all_count

### 2.2 Classification of tannins based on the annotation files of DNP

In [None]:
VM_name_all = []
for k in range(len(df_tannins)):
    VM_class = df_tannins['class_code'][k]
    VM_name = []
    for i in VM_class:
        a = int(i[2:])
        if a==6000:
            VM_name.append('Simple gallate ester tannins')
        elif a==6050:
            VM_name.append('Galloylgalloyl tannins')
        elif a == 6100:
            VM_name.append('Hexahydroxydiphenoyl ester tannins')
        elif a==6200:
            VM_name.append('Dehydrohexahydroxydiphenoyl ester tannins')
        elif a==6300:
            VM_name.append('Elaeocarpusinoyl ester tannins')
        elif a==6500:
            VM_name.append('Chebuloyl tannins')
        elif a==6600:
            VM_name.append('Brevifoloyl ester tannins')
        elif a == 6700:
            VM_name.append('Dehydrodigalloyl ester tannins')
        elif a == 6800:
            VM_name.append('Valoneoyl tannins')
        elif a==6850:
            VM_name.append('Valoneoyl dilactone tannins')
        elif a==6900:
            VM_name.append('Sanguisorbyl ester tannins')
        elif a == 7000:
            VM_name.append('Flavogallonoyl ester tannins')
        elif a==7050:
            VM_name.append('Flavogallonoyl dilactone tannins')
        elif a==7100:
            VM_name.append('Tetrahydroxybenzofuran dicarboxylate tannins')
        elif a==7200:
            VM_name.append('Macaranoyl ester tannins')
        elif a==7300:
            VM_name.append('Tergalloyl ester tannins')
        elif a==7350:
            VM_name.append('Tergalloyl monolactone tannins')
        elif a==7400:
            VM_name.append('Trilloyl ester tannins')
        elif a==7500:
            VM_name.append('Euphorbinoyl ester tannins')
        elif a==7600:
            VM_name.append('Gallagyl ester tannins')
        elif a==7650:
            VM_name.append('Terchebuloyl ester tannins')
        elif a==7790:
            VM_name.append('Miscellaneous ellagitannins')
    VM_name_all.append(VM_name)
df_tannins['Classify'] = VM_name_all
df1=df_tannins.drop(columns = ['class_code','count'])
for i in range(len(df1)):
    df1['Classify'][i]=' '.join(df1['Classify'][i])
df1.to_csv('DNP_Tannins.csv',index=False)

### 2.3 Calculate the glycosylation ratio

In [None]:
classify=['Simple gallate ester tannins','Galloylgalloyl tannins','Hexahydroxydiphenoyl ester tannins','Dehydrohexahydroxydiphenoyl ester tannins',
          'Elaeocarpusinoyl ester tannins','Chebuloyl tannins', 'Brevifoloyl ester tannins', 'Dehydrodigalloyl ester tannins', 
          'Valoneoyl tannins', 'Valoneoyl dilactone tannins','Sanguisorbyl ester tannins','Flavogallonoyl ester tannins',
          'Flavogallonoyl dilactone tannins','Tetrahydroxybenzofuran dicarboxylate tannins','Macaranoyl ester tannins','Tergalloyl ester tannins',
          'Tergalloyl monolactone tannins','Trilloyl ester tannins','Euphorbinoyl ester tannins','Gallagyl ester tannins',
          'Terchebuloyl ester tannins','Miscellaneous ellagitannins']
data=[]
for i in classify:
    try:
        df_classify=df1[df1['Classify'].str.contains(i)]
        df_classify_hassugar=df1[(df1['Classify'].str.contains(i))&(df1['hadOrHasSugars']==True)]
        glycosylation_ratio=round(len(df_classify_hassugar)/len(df_classify)*100,2)
        data.append((i,len(df_classify),len(df_classify_hassugar),glycosylation_ratio))
    except:
        pass
data=pd.DataFrame(data,columns=['Category','All','Sugar','Percentages(%)'])
data