In [11]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdmolops

In [12]:
chemtastesdb_df = pd.read_excel("ChemTastesDB_database.xlsx",
                               na_values=["*"],
                               keep_default_na=True)
chemtastesdb_df = chemtastesdb_df.iloc[:-3][["Name", "canonical SMILES", "Class taste"]]

In [13]:
chemtastesdb_df.head()

Unnamed: 0,Name,canonical SMILES,Class taste
0,(-)-Haematoxylin,Oc1cc2c(cc1O)C1c3ccc(c(c3OCC1(O)C2)O)O,Sweetness
1,(+)-4β-hydroxyhernandulcin,CC(C)=CCCC(C)(O)C1CC(O)C(=CC1=O)C,Sweetness
2,(+)-Dihydroquercetin 3-acetate,CC(=O)OC1C(Oc2cc(cc(c2C1=O)O)O)c1ccc(c(c1)O)O,Sweetness
3,(+)-Dihydro-6-methoxy-luteolin 3-acetate,COc1c(cc2c(c1O)C(=O)C(OC(C)=O)C(O2)c1ccc(c(c1)...,Sweetness
4,(+)-Haematoxylin,Oc1cc2c(cc1O)C1c3ccc(c(c3OCC1(O)C2)O)O,Sweetness


In [14]:
def count_peptide_bonds(mol):
    peptide_smarts = "[NX3;H1,H2][CH1,CH2][CX3](=O)[NX3;H1,H2]"
    patt = Chem.MolFromSmarts(peptide_smarts)
    return len(mol.GetSubstructMatches(patt))

In [15]:
def count_amide_bonds(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 0
    
    amide_bond = Chem.MolFromSmarts("C(=O)N")
    matches = mol.GetSubstructMatches(amide_bond)
    return len(matches)

def is_peptide(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False

    if Descriptors.MolWt(mol) >= 500:
        return False

    n_peptide = count_peptide_bonds(mol)
    if n_peptide < 1:
        return False

    return True

chemtastesdb_df['is_peptide'] = chemtastesdb_df['canonical SMILES'].apply(is_peptide)
peptides_df = chemtastesdb_df[chemtastesdb_df['is_peptide']].copy()

peptides_df[['Name', 'canonical SMILES', 'Class taste']].head()

Unnamed: 0,Name,canonical SMILES,Class taste
19,"(S)-Aspartyl-(7,7-dimethylnorborn-2R-yl)-(S)-a...",COC(=O)C(CC1CC2CCC1C2)NC(=O)C(N)CC(O)=O,Sweetness
20,"(S)-Aspartyl-(7,7-dimethylnorborn-2S-yl)-(S)-a...",COC(=O)C(CC1CC2CCC1C2)NC(=O)C(N)CC(O)=O,Sweetness
22,[L-(αMe)Phe]2-aspartame,COC(=O)C(C)(Cc1ccccc1)NC(=O)C(N)CC(O)=O,Sweetness
188,2-Amino-2-{[1-oxo-1-(propan-2-yloxy)propan-2-y...,CC(C)OC(=O)C(C)NC(=O)C(N)C(O)=O,Sweetness
207,3-({[(4-cyanophenyl)amino]methyl}amino)-3-[(1-...,COC(=O)C(Cc1ccccc1)NC(=O)C(CC(O)=O)NCNc1ccc(cc...,Sweetness


In [16]:
peptides_df.to_csv("identified_peptides.csv", index=False)

In [17]:
def get_taste_label(val):
    if isinstance(val, str):
        return val.strip().capitalize()
    return None

peptides_df['normalized_taste'] = peptides_df['Class taste'].apply(get_taste_label)

In [18]:
peptides_df['is_bitter'] = peptides_df['normalized_taste'] == 'Bitterness'
def bitter_vs_sweet(val):
    if val == 'Bitterness':
        return True
    elif val == 'Sweetness':
        return False
    else:
        return None
peptides_df['is_bitter_vs_sweet'] = peptides_df['normalized_taste'].apply(bitter_vs_sweet)

print(peptides_df.head())

                                                  Name  \
19   (S)-Aspartyl-(7,7-dimethylnorborn-2R-yl)-(S)-a...   
20   (S)-Aspartyl-(7,7-dimethylnorborn-2S-yl)-(S)-a...   
22                             [L-(αMe)Phe]2-aspartame   
188  2-Amino-2-{[1-oxo-1-(propan-2-yloxy)propan-2-y...   
207  3-({[(4-cyanophenyl)amino]methyl}amino)-3-[(1-...   

                                      canonical SMILES Class taste  \
19             COC(=O)C(CC1CC2CCC1C2)NC(=O)C(N)CC(O)=O   Sweetness   
20             COC(=O)C(CC1CC2CCC1C2)NC(=O)C(N)CC(O)=O   Sweetness   
22             COC(=O)C(C)(Cc1ccccc1)NC(=O)C(N)CC(O)=O   Sweetness   
188                    CC(C)OC(=O)C(C)NC(=O)C(N)C(O)=O   Sweetness   
207  COC(=O)C(Cc1ccccc1)NC(=O)C(CC(O)=O)NCNc1ccc(cc...   Sweetness   

     is_peptide normalized_taste  is_bitter is_bitter_vs_sweet  
19         True        Sweetness      False              False  
20         True        Sweetness      False              False  
22         True        Sweetness   

In [19]:
peptides_df.to_csv("identified_peptides.csv", index=False)

In [20]:
non_peptides_df = chemtastesdb_df[~chemtastesdb_df['is_peptide']].copy()
non_peptides_df[['Name', 'canonical SMILES', 'Class taste']].to_csv("non_peptides.csv", index=False)