In [5]:

import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

In [6]:
def get_features(smiles):
    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        return None

    features = {"MW": Descriptors.MolWt(mol)}

    return features

In [7]:
def is_peptide(smiles):
    mol = Chem.MolFromSmiles(smiles)

    amide_linkage = Chem.MolFromSmarts("C(=O)N")
    if not mol.HasSubstructMatch(amide_linkage):
        return False

    amino_acid_count = Chem.MolFromSmarts("[NX3][CX4H][CX3](=O)")  # Generic AA
    aa_matches = mol.GetSubstructMatches(amino_acid_count)
    return len(aa_matches) >= 2

In [8]:
chemtaste_df = pd.read_csv(r"chemtastes_families.csv")

chemtatste_descriptors = chemtaste_df["smiles"].apply(get_features).apply(pd.Series)
chemtaste_df = chemtaste_df.join(chemtatste_descriptors).dropna()

chemtaste_df.head()

Unnamed: 0,smiles,taste,superclass,class,subclass,MW
0,Oc1cc2c(cc1O)C1c3ccc(c(c3OCC1(O)C2)O)O,Sweetness,Organoheterocyclic compounds,Benzopyrans,1-benzopyrans,302.282
1,CC(C)=CCCC(C)(O)C1CC(O)C(=CC1=O)C,Sweetness,Lipids and lipid-like molecules,Prenol lipids,Sesquiterpenoids,252.354
2,CC(=O)OC1C(Oc2cc(cc(c2C1=O)O)O)c1ccc(c(c1)O)O,Sweetness,Phenylpropanoids and polyketides,Flavonoids,Flavans,346.291
3,COc1c(cc2c(c1O)C(=O)C(OC(C)=O)C(O2)c1ccc(c(c1)...,Sweetness,Phenylpropanoids and polyketides,Flavonoids,O-methylated flavonoids,376.317
4,COc1ccc(cc1O)C1Cc2cccc(c2C(=O)O1)OC1OC(CO)C(O)...,Sweetness,Organic oxygen compounds,Organooxygen compounds,Carbohydrates and carbohydrate conjugates,448.424


In [9]:
chemtaste_mw_sorted = chemtaste_df.sort_values(by="MW", ascending=False)
chemtaste_peptides = chemtaste_mw_sorted[chemtaste_mw_sorted['MW'] <= 500] # 

print(len(chemtaste_peptides))
chemtaste_peptides

2932


Unnamed: 0,smiles,taste,superclass,class,subclass,MW
1462,CCN1CC2(COC)C(O)CC(OC)C34C5CC6(O)C(O)C5C(O)(C(...,Bitterness,Lipids and lipid-like molecules,Prenol lipids,Diterpenoids,499.601
1703,CC1C2C(CC3(C)C4CC=C5C(C=C(O)C(=O)C5(C)C)C4(C)C...,Bitterness,Lipids and lipid-like molecules,Steroids and steroid derivatives,Oxosteroids,498.660
925,COC(=O)C(CSC(C)(C)C)NC(=O)C(CC(O)=O)NCCC(C)(C)...,Sweetness,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",498.642
2564,CC1(O)CC23CCC4C(C)(CCCC4(C)C(O)=O)C3CCC1(C2)OC...,Bitterness,Lipids and lipid-like molecules,Prenol lipids,Terpene glycosides,498.613
3207,CC12CCCC(C)(C2CCC23CC(CCC12)C(O)(CO)C3)C(=O)OC...,Tastelessness,Lipids and lipid-like molecules,Prenol lipids,Terpene glycosides,498.613
...,...,...,...,...,...,...
3014,[NH4+].[Cl-],Saltiness,Homogeneous non-metal compounds,Other non-metal halides,Unknown,53.492
3332,CCO,Non-sweetness,Organic oxygen compounds,Organooxygen compounds,Alcohols and polyols,46.069
2980,OC=O,Sourness,Organic acids and derivatives,Carboxylic acids and derivatives,Carboxylic acids,46.025
2983,Cl,Sourness,Homogeneous non-metal compounds,Halogen organides,Halogen hydrides,36.461


In [10]:
check_peptide = chemtaste_peptides["smiles"].apply(is_peptide)
chemtaste_peptides_true = chemtaste_peptides[check_peptide]

chemtaste_peptides_true

Unnamed: 0,smiles,taste,superclass,class,subclass,MW
925,COC(=O)C(CSC(C)(C)C)NC(=O)C(CC(O)=O)NCCC(C)(C)...,Sweetness,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",498.642
923,COC(=O)C(CC1CCCCC1)NC(=O)C(CC(O)=O)NCCC(C)(C)c...,Sweetness,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",492.613
2865,CCC(C)C(N)C(=O)NC(CC(N)=O)C(=O)NC(CCC(O)=O)C(=...,Umaminess,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",487.554
230,COC(=O)C(Cc1ccccc1)NC(=O)C(CC(O)=O)NC(=S)Nc1cc...,Sweetness,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",487.534
2459,NC(CCCN=C(N)N)C(=O)NC(CCCN=C(N)N)C(=O)NC(CCCN=...,Bitterness,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",486.582
...,...,...,...,...,...,...
2228,CC(NC(=O)C1CCCN1)C(O)=O,Bitterness,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",186.211
1635,CCC(C)C1NC(=O)C(C)NC1=O,Bitterness,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",184.239
1636,CC(C)CC1NC(=O)C(C)NC1=O,Bitterness,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",184.239
1640,CC(C)C1NC(=O)C(C)NC1=O,Bitterness,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",170.212


In [11]:
chemtaste_non_peptides = chemtaste_df[~chemtaste_df.isin(chemtaste_peptides_true.to_dict("list")).all(axis=1)]
chemtaste_non_peptides


Unnamed: 0,smiles,taste,superclass,class,subclass,MW
0,Oc1cc2c(cc1O)C1c3ccc(c(c3OCC1(O)C2)O)O,Sweetness,Organoheterocyclic compounds,Benzopyrans,1-benzopyrans,302.282
1,CC(C)=CCCC(C)(O)C1CC(O)C(=CC1=O)C,Sweetness,Lipids and lipid-like molecules,Prenol lipids,Sesquiterpenoids,252.354
2,CC(=O)OC1C(Oc2cc(cc(c2C1=O)O)O)c1ccc(c(c1)O)O,Sweetness,Phenylpropanoids and polyketides,Flavonoids,Flavans,346.291
3,COc1c(cc2c(c1O)C(=O)C(OC(C)=O)C(O2)c1ccc(c(c1)...,Sweetness,Phenylpropanoids and polyketides,Flavonoids,O-methylated flavonoids,376.317
4,COc1ccc(cc1O)C1Cc2cccc(c2C(=O)O1)OC1OC(CO)C(O)...,Sweetness,Organic oxygen compounds,Organooxygen compounds,Carbohydrates and carbohydrate conjugates,448.424
...,...,...,...,...,...,...
3553,CC(CCC(O)C(C)(C)O)C1CCC2(C)C3CC=C4C(CCC(O)C4(C...,Non-bitterness,Lipids and lipid-like molecules,Steroids and steroid derivatives,Cucurbitacins,474.726
3554,OCC(O)C(O)C(=O)CO,Non-bitterness,Organic oxygen compounds,Organooxygen compounds,Carbohydrates and carbohydrate conjugates,150.130
3555,CC1OC(C)C(OS(O)(=O)=O)C(O)C1O,Non-bitterness,Organoheterocyclic compounds,Oxanes,Unknown,242.249
3556,CC(=O)OC(C)(C)C1Cc2cc3c(cc2O1)OC(=O)C(=C3)C(C)...,Non-bitterness,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,356.418


In [12]:
# chemtaste_non_peptides.head()

In [13]:
chemtaste_non_peptides.to_csv('ChemTaste_nonpeptides.csv', index=False)