In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('identified_peptides.csv')
df = df.dropna(subset=["canonical SMILES", "Class taste"])

In [2]:
def smiles_to_morgan(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)

Bitterness vs Non-Bitterness

In [None]:
df_bitter = df.dropna(subset=["is_bitter"])
X_bitter = df_bitter["canonical SMILES"].apply(smiles_to_morgan).dropna()
df_bitter = df_bitter.loc[X_bitter.index]
X_bitter = np.array([np.array(fp) for fp in X_bitter])
y_bitter = df_bitter["is_bitter"].astype(int)  # 1 - bitter, 0 - sweet

X_train, X_test, y_train, y_test = train_test_split(X_bitter, y_bitter, test_size=0.2, random_state=42)

clf_bitter = RandomForestClassifier(n_estimators=250, random_state=42)
clf_bitter.fit(X_train, y_train)

y_pred = clf_bitter.predict(X_test)
print("Classification Report (Bitter vs Non-Bitter):")
print(classification_report(y_test, y_pred))

Classification Report (Bitter vs Non-Bitter):
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       117
           1       0.89      0.65      0.75        37

    accuracy                           0.90       154
   macro avg       0.89      0.81      0.84       154
weighted avg       0.90      0.90      0.89       154



Bitterness vs Sweetness

In [None]:
df_bitter_sweet = df.dropna(subset=["is_bitter_vs_sweet"])
X_bs = df_bitter_sweet["canonical SMILES"].apply(smiles_to_morgan).dropna()
df_bitter_sweet = df_bitter_sweet.loc[X_bs.index]
X_bs = np.array([np.array(fp) for fp in X_bs])
y_bs = df_bitter_sweet["is_bitter_vs_sweet"].astype(int)  # True/False to 1/0 1-bitter, 0 -sweet

X_train_bs, X_test_bs, y_train_bs, y_test_bs = train_test_split(X_bs, y_bs, test_size=0.2, random_state=42)

clf_bs = RandomForestClassifier(n_estimators=100, random_state=42)
clf_bs.fit(X_train_bs, y_train_bs)

y_pred_bs = clf_bs.predict(X_test_bs)
print("Classification Report (Bitter vs Sweet):")
print(classification_report(y_test_bs, y_pred_bs))

Classification Report (Bitter vs Sweet):
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        79
           1       1.00      0.93      0.96        41

    accuracy                           0.97       120
   macro avg       0.98      0.96      0.97       120
weighted avg       0.98      0.97      0.97       120

