In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("identified_peptides_with_sequences.csv")
aa_list = list("ACDEFGHIKLMNPQRSTVWY")

In [17]:
def compute_aac(sequence):
    length = len(sequence)
    if length == 0:
        return {aa: 0.0 for aa in aa_list}
    return {aa: sequence.count(aa) / length for aa in aa_list}

In [18]:
aac_df = df['sequence'].apply(compute_aac).apply(pd.Series)
X_full = aac_df
X_scaled = StandardScaler().fit_transform(X_full)

In [22]:
df_a = df[df['is_bitter'].isin([True, False])].copy()
y_a = df_a['is_bitter'].astype(int)
X_a = X_scaled[df_a.index]

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(
    X_a, y_a, test_size=0.2, random_state=42, stratify=y_a
)

clf_a = RandomForestClassifier(n_estimators=100, random_state=42)
clf_a.fit(X_train_a, y_train_a)
y_pred_a = clf_a.predict(X_test_a)

print("=== Classifier A: Bitter vs Non-Bitter ===")
print(classification_report(
    y_test_a, y_pred_a, target_names=["non-bitter", "bitter"], digits=2
))

=== Classifier A: Bitter vs Non-Bitter ===
              precision    recall  f1-score   support

  non-bitter       0.91      0.94      0.92       116
      bitter       0.79      0.71      0.75        38

    accuracy                           0.88       154
   macro avg       0.85      0.83      0.84       154
weighted avg       0.88      0.88      0.88       154



In [None]:
df_b = df[df['is_bitter_vs_sweet'].isin([True, False])].copy()
y_b = df_b['is_bitter_vs_sweet'].astype(int)
X_b = X_scaled[df_b.index]

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_b, y_b, test_size=0.2, random_state=42, stratify=y_b
)

clf_b = RandomForestClassifier(n_estimators=100, random_state=42)
clf_b.fit(X_train_b, y_train_b)
y_pred_b = clf_b.predict(X_test_b)

print(classification_report(
    y_test_b, y_pred_b, target_names=["sweet", "bitter"], digits=2
))


=== Classifier B: Bitter vs Sweet (Only) ===
              precision    recall  f1-score   support

       sweet       0.92      0.96      0.94        82
      bitter       0.91      0.82      0.86        38

    accuracy                           0.92       120
   macro avg       0.92      0.89      0.90       120
weighted avg       0.92      0.92      0.92       120

