In [34]:
import pandas as pd

df = pd.read_csv("gut_health_synthetic_dataset.csv")
print(df.head())
print(df.shape)

   stool_type stool_color water_intake fiber_intake   food_type  stress_level  \
0           6      yellow          low         high       dairy             1   
1           3      yellow          low          low       spicy             0   
2           2       brown       medium         high  fiber_rich             2   
3           4       green       medium       medium        oily             2   
4           1       brown         high          low      normal             1   

   time_since_meal_hours                 symptoms        label  
0                      7                   cramps     Diarrhea  
1                      8                   nausea  Dehydration  
2                      6  abdominal_pain|bloating      Healthy  
3                      5          bloating|nausea  Indigestion  
4                      4           abdominal_pain      Healthy  
(10000, 9)


In [35]:
X = df.drop("label", axis=1)
y = df["label"]

In [36]:
from sklearn.preprocessing import MultiLabelBinarizer

X["symptoms"] = X["symptoms"].fillna("")
X["symptoms_list"] = X["symptoms"].apply(lambda x: x.split("|") if x else [])

mlb = MultiLabelBinarizer()
symptoms_encoded = mlb.fit_transform(X["symptoms_list"])

symptoms_df = pd.DataFrame(
    symptoms_encoded,
    columns=[f"symptom_{s}" for s in mlb.classes_]
)

X = pd.concat([X.drop(["symptoms", "symptoms_list"], axis=1), symptoms_df], axis=1)

In [37]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = [
    "stool_color",
    "water_intake",
    "fiber_intake",
    "food_type"
]

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_cat = encoder.fit_transform(X[categorical_cols])

encoded_cat_df = pd.DataFrame(
    encoded_cat,
    columns=encoder.get_feature_names_out(categorical_cols)
)

X = pd.concat([X.drop(categorical_cols, axis=1), encoded_cat_df], axis=1)


In [38]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(label_encoder.classes_)

['Constipation' 'Dehydration' 'Diarrhea' 'Food_Intolerance' 'Healthy'
 'Indigestion']


In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)


In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)


In [41]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    random_state=42,
    class_weight="balanced"
)

model.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,15
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [42]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.859

Classification Report:

                  precision    recall  f1-score   support

    Constipation       0.94      1.00      0.97        64
     Dehydration       0.71      0.99      0.83       134
        Diarrhea       0.80      1.00      0.89       457
Food_Intolerance       0.76      1.00      0.87        13
         Healthy       0.98      0.78      0.87      1193
     Indigestion       0.58      0.90      0.70       139

        accuracy                           0.86      2000
       macro avg       0.80      0.94      0.85      2000
    weighted avg       0.89      0.86      0.86      2000



In [43]:
# Stool type 1–2 + low water should mostly be constipation
subset = X_test[
    (X_test["stool_type"].isin([1, 2])) &
    (X_test["water_intake_low"] == 1)
]

preds = model.predict(subset)
pred_labels = label_encoder.inverse_transform(preds)

pd.Series(pred_labels).value_counts(normalize=True)


Healthy         0.632432
Constipation    0.367568
Name: proportion, dtype: float64

In [15]:
import joblib

joblib.dump(model, "gut_model.pkl")
joblib.dump(encoder, "categorical_encoder.pkl")
joblib.dump(mlb, "symptom_encoder.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")


['label_encoder.pkl']

In [13]:
import numpy as np

probs = model.predict_proba(X_test)
max_probs = np.max(probs, axis=1)

print("Confident predictions:",
      np.mean(max_probs >= 0.65))


Confident predictions: 0.9385
