In [1]:
# =========================================
# TRAINING MODEL KLASIFIKASI STATUS GIZI
# Output:
# - pipeline_status_gizi.joblib (preprocess + model)
# - label_encoder.joblib (mapping label)
# =========================================

import os
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# -----------------------------------------
# 1) Load data
# -----------------------------------------
DATA_PATH = "malnutrition_data.csv"  # sesuaikan
df = pd.read_csv(DATA_PATH)

# -----------------------------------------
# 2) Pisahkan fitur dan target
# -----------------------------------------
TARGET_COL = "nutrition_status"
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# -----------------------------------------
# 3) Encoding target (label)
# -----------------------------------------
label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)

# -----------------------------------------
# 4) Fitur numerik
# -----------------------------------------
numerical_cols = ["age_months", "weight_kg", "height_cm", "muac_cm", "bmi"]

# -----------------------------------------
# 5) Preprocessor (impute + scaling)
# -----------------------------------------
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[("num", numeric_pipe, numerical_cols)],
    remainder="drop"
)

# -----------------------------------------
# 6) Split data
# -----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc,
    test_size=0.2,
    random_state=42,
    stratify=y_enc
)

# -----------------------------------------
# 7) Dua model untuk dibandingkan
# -----------------------------------------
pipe_logreg = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", LogisticRegression(max_iter=2000))
])

pipe_rf = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(n_estimators=300, random_state=42))
])

# -----------------------------------------
# 8) Train + evaluasi Logistic Regression
# -----------------------------------------
pipe_logreg.fit(X_train, y_train)
pred_logreg = pipe_logreg.predict(X_test)
acc_logreg = accuracy_score(y_test, pred_logreg)

print("\n=== Logistic Regression ===")
print("Accuracy:", acc_logreg)
print("Confusion Matrix:\n", confusion_matrix(y_test, pred_logreg))
print("Classification Report:\n", classification_report(
    y_test, pred_logreg, target_names=label_encoder.classes_
))

# -----------------------------------------
# 9) Train + evaluasi Random Forest
# -----------------------------------------
pipe_rf.fit(X_train, y_train)
pred_rf = pipe_rf.predict(X_test)
acc_rf = accuracy_score(y_test, pred_rf)

print("\n=== Random Forest ===")
print("Accuracy:", acc_rf)
print("Confusion Matrix:\n", confusion_matrix(y_test, pred_rf))
print("Classification Report:\n", classification_report(
    y_test, pred_rf, target_names=label_encoder.classes_
))

# -----------------------------------------
# 10) Pilih model terbaik
# -----------------------------------------
if acc_rf > acc_logreg:
    best_pipeline = pipe_rf
    best_name = "RandomForest"
    best_acc = acc_rf
else:
    best_pipeline = pipe_logreg
    best_name = "LogisticRegression"
    best_acc = acc_logreg

print(f"\nModel terbaik: {best_name} | Accuracy: {best_acc}")




=== Logistic Regression ===
Accuracy: 0.946
Confusion Matrix:
 [[187  23  10]
 [  8 702   0]
 [ 13   0  57]]
Classification Report:
               precision    recall  f1-score   support

    moderate       0.90      0.85      0.87       220
      normal       0.97      0.99      0.98       710
      severe       0.85      0.81      0.83        70

    accuracy                           0.95      1000
   macro avg       0.91      0.88      0.89      1000
weighted avg       0.94      0.95      0.95      1000


=== Random Forest ===
Accuracy: 0.951
Confusion Matrix:
 [[193  15  12]
 [  7 703   0]
 [ 15   0  55]]
Classification Report:
               precision    recall  f1-score   support

    moderate       0.90      0.88      0.89       220
      normal       0.98      0.99      0.98       710
      severe       0.82      0.79      0.80        70

    accuracy                           0.95      1000
   macro avg       0.90      0.88      0.89      1000
weighted avg       0.95      0.