In [2]:
# =========================================
# TRAINING FINAL MODEL - RANDOM FOREST
# STATUS GIZI ANAK
# =========================================

import os
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# -----------------------------------------
# 1. Load data
# -----------------------------------------
DATA_PATH = "malnutrition_data.csv"  # sesuaikan
df = pd.read_csv(DATA_PATH)

# -----------------------------------------
# 2. Pisahkan fitur dan target
# -----------------------------------------
TARGET = "nutrition_status"
X = df.drop(columns=[TARGET])
y = df[TARGET]

# -----------------------------------------
# 3. Encode target
# -----------------------------------------
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# -----------------------------------------
# 4. Fitur numerik
# -----------------------------------------
num_cols = ["age_months", "weight_kg", "height_cm", "muac_cm", "bmi"]

# -----------------------------------------
# 5. Preprocessing
# -----------------------------------------
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[("num", numeric_pipe, num_cols)]
)

# -----------------------------------------
# 6. Pipeline Random Forest
# -----------------------------------------
rf_pipeline = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        class_weight="balanced"
    ))
])

# -----------------------------------------
# 7. Split data
# -----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

# -----------------------------------------
# 8. Train
# -----------------------------------------
rf_pipeline.fit(X_train, y_train)

# -----------------------------------------
# 9. Evaluasi
# -----------------------------------------
y_pred = rf_pipeline.predict(X_test)

print("=== RANDOM FOREST FINAL MODEL ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_
))

# -----------------------------------------
# 10. Simpan model
# -----------------------------------------
os.makedirs("model", exist_ok=True)

joblib.dump(rf_pipeline, "model/pipeline_status_gizi.joblib")
joblib.dump(label_encoder, "model/label_encoder.joblib")

print("\nModel Random Forest berhasil disimpan.")


=== RANDOM FOREST FINAL MODEL ===
Accuracy: 0.953
Confusion Matrix:
[[195  16   9]
 [  5 705   0]
 [ 17   0  53]]
Classification Report:
              precision    recall  f1-score   support

    moderate       0.90      0.89      0.89       220
      normal       0.98      0.99      0.99       710
      severe       0.85      0.76      0.80        70

    accuracy                           0.95      1000
   macro avg       0.91      0.88      0.89      1000
weighted avg       0.95      0.95      0.95      1000


Model Random Forest berhasil disimpan.
