**Implementasi Machine Learning, Rule-Based System, dan Decision Support System untuk Klasifikasi Keseimbangan Nutrisi Bahan Makanan**

In [1]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("/content/nutrition.csv")

print("Total data asli:", len(df))
df.head()


Total data asli: 8789


Unnamed: 0.1,Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [2]:
features = [
    'calories',
    'protein',
    'fat',
    'carbohydrate',
    'sugars',
    'fiber',
    'saturated_fat'
]

df = df[features].copy()

# Pastikan numerik
for col in features:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Isi NaN dengan 0
df = df.fillna(0)

print("Data setelah preprocessing:", len(df))
df.describe()


Data setelah preprocessing: 8789


Unnamed: 0,calories,protein,fat,carbohydrate,sugars,fiber,saturated_fat
count,8789.0,8789.0,8789.0,8789.0,8789.0,8789.0,8789.0
mean,226.283878,0.0,0.0,0.0,0.0,0.0,0.0
std,169.862001,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,91.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,191.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,337.0,0.0,0.0,0.0,0.0,0.0,0.0
max,902.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
def generate_combinations(data, n_samples=5000):
    data_values = data.to_numpy(dtype=float)
    n_rows = data_values.shape[0]

    if n_rows < 3:
        raise ValueError("Data tidak cukup untuk kombinasi")

    combos = []

    for _ in range(n_samples):
        idx = np.random.choice(n_rows, 3, replace=False)
        items = data_values[idx]

        weights = np.random.dirichlet([1, 1, 1])
        combined = weights @ items

        combos.append(combined)

    return pd.DataFrame(combos, columns=data.columns)


In [4]:
combo_df = generate_combinations(df, n_samples=6000)

print("Hasil simulasi:", len(combo_df))
combo_df.head()


Hasil simulasi: 6000


Unnamed: 0,calories,protein,fat,carbohydrate,sugars,fiber,saturated_fat
0,195.459991,0.0,0.0,0.0,0.0,0.0,0.0
1,252.214956,0.0,0.0,0.0,0.0,0.0,0.0
2,462.536629,0.0,0.0,0.0,0.0,0.0,0.0
3,102.424486,0.0,0.0,0.0,0.0,0.0,0.0
4,403.171927,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
combo_df = combo_df.copy()

combo_df['protein_ratio'] = (combo_df['protein'] * 4) / (combo_df['calories'] + 1e-6)
combo_df['fat_ratio']     = (combo_df['fat'] * 9) / (combo_df['calories'] + 1e-6)
combo_df['carb_ratio']    = (combo_df['carbohydrate'] * 4) / (combo_df['calories'] + 1e-6)

combo_df['sugar_ratio']   = combo_df['sugars'] / (combo_df['carbohydrate'] + 1e-6)
combo_df['satfat_ratio']  = combo_df['saturated_fat'] / (combo_df['fat'] + 1e-6)

combo_df['imbalance_index'] = abs(
    combo_df['protein_ratio']
    - combo_df['fat_ratio']
    - combo_df['carb_ratio']
)

combo_df.head()


Unnamed: 0,calories,protein,fat,carbohydrate,sugars,fiber,saturated_fat,protein_ratio,fat_ratio,carb_ratio,sugar_ratio,satfat_ratio,imbalance_index
0,195.459991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,252.214956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,462.536629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,102.424486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,403.171927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
def label_nutrition(row):
    if (
        0.15 <= row['protein_ratio'] <= 0.30 and
        0.20 <= row['fat_ratio'] <= 0.35 and
        0.40 <= row['carb_ratio'] <= 0.65 and
        row['sugar_ratio'] < 0.30 and
        row['satfat_ratio'] < 0.35
    ):
        return 0   # Seimbang
    elif row['sugar_ratio'] > 0.45 or row['satfat_ratio'] > 0.45:
        return 2   # Risiko Tinggi
    else:
        return 1   # Tidak Seimbang


In [7]:
import numpy as np

# Komposisi gizi ideal (referensi WHO / AKG)
ideal = {
    'protein_ratio': 0.20,
    'fat_ratio': 0.30,
    'carb_ratio': 0.50
}

# Hitung jarak ke komposisi ideal
combo_df['distance'] = np.sqrt(
    (combo_df['protein_ratio'] - ideal['protein_ratio'])**2 +
    (combo_df['fat_ratio']     - ideal['fat_ratio'])**2 +
    (combo_df['carb_ratio']    - ideal['carb_ratio'])**2
)

# Bagi berdasarkan kuantil (data-driven)
q1 = combo_df['distance'].quantile(0.33)
q2 = combo_df['distance'].quantile(0.66)

def label_nutrition_distance(d):
    if d <= q1:
        return 0   # Seimbang
    elif d <= q2:
        return 1   # Tidak seimbang
    else:
        return 2   # Risiko tinggi

combo_df['label'] = combo_df['distance'].apply(label_nutrition_distance)

combo_df['label'].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,6000


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features_ml = [
    'calories', 'protein', 'fat', 'carbohydrate',
    'fiber', 'sugars', 'saturated_fat',
    'protein_ratio', 'fat_ratio', 'carb_ratio',
    'sugar_ratio', 'satfat_ratio', 'imbalance_index'
]

X = combo_df[features_ml].values
y = combo_df['label'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape :", X_test.shape)


X_train shape: (4800, 13)
X_test shape : (1200, 13)


In [9]:
from sklearn.ensemble import RandomForestClassifier

# Safety check (biar ga NameError lagi)
assert 'X_train' in globals(), "X_train belum ada, jalankan CELL 8 dulu"

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=14,
    min_samples_split=5,
    random_state=42,
    class_weight='balanced'
)

model.fit(X_train, y_train)

print("Model training selesai")


Model training selesai


In [10]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)


Accuracy: 1.0


In [11]:
from sklearn.metrics import confusion_matrix
import pandas as pd

labels = [0, 1, 2]

cm = confusion_matrix(
    y_test,
    y_pred,
    labels=labels
)

cm_df = pd.DataFrame(
    cm,
    index=['Seimbang', 'Tidak Seimbang', 'Risiko Tinggi'],
    columns=['Seimbang', 'Tidak Seimbang', 'Risiko Tinggi']
)

cm_df


Unnamed: 0,Seimbang,Tidak Seimbang,Risiko Tinggi
Seimbang,1200,0,0
Tidak Seimbang,0,0,0
Risiko Tinggi,0,0,0


In [12]:
model = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    min_samples_leaf=10,
    random_state=42,
    class_weight='balanced_subsample'
)

model.fit(X_train, y_train)


In [13]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


CLASSIFICATION REPORT

In [14]:
import pandas as pd

print("Distribusi y_test:")
print(pd.Series(y_test).value_counts())

print("\nDistribusi y_pred:")
print(pd.Series(y_pred).value_counts())


Distribusi y_test:
0    1200
Name: count, dtype: int64

Distribusi y_pred:
0    1200
Name: count, dtype: int64


In [15]:
from sklearn.metrics import classification_report

print(classification_report(
    y_test,
    y_pred,
    labels=[0, 1, 2],
    target_names=['Seimbang', 'Tidak Seimbang', 'Risiko Tinggi'],
    zero_division=0
))


                precision    recall  f1-score   support

      Seimbang       1.00      1.00      1.00      1200
Tidak Seimbang       0.00      0.00      0.00         0
 Risiko Tinggi       0.00      0.00      0.00         0

      accuracy                           1.00      1200
     macro avg       0.33      0.33      0.33      1200
  weighted avg       1.00      1.00      1.00      1200



CONFUSION MATRIX

In [16]:
from sklearn.metrics import confusion_matrix
import pandas as pd

labels = [0, 1, 2]

cm = confusion_matrix(
    y_test,
    y_pred,
    labels=labels
)

cm_df = pd.DataFrame(
    cm,
    index=['Seimbang', 'Tidak Seimbang', 'Risiko Tinggi'],
    columns=['Seimbang', 'Tidak Seimbang', 'Risiko Tinggi']
)

cm_df


Unnamed: 0,Seimbang,Tidak Seimbang,Risiko Tinggi
Seimbang,1200,0,0
Tidak Seimbang,0,0,0
Risiko Tinggi,0,0,0


AKURASI PER KELAS

In [17]:
import numpy as np

class_accuracy = {}

for label in labels:
    idx = (y_test == label)
    if idx.sum() > 0:
        class_accuracy[label] = (y_pred[idx] == y_test[idx]).mean()
    else:
        class_accuracy[label] = None

class_accuracy


{0: np.float64(1.0), 1: None, 2: None}

FEATURE IMPORTANCE (FINAL)

In [18]:
importance = pd.DataFrame({
    'Feature': features_ml,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

importance


Unnamed: 0,Feature,Importance
0,calories,0.0
1,protein,0.0
2,fat,0.0
3,carbohydrate,0.0
4,fiber,0.0
5,sugars,0.0
6,saturated_fat,0.0
7,protein_ratio,0.0
8,fat_ratio,0.0
9,carb_ratio,0.0


In [19]:
import joblib

joblib.dump(model, "model_gizi_random_forest.pkl")
joblib.dump(scaler, "scaler_gizi.pkl")


['scaler_gizi.pkl']

In [20]:

sample = combo_df[features_ml].sample(1, random_state=1)

sample_scaled = scaler.transform(sample)
pred = model.predict(sample_scaled)

label_map = {
    0: "Seimbang",
    1: "Tidak Seimbang",
    2: "Risiko Tinggi"
}

print("Prediksi menu:", label_map[pred[0]])


Prediksi menu: Seimbang




##**Pengembangan Sistem Pendukung Keputusan untuk Evaluasi Keseimbangan Nutrisi Makanan Berbasis Machine Learning**

**NAMA : AGIO PRIMA KARDANA**

**NIM : 23.11.5834**

Import Library

In [21]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


Load Dataset

In [22]:
df = pd.read_csv("/content/nutrition.csv")

df.head()


Unnamed: 0.1,Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


Cek Struktur Dataset

In [23]:
import re

def clean_numeric(value):
    if pd.isna(value):
        return np.nan

    # Ambil angka (boleh desimal)
    match = re.search(r"[-+]?\d*\.?\d+", str(value))
    return float(match.group()) if match else np.nan


Cleaning Data

In [24]:
nutrient_cols = [
    "calories",
    "protein",
    "fat",
    "carbohydrate",
    "fiber",
    "sugars"
]


for col in nutrient_cols:
    df[col] = df[col].apply(clean_numeric)


Rule-Based Labeling

In [25]:
def classify_nutrition(row):
    protein = row["protein"]
    fat = row["fat"]
    carb = row["carbohydrate"]

    total = protein + fat + carb
    if total == 0:
        return "Tidak Seimbang"

    protein_ratio = protein / total
    fat_ratio = fat / total
    carb_ratio = carb / total

    if protein_ratio >= 0.2 and fat_ratio <= 0.35 and carb_ratio <= 0.6:
        return "Seimbang"
    elif fat_ratio > 0.45 or carb_ratio > 0.7:
        return "Risiko Tinggi"
    else:
        return "Tidak Seimbang"


In [26]:
X = df[nutrient_cols]


Generate Label

In [27]:
df["label"] = df.apply(classify_nutrition, axis=1)
df["label"].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Risiko Tinggi,4429
Seimbang,2636
Tidak Seimbang,1724


Feature & Target

In [28]:
nutrition_cols = [
    col for col in df.columns
    if col.lower() in [
        "calories",
        "protein",
        "fat",
        "carbohydrate",
        "carbohydrates"
    ]
]

print("Kolom nutrisi dipakai:", nutrition_cols)

X = df[nutrition_cols]
y = df["label"]


Kolom nutrisi dipakai: ['calories', 'protein', 'carbohydrate', 'fat']


Train-Test Split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


Scaling

In [30]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Training Model ML

In [31]:
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

model.fit(X_train_scaled, y_train)


Evaluasi Model

In [32]:
y_pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.9618885096700797

Classification Report:

                precision    recall  f1-score   support

 Risiko Tinggi       0.97      0.98      0.97       886
      Seimbang       0.98      0.98      0.98       527
Tidak Seimbang       0.92      0.88      0.90       345

      accuracy                           0.96      1758
     macro avg       0.96      0.95      0.95      1758
  weighted avg       0.96      0.96      0.96      1758



kombinasi gabugan

In [33]:
def get_nutrition(df, ingredient_name):
    rows = df[
        df["name"]
        .astype(str)
        .str.lower()
        .str.contains(ingredient_name.lower(), regex=False)
    ]
    if rows.empty:
        raise ValueError(f"Bahan '{ingredient_name}' tidak ditemukan")
    return rows.iloc[0][nutrition_cols].astype(float)


def combine_ingredients(df, ingredients):
    total = np.zeros(len(nutrition_cols))
    for item in ingredients:
        total += get_nutrition(df, item).values
    return pd.Series(total, index=nutrition_cols)


def predict_ml_combination(df, ingredients, scaler, model):
    nutrition = combine_ingredients(df, ingredients)
    scaled = scaler.transform([nutrition])
    label = model.predict(scaled)[0]
    return label, nutrition


def predict_rule_combination(df, ingredients):
    nutrition = combine_ingredients(df, ingredients)
    label = classify_nutrition(nutrition)
    return label, nutrition


def compare_ml_vs_rule(df, ingredients, scaler, model):
    ml_label, nutrition = predict_ml_combination(
        df, ingredients, scaler, model
    )
    rule_label, _ = predict_rule_combination(df, ingredients)

    return {
        "Bahan": ", ".join(ingredients),
        "ML": ml_label,
        "Rule-Based": rule_label,
        "Calories": nutrition["calories"],
        "Protein": nutrition["protein"],
        "Fat": nutrition["fat"],
        "Carbohydrate": nutrition["carbohydrate"]
    }


Test Kombinasi SEIMBANG

In [34]:
compare_ml_vs_rule(
    df,
    ["Milk", "Banana", "Peanut"],
    scaler,
    model
)




{'Bahan': 'Milk, Banana, Peanut',
 'ML': 'Tidak Seimbang',
 'Rule-Based': 'Seimbang',
 'Calories': np.float64(988.0),
 'Protein': np.float64(38.56999999999999),
 'Fat': np.float64(56.94),
 'Carbohydrate': np.float64(93.78999999999999)}

In [35]:
compare_ml_vs_rule(
    df,
    ["Sugar", "Butter", "White Rice"],
    scaler,
    model
)




{'Bahan': 'Sugar, Butter, White Rice',
 'ML': 'Tidak Seimbang',
 'Rule-Based': 'Tidak Seimbang',
 'Calories': np.float64(1600.0),
 'Protein': np.float64(2.7800000000000002),
 'Fat': np.float64(115.29),
 'Carbohydrate': np.float64(171.74)}

Multiple Test Case (DSS Style)

In [36]:
test_cases = [
    ["Milk", "Banana"],
    ["Milk", "Peanut"],
    ["Sugar", "Butter"],
    ["White Rice", "Sugar"]
]

results = []

for case in test_cases:
    results.append(compare_ml_vs_rule(
        df, case, scaler, model
    ))

pd.DataFrame(results)




Unnamed: 0,Bahan,ML,Rule-Based,Calories,Protein,Fat,Carbohydrate
0,"Milk, Banana",Tidak Seimbang,Tidak Seimbang,560.0,4.77,35.04,62.52
1,"Milk, Peanut",Seimbang,Seimbang,469.0,36.27,23.34,35.39
2,"Sugar, Butter",Tidak Seimbang,Risiko Tinggi,1144.0,0.28,99.88,94.8
3,"White Rice, Sugar",Risiko Tinggi,Risiko Tinggi,724.0,2.5,15.81,171.74
