In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/Colab Notebooks/DM/csv/df_final.csv"
file_path1 = "/content/drive/MyDrive/Colab Notebooks/DM/csv/df_final_unscale.csv"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import joblib


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv(file_path)
df_raw = pd.read_csv(file_path1)

In [None]:
df
df_raw

In [None]:
features_base = ['energy_100g', 'fat_100g', 'saturated-fat_100g',
                 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g']


In [None]:
scaler = StandardScaler()
scaler.fit_transform(df_raw[features_base])
joblib.dump(scaler, "nutri_scaler.pkl")

In [None]:
def evaluate_model(y_test, y_pred, labels, model_name):
    print(f"\n===== {model_name} REPORT =====")
    print(classification_report(y_test, y_pred, target_names=labels))

    cm = confusion_matrix(y_test, y_pred)
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_percent, annot=True, fmt='.2%', cmap='Blues',
                xticklabels=labels, yticklabels=labels)

    plt.title(f'Confusion Matrix (Percentage): {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

In [None]:
print("\n>>> Evaluation: nutriscore_grade (a, b, c, d, e)")

X1 = df[features_base].fillna(0)
le1 = LabelEncoder()
y1 = le1.fit_transform(df['nutriscore_grade'].astype(str))
labels1 = le1.classes_

X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)
dt_model.fit(X_train, y_train)
evaluate_model(y_test, dt_model.predict(X_test), labels1, "Decision Tree")

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
evaluate_model(y_test, rf_model.predict(X_test), labels1, "Random Forest")

xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
evaluate_model(y_test, xgb_model.predict(X_test), labels1, "XGBoost")

In [None]:
print("\n>>> Evaluation: nutriscore_grade (a, b, c, d, e)")

X1 = df[features_base].fillna(0)
le1 = LabelEncoder()
y1 = le1.fit_transform(df['nutriscore_grade'].astype(str))
labels1 = le1.classes_

X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

dt_model = DecisionTreeClassifier(random_state=42, max_depth=20)
dt_model.fit(X_train, y_train)
evaluate_model(y_test, dt_model.predict(X_test), labels1, "Decision Tree")

rf_model = RandomForestClassifier(n_estimators=400, random_state=42, class_weight= 'balanced')
rf_model.fit(X_train, y_train)
evaluate_model(y_test, rf_model.predict(X_test), labels1, "Random Forest")

xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
evaluate_model(y_test, xgb_model.predict(X_test), labels1, "XGBoost")

#### Sau khi tinh chỉnh các siêu tham số của mô hình, Decision Tree tăng lên 82%

### Mô hình random forest cho ra độ chính xác cao nhất => lưu mô hình

In [None]:
### Lưu mô hình random forest vì accuracy tổng thể cao nhất
joblib.dump(rf_model, "nutriscore_grade_model.pkl")

In [None]:
joblib.dump(le1, "nutrigrade_label.pkl")

In [None]:
joblib.dump(le4, "healthy_label.pkl")

In [None]:
print("\n>>> Evaluation: nova_group")

features_nova = features_base + ['additives_n']
X2 = df[features_nova].fillna(0)
le2 = LabelEncoder()
y2 = le2.fit_transform(df['nova_group'].astype(str))
labels2 = le2.classes_

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

dt_nova = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
evaluate_model(y_test, dt_nova.predict(X_test), labels2, "Decision Tree")

rf_nova = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train, y_train)
evaluate_model(y_test, rf_nova.predict(X_test), labels2, "Random Forest")

xgb_nova = XGBClassifier(random_state=42).fit(X_train, y_train)
evaluate_model(y_test, xgb_nova.predict(X_test), labels2, "XGBoost")

In [None]:
### Lưu mô hình random forest vì accuracy tổng thể cao nhất
joblib.dump(rf_nova, "nova_group_model.pkl")

In [None]:
print("\n>>> Evaluation: Healthy and Unhealthy")

df['is_healthy'] = df['nutriscore_grade'].apply(lambda x: 'Healthy' if x in ['a', 'b'] else 'Unhealthy')
X4 = df[features_base].fillna(0)
le4 = LabelEncoder()
y4 = le4.fit_transform(df['is_healthy'])
labels4 = le4.classes_

X_train, X_test, y_train, y_test = train_test_split(X4, y4, test_size=0.2, random_state=42)

# 1. Decision Tree
dt_bin = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
evaluate_model(y_test, dt_bin.predict(X_test), labels4, "Decision Tree")

# 2. Random Forest
rf_bin = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train, y_train)
evaluate_model(y_test, rf_bin.predict(X_test), labels4, "Random Forest")

# 3. XGBoost
xgb_bin = XGBClassifier(random_state=42).fit(X_train, y_train)
evaluate_model(y_test, xgb_bin.predict(X_test), labels4, "XGBoost")

In [None]:
joblib.dump(rf_bin, "healthy_model.pkl")