In [None]:
import pandas as pd

# Load datasets
final_data = pd.read_csv("../datasets/Final_Data_Set.csv")
meal_suggestions = pd.read_csv("../datasets/Meal_Suggestions.csv")
nutrients = pd.read_csv("../datasets/Micro_and_Macro_Nutrients.csv")

# Merge on common key (Daily_Calories)
data = pd.merge(final_data, meal_suggestions, on="Daily_Calories")
data = pd.merge(data, nutrients, on="Daily_Calories")

In [35]:
# Drop duplicates
data = data.drop_duplicates()

# Check missing values
print(data.isnull().sum())

# Handle missing values (example: fill with median for numerical columns)
data.fillna(data.median(numeric_only=True), inplace=True)

# Remove impossible values (e.g., negative calories)
data = data[data["Daily_Calories"] > 0]
print(data.head(10))

Age                        0
Gender                     0
Weight_kg                  0
Height_cm                  0
BMI                        0
Diet_Preference            0
Activity_Level             0
Weekly_Activity_Days       0
Disease                 2513
Food_Allergies          3390
Health_Goal                0
Daily_Calories             0
Breakfast                  0
Lunch                      0
Dinner                     0
Snacks                     0
Water_Intake_L             0
Protein_g                  0
Carbs_g                    0
Fat_g                      0
Fiber_g                    0
Sugar_g                    0
Vitamin_A_mcg              0
Vitamin_C_mg               0
Vitamin_D_mcg              0
Calcium_mg                 0
Iron_mg                    0
Potassium_mg               0
Magnesium_mg               0
Zinc_mg                    0
dtype: int64
   Age  Gender  Weight_kg  Height_cm   BMI           Diet_Preference  \
0   27  Female         65        155  27.1  M

In [36]:
# BMI Category (as before)
data["BMI_Category"] = data["BMI"].apply(lambda x: "Underweight" if x < 18.5 else "Normal" if x < 25 else "Overweight" if x < 30 else "Obese")

# Caloric Needs Adjustment based on Goal
def adjust_calories(row):
    if row["Health_Goal"] == "Weight Loss":
        return row["Daily_Calories"] * 0.9  # 10% deficit
    elif row["Health_Goal"] == "Muscle Gain":
        return row["Daily_Calories"] * 1.1  # 10% surplus
    else:
        return row["Daily_Calories"]

data["Adjusted_Calories"] = data.apply(adjust_calories, axis=1)

# One-Hot Encoding for Categorical Variables
categorical_cols = ["Gender", "Diet_Preference", "Activity_Level", "Disease", "Food_Allergies", "Health_Goal"]
data = pd.get_dummies(data, columns=categorical_cols)

In [37]:
# Bin Age into groups
data["Age_Group"] = pd.cut(data["Age"], bins=[18, 30, 45, 60, 70], labels=["18-29", "30-44", "45-59", "60+"])

# Bin Weekly_Activity_Days
data["Activity_Frequency"] = pd.cut(data["Weekly_Activity_Days"], bins=[-1, 2, 4, 7], labels=["Low", "Medium", "High"])

In [38]:
from sklearn.preprocessing import StandardScaler

# Select numerical features to scale
numerical_cols = ["Age", "Weight_kg", "Height_cm", "BMI", "Daily_Calories", "Protein_g", "Carbs_g", "Fat_g"]
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [39]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer


data = pd.merge(
    pd.merge(final_data, meal_suggestions, on="Daily_Calories"),
    nutrients, on="Daily_Calories"
)

# 2. Define target (e.g., recommending Breakfast meals)
y = data["Breakfast"]  # Categorical target

# 3. Prepare features (X)
X = data.drop(columns=["Breakfast", "Lunch", "Dinner", "Snacks"])

# 4. Handle categorical features (ONE-HOT ENCODING)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# 5. Handle missing values (IMPUTATION)
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X_encoded), columns=X_encoded.columns)

# 6. Encode target variable (if categorical)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 7. Feature Selection (SelectKBest with ANOVA F-value)
selector = SelectKBest(score_func=f_classif, k=25)  # Select top 10 features
X_selected = selector.fit_transform(X_imputed, y_encoded)

# 8. Get selected feature names
selected_features = X_encoded.columns[selector.get_support()]
print("TOP 10 SELECTED FEATURES:")
print(selected_features.tolist())

# 9. Create new DataFrame with selected features
selected_data = X_imputed[selected_features]
selected_data['Target_Breakfast'] = y_encoded  # Add encoded target if needed

TOP 10 SELECTED FEATURES:
['Weight_kg', 'Daily_Calories', 'Water_Intake_L', 'Protein_g', 'Carbs_g', 'Fat_g', 'Fiber_g', 'Sugar_g', 'Diet_Preference_Flexitarian', 'Diet_Preference_Non-Vegetarian', 'Diet_Preference_Raw Vegan', 'Diet_Preference_Vegan', 'Diet_Preference_Vegetarian', 'Disease_Diabetes Type 2', 'Disease_Hypertension', 'Disease_Hypothyroidism', 'Disease_IBS', 'Disease_Iron Deficiency', 'Disease_Obesity', 'Food_Allergies_Fish', 'Food_Allergies_Milk', 'Food_Allergies_Peanuts', 'Food_Allergies_Shellfish', 'Food_Allergies_Tree Nuts', 'Food_Allergies_Wheat']


  f = msb / msw
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['Target_Breakfast'] = y_encoded  # Add encoded target if needed


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2)

# Train a classifier (e.g., RandomForest for demonstration)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", clf.score(X_test, y_test))

Accuracy: 0.9046573336993321


In [41]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

                                            precision    recall  f1-score   support

Allergy-safe breakfast: Oatmeal with seeds       0.86      0.83      0.85      1019
  Low-carb breakfast: Eggs with vegetables       0.73      0.67      0.70       935
         Protein breakfast: Eggs and toast       0.93      0.95      0.94      8040
            Vegan breakfast: Tofu scramble       0.88      0.81      0.85       533
 Vegetarian breakfast: Oatmeal with fruits       0.81      0.76      0.78       402

                                  accuracy                           0.90     10929
                                 macro avg       0.84      0.81      0.82     10929
                              weighted avg       0.90      0.90      0.90     10929



In [None]:
import joblib 
joblib.dump(clf, 'models/meal_classifier_model.pkl')

['meal_classifier_model.pkl']