In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load and preprocess data (same as before)
def load_and_preprocess():
    final_data = pd.read_csv("../datasets/Final_Data_Set.csv")
    meal_suggestions = pd.read_csv("../datasets/Meal_Suggestions.csv")
    nutrients = pd.read_csv("../datasets/Micro_and_Macro_Nutrients.csv")
    
    data = pd.merge(
        pd.merge(final_data, meal_suggestions, on="Daily_Calories"),
        nutrients, on="Daily_Calories"
    )
    
    # Data cleaning
    data = data.drop_duplicates()
    data.fillna(data.median(numeric_only=True), inplace=True)
    data = data[data["Daily_Calories"] > 0]
    
    # Feature engineering
    data["BMI_Category"] = data["BMI"].apply(
        lambda x: "Underweight" if x < 18.5 
        else "Normal" if x < 25 
        else "Overweight" if x < 30 
        else "Obese"
    )
    
    data["Adjusted_Calories"] = data.apply(
        lambda row: row["Daily_Calories"] * 0.9 if row["Health_Goal"] == "Weight Loss"
        else row["Daily_Calories"] * 1.1 if row["Health_Goal"] == "Muscle Gain"
        else row["Daily_Calories"],
        axis=1
    )
    
    # One-Hot Encoding
    categorical_cols = ["Gender", "Diet_Preference", "Activity_Level", 
                       "Disease", "Food_Allergies", "Health_Goal"]
    data = pd.get_dummies(data, columns=categorical_cols)
    
    # Binning
    data["Age_Group"] = pd.cut(data["Age"], 
                              bins=[18, 30, 45, 60, 70], 
                              labels=["18-29", "30-44", "45-59", "60+"])
    
    data["Activity_Frequency"] = pd.cut(data["Weekly_Activity_Days"], 
                                      bins=[-1, 2, 4, 7], 
                                      labels=["Low", "Medium", "High"])
    
    # Scaling
    numerical_cols = ["Age", "Weight_kg", "Height_cm", "BMI", 
                     "Daily_Calories", "Protein_g", "Carbs_g", "Fat_g"]
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
    
    return data, scaler

# Feature selection and preparation
def prepare_features(data):
    y = data["Breakfast"]
    X = data.drop(columns=["Breakfast", "Lunch", "Dinner", "Snacks"])
    
    # Handle categorical features
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
    
    # Feature selection
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    
    selector = SelectKBest(f_classif, k=25)
    X_selected = selector.fit_transform(X_encoded, y_encoded)
    selected_features = X_encoded.columns[selector.get_support()]
    
    return X_selected, y_encoded, selected_features, le

# Model training and evaluation
def train_and_evaluate_models(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    models = {
        "Random Forest": RandomForestClassifier(random_state=42),
        "XGBoost": XGBClassifier(random_state=42, eval_metric='mlogloss'),
        "LightGBM": LGBMClassifier(random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
        "KNN": KNeighborsClassifier(),
        "SVM": SVC(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
        "Neural Network": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
    }
    
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Save model
        joblib.dump(model, f'../models/{name.replace(" ", "_").lower()}_model.pkl')
        
        # Store results
        results[name] = {
            "accuracy": accuracy_score(y_test, y_pred),
            "report": classification_report(y_test, y_pred, output_dict=True)
        }
        
        print(f"\n{name} Performance:")
        print(f"Accuracy: {results[name]['accuracy']:.4f}")
        print(classification_report(y_test, y_pred))
    
    return results

# Main execution
if __name__ == "__main__":
    # Load and preprocess data
    data, scaler = load_and_preprocess()
    joblib.dump(scaler, '../models/scaler.pkl')
    
    # Prepare features
    X_selected, y_encoded, selected_features, label_encoder = prepare_features(data)
    joblib.dump(label_encoder, '../models/label_encoder.pkl')
    
    # Train and evaluate models
    results = train_and_evaluate_models(X_selected, y_encoded)
    
    # Find best model
    best_model_name = max(results, key=lambda x: results[x]['accuracy'])
    print(f"\nBest Model: {best_model_name} with accuracy {results[best_model_name]['accuracy']:.4f}")


Random Forest Performance:
Accuracy: 0.8273
              precision    recall  f1-score   support

           0       0.72      0.71      0.72       642
           1       0.58      0.49      0.53       649
           2       0.88      0.91      0.90      4533
           3       0.73      0.64      0.68       269
           4       0.70      0.62      0.65       235

    accuracy                           0.83      6328
   macro avg       0.72      0.67      0.70      6328
weighted avg       0.82      0.83      0.82      6328


XGBoost Performance:
Accuracy: 0.8123
              precision    recall  f1-score   support

           0       0.76      0.41      0.53       642
           1       0.77      0.47      0.59       649
           2       0.82      0.96      0.89      4533
           3       0.83      0.42      0.55       269
           4       0.72      0.40      0.51       235

    accuracy                           0.81      6328
   macro avg       0.78      0.53      0.61    