In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE

# -----------------------------
# Load datasets
# -----------------------------
genetic_df = pd.read_csv("C:\\Users\\trejan\\Desktop\\Sem 2\\Machine Learning\\model\\new_genetic_profiles.csv")
meal_df = pd.read_csv("C:\\Users\\trejan\\Desktop\\Sem 2\\Machine Learning\\model\\train.csv")

# -----------------------------
# Preprocess genetic data
# -----------------------------
# Handle missing values for numeric columns in genetic_df
numeric_cols = genetic_df.select_dtypes(include=['float64', 'int64']).columns
genetic_df[numeric_cols] = genetic_df[numeric_cols].fillna(genetic_df[numeric_cols].mean())

# Handle missing values for categorical columns in genetic_df
categorical_cols = genetic_df.select_dtypes(include=['object']).columns
genetic_df[categorical_cols] = genetic_df[categorical_cols].fillna('Unknown')

# Convert Obesity_Risk_Score into categories (Low, Medium, High)
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.3, 0.6, 1],
    labels=['Low', 'Medium', 'High']
)

# Select relevant features
features = ['BMI', 'Physical_Activity', 'Diet_Type', 'MC4R_Variant', 
            'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant', 'Obesity_Risk_Score']
genetic_df = genetic_df[features + ['Obesity_Risk_Category']]

# One-hot encode categorical features (Diet_Type and genetic variants)
genetic_df = pd.get_dummies(genetic_df, columns=['Diet_Type', 'MC4R_Variant', 
                                                 'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant'], drop_first=True)

# Prepare features and target
X = genetic_df.drop(['Obesity_Risk_Category'], axis=1)
y = genetic_df['Obesity_Risk_Category']

# Convert Obesity_Risk_Category to numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Ensure all columns in X are numeric
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = pd.to_numeric(X[col], errors='coerce')  # Convert to numeric, coercing errors to NaN
        X[col] = X[col].fillna(0)  # Fill NaN values with 0

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

# Split resampled data into training (60%), validation (20%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# -----------------------------
# Hyperparameter tuning with XGBoost
# -----------------------------
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# Initialize XGBClassifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Grid Search CV for XGBoost
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_xgb = grid_search_xgb.best_params_
print(f'Best Parameters for XGBoost: {best_params_xgb}')

best_xgb = grid_search_xgb.best_estimator_
best_xgb.fit(X_train, y_train)

# -----------------------------
# Evaluation of XGBoost model
# -----------------------------
# Evaluate on the validation set
y_val_pred_xgb = best_xgb.predict(X_val)
val_accuracy_xgb = accuracy_score(y_val, y_val_pred_xgb) * 100
print(f'Validation Accuracy (XGBoost): {val_accuracy_xgb:.2f}%')

# Evaluate on the test set
y_test_pred_xgb = best_xgb.predict(X_test)
test_accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb) * 100
print(f'Test Accuracy (XGBoost): {test_accuracy_xgb:.2f}%')

# Evaluate on noisy data (optional)
X_noisy = X_resampled + np.random.normal(0, 0.1, X_resampled.shape)  # Add Gaussian noise
cv_scores_noisy_xgb = cross_val_score(best_xgb, X_noisy, y_resampled, cv=5, scoring='accuracy')
print(f'Cross-Validation Accuracy on Noisy Data (XGBoost): {cv_scores_noisy_xgb.mean() * 100:.2f}%')

# Print classification report and confusion matrix for the test set
print("\nClassification Report (Test Set - XGBoost):")
print(classification_report(y_test, y_test_pred_xgb, target_names=label_encoder.classes_))

print("\nConfusion Matrix (Test Set - XGBoost):")
print(confusion_matrix(y_test, y_test_pred_xgb))

# -----------------------------
# Preprocess nutritional data for meal recommendations
# -----------------------------
nutritional_features = meal_df[['Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']]
scaler = StandardScaler()
nutritional_features_scaled = scaler.fit_transform(nutritional_features)

# Increase the number of meal clusters
num_clusters = 10  # Using 10 clusters for more diversity
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
meal_df['Meal_Cluster'] = kmeans.fit_predict(nutritional_features_scaled)

# -----------------------------
# Functions for Meal Recommendations
# -----------------------------
def display_meal_cluster(meals, cluster_name, num_meals=5):
    print(f"\n=== Recommended Meals for {cluster_name} ===")
    print(f"Total Meals in Cluster: {len(meals)}")
    
    # Display a subset of meals
    print(f"\nTop {num_meals} Meals:")
    print(f"{'Meal Description':<50} {'Energy (kcal)':<15} {'Protein (g)':<15} {'Fat (g)':<15} {'Carbs (g)':<15}")
    print("-" * 110)
    for _, meal in meals.head(num_meals).iterrows():
        print(f"{meal['Descrip']:<50} {meal['Energy_kcal']:<15} {meal['Protein_g']:<15} {meal['Fat_g']:<15} {meal['Carb_g']:<15}")
    
    # Display summary of nutritional values
    avg_energy = meals['Energy_kcal'].mean()
    avg_protein = meals['Protein_g'].mean()
    avg_fat = meals['Fat_g'].mean()
    avg_carbs = meals['Carb_g'].mean()
    
    print("\n=== Nutritional Summary ===")
    print(f"Average Energy (kcal): {avg_energy:.2f}")
    print(f"Average Protein (g): {avg_protein:.2f}")
    print(f"Average Fat (g): {avg_fat:.2f}")
    print(f"Average Carbs (g): {avg_carbs:.2f}")

def recommend_meals(user_profile, meal_df, model, label_encoder, num_meals=5):
    # Convert the user profile to a DataFrame
    user_profile_df = pd.DataFrame([user_profile])
    
    # One-hot encode categorical features (same as during training)
    user_profile_df = pd.get_dummies(user_profile_df, columns=['Diet_Type', 'MC4R_Variant', 
                                                               'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant'], drop_first=True)
    
    # Ensure the user profile has the same columns as the training data
    missing_cols = set(X_train.columns) - set(user_profile_df.columns)
    for col in missing_cols:
        user_profile_df[col] = 0  # Add missing columns with default value 0
    
    # Reorder columns to match the training data
    user_profile_df = user_profile_df[X_train.columns]
    
    # Predict the obesity risk category for the user profile
    predicted_risk_category = model.predict(user_profile_df)
    predicted_risk_category_label = label_encoder.inverse_transform(predicted_risk_category)[0]
    print(f'\nPredicted Obesity Risk Category: {predicted_risk_category_label}')
    
    # Define cluster preferences based on risk category
    if predicted_risk_category_label == 'Low':
        # Low-risk users: Focus on high-protein, balanced meals
        preferred_clusters = [0, 1, 2]  # Example clusters
        sort_by = 'Protein_g'  # Sort by highest protein
    elif predicted_risk_category_label == 'Medium':
        # Medium-risk users: Focus on moderate-calorie, balanced meals
        preferred_clusters = [3, 4, 5]  # Example clusters
        sort_by = 'Energy_kcal'  # Sort by moderate calories
    else:
        # High-risk users: Focus on low-calorie, nutrient-dense meals
        preferred_clusters = [6, 7, 8, 9]  # Example clusters
        sort_by = 'Energy_kcal'  # Sort by lowest calories
    
    # Recommend meals from the preferred clusters
    recommended_meals = meal_df[meal_df['Meal_Cluster'].isin(preferred_clusters)]
    
    # Sort meals based on the user's risk category
    if predicted_risk_category_label == 'High':
        recommended_meals = recommended_meals.sort_values(by=sort_by, ascending=True)  # Low calories
    else:
        recommended_meals = recommended_meals.sort_values(by=sort_by, ascending=False)  # High protein or moderate calories
    
    # Display recommended meals in a user-friendly format
    display_meal_cluster(recommended_meals, f"{predicted_risk_category_label}-Risk", num_meals)

# -----------------------------
# Example: Recommend Meals for a New Genetic Profile
# -----------------------------
new_profile = {
    'BMI': 28.5,                 # Example value
    'Physical_Activity': 2,      # Example value (1: Low, 2: Moderate, 3: High)
    'Diet_Type': 'High-Fat',     # Example value
    'MC4R_Variant': 'rs17782313_CT',  # Example value
    'PPARG_Variant': 'rs1801282_CG',   # Example value
    'FTO_Variant': 'rs9939609_AT',      # Example value
    'LEPR_Variant': 'rs1137101_AG',      # Example value
    'Obesity_Risk_Score': 0.45   # Example value
}

# Use best_xgb for prediction and meal recommendation
recommend_meals(new_profile, meal_df, best_xgb, label_encoder)


Best Parameters for XGBoost: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
Validation Accuracy (XGBoost): 100.00%
Test Accuracy (XGBoost): 100.00%
Cross-Validation Accuracy on Noisy Data (XGBoost): 84.29%

Classification Report (Test Set - XGBoost):
              precision    recall  f1-score   support

        High       1.00      1.00      1.00       194
         Low       1.00      1.00      1.00       202
      Medium       1.00      1.00      1.00       167

    accuracy                           1.00       563
   macro avg       1.00      1.00      1.00       563
weighted avg       1.00      1.00      1.00       563


Confusion Matrix (Test Set - XGBoost):
[[194   0   0]
 [  0 202   0]
 [  0   0 167]]

Predicted Obesity Risk Category: Medium

=== Recommended Meals for Medium-Risk ===
Total Meals in Cluster: 3637

Top 5 Meals:
Meal Description                                   Energy (kcal)   Protein (g)     Fat (g)        

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE
import joblib  # Added for saving models
import os

# -----------------------------
# Load datasets
# -----------------------------
genetic_df = pd.read_csv("C:\\Users\\trejan\\Desktop\\Sem 2\\Machine Learning\\model\\new_genetic_profiles.csv")
meal_df = pd.read_csv("C:\\Users\\trejan\\Desktop\\Sem 2\\Machine Learning\\model\\train.csv")

# -----------------------------
# Preprocess genetic data
# -----------------------------
# Handle missing values for numeric columns in genetic_df
numeric_cols = genetic_df.select_dtypes(include=['float64', 'int64']).columns
genetic_df[numeric_cols] = genetic_df[numeric_cols].fillna(genetic_df[numeric_cols].mean())

# Handle missing values for categorical columns in genetic_df
categorical_cols = genetic_df.select_dtypes(include=['object']).columns
genetic_df[categorical_cols] = genetic_df[categorical_cols].fillna('Unknown')

# Convert Obesity_Risk_Score into categories (Low, Medium, High)
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.3, 0.6, 1],
    labels=['Low', 'Medium', 'High']
)

# Select relevant features
features = ['BMI', 'Physical_Activity', 'Diet_Type', 'MC4R_Variant', 
            'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant', 'Obesity_Risk_Score']
genetic_df = genetic_df[features + ['Obesity_Risk_Category']]

# One-hot encode categorical features (Diet_Type and genetic variants)
genetic_df = pd.get_dummies(genetic_df, columns=['Diet_Type', 'MC4R_Variant', 
                                                 'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant'], drop_first=True)

# Prepare features and target
X = genetic_df.drop(['Obesity_Risk_Category'], axis=1)
y = genetic_df['Obesity_Risk_Category']

# Convert Obesity_Risk_Category to numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Ensure all columns in X are numeric
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = pd.to_numeric(X[col], errors='coerce')  # Convert to numeric, coercing errors to NaN
        X[col] = X[col].fillna(0)  # Fill NaN values with 0

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

# Split resampled data into training (60%), validation (20%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# -----------------------------
# Hyperparameter tuning with XGBoost
# -----------------------------
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# Initialize XGBClassifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Grid Search CV for XGBoost
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_xgb = grid_search_xgb.best_params_
print(f'Best Parameters for XGBoost: {best_params_xgb}')

best_xgb = grid_search_xgb.best_estimator_
best_xgb.fit(X_train, y_train)

# -----------------------------
# Save the XGBoost model and preprocessing artifacts
# -----------------------------
save_dir = r"C:\Users\trejan\Desktop\GNN\Saved models"
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist

# Save the model and preprocessing objects
joblib.dump(best_xgb, os.path.join(save_dir, "xgboost_model.pkl"))  # Save the XGBoost model
joblib.dump(label_encoder, os.path.join(save_dir, "xgboost_label_encoder.pkl"))  # Save the label encoder
joblib.dump(smote, os.path.join(save_dir, "xgboost_smote.pkl"))  # Save the SMOTE object
joblib.dump(X_train.columns, os.path.join(save_dir, "xgboost_feature_columns.pkl"))  # Save the feature columns

print(f"\nXGBoost model and preprocessing artifacts saved to: {save_dir}")

# -----------------------------
# Evaluation of XGBoost model
# -----------------------------
# Evaluate on the validation set
y_val_pred_xgb = best_xgb.predict(X_val)
val_accuracy_xgb = accuracy_score(y_val, y_val_pred_xgb) * 100
print(f'Validation Accuracy (XGBoost): {val_accuracy_xgb:.2f}%')

# Evaluate on the test set
y_test_pred_xgb = best_xgb.predict(X_test)
test_accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb) * 100
print(f'Test Accuracy (XGBoost): {test_accuracy_xgb:.2f}%')

# Evaluate on noisy data (optional)
X_noisy = X_resampled + np.random.normal(0, 0.1, X_resampled.shape)  # Add Gaussian noise
cv_scores_noisy_xgb = cross_val_score(best_xgb, X_noisy, y_resampled, cv=5, scoring='accuracy')
print(f'Cross-Validation Accuracy on Noisy Data (XGBoost): {cv_scores_noisy_xgb.mean() * 100:.2f}%')

# Print classification report and confusion matrix for the test set
print("\nClassification Report (Test Set - XGBoost):")
print(classification_report(y_test, y_test_pred_xgb, target_names=label_encoder.classes_))

print("\nConfusion Matrix (Test Set - XGBoost):")
print(confusion_matrix(y_test, y_test_pred_xgb))

# -----------------------------
# Preprocess nutritional data for meal recommendations
# -----------------------------
nutritional_features = meal_df[['Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']]
scaler = StandardScaler()
nutritional_features_scaled = scaler.fit_transform(nutritional_features)

# Increase the number of meal clusters
num_clusters = 10  # Using 10 clusters for more diversity
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
meal_df['Meal_Cluster'] = kmeans.fit_predict(nutritional_features_scaled)

# -----------------------------
# Functions for Meal Recommendations
# -----------------------------
def display_meal_cluster(meals, cluster_name, num_meals=5):
    print(f"\n=== Recommended Meals for {cluster_name} ===")
    print(f"Total Meals in Cluster: {len(meals)}")
    
    # Display a subset of meals
    print(f"\nTop {num_meals} Meals:")
    print(f"{'Meal Description':<50} {'Energy (kcal)':<15} {'Protein (g)':<15} {'Fat (g)':<15} {'Carbs (g)':<15}")
    print("-" * 110)
    for _, meal in meals.head(num_meals).iterrows():
        print(f"{meal['Descrip']:<50} {meal['Energy_kcal']:<15} {meal['Protein_g']:<15} {meal['Fat_g']:<15} {meal['Carb_g']:<15}")
    
    # Display summary of nutritional values
    avg_energy = meals['Energy_kcal'].mean()
    avg_protein = meals['Protein_g'].mean()
    avg_fat = meals['Fat_g'].mean()
    avg_carbs = meals['Carb_g'].mean()
    
    print("\n=== Nutritional Summary ===")
    print(f"Average Energy (kcal): {avg_energy:.2f}")
    print(f"Average Protein (g): {avg_protein:.2f}")
    print(f"Average Fat (g): {avg_fat:.2f}")
    print(f"Average Carbs (g): {avg_carbs:.2f}")

def recommend_meals(user_profile, meal_df, model, label_encoder, num_meals=5):
    # Convert the user profile to a DataFrame
    user_profile_df = pd.DataFrame([user_profile])
    
    # One-hot encode categorical features (same as during training)
    user_profile_df = pd.get_dummies(user_profile_df, columns=['Diet_Type', 'MC4R_Variant', 
                                                               'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant'], drop_first=True)
    
    # Ensure the user profile has the same columns as the training data
    missing_cols = set(X_train.columns) - set(user_profile_df.columns)
    for col in missing_cols:
        user_profile_df[col] = 0  # Add missing columns with default value 0
    
    # Reorder columns to match the training data
    user_profile_df = user_profile_df[X_train.columns]
    
    # Predict the obesity risk category for the user profile
    predicted_risk_category = model.predict(user_profile_df)
    predicted_risk_category_label = label_encoder.inverse_transform(predicted_risk_category)[0]
    print(f'\nPredicted Obesity Risk Category: {predicted_risk_category_label}')
    
    # Define cluster preferences based on risk category
    if predicted_risk_category_label == 'Low':
        # Low-risk users: Focus on high-protein, balanced meals
        preferred_clusters = [0, 1, 2]  # Example clusters
        sort_by = 'Protein_g'  # Sort by highest protein
    elif predicted_risk_category_label == 'Medium':
        # Medium-risk users: Focus on moderate-calorie, balanced meals
        preferred_clusters = [3, 4, 5]  # Example clusters
        sort_by = 'Energy_kcal'  # Sort by moderate calories
    else:
        # High-risk users: Focus on low-calorie, nutrient-dense meals
        preferred_clusters = [6, 7, 8, 9]  # Example clusters
        sort_by = 'Energy_kcal'  # Sort by lowest calories
    
    # Recommend meals from the preferred clusters
    recommended_meals = meal_df[meal_df['Meal_Cluster'].isin(preferred_clusters)]
    
    # Sort meals based on the user's risk category
    if predicted_risk_category_label == 'High':
        recommended_meals = recommended_meals.sort_values(by=sort_by, ascending=True)  # Low calories
    else:
        recommended_meals = recommended_meals.sort_values(by=sort_by, ascending=False)  # High protein or moderate calories
    
    # Display recommended meals in a user-friendly format
    display_meal_cluster(recommended_meals, f"{predicted_risk_category_label}-Risk", num_meals)

# -----------------------------
# Example: Recommend Meals for a New Genetic Profile
# -----------------------------
new_profile = {
    'BMI': 28.5,                 # Example value
    'Physical_Activity': 2,      # Example value (1: Low, 2: Moderate, 3: High)
    'Diet_Type': 'High-Fat',     # Example value
    'MC4R_Variant': 'rs17782313_CT',  # Example value
    'PPARG_Variant': 'rs1801282_CG',   # Example value
    'FTO_Variant': 'rs9939609_AT',      # Example value
    'LEPR_Variant': 'rs1137101_AG',      # Example value
    'Obesity_Risk_Score': 0.45   # Example value
}

# Use best_xgb for prediction and meal recommendation
recommend_meals(new_profile, meal_df, best_xgb, label_encoder)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

KeyboardInterrupt: 