In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE

# Load datasets
genetic_df = pd.read_csv("C:\\Users\\trejan\\Desktop\\Sem 2\\Machine Learning\\model\\genetic_profiles.csv")
meal_df = pd.read_csv("C:\\Users\\trejan\\Desktop\\Sem 2\\Machine Learning\\model\\train.csv")

# Handle missing values for numeric columns in genetic_df
numeric_cols = genetic_df.select_dtypes(include=['float64', 'int64']).columns
genetic_df[numeric_cols] = genetic_df[numeric_cols].fillna(genetic_df[numeric_cols].mean())

# Handle missing values for categorical columns in genetic_df
categorical_cols = genetic_df.select_dtypes(include=['object']).columns
genetic_df[categorical_cols] = genetic_df[categorical_cols].fillna('Unknown')

# Convert Obesity_Risk_Score into categories (low, medium, high)
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.3, 0.6, 1],
    labels=['Low', 'Medium', 'High']
)

# Select relevant features
features = ['BMI', 'Physical_Activity', 'Diet_Type', 'MC4R_Variant', 'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant', 'Obesity_Risk_Score']
genetic_df = genetic_df[features + ['Obesity_Risk_Category']]

# One-hot encode categorical features (Diet_Type and genetic variants)
genetic_df = pd.get_dummies(genetic_df, columns=['Diet_Type', 'MC4R_Variant', 'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant'], drop_first=True)

# Prepare features and target
X = genetic_df.drop(['Obesity_Risk_Category'], axis=1)
y = genetic_df['Obesity_Risk_Category']

# Convert Obesity_Risk_Category to numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Ensure all columns in X are numeric
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = pd.to_numeric(X[col], errors='coerce')  # Convert to numeric, coercing errors to NaN
        X[col] = X[col].fillna(0)  # Fill NaN values with 0

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

# Split resampled data into training (60%), validation (20%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],  # Limit tree depth
    'min_samples_split': [10, 20],  # Increase minimum samples to split
    'min_samples_leaf': [2, 4],  # Increase minimum samples per leaf
    'max_features': ['sqrt', 'log2']
}

# Use RandomForestClassifier for classification
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Train the model with the best parameters
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)

# Evaluate on the validation set
y_val_pred = best_rf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred) * 100
print(f'Validation Accuracy: {val_accuracy:.2f}%')

# Evaluate on the test set
y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred) * 100

# Evaluate on noisy data
X_noisy = X_resampled + np.random.normal(0, 0.1, X_resampled.shape)  # Add Gaussian noise
cv_scores_noisy = cross_val_score(best_rf, X_noisy, y_resampled, cv=5, scoring='accuracy')
print(f'Cross-Validation Accuracy (Noisy Data): {cv_scores_noisy.mean() * 100:.2f}%')

# Print classification report and confusion matrix
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))

print("\nConfusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_test_pred))

# Preprocess nutritional data for meal recommendations
nutritional_features = meal_df[['Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']]
scaler = StandardScaler()
nutritional_features_scaled = scaler.fit_transform(nutritional_features)

# Increase the number of meal clusters
num_clusters = 10  # Use 10 clusters for more diversity
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
meal_df['Meal_Cluster'] = kmeans.fit_predict(nutritional_features_scaled)

# Function to display meal clusters in a user-friendly format
def display_meal_cluster(meals, cluster_name, num_meals=5):
    print(f"\n=== Recommended Meals for {cluster_name} ===")
    print(f"Total Meals in Cluster: {len(meals)}")
    
    # Display a subset of meals
    print(f"\nTop {num_meals} Meals:")
    print(f"{'Meal Description':<50} {'Energy (kcal)':<15} {'Protein (g)':<15} {'Fat (g)':<15} {'Carbs (g)':<15}")
    print("-" * 110)
    for _, meal in meals.head(num_meals).iterrows():
        print(f"{meal['Descrip']:<50} {meal['Energy_kcal']:<15} {meal['Protein_g']:<15} {meal['Fat_g']:<15} {meal['Carb_g']:<15}")
    
    # Display summary of nutritional values
    avg_energy = meals['Energy_kcal'].mean()
    avg_protein = meals['Protein_g'].mean()
    avg_fat = meals['Fat_g'].mean()
    avg_carbs = meals['Carb_g'].mean()
    
    print("\n=== Nutritional Summary ===")
    print(f"Average Energy (kcal): {avg_energy:.2f}")
    print(f"Average Protein (g): {avg_protein:.2f}")
    print(f"Average Fat (g): {avg_fat:.2f}")
    print(f"Average Carbs (g): {avg_carbs:.2f}")

# Function to recommend meals dynamically based on user profile
def recommend_meals(user_profile, meal_df, model, label_encoder, num_meals=5):
    # Convert the user profile to a DataFrame
    user_profile_df = pd.DataFrame([user_profile])
    
    # One-hot encode categorical features (same as during training)
    user_profile_df = pd.get_dummies(user_profile_df, columns=['Diet_Type', 'MC4R_Variant', 'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant'], drop_first=True)
    
    # Ensure the user profile has the same columns as the training data
    missing_cols = set(X_train.columns) - set(user_profile_df.columns)
    for col in missing_cols:
        user_profile_df[col] = 0  # Add missing columns with default value 0
    
    # Reorder columns to match the training data
    user_profile_df = user_profile_df[X_train.columns]
    
    # Predict the obesity risk category for the user profile
    predicted_risk_category = model.predict(user_profile_df)
    predicted_risk_category_label = label_encoder.inverse_transform(predicted_risk_category)[0]
    print(f'Predicted Obesity Risk Category: {predicted_risk_category_label}')
    
    # Define cluster preferences based on risk category
    if predicted_risk_category_label == 'Low':
        # Low-risk users: Focus on high-protein, balanced meals
        preferred_clusters = [0, 1, 2]  # Example clusters
        sort_by = 'Protein_g'  # Sort by highest protein
    elif predicted_risk_category_label == 'Medium':
        # Medium-risk users: Focus on moderate-calorie, balanced meals
        preferred_clusters = [3, 4, 5]  # Example clusters
        sort_by = 'Energy_kcal'  # Sort by moderate calories
    else:
        # High-risk users: Focus on low-calorie, nutrient-dense meals
        preferred_clusters = [6, 7, 8, 9]  # Example clusters
        sort_by = 'Energy_kcal'  # Sort by lowest calories
    
    # Recommend meals from the preferred clusters
    recommended_meals = meal_df[meal_df['Meal_Cluster'].isin(preferred_clusters)]
    
    # Sort meals based on the user's risk category
    if predicted_risk_category_label == 'High':
        recommended_meals = recommended_meals.sort_values(by=sort_by, ascending=True)  # Low calories
    else:
        recommended_meals = recommended_meals.sort_values(by=sort_by, ascending=False)  # High protein or moderate calories
    
    # Display recommended meals in a user-friendly format
    display_meal_cluster(recommended_meals, f"{predicted_risk_category_label}-Risk", num_meals)

# Example new genetic profile
new_profile = {
    'BMI': 28.5,  # Example value
    'Physical_Activity': 2,  # Example value (1: Low, 2: Moderate, 3: High)
    'Diet_Type': 'High-Fat',  # Example value
    'MC4R_Variant': 'rs17782313_CT',  # Example value
    'PPARG_Variant': 'rs1801282_CG',  # Example value
    'FTO_Variant': 'rs9939609_AT',  # Example value
    'LEPR_Variant': 'rs1137101_AG',  # Example value
    'Obesity_Risk_Score': 0.45  # Example value
}

# Recommend meals for the new profile
recommend_meals(new_profile, meal_df, best_rf, label_encoder)

Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Validation Accuracy: 100.00%
Cross-Validation Accuracy (Noisy Data): 82.51%

Classification Report (Test Set):
              precision    recall  f1-score   support

        High       1.00      1.00      1.00       182
         Low       1.00      1.00      1.00       154
      Medium       1.00      1.00      1.00       173

    accuracy                           1.00       509
   macro avg       1.00      1.00      1.00       509
weighted avg       1.00      1.00      1.00       509


Confusion Matrix (Test Set):
[[182   0   0]
 [  0 154   0]
 [  0   0 173]]
Predicted Obesity Risk Category: Medium

=== Recommended Meals for Medium-Risk ===
Total Meals in Cluster: 3637

Top 5 Meals:
Meal Description                                   Energy (kcal)   Protein (g)     Fat (g)         Carbs (g)      
----------------------------------------------------------------

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE
import pickle

# Load datasets
genetic_df = pd.read_csv("C:\\Users\\trejan\\Desktop\\Sem 2\\Machine Learning\\model\\genetic_profiles.csv")
meal_df = pd.read_csv("C:\\Users\\trejan\\Desktop\\Sem 2\\Machine Learning\\model\\train.csv")

# Handle missing values for numeric columns in genetic_df
numeric_cols = genetic_df.select_dtypes(include=['float64', 'int64']).columns
genetic_df[numeric_cols] = genetic_df[numeric_cols].fillna(genetic_df[numeric_cols].mean())

# Handle missing values for categorical columns in genetic_df
categorical_cols = genetic_df.select_dtypes(include=['object']).columns
genetic_df[categorical_cols] = genetic_df[categorical_cols].fillna('Unknown')

# Convert Obesity_Risk_Score into categories (low, medium, high)
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.3, 0.6, 1],
    labels=['Low', 'Medium', 'High']
)

# Select relevant features
features = ['BMI', 'Physical_Activity', 'Diet_Type', 'MC4R_Variant', 'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant', 'Obesity_Risk_Score']
genetic_df = genetic_df[features + ['Obesity_Risk_Category']]

# One-hot encode categorical features (Diet_Type and genetic variants)
genetic_df = pd.get_dummies(genetic_df, columns=['Diet_Type', 'MC4R_Variant', 'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant'], drop_first=True)

# Prepare features and target
X = genetic_df.drop(['Obesity_Risk_Category'], axis=1)
y = genetic_df['Obesity_Risk_Category']

# Convert Obesity_Risk_Category to numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Ensure all columns in X are numeric
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = pd.to_numeric(X[col], errors='coerce')  # Convert to numeric, coercing errors to NaN
        X[col] = X[col].fillna(0)  # Fill NaN values with 0

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

# Split resampled data into training (60%), validation (20%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],  # Limit tree depth
    'min_samples_split': [10, 20],  # Increase minimum samples to split
    'min_samples_leaf': [2, 4],  # Increase minimum samples per leaf
    'max_features': ['sqrt', 'log2']
}

# Use RandomForestClassifier for classification
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Train the model with the best parameters
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)

# Evaluate on the validation set
y_val_pred = best_rf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred) * 100
print(f'Validation Accuracy: {val_accuracy:.2f}%')

# Evaluate on the test set
y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred) * 100
print(f'Test Accuracy: {test_accuracy:.2f}%')

# Evaluate on noisy data
X_noisy = X_resampled + np.random.normal(0, 0.1, X_resampled.shape)  # Add Gaussian noise
cv_scores_noisy = cross_val_score(best_rf, X_noisy, y_resampled, cv=5, scoring='accuracy')
print(f'Cross-Validation Accuracy (Noisy Data): {cv_scores_noisy.mean() * 100:.2f}%')

# Print classification report and confusion matrix
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))

print("\nConfusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_test_pred))

# Preprocess nutritional data for meal recommendations
nutritional_features = meal_df[['Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']]
scaler = StandardScaler()
nutritional_features_scaled = scaler.fit_transform(nutritional_features)

# Increase the number of meal clusters
num_clusters = 10  # Use 10 clusters for more diversity
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
meal_df['Meal_Cluster'] = kmeans.fit_predict(nutritional_features_scaled)

# Function to display meal clusters in a user-friendly format
def display_meal_cluster(meals, cluster_name, num_meals=5):
    print(f"\n=== Recommended Meals for {cluster_name} ===")
    print(f"Total Meals in Cluster: {len(meals)}")
    
    # Display a subset of meals
    print(f"\nTop {num_meals} Meals:")
    print(f"{'Meal Description':<50} {'Energy (kcal)':<15} {'Protein (g)':<15} {'Fat (g)':<15} {'Carbs (g)':<15}")
    print("-" * 110)
    for _, meal in meals.head(num_meals).iterrows():
        print(f"{meal['Descrip']:<50} {meal['Energy_kcal']:<15} {meal['Protein_g']:<15} {meal['Fat_g']:<15} {meal['Carb_g']:<15}")
    
    # Display summary of nutritional values
    avg_energy = meals['Energy_kcal'].mean()
    avg_protein = meals['Protein_g'].mean()
    avg_fat = meals['Fat_g'].mean()
    avg_carbs = meals['Carb_g'].mean()
    
    print("\n=== Nutritional Summary ===")
    print(f"Average Energy (kcal): {avg_energy:.2f}")
    print(f"Average Protein (g): {avg_protein:.2f}")
    print(f"Average Fat (g): {avg_fat:.2f}")
    print(f"Average Carbs (g): {avg_carbs:.2f}")

# Function to recommend meals dynamically based on user profile
def recommend_meals(user_profile, meal_df, model, label_encoder, num_meals=5):
    # Convert the user profile to a DataFrame
    user_profile_df = pd.DataFrame([user_profile])
    
    # One-hot encode categorical features (same as during training)
    user_profile_df = pd.get_dummies(user_profile_df, columns=['Diet_Type', 'MC4R_Variant', 'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant'], drop_first=True)
    
    # Ensure the user profile has the same columns as the training data
    missing_cols = set(X_train.columns) - set(user_profile_df.columns)
    for col in missing_cols:
        user_profile_df[col] = 0  # Add missing columns with default value 0
    
    # Reorder columns to match the training data
    user_profile_df = user_profile_df[X_train.columns]
    
    # Predict the obesity risk category for the user profile
    predicted_risk_category = model.predict(user_profile_df)
    predicted_risk_category_label = label_encoder.inverse_transform(predicted_risk_category)[0]
    print(f'Predicted Obesity Risk Category: {predicted_risk_category_label}')
    
    # Define cluster preferences based on risk category
    if predicted_risk_category_label == 'Low':
        # Low-risk users: Focus on high-protein, balanced meals
        preferred_clusters = [0, 1, 2]  # Example clusters
        sort_by = 'Protein_g'  # Sort by highest protein
    elif predicted_risk_category_label == 'Medium':
        # Medium-risk users: Focus on moderate-calorie, balanced meals
        preferred_clusters = [3, 4, 5]  # Example clusters
        sort_by = 'Energy_kcal'  # Sort by moderate calories
    else:
        # High-risk users: Focus on low-calorie, nutrient-dense meals
        preferred_clusters = [6, 7, 8, 9]  # Example clusters
        sort_by = 'Energy_kcal'  # Sort by lowest calories
    
    # Recommend meals from the preferred clusters
    recommended_meals = meal_df[meal_df['Meal_Cluster'].isin(preferred_clusters)]
    
    # Sort meals based on the user's risk category
    if predicted_risk_category_label == 'High':
        recommended_meals = recommended_meals.sort_values(by=sort_by, ascending=True)  # Low calories
    else:
        recommended_meals = recommended_meals.sort_values(by=sort_by, ascending=False)  # High protein or moderate calories
    
    # Display recommended meals in a user-friendly format
    display_meal_cluster(recommended_meals, f"{predicted_risk_category_label}-Risk", num_meals)

# Example new genetic profile
new_profile = {
    'BMI': 28.5,  # Example value
    'Physical_Activity': 2,  # Example value (1: Low, 2: Moderate, 3: High)
    'Diet_Type': 'High-Fat',  # Example value
    'MC4R_Variant': 'rs17782313_CT',  # Example value
    'PPARG_Variant': 'rs1801282_CG',  # Example value
    'FTO_Variant': 'rs9939609_AT',  # Example value
    'LEPR_Variant': 'rs1137101_AG',  # Example value
    'Obesity_Risk_Score': 0.45  # Example value
}

# Recommend meals for the new profile
recommend_meals(new_profile, meal_df, best_rf, label_encoder)

# -------------------------------
# Save the Trained Model
# -------------------------------
# Define the save directory and filename based on the implementation method used
save_model_dir = r"C:\Users\trejan\Desktop\GNN\Saved models"
if not os.path.exists(save_model_dir):
    os.makedirs(save_model_dir)

# Filename includes the method name: here "RandomForest_GridSearch_SMOTE.pkl"
model_filename = os.path.join(save_model_dir, "RandomForest_GridSearch_SMOTE.pkl")

# Save the model using pickle
with open(model_filename, "wb") as f:
    pickle.dump(best_rf, f)
    
print(f"\nTrained model saved at: {model_filename}")


Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Validation Accuracy: 100.00%
Test Accuracy: 100.00%
Cross-Validation Accuracy (Noisy Data): 83.14%

Classification Report (Test Set):
              precision    recall  f1-score   support

        High       1.00      1.00      1.00       182
         Low       1.00      1.00      1.00       154
      Medium       1.00      1.00      1.00       173

    accuracy                           1.00       509
   macro avg       1.00      1.00      1.00       509
weighted avg       1.00      1.00      1.00       509


Confusion Matrix (Test Set):
[[182   0   0]
 [  0 154   0]
 [  0   0 173]]


[WinError 2] The system cannot find the file specified
  File "C:\Users\trejan\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\trejan\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\trejan\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                

Predicted Obesity Risk Category: Medium

=== Recommended Meals for Medium-Risk ===
Total Meals in Cluster: 3637

Top 5 Meals:
Meal Description                                   Energy (kcal)   Protein (g)     Fat (g)         Carbs (g)      
--------------------------------------------------------------------------------------------------------------
Snacks, popcorn, oil-popped, microwave, regular flavor, no trans fat 583.0           7.29            43.55           45.06          
Chocolate, dark, 60-69% cacao solids               579.0           6.12            38.31           52.42          
Candies, HERSHEY'S POT OF GOLD Almond Bar          577.0           12.82           38.46           46.15          
Candies, MARS SNACKFOOD US, COCOAVIA Chocolate Covered Almonds 573.0           9.51            37.07           50.22          
Candies, HERSHEY'S MILK CHOCOLATE WITH ALMOND BITES 568.0           9.76            35.73           51.72          

=== Nutritional Summary ===
Average Energ