In [2]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cluster import KMeans

#############################
# Load and Preprocess Data
#############################

genetic_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\new_genetic_profiles.csv"
genetic_df = pd.read_csv(genetic_file_path)
genetic_df.columns = genetic_df.columns.str.strip()
genetic_df.fillna("None", inplace=True)

# Convert Obesity_Risk_Score into categories
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.5, 0.8, 1],
    labels=['Low', 'Medium', 'High']
)

# Initialize LabelEncoders
label_encoders = {}

# Encode categorical features
for col in ["Diet_Type", "Physical_Activity"]:
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Encode genetic variants
variant_columns = ["MC4R_Variant", "PPARG_Variant", "FTO_Variant", "LEPR_Variant"]
for col in variant_columns:
    genetic_df[col] = genetic_df[col].astype(str)
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Define features and target
features = [
    "Age", "BMI", "Physical_Activity", "Diet_Type",
    "MC4R_Present", "MC4R_Variant",
    "PPARG_Present", "PPARG_Variant",
    "FTO_Present", "FTO_Variant",
    "LEPR_Present", "LEPR_Variant"
]
target = "Obesity_Risk_Category"

X_gen = genetic_df[features]
y_gen = genetic_df[target]

# Encode target labels
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y_gen)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_gen, y_encoded, test_size=0.2, random_state=42)

# Scale features (important for Neural Networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#############################
# Train Neural Network (MLP)
#############################

# Define the MLP model
mlp_model = MLPClassifier(
    hidden_layer_sizes=(100, 50),  # Two hidden layers with 100 and 50 neurons
    activation='relu',             # Activation function
    solver='adam',                 # Optimizer
    max_iter=500,                  # Maximum number of iterations
    random_state=42,
    early_stopping=True,           # Stop training if validation score doesn't improve
    validation_fraction=0.2        # Fraction of training data to use for validation
)

# Train the model
mlp_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = mlp_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_le.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

#############################
# Save Model and Preprocessing Artifacts as Pickle Files
#############################

# Define the save directory
save_dir = r"C:\Users\trejan\Desktop\GNN\Saved models"
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist

# Save the MLP model
with open(os.path.join(save_dir, "mlp_model.pkl"), "wb") as f:
    pickle.dump(mlp_model, f)

# Save the label encoders
with open(os.path.join(save_dir, "label_encoders.pkl"), "wb") as f:
    pickle.dump(label_encoders, f)

# Save the target encoder
with open(os.path.join(save_dir, "target_encoder.pkl"), "wb") as f:
    pickle.dump(target_le, f)

# Save the scaler
with open(os.path.join(save_dir, "scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)

print(f"Model and preprocessing artifacts saved to: {save_dir}")

#############################
# Pipeline 2: Meal Recommendation
#############################

meal_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\train.csv"
meal_df = pd.read_csv(meal_file_path)

nutritional_features = meal_df[['Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']]

num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
meal_df['Meal_Cluster'] = kmeans.fit_predict(nutritional_features)

# Save the meal dataset
with open(os.path.join(save_dir, "meals.pkl"), "wb") as meal_file:
    pickle.dump(meal_df, meal_file)

print("Meal dataset saved.")

# Define a meal recommendation function
def recommend_meals(user_profile, meal_df, mlp_model, target_le, scaler, num_meals=5):
    user_profile_df = pd.DataFrame([user_profile])
    
    for col in ["Diet_Type", "Physical_Activity"]:
        if col in user_profile_df.columns:
            le = label_encoders[col]
            user_profile_df[col] = le.transform(user_profile_df[col])
    for col in variant_columns:
        if col in user_profile_df.columns:
            le = label_encoders[col]
            user_profile_df[col] = le.transform(user_profile_df[col].astype(str))
    
    missing_cols = set(features) - set(user_profile_df.columns)
    for col in missing_cols:
        user_profile_df[col] = 0
    user_profile_df = user_profile_df[features]

    # Scale the user profile
    user_profile_scaled = scaler.transform(user_profile_df)

    predicted_category = mlp_model.predict(user_profile_scaled)[0]
    predicted_label = target_le.inverse_transform([predicted_category])[0]
    
    if predicted_label == 'Low':
        preferred_clusters = [0, 1, 2, 3]
    elif predicted_label == 'Medium':
        preferred_clusters = [4, 5, 6, 7]
    else:
        preferred_clusters = [8, 9]
    
    recommended_meals = meal_df[meal_df['Meal_Cluster'].isin(preferred_clusters)]
    recommended_meals = recommended_meals.sample(frac=1).reset_index(drop=True)
    
    return predicted_label, recommended_meals[['Descrip', 'Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']].head(num_meals)

# Example user profile
new_profile = {
    "Age": 35,
    "BMI": 28.5,
    "Physical_Activity": "Low",
    "Diet_Type": "High-Fat",
    "MC4R_Present": 1,
    "MC4R_Variant": "rs17782313_TT",
    "PPARG_Present": 0,
    "PPARG_Variant": "rs1801282_CG",
    "FTO_Present": 1,
    "FTO_Variant": "rs9939609_AT",
    "LEPR_Present": 1,
    "LEPR_Variant": "rs1137101_AG"
}

predicted_risk, recommended_meals = recommend_meals(new_profile, meal_df, mlp_model, target_le, scaler, num_meals=5)
print(f"Predicted Obesity Risk Category: {predicted_risk}")
print("Recommended Meals:")
print(recommended_meals)

Classification Report:
              precision    recall  f1-score   support

        High       0.88      1.00      0.94        88
         Low       0.75      0.99      0.86       182
      Medium       0.98      0.45      0.62       130

    accuracy                           0.82       400
   macro avg       0.87      0.82      0.81       400
weighted avg       0.86      0.82      0.80       400

Confusion Matrix:
[[ 88   0   0]
 [  0 181   1]
 [ 12  59  59]]
Model and preprocessing artifacts saved to: C:\Users\trejan\Desktop\GNN\Saved models
Meal dataset saved.
Predicted Obesity Risk Category: High
Recommended Meals:
                                             Descrip  Energy_kcal  Protein_g  \
0                             Nuts, beechnuts, dried        576.0       6.20   
1                     Babyfood, meat, veal, strained         81.0      13.12   
2         Soup, chicken with rice, canned, condensed         68.0       1.84   
3  Infant formula, ABBOTT NUTRITION, SIMILAC, SEN.