In [1]:
import os
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (classification_report, confusion_matrix, roc_curve, auc, 
                             precision_recall_curve, average_precision_score)
from sklearn.cluster import KMeans

#############################
# Load and Preprocess Data
#############################

genetic_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\new_genetic_profiles.csv"
genetic_df = pd.read_csv(genetic_file_path)
genetic_df.columns = genetic_df.columns.str.strip()
genetic_df.fillna("None", inplace=True)

# Convert Obesity_Risk_Score into categories
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.5, 0.8, 1],
    labels=['Low', 'Medium', 'High']
)

# Initialize LabelEncoders
label_encoders = {}

# Encode categorical features
for col in ["Diet_Type", "Physical_Activity"]:
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Encode genetic variants
variant_columns = ["MC4R_Variant", "PPARG_Variant", "FTO_Variant", "LEPR_Variant"]
for col in variant_columns:
    genetic_df[col] = genetic_df[col].astype(str)
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Define features and target
features = [
    "Age", "BMI", "Physical_Activity", "Diet_Type",
    "MC4R_Present", "MC4R_Variant",
    "PPARG_Present", "PPARG_Variant",
    "FTO_Present", "FTO_Variant",
    "LEPR_Present", "LEPR_Variant"
]
target = "Obesity_Risk_Category"

X_gen = genetic_df[features]
y_gen = genetic_df[target]

# Encode target labels
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y_gen)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_gen, y_encoded, test_size=0.2, random_state=42)

# Scale features (important for Neural Networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#############################
# Train Neural Network (MLP)
#############################

# Define the MLP model
mlp_model = MLPClassifier(
    hidden_layer_sizes=(100, 50),  # Two hidden layers with 100 and 50 neurons
    activation='relu',             # Activation function
    solver='adam',                 # Optimizer
    max_iter=500,                  # Maximum number of iterations
    random_state=42,
    early_stopping=True,           # Stop training if validation score doesn't improve
    validation_fraction=0.2        # Fraction of training data to use for validation
)

# Train the model
mlp_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = mlp_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_le.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

#############################
# Save Model and Preprocessing Artifacts as Pickle Files
#############################

# Define the save directory
save_dir = r"C:\Users\trejan\Desktop\GNN\Saved models"
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist

# Save the MLP model
with open(os.path.join(save_dir, "mlp_model.pkl"), "wb") as f:
    pickle.dump(mlp_model, f)

# Save the label encoders
with open(os.path.join(save_dir, "label_encoders.pkl"), "wb") as f:
    pickle.dump(label_encoders, f)

# Save the target encoder
with open(os.path.join(save_dir, "target_encoder.pkl"), "wb") as f:
    pickle.dump(target_le, f)

# Save the scaler
with open(os.path.join(save_dir, "scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)

print(f"Model and preprocessing artifacts saved to: {save_dir}")

#############################
# Additional Evaluation Metrics and Graphs for MLP Model
#############################

# Binarize the test labels for ROC and Precision-Recall curves
n_classes = len(target_le.classes_)
y_test_bin = label_binarize(y_test, classes=list(range(n_classes)))
y_score = mlp_model.predict_proba(X_test_scaled)

# 1. Plot ROC Curves for each class
plt.figure()
colors = ['blue', 'red', 'green']  # Adjust colors if more classes are present
for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, color=colors[i], lw=2,
             label=f"ROC curve for {target_le.classes_[i]} (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for MLP Classifier')
plt.legend(loc='lower right')
roc_path = os.path.join(save_dir, 'mlp_roc_curves.png')
plt.savefig(roc_path)
plt.close()
print(f"ROC curves saved to: {roc_path}")

# 2. Plot Precision-Recall Curves for each class
plt.figure()
for i in range(n_classes):
    precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_score[:, i])
    avg_precision = average_precision_score(y_test_bin[:, i], y_score[:, i])
    plt.plot(recall, precision, lw=2,
             label=f"PR curve for {target_le.classes_[i]} (AP = {avg_precision:.2f})")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves for MLP Classifier')
plt.legend(loc='upper right')
pr_path = os.path.join(save_dir, 'mlp_precision_recall_curves.png')
plt.savefig(pr_path)
plt.close()
print(f"Precision-Recall curves saved to: {pr_path}")

# 3. Plot Confusion Matrix Heatmap
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues',
            xticklabels=target_le.classes_, yticklabels=target_le.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for MLP Classifier')
cm_path = os.path.join(save_dir, 'mlp_confusion_matrix.png')
plt.savefig(cm_path)
plt.close()
print(f"Confusion Matrix heatmap saved to: {cm_path}")

# 4. Plot Loss Curve (if available)
if hasattr(mlp_model, 'loss_curve_'):
    plt.figure()
    plt.plot(mlp_model.loss_curve_, marker='o')
    plt.xlabel('Iterations')
    plt.ylabel('Loss')
    plt.title('MLP Loss Curve')
    loss_curve_path = os.path.join(save_dir, 'mlp_loss_curve.png')
    plt.savefig(loss_curve_path)
    plt.close()
    print(f"Loss curve saved to: {loss_curve_path}")

# 5. Plot Learning Curve
train_sizes, train_scores, valid_scores = learning_curve(
    mlp_model, X_train_scaled, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10)
)
plt.figure()
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label='Training score')
plt.plot(train_sizes, np.mean(valid_scores, axis=1), 'o-', label='Cross-validation score')
plt.xlabel('Training Set Size')
plt.ylabel('Score')
plt.title('Learning Curve for MLP Classifier')
plt.legend(loc='best')
learning_curve_path = os.path.join(save_dir, 'mlp_learning_curve.png')
plt.savefig(learning_curve_path)
plt.close()
print(f"Learning curve saved to: {learning_curve_path}")

#############################
# Pipeline 2: Meal Recommendation
#############################

meal_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\train.csv"
meal_df = pd.read_csv(meal_file_path)

nutritional_features = meal_df[['Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']]

num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
meal_df['Meal_Cluster'] = kmeans.fit_predict(nutritional_features)

# Save the meal dataset
with open(os.path.join(save_dir, "meals.pkl"), "wb") as meal_file:
    pickle.dump(meal_df, meal_file)

print("Meal dataset saved.")

# Define a meal recommendation function
def recommend_meals(user_profile, meal_df, mlp_model, target_le, scaler, num_meals=5):
    user_profile_df = pd.DataFrame([user_profile])
    
    for col in ["Diet_Type", "Physical_Activity"]:
        if col in user_profile_df.columns:
            le = label_encoders[col]
            user_profile_df[col] = le.transform(user_profile_df[col])
    for col in variant_columns:
        if col in user_profile_df.columns:
            le = label_encoders[col]
            user_profile_df[col] = le.transform(user_profile_df[col].astype(str))
    
    missing_cols = set(features) - set(user_profile_df.columns)
    for col in missing_cols:
        user_profile_df[col] = 0
    user_profile_df = user_profile_df[features]

    # Scale the user profile
    user_profile_scaled = scaler.transform(user_profile_df)

    predicted_category = mlp_model.predict(user_profile_scaled)[0]
    predicted_label = target_le.inverse_transform([predicted_category])[0]
    
    if predicted_label == 'Low':
        preferred_clusters = [0, 1, 2, 3]
    elif predicted_label == 'Medium':
        preferred_clusters = [4, 5, 6, 7]
    else:
        preferred_clusters = [8, 9]
    
    recommended_meals = meal_df[meal_df['Meal_Cluster'].isin(preferred_clusters)]
    recommended_meals = recommended_meals.sample(frac=1).reset_index(drop=True)
    
    return predicted_label, recommended_meals[['Descrip', 'Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']].head(num_meals)

# Example user profile
new_profile = {
    "Age": 35,
    "BMI": 28.5,
    "Physical_Activity": "Low",
    "Diet_Type": "High-Fat",
    "MC4R_Present": 1,
    "MC4R_Variant": "rs17782313_TT",
    "PPARG_Present": 0,
    "PPARG_Variant": "rs1801282_CG",
    "FTO_Present": 1,
    "FTO_Variant": "rs9939609_AT",
    "LEPR_Present": 1,
    "LEPR_Variant": "rs1137101_AG"
}

predicted_risk, recommended_meals = recommend_meals(new_profile, meal_df, mlp_model, target_le, scaler, num_meals=5)
print(f"Predicted Obesity Risk Category: {predicted_risk}")
print("Recommended Meals:")
print(recommended_meals)


Classification Report:
              precision    recall  f1-score   support

        High       0.88      1.00      0.94        88
         Low       0.75      0.99      0.86       182
      Medium       0.98      0.45      0.62       130

    accuracy                           0.82       400
   macro avg       0.87      0.82      0.81       400
weighted avg       0.86      0.82      0.80       400

Confusion Matrix:
[[ 88   0   0]
 [  0 181   1]
 [ 12  59  59]]
Model and preprocessing artifacts saved to: C:\Users\trejan\Desktop\GNN\Saved models
ROC curves saved to: C:\Users\trejan\Desktop\GNN\Saved models\mlp_roc_curves.png
Precision-Recall curves saved to: C:\Users\trejan\Desktop\GNN\Saved models\mlp_precision_recall_curves.png
Confusion Matrix heatmap saved to: C:\Users\trejan\Desktop\GNN\Saved models\mlp_confusion_matrix.png
Loss curve saved to: C:\Users\trejan\Desktop\GNN\Saved models\mlp_loss_curve.png
Learning curve saved to: C:\Users\trejan\Desktop\GNN\Saved models\mlp_learnin

[WinError 2] The system cannot find the file specified
  File "C:\Users\trejan\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\trejan\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\trejan\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                

Meal dataset saved.
Predicted Obesity Risk Category: High
Recommended Meals:
                                             Descrip  Energy_kcal  Protein_g  \
0  Infant formula, MEAD JOHNSON, ENFAMIL, Enfagro...         65.0       2.17   
1  Fast foods, salad, vegetables tossed, without ...         82.0       7.98   
2        Peanuts, spanish, oil-roasted, without salt        579.0      28.01   
3     Sweet potato, frozen, cooked, baked, with salt        100.0       1.71   
4              Peanut butter, chunk style, with salt        589.0      24.06   

   Fat_g  Carb_g  
0   2.89    7.64  
1   4.93    1.45  
2  49.04   17.45  
3   0.12   23.40  
4  49.94   21.57  


In [2]:
from sklearn.inspection import permutation_importance

# Compute permutation importance
result = permutation_importance(mlp_model, X_test_scaled, y_test, n_repeats=10, random_state=42, n_jobs=-1)

# Visualize
sorted_idx = result.importances_mean.argsort()[::-1]
plt.figure(figsize=(10, 6))
plt.barh(np.array(features)[sorted_idx], result.importances_mean[sorted_idx])
plt.xlabel("Mean Importance")
plt.title("Permutation Feature Importance for MLP Classifier")
perm_path = os.path.join(save_dir, 'mlp_permutation_importance.png')
plt.tight_layout()
plt.savefig(perm_path)
plt.close()
print(f"Permutation Importance saved to: {perm_path}")


Permutation Importance saved to: C:\Users\trejan\Desktop\GNN\Saved models\mlp_permutation_importance.png


In [4]:
from sklearn.inspection import PartialDependenceDisplay
import warnings
warnings.filterwarnings("ignore")

X_train_df = pd.DataFrame(X_train_scaled, columns=features)

# Select most important features from permutation
important_features = [features[i] for i in sorted_idx[:5]]  # Top 5 features

# Specify class index (e.g., 0 = first class)
target_class_index = 0  # You can change this to 1 or 2 if needed

display = PartialDependenceDisplay.from_estimator(
    mlp_model,
    X_train_df,
    features=important_features,
    feature_names=features,
    kind='average',
    target=target_class_index
)

display.figure_.suptitle(f"Partial Dependence Plots for Class {mlp_model.classes_[target_class_index]}")
plt.subplots_adjust(hspace=0.6)
pdp_path = os.path.join(save_dir, f'mlp_partial_dependence_class_{target_class_index}.png')
plt.savefig(pdp_path)
plt.close()
print(f"PDP saved to: {pdp_path}")


PDP saved to: C:\Users\trejan\Desktop\GNN\Saved models\mlp_partial_dependence_class_0.png


In [5]:
import lime
import lime.lime_tabular

# Prepare LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train_scaled,
    feature_names=features,
    class_names=target_le.classes_,
    discretize_continuous=True,
    verbose=True,
    random_state=42
)

# Explain a prediction (e.g., first test sample)
i = 0
exp = explainer.explain_instance(X_test_scaled[i], mlp_model.predict_proba, num_features=10)
lime_path = os.path.join(save_dir, f"lime_explanation_sample_{i}.html")
exp.save_to_file(lime_path)
print(f"LIME explanation for sample {i} saved to: {lime_path}")


Intercept -0.03351194969935145
Prediction_local [0.32155442]
Right: 0.602373424093634
LIME explanation for sample 0 saved to: C:\Users\trejan\Desktop\GNN\Saved models\lime_explanation_sample_0.html


In [7]:
# Loop through all classes in the model (assuming classes are labeled as integers: 0, 1, 2, etc.)
for target_class in range(mlp_model.classes_.shape[0]):  # Loop over all classes
    display = PartialDependenceDisplay.from_estimator(
        mlp_model,
        X_train_df,
        features=important_features,
        feature_names=features,
        kind="average",
        target=target_class  # Specify the target class index
    )

    # Adjust spacing between subplots for better readability
    display.figure_.suptitle(f"Partial Dependence Plots for MLP Classifier (Class {target_class})")
    plt.subplots_adjust(hspace=0.6)  # Increase space between subplots
    pdp_path = os.path.join(save_dir, f'mlp_partial_dependence_plots_class_{target_class}.png')
    plt.savefig(pdp_path)
    plt.close()
    print(f"PDPs for Class {target_class} saved to: {pdp_path}")


PDPs for Class 0 saved to: C:\Users\trejan\Desktop\GNN\Saved models\mlp_partial_dependence_plots_class_0.png
PDPs for Class 1 saved to: C:\Users\trejan\Desktop\GNN\Saved models\mlp_partial_dependence_plots_class_1.png
PDPs for Class 2 saved to: C:\Users\trejan\Desktop\GNN\Saved models\mlp_partial_dependence_plots_class_2.png
