In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import learning_curve, cross_val_score, KFold
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
import os
import joblib
import datetime
from sklearn.calibration import calibration_curve
import json

# Create the directory if it doesn't exist
output_dir = r"C:\Users\trejan\Desktop\GNN\XGB"
os.makedirs(output_dir, exist_ok=True)

# Current timestamp for file naming
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Function to save plot
def save_plot(fig, filename):
    filepath = os.path.join(output_dir, filename)
    fig.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close(fig)
    print(f"Saved: {filepath}")

# Function to save metrics to JSON
def save_metrics(metrics_dict, filename):
    filepath = os.path.join(output_dir, filename)
    with open(filepath, 'w') as f:
        json.dump(metrics_dict, f, indent=4)
    print(f"Saved: {filepath}")

# Save the trained model
def save_model(model, scaler, label_encoders, target_le, filename):
    filepath = os.path.join(output_dir, filename)
    model_data = {
        'model': model,
        'scaler': scaler,
        'label_encoders': label_encoders,
        'target_encoder': target_le
    }
    joblib.dump(model_data, filepath)
    print(f"Saved: {filepath}")

# Add these sections right after the k-NN model evaluation in your script

# 1. Save the model and associated transformers
save_model(knn_model, scaler, label_encoders, target_le, f"knn_obesity_model_{timestamp}.pkl")

# 2. Calculate and save detailed metrics
metrics = {
    'accuracy': float(accuracy_score(y_test, y_pred_knn)),
    'classification_report': classification_report(y_test, y_pred_knn, target_names=target_le.classes_, output_dict=True)
}
save_metrics(metrics, f"model_metrics_{timestamp}.json")

# 3. Generate and save confusion matrix plot
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred_knn)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_le.classes_, 
            yticklabels=target_le.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
save_plot(plt.gcf(), f"confusion_matrix_{timestamp}.png")

# 4. Generate and save feature importance (for k-NN, use permutation importance)
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(knn_model, X_test_scaled, y_test, n_repeats=10, random_state=42)
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': perm_importance.importances_mean
})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance (Permutation-based)')
plt.tight_layout()
save_plot(plt.gcf(), f"feature_importance_{timestamp}.png")

# 5. Generate and save learning curves
train_sizes = np.linspace(0.1, 1.0, 10)
train_sizes, train_scores, test_scores = learning_curve(
    knn_model, X_gen, y_encoded, train_sizes=train_sizes, cv=5, scoring='accuracy')

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 8))
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', marker='+', markersize=5, label='Validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.title('Learning Curves')
plt.legend(loc='lower right')
plt.grid(True)
save_plot(plt.gcf(), f"learning_curve_{timestamp}.png")

# 6. Generate and save cross-validation scores
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(knn_model, X_gen, y_encoded, cv=cv, scoring='accuracy')

plt.figure(figsize=(10, 6))
plt.bar(range(1, 6), cv_scores, color='skyblue')
plt.axhline(y=cv_scores.mean(), color='red', linestyle='-', label=f'Mean Accuracy: {cv_scores.mean():.4f}')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.ylim(0, 1.0)
plt.title('5-Fold Cross-Validation Scores')
plt.xticks(range(1, 6))
plt.legend()
save_plot(plt.gcf(), f"cross_validation_scores_{timestamp}.png")

# 7. Generate and save ROC curves (for multi-class, one-vs-rest approach)
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

# Binarize the output for ROC calculation
y_test_bin = label_binarize(y_test, classes=np.unique(y_encoded))
n_classes = y_test_bin.shape[1]

# Fit OneVsRest classifier for ROC calculation
ovr_clf = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=11))
ovr_clf.fit(X_train_scaled, label_binarize(y_train, classes=np.unique(y_encoded)))
y_score = ovr_clf.predict_proba(X_test_scaled)

# Compute ROC curve and ROC area for each class
plt.figure(figsize=(12, 8))
roc_auc = {}
for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'ROC curve (class {target_le.classes_[i]}) (AUC = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc="lower right")
save_plot(plt.gcf(), f"roc_curves_{timestamp}.png")

# 8. Generate and save k-value optimization plot (if not already chosen)
k_range = list(range(1, 31, 2))
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_gen, y_encoded, cv=5, scoring='accuracy')
    k_scores.append(scores.mean())

plt.figure(figsize=(10, 6))
plt.plot(k_range, k_scores, marker='o')
plt.xlabel('Value of K')
plt.ylabel('Cross-Validated Accuracy')
plt.title('k-NN: Accuracy for Different Values of k')
plt.grid(True)
save_plot(plt.gcf(), f"k_optimization_{timestamp}.png")

# 9. Generate and save detailed distribution of predictions
pred_df = pd.DataFrame({
    'Actual': target_le.inverse_transform(y_test),
    'Predicted': target_le.inverse_transform(y_pred_knn)
})

plt.figure(figsize=(12, 6))
sns.countplot(x='Actual', hue='Predicted', data=pred_df, palette='viridis')
plt.title('Distribution of Predictions by Actual Class')
plt.xlabel('Actual Class')
plt.ylabel('Count')
save_plot(plt.gcf(), f"prediction_distribution_{timestamp}.png")

# 10. Save a summary report
summary = {
    'model_type': 'KNeighborsClassifier',
    'n_neighbors': 11,
    'accuracy': float(metrics['accuracy']),
    'mean_cv_accuracy': float(cv_scores.mean()),
    'std_cv_accuracy': float(cv_scores.std()),
    'top_features': feature_importance['Feature'].iloc[:5].tolist(),
    'timestamp': timestamp
}
save_metrics(summary, f"model_summary_{timestamp}.json")

# Print completion message
print(f"\nEvaluation complete! All metrics and visualizations saved to {output_dir}")

k-NN Test Accuracy: 0.8
Classification Report:
              precision    recall  f1-score   support

        High       0.88      0.98      0.92        88
         Low       0.75      0.96      0.84       182
      Medium       0.86      0.46      0.60       130

    accuracy                           0.80       400
   macro avg       0.83      0.80      0.79       400
weighted avg       0.81      0.80      0.78       400

Confusion Matrix:
[[ 86   0   2]
 [  0 174   8]
 [ 12  58  60]]

Predicted Obesity Risk Category: High

Recommended Meals:
                                                Descrip  Energy_kcal  \
1633        Leavening agents, baking powder, low-sodium         97.0   
826                   Peas, mature seeds, sprouted, raw        124.0   
847   Turkey from whole, enhanced, light meat, meat ...        127.0   
1662  Turkey, breast, from whole bird, enhanced, mea...        127.0   
373   Turkey, wing, from whole bird, enhanced, meat ...        127.0   

      Protein_g 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cluster import KMeans
import random
import os
import joblib  # Added for model saving

# Set environment variables and parallel processing configuration
os.environ['LOKY_MAX_CPU_COUNT'] = '4'

#############################
# Pipeline 1: k-NN for Obesity Risk Prediction
#############################

# Load genetic dataset
genetic_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\new_genetic_profiles.csv"
genetic_df = pd.read_csv(genetic_file_path)
genetic_df.columns = genetic_df.columns.str.strip()
genetic_df.fillna("None", inplace=True)

# Convert Obesity_Risk_Score into categories
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.5, 0.8, 1],
    labels=['Low', 'Medium', 'High']
)

# Initialize dictionary to store LabelEncoders
label_encoders = {}

# Encode categorical variables
for col in ["Diet_Type", "Physical_Activity"]:
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Encode gene variant columns
variant_columns = ["MC4R_Variant", "PPARG_Variant", "FTO_Variant", "LEPR_Variant"]
for col in variant_columns:
    genetic_df[col] = genetic_df[col].astype(str)
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Define features and target
features = [
    "Age", "BMI", "Physical_Activity", "Diet_Type",
    "MC4R_Present", "MC4R_Variant",
    "PPARG_Present", "PPARG_Variant",
    "FTO_Present", "FTO_Variant",
    "LEPR_Present", "LEPR_Variant"
]
target = "Obesity_Risk_Category"

X_gen = genetic_df[features]
y_gen = genetic_df[target]

# Encode target labels
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y_gen)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_gen, y_encoded, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train k-NN model
knn_model = KNeighborsClassifier(n_neighbors=11)
knn_model.fit(X_train_scaled, y_train)

# Evaluate model
y_pred_knn = knn_model.predict(X_test_scaled)
print("k-NN Test Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:")
print(classification_report(y_test, y_pred_knn, target_names=target_le.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))

# Save model and preprocessing artifacts
save_dir = r"C:\Users\trejan\Desktop\GNN\Saved models"
os.makedirs(save_dir, exist_ok=True)  # Create directory if needed

joblib.dump(knn_model, os.path.join(save_dir, "knn_model.pkl"))
joblib.dump(scaler, os.path.join(save_dir, "scaler.pkl"))
joblib.dump(label_encoders, os.path.join(save_dir, "label_encoders.pkl"))
joblib.dump(target_le, os.path.join(save_dir, "target_encoder.pkl"))

print(f"\nModel and preprocessing artifacts saved to: {save_dir}")

#############################
# Pipeline 2: Meal Recommendation
#############################

# Load meal dataset
meal_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\train.csv"
meal_df = pd.read_csv(meal_file_path)

# Preprocess nutritional features
nutritional_features = meal_df[['Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']]
scaler_meal = StandardScaler()
nutritional_features_scaled = scaler_meal.fit_transform(nutritional_features)

# Cluster meals
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
meal_df['Meal_Cluster'] = kmeans.fit_predict(nutritional_features_scaled)

# Recommendation function (unchanged)
def recommend_meals(user_profile, meal_df, knn_model, target_le, scaler, num_meals=5):
    user_profile_df = pd.DataFrame([user_profile])
    
    for col in ["Diet_Type", "Physical_Activity"]:
        if col in user_profile_df.columns:
            le = label_encoders[col]
            user_profile_df[col] = le.transform(user_profile_df[col])
    for col in variant_columns:
        if col in user_profile_df.columns:
            le = label_encoders[col]
            user_profile_df[col] = le.transform(user_profile_df[col].astype(str))
    
    missing_cols = set(features) - set(user_profile_df.columns)
    for col in missing_cols:
        user_profile_df[col] = 0
    user_profile_df = user_profile_df[features]
    
    user_profile_scaled = scaler.transform(user_profile_df)
    predicted_category = knn_model.predict(user_profile_scaled)[0]
    predicted_label = target_le.inverse_transform([predicted_category])[0]
    print(f"\nPredicted Obesity Risk Category: {predicted_label}")
    
    if predicted_label == 'Low':
        preferred_clusters = [0, 1, 2, 3]
        sort_by = random.choice(['Protein_g', 'Energy_kcal'])
        ascending = random.choice([True, False])
    elif predicted_label == 'Medium':
        preferred_clusters = [4, 5, 6, 7]
        sort_by = random.choice(['Energy_kcal', 'Fat_g'])
        ascending = random.choice([True, False])
    else:
        preferred_clusters = [8, 9, 0, 1]
        sort_by = random.choice(['Energy_kcal', 'Carb_g'])
        ascending = random.choice([True, False])
    
    recommended_meals = meal_df[meal_df['Meal_Cluster'].isin(preferred_clusters)]
    recommended_meals = recommended_meals.sample(frac=1).reset_index(drop=True)
    recommended_meals = recommended_meals.sort_values(by=sort_by, ascending=ascending)
    
    print("\nRecommended Meals:")
    print(recommended_meals[['Descrip', 'Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']].head(num_meals))

# Example usage
new_profile = {
    "Age": 35,
    "BMI": 28.5,
    "Physical_Activity": "Low",
    "Diet_Type": "High-Fat",
    "MC4R_Present": 0,
    "MC4R_Variant": "rs17782313_TT",
    "PPARG_Present": 0,
    "PPARG_Variant": "rs1801282_CG",
    "FTO_Present": 1,
    "FTO_Variant": "rs9939609_AT",
    "LEPR_Present": 1,
    "LEPR_Variant": "rs1137101_AG"
}

recommend_meals(new_profile, meal_df, knn_model, target_le, scaler, num_meals=5)

k-NN Test Accuracy: 0.8
Classification Report:
              precision    recall  f1-score   support

        High       0.88      0.98      0.92        88
         Low       0.75      0.96      0.84       182
      Medium       0.86      0.46      0.60       130

    accuracy                           0.80       400
   macro avg       0.83      0.80      0.79       400
weighted avg       0.81      0.80      0.78       400

Confusion Matrix:
[[ 86   0   2]
 [  0 174   8]
 [ 12  58  60]]

Model and preprocessing artifacts saved to: C:\Users\trejan\Desktop\GNN\Saved models

Predicted Obesity Risk Category: High

Recommended Meals:
                                            Descrip  Energy_kcal  Protein_g  \
2088     Nuts, pecans, oil roasted, with salt added        715.0       9.20   
1806  Nuts, pecans, oil roasted, without salt added        715.0       9.20   
1997     Nuts, pecans, dry roasted, with salt added        710.0       9.50   
1079  Nuts, pecans, dry roasted, without salt a

[WinError 2] The system cannot find the file specified
  File "C:\Users\trejan\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\trejan\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\trejan\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                