In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cluster import KMeans  # Import KMeans
import random  # For introducing randomness in recommendations

# Define the model save path
model_save_path = r"C:\Users\trejan\Desktop\GNN\Saved models\LogisticRegression.pkl"

#############################
# Pipeline 1: Logistic Regression for Obesity Risk Prediction
#############################

# Load genetic dataset (assumes comma-delimited)
genetic_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\new_genetic_profiles.csv"
genetic_df = pd.read_csv(genetic_file_path)
genetic_df.columns = genetic_df.columns.str.strip()
genetic_df.fillna("None", inplace=True)

# Convert Obesity_Risk_Score into categories (Low, Medium, High)
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.5, 0.8, 1],
    labels=['Low', 'Medium', 'High']
)

# Initialize dictionary to store LabelEncoders
label_encoders = {}

# Encode categorical variables (Diet_Type, Physical_Activity)
for col in ["Diet_Type", "Physical_Activity"]:
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Encode gene variant columns as strings (so that "None" is encoded too)
variant_columns = ["MC4R_Variant", "PPARG_Variant", "FTO_Variant", "LEPR_Variant"]
for col in variant_columns:
    genetic_df[col] = genetic_df[col].astype(str)
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Define features and target for the genetic model
features = [
    "Age", "BMI", "Physical_Activity", "Diet_Type",
    "MC4R_Present", "MC4R_Variant",
    "PPARG_Present", "PPARG_Variant",
    "FTO_Present", "FTO_Variant",
    "LEPR_Present", "LEPR_Variant"
]
target = "Obesity_Risk_Category"

X_gen = genetic_df[features]
y_gen = genetic_df[target]

# Encode target labels (Low, Medium, High)
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y_gen)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_gen, y_encoded, test_size=0.2, random_state=42)

# Scale features (important for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Logistic Regression model
logreg_model = LogisticRegression(
    multi_class='multinomial',  # For multi-class classification
    solver='lbfgs',            # Suitable for small datasets
    max_iter=1000,             # Increase iterations for convergence
    random_state=42
)
logreg_model.fit(X_train_scaled, y_train)

# Save the trained model
with open(model_save_path, 'wb') as model_file:
    pickle.dump(logreg_model, model_file)

print(f"Model saved successfully at {model_save_path}")

# Evaluate the model
y_pred_logreg = logreg_model.predict(X_test_scaled)
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Classification Report:")
print(classification_report(y_test, y_pred_logreg, target_names=target_le.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg))


Model saved successfully at C:\Users\trejan\Desktop\GNN\Saved models\LogisticRegression.pkl
Logistic Regression Test Accuracy: 0.8075
Classification Report:
              precision    recall  f1-score   support

        High       0.89      0.94      0.92        88
         Low       0.75      1.00      0.85       182
      Medium       0.92      0.45      0.60       130

    accuracy                           0.81       400
   macro avg       0.85      0.80      0.79       400
weighted avg       0.83      0.81      0.79       400

Confusion Matrix:
[[ 83   0   5]
 [  0 182   0]
 [ 10  62  58]]




In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cluster import KMeans
import random  # For introducing randomness in recommendations
import os  # To set environment variables

# Suppress joblib warning about physical cores
os.environ['LOKY_MAX_CPU_COUNT'] = '4'

#############################
# Pipeline 1: Logistic Regression for Obesity Risk Prediction
#############################

# Load genetic dataset (assumes comma-delimited)
genetic_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\new_genetic_profiles.csv"
genetic_df = pd.read_csv(genetic_file_path)
genetic_df.columns = genetic_df.columns.str.strip()
genetic_df.fillna("None", inplace=True)

# Convert Obesity_Risk_Score into categories (Low, Medium, High)
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.5, 0.8, 1],
    labels=['Low', 'Medium', 'High']
)

# Initialize dictionary to store LabelEncoders
label_encoders = {}

# Encode categorical variables (Diet_Type, Physical_Activity)
for col in ["Diet_Type", "Physical_Activity"]:
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Encode gene variant columns as strings (so that "None" is encoded too)
variant_columns = ["MC4R_Variant", "PPARG_Variant", "FTO_Variant", "LEPR_Variant"]
for col in variant_columns:
    genetic_df[col] = genetic_df[col].astype(str)
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Define features and target for the genetic model
features = [
    "Age", "BMI", "Physical_Activity", "Diet_Type",
    "MC4R_Present", "MC4R_Variant",
    "PPARG_Present", "PPARG_Variant",
    "FTO_Present", "FTO_Variant",
    "LEPR_Present", "LEPR_Variant"
]
target = "Obesity_Risk_Category"

X_gen = genetic_df[features]
y_gen = genetic_df[target]

# Encode target labels (Low, Medium, High)
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y_gen)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_gen, y_encoded, test_size=0.2, random_state=42)

# Scale features (important for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Logistic Regression model
logreg_model = LogisticRegression(
    solver='lbfgs',            # Suitable for small datasets
    max_iter=1000,             # Increase iterations for convergence
    random_state=42
)
logreg_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred_logreg = logreg_model.predict(X_test_scaled)
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Classification Report:")
print(classification_report(y_test, y_pred_logreg, target_names=target_le.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg))

#############################
# Pipeline 2: Meal Recommendation (Same as Before)
#############################

# Load the meal dataset (assumed to be comma-delimited)
meal_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\train.csv"
meal_df = pd.read_csv(meal_file_path)

# Preprocess nutritional features; these columns should exist in your meal dataset
nutritional_features = meal_df[['Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']]
scaler_meal = StandardScaler()
nutritional_features_scaled = scaler_meal.fit_transform(nutritional_features)

# Cluster meals using KMeans (e.g., 10 clusters)
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
meal_df['Meal_Cluster'] = kmeans.fit_predict(nutritional_features_scaled)

# Define a meal recommendation function that uses the predicted obesity risk category
def recommend_meals(user_profile, meal_df, logreg_model, target_le, scaler, num_meals=5):
    """
    user_profile: dict with genetic feature values (original, unencoded)
    logreg_model: trained logistic regression model
    target_le: LabelEncoder for the target risk category
    scaler: StandardScaler fitted on genetic features
    """
    # Convert user_profile into a DataFrame
    user_profile_df = pd.DataFrame([user_profile])
    
    # Encode categorical features using stored encoders
    for col in ["Diet_Type", "Physical_Activity"]:
        if col in user_profile_df.columns:
            le = label_encoders[col]
            user_profile_df[col] = le.transform(user_profile_df[col])
    for col in variant_columns:
        if col in user_profile_df.columns:
            le = label_encoders[col]
            user_profile_df[col] = le.transform(user_profile_df[col].astype(str))
    
    # Ensure the user profile contains all required features; fill missing with 0
    missing_cols = set(features) - set(user_profile_df.columns)
    for col in missing_cols:
        user_profile_df[col] = 0
    user_profile_df = user_profile_df[features]
    
    # Scale the user profile using the same scaler as training
    user_profile_scaled = scaler.transform(user_profile_df)
    
    # Predict obesity risk using the logistic regression model
    predicted_category = logreg_model.predict(user_profile_scaled)[0]
    predicted_label = target_le.inverse_transform([predicted_category])[0]
    print(f"\nPredicted Obesity Risk Category: {predicted_label}")
    
    # Define cluster preferences based on predicted risk (example logic)
    if predicted_label == 'Low':
        preferred_clusters = [0, 1, 2, 3]  # Broaden the cluster selection
        sort_by = random.choice(['Protein_g', 'Energy_kcal'])  # Randomize sorting
        ascending = random.choice([True, False])  # Randomize order
    elif predicted_label == 'Medium':
        preferred_clusters = [4, 5, 6, 7]  # Broaden the cluster selection
        sort_by = random.choice(['Energy_kcal', 'Fat_g'])  # Randomize sorting
        ascending = random.choice([True, False])  # Randomize order
    else:
        preferred_clusters = [8, 9, 0, 1]  # Broaden the cluster selection
        sort_by = random.choice(['Energy_kcal', 'Carb_g'])  # Randomize sorting
        ascending = random.choice([True, False])  # Randomize order
    
    # Filter and sort meals from the preferred clusters
    recommended_meals = meal_df[meal_df['Meal_Cluster'].isin(preferred_clusters)]
    recommended_meals = recommended_meals.sample(frac=1).reset_index(drop=True)  # Shuffle the meals
    recommended_meals = recommended_meals.sort_values(by=sort_by, ascending=ascending)
    
    print("\nRecommended Meals:")
    print(recommended_meals[['Descrip', 'Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']].head(num_meals))

# Example new genetic profile for meal recommendation (using original, unencoded values)
new_profile = {
    "Age": 35,
    "BMI": 28.5,
    "Physical_Activity": "Low",      # Original string (will be encoded)
    "Diet_Type": "High-Fat",           # Original string (will be encoded)
    "MC4R_Present": 1,
    "MC4R_Variant": "rs17782313_TT",
    "PPARG_Present": 0,
    "PPARG_Variant": "rs1801282_CG",
    "FTO_Present": 1,
    "FTO_Variant": "rs9939609_AT",
    "LEPR_Present": 0,
    "LEPR_Variant": "rs1137101_AG"
}

# Get meal recommendations using the logistic regression model
recommend_meals(new_profile, meal_df, logreg_model, target_le, scaler, num_meals=5)

Logistic Regression Test Accuracy: 0.8075
Classification Report:
              precision    recall  f1-score   support

        High       0.89      0.94      0.92        88
         Low       0.75      1.00      0.85       182
      Medium       0.92      0.45      0.60       130

    accuracy                           0.81       400
   macro avg       0.85      0.80      0.79       400
weighted avg       0.83      0.81      0.79       400

Confusion Matrix:
[[ 83   0   5]
 [  0 182   0]
 [ 10  62  58]]

Predicted Obesity Risk Category: Medium

Recommended Meals:
                                                Descrip  Energy_kcal  \
2314     Egg, whole, dried, stabilized, glucose reduced        615.0   
364   Snacks, popcorn, oil-popped, microwave, regula...        583.0   
589                                   Egg, whole, dried        605.0   
1783          Puff pastry, frozen, ready-to-bake, baked        558.0   
2985          Candies, HERSHEY'S POT OF GOLD Almond Bar        577.0 