In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans

# Pipeline 1: Genetic Model
#############################

# Load genetic dataset (adjust sep='\t' if your file is tab-delimited)
genetic_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\new_genetic_profiles.csv"
genetic_df = pd.read_csv(genetic_file_path)
# Remove any extra whitespace from column names
genetic_df.columns = genetic_df.columns.str.strip()
print(genetic_df.columns.tolist())


# Fill missing values (if any)
genetic_df.fillna("None", inplace=True)

# Convert Obesity_Risk_Score into categories using thresholds
# (e.g. 0-0.3: Low, 0.3-0.6: Medium, 0.6-1: High)
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.3, 0.6, 1],
    labels=['Low', 'Medium', 'High']
)

# Initialize a dictionary to store label encoders (for future use)
label_encoders = {}

# Encode categorical variables (Diet_Type, Physical_Activity)
for col in ["Diet_Type", "Physical_Activity"]:
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Encode gene variant columns as strings (so that "None" is encoded too)
variant_columns = ["MC4R_Variant", "PPARG_Variant", "FTO_Variant", "LEPR_Variant"]
for col in variant_columns:
    genetic_df[col] = genetic_df[col].astype(str)
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Define features and target for the genetic model
features = [
    "Age", "BMI", "Physical_Activity", "Diet_Type",
    "MC4R_Present", "MC4R_Variant",
    "PPARG_Present", "PPARG_Variant",
    "FTO_Present", "FTO_Variant",
    "LEPR_Present", "LEPR_Variant"
]
target = "Obesity_Risk_Category"

X_gen = genetic_df[features]
y_gen = genetic_df[target]

# Encode target labels (Low, Medium, High)
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y_gen)

# Split the genetic data
X_train, X_test, y_train, y_test = train_test_split(X_gen, y_encoded, test_size=0.2, random_state=42)

# Define and train an XGBoost classifier with regularization to help prevent overfitting
xgb_clf = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=3,
    learning_rate=0.1,
    n_estimators=100,
    max_depth=5,
    colsample_bytree=0.8,
    subsample=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)

xgb_clf.fit(X_train, y_train)

# Evaluate the classifier
y_pred = xgb_clf.predict(X_test)
print("Genetic Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_le.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

##################################
# Pipeline 2: Meal Recommendation
##################################

# Load the meal dataset (assumed to be comma-delimited)
meal_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\train.csv"
meal_df = pd.read_csv(meal_file_path)

# Preprocess nutritional features; here we assume these columns exist
nutritional_features = meal_df[['Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']]
scaler = StandardScaler()
nutritional_features_scaled = scaler.fit_transform(nutritional_features)

# Cluster meals using KMeans to create diverse groups (e.g., 10 clusters)
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
meal_df['Meal_Cluster'] = kmeans.fit_predict(nutritional_features_scaled)

# Define a meal recommendation function that uses the predicted obesity risk category
def recommend_meals(user_profile, meal_df, model, target_le, num_meals=5):
    """
    user_profile: dict with genetic feature values (unencoded, as originally provided)
    model: trained genetic model (xgb_clf)
    target_le: LabelEncoder for the target risk category
    """
    # Convert user_profile into DataFrame
    user_profile_df = pd.DataFrame([user_profile])
    
    # Encode the categorical features in the user profile using stored encoders
    for col in ["Diet_Type", "Physical_Activity"]:
        if col in user_profile_df.columns:
            le = label_encoders[col]
            user_profile_df[col] = le.transform(user_profile_df[col])
    for col in variant_columns:
        if col in user_profile_df.columns:
            le = label_encoders[col]
            user_profile_df[col] = le.transform(user_profile_df[col].astype(str))
    
    # Ensure the user profile has all required features; add missing ones as 0 if needed
    missing_cols = set(features) - set(user_profile_df.columns)
    for col in missing_cols:
        user_profile_df[col] = 0
    user_profile_df = user_profile_df[features]
    
    # Predict obesity risk category using the genetic model
    predicted_category = model.predict(user_profile_df)[0]
    predicted_label = target_le.inverse_transform([predicted_category])[0]
    print(f"\nPredicted Obesity Risk Category: {predicted_label}")
    
    # Define cluster preferences based on the predicted risk
    # (These cluster assignments are examples; you should adjust based on your meal data)
    if predicted_label == 'Low':
        # For low-risk individuals, recommend high-protein, balanced meals
        preferred_clusters = [0, 1, 2]
        sort_by = 'Protein_g'
        ascending = False
    elif predicted_label == 'Medium':
        # For medium-risk individuals, recommend moderate-calorie meals
        preferred_clusters = [3, 4, 5]
        sort_by = 'Energy_kcal'
        ascending = False
    else:
        # For high-risk individuals, recommend low-calorie, nutrient-dense meals
        preferred_clusters = [6, 7, 8, 9]
        sort_by = 'Energy_kcal'
        ascending = True
    
    # Filter and sort meals from the preferred clusters
    recommended_meals = meal_df[meal_df['Meal_Cluster'].isin(preferred_clusters)]
    recommended_meals = recommended_meals.sort_values(by=sort_by, ascending=ascending)
    
    print("\nRecommended Meals:")
    print(recommended_meals[['Descrip', 'Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g']].head(num_meals))

# Example new genetic profile for meal recommendation (using original, unencoded values)
new_profile = {
    "Age": 35,
    "BMI": 28.5,
    "Physical_Activity": "Low",      # Original string value (will be encoded)
    "Diet_Type": "High-Fat",           # Original string value (will be encoded)
    "MC4R_Present": 1,
    "MC4R_Variant": "rs17782313_TT",    # Original string value
    "PPARG_Present": 1,
    "PPARG_Variant": "rs1801282_CG",    # Original string value
    "FTO_Present": 1,
    "FTO_Variant": "rs9939609_AT",      # Original string value
    "LEPR_Present": 1,
    "LEPR_Variant": "rs1137101_AG"       # Original string value
}

# Get meal recommendations based on the genetic model prediction
recommend_meals(new_profile, meal_df, xgb_clf, target_le, num_meals=5)


['Profile_ID', 'Age', 'BMI', 'Physical_Activity', 'Diet_Type', 'Obesity_Risk_Score', 'MC4R_Present', 'MC4R_Variant', 'PPARG_Present', 'PPARG_Variant', 'FTO_Present', 'FTO_Variant', 'LEPR_Present', 'LEPR_Variant']
Genetic Model Accuracy: 0.725
Classification Report:
              precision    recall  f1-score   support

        High       0.95      0.90      0.93       179
         Low       0.42      0.43      0.43        83
      Medium       0.64      0.67      0.65       138

    accuracy                           0.72       400
   macro avg       0.67      0.67      0.67       400
weighted avg       0.73      0.72      0.73       400

Confusion Matrix:
[[161  10   8]
 [  2  36  45]
 [  6  39  93]]


[WinError 2] The system cannot find the file specified
  File "C:\Users\trejan\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\trejan\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\trejan\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1036, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                


Predicted Obesity Risk Category: High

Recommended Meals:
                                                Descrip  Energy_kcal  \
2301  Sweetener, herbal extract powder from Stevia leaf          0.0   
3281  Turkey, breast, from whole bird, enhanced, mea...        127.0   
3985  Turkey from whole, enhanced, light meat, meat ...        127.0   
5767  Turkey, wing, from whole bird, enhanced, meat ...        127.0   
4302         Game meat, buffalo, water, cooked, roasted        131.0   

      Protein_g  Fat_g  Carb_g  
2301       0.00   0.00   100.0  
3281      26.97   2.08     0.0  
3985      26.97   2.08     0.0  
5767      26.97   2.08     0.0  
4302      26.83   1.80     0.0  
