In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE

# Load datasets
genetic_df = pd.read_csv("C:\\Users\\trejan\\Desktop\\Sem 2\\Machine Learning\\model\\genetic_profiles.csv")
meal_df = pd.read_csv("C:\\Users\\trejan\\Desktop\\Sem 2\\Machine Learning\\model\\train.csv")

# Handle missing values for numeric columns in genetic_df
numeric_cols = genetic_df.select_dtypes(include=['float64', 'int64']).columns
genetic_df[numeric_cols] = genetic_df[numeric_cols].fillna(genetic_df[numeric_cols].mean())

# Handle missing values for categorical columns in genetic_df
categorical_cols = genetic_df.select_dtypes(include=['object']).columns
genetic_df[categorical_cols] = genetic_df[categorical_cols].fillna('Unknown')

# Convert Obesity_Risk_Score into categories
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.3, 0.6, 1],
    labels=['Low', 'Medium', 'High']
)

# Select relevant features
features = ['BMI', 'Physical_Activity', 'Diet_Type', 'MC4R_Variant', 'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant', 'Obesity_Risk_Score']
genetic_df = genetic_df[features + ['Obesity_Risk_Category']]

# One-hot encode categorical features
genetic_df = pd.get_dummies(genetic_df, columns=['Diet_Type', 'MC4R_Variant', 'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant'], drop_first=True)

# Prepare features and target
X = genetic_df.drop(['Obesity_Risk_Category'], axis=1)
y = genetic_df['Obesity_Risk_Category']

# Convert Obesity_Risk_Category to numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Ensure all columns in X are numeric
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = pd.to_numeric(X[col], errors='coerce')
        X[col] = X[col].fillna(0)

# Scale the features (important for SVM)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_encoded)

# Split resampled data
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['rbf', 'linear'],  # Kernel types
    'gamma': ['scale', 'auto', 0.1, 0.01],  # Kernel coefficient
    'class_weight': ['balanced', None]
}

# Use SVC for classification
svm = SVC(random_state=42)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Train the model with the best parameters
best_svm = grid_search.best_estimator_
best_svm.fit(X_train, y_train)

# Evaluate on the validation set
y_val_pred = best_svm.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred) * 100
print(f'Validation Accuracy: {val_accuracy:.2f}%')

# Evaluate on the test set
y_test_pred = best_svm.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred) * 100
print(f'Test Accuracy: {test_accuracy:.2f}%')

# Evaluate on noisy data
X_noisy = X_resampled + np.random.normal(0, 0.1, X_resampled.shape)
cv_scores_noisy = cross_val_score(best_svm, X_noisy, y_resampled, cv=5, scoring='accuracy')
print(f'Cross-Validation Accuracy (Noisy Data): {cv_scores_noisy.mean() * 100:.2f}%')

# Print classification report and confusion matrix
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))

print("\nConfusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_test_pred))

def recommend_meals(user_profile, meal_df, model, label_encoder, scaler, num_meals=5):
    # Convert the user profile to a DataFrame
    user_profile_df = pd.DataFrame([user_profile])
    
    # One-hot encode categorical features
    user_profile_df = pd.get_dummies(user_profile_df, columns=['Diet_Type', 'MC4R_Variant', 'PPARG_Variant', 'FTO_Variant', 'LEPR_Variant'], drop_first=True)
    
    # Ensure the user profile has the same columns as the training data
    missing_cols = set(X_train.columns) - set(user_profile_df.columns)
    for col in missing_cols:
        user_profile_df[col] = 0
    
    # Reorder columns to match the training data
    user_profile_df = user_profile_df[X_train.columns]
    
    # Scale the features using the same scaler
    user_profile_scaled = scaler.transform(user_profile_df)
    
    # Predict the obesity risk category
    predicted_risk_category = model.predict(user_profile_scaled)
    predicted_risk_category_label = label_encoder.inverse_transform(predicted_risk_category)[0]
    print(f'Predicted Obesity Risk Category: {predicted_risk_category_label}')
    
    # Rest of the meal recommendation logic remains the same...
    # (Previous meal clustering and recommendation code)

# Example usage
new_profile = {
    'BMI': 28.5,
    'Physical_Activity': 2,
    'Diet_Type': 'High-Fat',
    'MC4R_Variant': 'rs17782313_CT',
    'PPARG_Variant': 'rs1801282_CG',
    'FTO_Variant': 'rs9939609_AT',
    'LEPR_Variant': 'rs1137101_AG',
    'Obesity_Risk_Score': 0.45
}

# Recommend meals for the new profile
recommend_meals(new_profile, meal_df, best_svm, label_encoder, scaler)

Best Parameters: {'C': 10, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear'}
Validation Accuracy: 99.21%
Test Accuracy: 98.82%
Cross-Validation Accuracy (Noisy Data): 95.79%

Classification Report (Test Set):
              precision    recall  f1-score   support

        High       0.98      0.99      0.99       182
         Low       1.00      0.99      1.00       154
      Medium       0.98      0.98      0.98       173

    accuracy                           0.99       509
   macro avg       0.99      0.99      0.99       509
weighted avg       0.99      0.99      0.99       509


Confusion Matrix (Test Set):
[[180   0   2]
 [  0 153   1]
 [  3   0 170]]
Predicted Obesity Risk Category: Medium


