Included Libraries

In [29]:
import pandas as pd
import sklearn as sk
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, log_loss
from sklearn.preprocessing import label_binarize

Reading Data

In [3]:
df = pd.read_csv("normalized.csv")
df.head()

Unnamed: 0,ID,Sex,Age,Height,Weight,Hypertension,Diabetes,BMI,Fitness Goal,Fitness Type,Exercises,Equipment,Diet,Recommendation,Level_Normal,Level_Obuse,Level_Overweight,Level_Underweight
0,1,0,-1.623821,-0.241008,-1.164277,0,0,-1.160781,0,0,"Squats, deadlifts, bench presses, and overhead...",Dumbbells and barbells,"Vegetables: (Carrots, Sweet Potato, and Lettuc...",Follow a regular exercise schedule. Adhere to ...,False,False,False,True
1,2,0,-1.623821,-0.241008,-1.164277,1,0,-1.160781,0,0,"Squats, deadlifts, bench presses, and overhead...","Light athletic shoes, resistance bands, and li...","Vegetables: (Tomatoes, Garlic, leafy greens, b...",Follow a regular exercise schedule. Adhere to ...,False,False,False,True
2,3,0,-1.623821,-0.241008,-1.164277,0,1,-1.160781,0,0,"Squats, yoga, deadlifts, bench presses, and ov...","Dumbbells, barbells and Blood glucose monitor","Vegetables: (Garlic, Roma Tomatoes, Capers and...",Follow a regular exercise schedule. Adhere to ...,False,False,False,True
3,4,0,-1.623821,-0.241008,-1.164277,1,1,-1.160781,0,0,"Squats, yoga, deadlifts, bench presses, and ov...","Light athletic shoes, resistance bands, light ...","Vegetables: (Garlic, Roma Tomatoes, Capers, Gr...",Follow a regular exercise schedule. Adhere to ...,False,False,False,True
4,5,0,-1.623821,-0.241008,-1.164277,0,0,-1.160781,0,0,"Squats, deadlifts, bench presses, and overhead...",Dumbbells and barbells,"Vegetables: (Carrots, Sweet Potato, Lettuce); ...",Follow a regular exercise schedule. Adhere to ...,False,False,False,True


Dropping unnecessary columns ( dimensionality reduction on a lower level )

In [4]:
columns_to_drop = ["BMI", "Recommendation", "ID"]

for column in columns_to_drop:
    if column in df.columns:
        df = df.drop(columns=[column])
        print(f'Dropped: {column}')
    else:
        print(f"Column '{column}' not found in the DataFrame !")


Dropped: BMI
Dropped: Recommendation
Dropped: ID


Functions for normalization into required categories

In [5]:
def preprocess_data(data):
    """
    Preprocess the data string by splitting it into individual items and cleaning the data.
    This handles commas, semi-colons, and extra spaces.

    Args:
    data (str): The data string to preprocess.

    Returns:
    list: A list of items found in the data string.
    """
    # Replace semi-colons with commas and split by commas
    items = data.replace(';', ',').split(',')

    # Clean up each item (strip spaces, convert to lowercase for consistency)
    items = [item.strip().lower() for item in items]

    return items

In [6]:

def categorize_row(data, categories_dict):
    """
    Categorize a single row of data based on predefined category lists.

    Args:
    data (str): The single data point (e.g., equipment or exercise item).
    categories_dict (dict): A dictionary where the keys are category names and values are lists of items belonging to each category.

    Returns:
    dict: A dictionary with category names as keys and binary indicators as values.
    """
    # Preprocess the data string
    items = preprocess_data(data)

    # Initialize the category columns as zeros
    category_columns = {category: 0 for category in categories_dict.keys()}

    # Check each category's list and update the columns based on whether items are found in the data
    for category, category_items in categories_dict.items():
        # Normalize category items and check if any match the preprocessed data
        category_items = [item.lower() for item in category_items]  # Normalize to lowercase
        if any(item in items for item in category_items):
            category_columns[category] = 1

    return category_columns

In [7]:
def categorize_data(data_column, categories_dict):
    """
    Apply categorization to each row in the data column using predefined category lists.

    Args:
    data_column (pd.Series): The column from the DataFrame containing data to be categorized.
    categories_dict (dict): A dictionary where the keys are category names and values are lists of items belonging to each category.

    Returns:
    pd.DataFrame: A DataFrame with separate columns for each category, with binary indicators.
    """
    # Apply the categorize_row function to each row in the column
    categorized_data = data_column.apply(lambda x: categorize_row(x, categories_dict))

    # Convert the list of dictionaries to a DataFrame
    categorized_df = pd.json_normalize(categorized_data)

    return categorized_df

Main for normalizing data

In [9]:
# Category lists
vegetables = ['carrots', 'sweet potato', 'lettuce', 'tomatoes', 'garlic', 'leafy greens', 'broccoli', 'bell peppers']

proteins = ['red meats', 'poultry', 'fish', 'eggs', 'dairy products', 'legumes', 'nuts', 'tofu', 'low-fat dairy products']

juices = ['fruit juice', 'watermelon juice', 'carrot juice', 'apple juice', 'mango juice', 'beetroot juice']

exercise = ['squats', 'deadlifts', 'bench presses', 'yoga', 'cycling', 'swimming', 'running', 'dancing', 'walking']

equipment = ['dumbbells', 'barbells', 'kettlebells', 'resistance bands', 'ellipticals', 'treadmills', 'rowing machine', 'light athletic shoes', 'yoga mat', 'blood glucose monitor']

# Category lists for different columns (diet, exercise, equipment)
diet_categories = {
    'vegetables': vegetables,
    'proteins': proteins,
    'juices': juices
}

exercise_categories = {
    'strength_exercise': ['squats', 'deadlifts', 'bench presses'],
    'cardio_exercise': ['cycling', 'running', 'swimming', 'dancing'],
    'flexibility_exercise': ['yoga', 'walking']
}

equipment_categories = {
    'strength_equipment': ['dumbbells', 'barbells', 'kettlebells', 'resistance bands'],
    'cardio_equipment': ['ellipticals', 'treadmills', 'rowing machine'],
    'flexibility_equipment': ['yoga mat', 'light athletic shoes', 'blood glucose monitor']
}

# Categorization for diet, exercise, and equipment
df_diet = categorize_data(df['Diet'], diet_categories)
df_exercise = categorize_data(df['Exercises'], exercise_categories)
df_equipment = categorize_data(df['Equipment'], equipment_categories)

# Merge the new columns into the original DataFrame
df = pd.concat([df, df_diet, df_exercise, df_equipment], axis=1)

# drop the original columns
df = df.drop(columns=['Diet', 'Exercises', 'Equipment'])

print(df)

       Sex       Age    Height    Weight  Hypertension  Diabetes  \
0        0 -1.623821 -0.241008 -1.164277             0         0   
1        0 -1.623821 -0.241008 -1.164277             1         0   
2        0 -1.623821 -0.241008 -1.164277             0         1   
3        0 -1.623821 -0.241008 -1.164277             1         1   
4        0 -1.623821 -0.241008 -1.164277             0         0   
...    ...       ...       ...       ...           ...       ...   
14342    0 -0.344694  1.876087  1.438140             0         0   
14343    0 -0.269452  1.981942  1.489673             0         0   
14344    0 -0.194209  2.087797  1.541206             0         0   
14345    0 -0.118966  2.193651  1.592739             0         0   
14346    0 -0.043723  2.299506  1.644272             0         0   

       Fitness Goal  Fitness Type  Level_Normal  Level_Obuse  ...  \
0                 0             0         False        False  ...   
1                 0             0         Fal

**PCA:**

No need to apply PCA or RFE as the number of significant columns has already been reduced to 12.

**Model**

In [25]:
def  train_and_evaluate_for_category(df, category):
    # Define the target columns based on the category
    if category == 'Exercises':
        target_columns = exercise_categories.keys()
    elif category == 'Diet':
        target_columns = diet_categories.keys()
    elif category == 'Equipment':
        target_columns = equipment_categories.keys()
    else:
        raise ValueError("Unsupported category: Choose 'exercise' or 'diet' or 'equipment' ")

    # Prepare the data for the target columns
    X = df.drop(columns=target_columns)  # Features
    y = df[target_columns]  # Target

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the model
    model = RandomForestClassifier(random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    return evaluate_model(y_test, y_pred)

Evaluation

In [30]:
def evaluate_model(y_test, y_pred, is_multiclass=False):
    """
    Evaluates the model using various classification metrics.

    Args:
    y_test (array-like): True values.
    y_pred (array-like): Predicted values.
    is_multiclass (bool): Whether the problem is multiclass (default is False).

    Returns:
    dict: A dictionary with evaluation metrics (accuracy, precision, recall, F1 score, confusion matrix, loss, and ROC/AUC).
    """
    # Flatten the true values and predictions to work with multiclass evaluation metrics
    y_test_flat = y_test.values.flatten()
    y_pred_flat = y_pred.flatten()

    # Accuracy, Precision, Recall, F1 Score
    accuracy = accuracy_score(y_test_flat, y_pred_flat)
    precision = precision_score(y_test_flat, y_pred_flat, average='weighted')
    recall = recall_score(y_test_flat, y_pred_flat, average='weighted')
    f1 = f1_score(y_test_flat, y_pred_flat, average='weighted')
    cm = confusion_matrix(y_test_flat, y_pred_flat)

    # Loss (using log loss for classification problems)
    try:
        loss = log_loss(y_test_flat, y_pred)
    except ValueError:
        loss = None  # If log_loss cannot be computed, return None

    # ROC/AUC (for binary classification or multiclass)
    if is_multiclass:
        # For multiclass, we need to binarize the true labels and predicted labels
        y_test_bin = label_binarize(y_test_flat, classes=np.unique(y_test_flat))
        roc_auc = roc_auc_score(y_test_bin, y_pred, average='weighted', multi_class='ovr')
    else:
        roc_auc = roc_auc_score(y_test_flat, y_pred_flat)

    # Collect all metrics in a dictionary
    evaluation_metrics = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': cm,
        'Loss': loss,
        'ROC AUC': roc_auc
    }

    return evaluation_metrics

In [27]:
def print_evaluation_metrics(category_name, metrics):
    """
    Print the evaluation metrics for a category.

    Args:
    category_name (str): The name of the category (e.g., 'Exercise', 'Diet', 'Equipment').
    metrics (dict): The evaluation metrics (accuracy, precision, recall, etc.).
    """
    print(f"\n{category_name} Evaluation Metrics:")
    for key, value in metrics.items():
        print(f"{key}: {value}")

Main Part for prediction and model evaluation

In [31]:
# Example usage
exercise_metrics = train_and_evaluate_for_category(df, 'Exercises')
print_evaluation_metrics('Exercise', exercise_metrics)

equipment_metrics = train_and_evaluate_for_category(df, 'Equipment')
print_evaluation_metrics('Equipment', equipment_metrics)

diet_metrics = train_and_evaluate_for_category(df, 'Diet')
print_evaluation_metrics('Diet', diet_metrics)


Exercise Evaluation Metrics:
Accuracy: 0.9995354239256679
Precision: 0.9995354239256679
Recall: 0.9995354239256679
F1 Score: 0.9995354239256679
Confusion Matrix: [[5040    2]
 [   2 3566]]
Loss: None
ROC AUC: 0.9995213969472575

Equipment Evaluation Metrics:
Accuracy: 0.9922183507549361
Precision: 0.9922185599396804
Recall: 0.9922183507549361
F1 Score: 0.9922184418307293
Confusion Matrix: [[4701   34]
 [  33 3842]]
Loss: None
ROC AUC: 0.9921516503729946

Diet Evaluation Metrics:
Accuracy: 0.9881533101045297
Precision: 0.988151723458476
Recall: 0.9881533101045297
F1 Score: 0.9881524588878015
Confusion Matrix: [[5478   50]
 [  52 3030]]
Loss: None
ROC AUC: 0.9870414882737262
