In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Load data
df = pd.read_csv('../data/data.csv')

# One-hot encode categorical features
df = pd.get_dummies(df, columns=['Gender', 'Workout_Type'], drop_first=True)

# Separate features and target
X = df.drop(columns=['Fat_Percentage'])
y = df['Fat_Percentage']

# Select top 6 features
selector = SelectKBest(score_func=f_regression, k=6)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print("Top features:", list(selected_features))

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.25, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize model
gbr = GradientBoostingRegressor(random_state=42)

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.7, 1.0],
    'min_samples_split': [2, 5]
}

# Grid search with 5-fold CV
grid_search = GridSearchCV(gbr, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
grid_search.fit(X_train_scaled, y_train)

print(f"Best parameters: {grid_search.best_params_}")

# Best estimator
best_model = grid_search.best_estimator_

# Cross-validation on training set (using best_model params)
cv_r2 = cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='r2')
cv_rmse = np.sqrt(-cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error'))

print(f"\nCross-Validation Training R2 (mean ± std): {cv_r2.mean():.4f} ± {cv_r2.std():.4f}")
print(f"Cross-Validation Training RMSE (mean ± std): {cv_rmse.mean():.4f} ± {cv_rmse.std():.4f}")

# Fit best model on full train data
best_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred = best_model.predict(X_train_scaled)
y_test_pred = best_model.predict(X_test_scaled)

# Metrics on train and test sets
train_r2 = r2_score(y_train, y_train_pred)
train_rmse = mean_squared_error(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_rmse = mean_squared_error(y_test, y_test_pred)

print(f"\nTraining R2 Score: {train_r2:.4f}")
print(f"Training RMSE: {train_rmse:.4f}")
print(f"Testing R2 Score:  {test_r2:.4f}")
print(f"Testing RMSE:  {test_rmse:.4f}")


Top features: ['Session_Duration (hours)', 'Calories_Burned', 'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'Experience_Level', 'Gender_Male']
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 300, 'subsample': 1.0}

Cross-Validation Training R2 (mean ± std): 0.7977 ± 0.0298
Cross-Validation Training RMSE (mean ± std): 2.7776 ± 0.0756

Training R2 Score: 0.8207
Training RMSE: 7.0178
Testing R2 Score:  0.8059
Testing RMSE:  7.5904


In [2]:
import joblib
import pandas as pd
import numpy as np

# Save the trained model, scaler, and selected features
joblib.dump(best_model, '../models/fat/fat-model.pkl')
joblib.dump(scaler, '../models/fat/fat-scaler.pkl')
joblib.dump(selected_features, '../models/fat/fat-selected_features.pkl')

print("Model, scaler, and selected features saved!")


Model, scaler, and selected features saved!


In [3]:
import pandas as pd

# Your selected features from the model
selected_features = ['Session_Duration (hours)', 'Calories_Burned', 'Water_Intake (liters)', 
                     'Workout_Frequency (days/week)', 'Experience_Level', 'Gender_Male']

# Print selected features
print("Selected Features:", selected_features)

# Define 5 custom test samples as dicts
custom_tests = [
    {'Session_Duration (hours)': 1.2, 'Calories_Burned': 800, 'Water_Intake (liters)': 2.5,
     'Workout_Frequency (days/week)': 4, 'Experience_Level': 2, 'Gender_Male': 1},

    {'Session_Duration (hours)': 0.8, 'Calories_Burned': 500, 'Water_Intake (liters)': 1.8,
     'Workout_Frequency (days/week)': 3, 'Experience_Level': 1, 'Gender_Male': 0},

    {'Session_Duration (hours)': 1.5, 'Calories_Burned': 1100, 'Water_Intake (liters)': 3.0,
     'Workout_Frequency (days/week)': 5, 'Experience_Level': 3, 'Gender_Male': 1},

    {'Session_Duration (hours)': 0.5, 'Calories_Burned': 400, 'Water_Intake (liters)': 2.0,
     'Workout_Frequency (days/week)': 2, 'Experience_Level': 1, 'Gender_Male': 0},

    {'Session_Duration (hours)': 2.0, 'Calories_Burned': 1400, 'Water_Intake (liters)': 3.5,
     'Workout_Frequency (days/week)': 6, 'Experience_Level': 3, 'Gender_Male': 1},
]

# Convert to DataFrame
custom_df = pd.DataFrame(custom_tests)

# Ensure columns order matches selected_features
custom_df = custom_df[selected_features]

# Scale features (using your previously fitted scaler)
custom_scaled = scaler.transform(custom_df)

# Predict using your trained model (best_model)
predictions = best_model.predict(custom_scaled)

# Print the results side-by-side
for i, row in custom_df.iterrows():
    print(f"\nTest Sample {i+1}:")
    print(row.to_dict())
    print(f"Predicted Fat Percentage: {predictions[i]:.2f}")


Selected Features: ['Session_Duration (hours)', 'Calories_Burned', 'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'Experience_Level', 'Gender_Male']

Test Sample 1:
{'Session_Duration (hours)': 1.2, 'Calories_Burned': 800.0, 'Water_Intake (liters)': 2.5, 'Workout_Frequency (days/week)': 4.0, 'Experience_Level': 2.0, 'Gender_Male': 1.0}
Predicted Fat Percentage: 25.36

Test Sample 2:
{'Session_Duration (hours)': 0.8, 'Calories_Burned': 500.0, 'Water_Intake (liters)': 1.8, 'Workout_Frequency (days/week)': 3.0, 'Experience_Level': 1.0, 'Gender_Male': 0.0}
Predicted Fat Percentage: 30.14

Test Sample 3:
{'Session_Duration (hours)': 1.5, 'Calories_Burned': 1100.0, 'Water_Intake (liters)': 3.0, 'Workout_Frequency (days/week)': 5.0, 'Experience_Level': 3.0, 'Gender_Male': 1.0}
Predicted Fat Percentage: 20.55

Test Sample 4:
{'Session_Duration (hours)': 0.5, 'Calories_Burned': 400.0, 'Water_Intake (liters)': 2.0, 'Workout_Frequency (days/week)': 2.0, 'Experience_Level': 1.0, 'Gender

