In [1]:
import pandas as pd
import numpy as np

# Define the mapping between workout types and goals
workout_to_goal = {
    'Cardio': 'Improve cardiovascular health',
    'HIIT': 'Lose weight',
    'Strength': 'Build muscle',
    'Yoga': 'Improve flexibility and balance'
}

# Read the CSV file
df = pd.read_csv('../data/data.csv')
np.random.seed(42)
mapping_mask = np.random.rand(len(df)) < 0.88

# Initialize the Goal column with NaN
df['Goal'] = np.nan

# Apply correct mapping to 88% of the data
df.loc[mapping_mask, 'Goal'] = df.loc[mapping_mask, 'Workout_Type'].map(workout_to_goal)

# For the remaining 12%, assign random goals (wrong or no mapping)
incorrect_indices = df.index[~mapping_mask]
random_goals = np.random.choice(list(workout_to_goal.values()), size=len(incorrect_indices))
df.loc[incorrect_indices, 'Goal'] = random_goals

# Save the modified dataframe back to CSV
df.to_csv('../data/data_with_goals.csv', index=False)

 'Build muscle' 'Build muscle' 'Lose weight'
 'Improve cardiovascular health' 'Improve cardiovascular health'
 'Build muscle' 'Improve cardiovascular health' 'Lose weight'
 'Improve cardiovascular health' 'Lose weight'
 'Improve flexibility and balance' 'Build muscle' 'Lose weight'
 'Improve cardiovascular health' 'Build muscle' 'Lose weight'
 'Build muscle' 'Improve cardiovascular health'
 'Improve flexibility and balance' 'Improve flexibility and balance'
 'Lose weight' 'Improve flexibility and balance' 'Lose weight'
 'Build muscle' 'Lose weight' 'Lose weight' 'Lose weight'
 'Improve flexibility and balance' 'Lose weight' 'Build muscle'
 'Improve cardiovascular health' 'Build muscle' 'Build muscle'
 'Build muscle' 'Improve cardiovascular health'
 'Improve cardiovascular health' 'Improve flexibility and balance'
 'Lose weight' 'Build muscle' 'Improve flexibility and balance'
 'Improve cardiovascular health' 'Build muscle'
 'Improve cardiovascular health' 'Improve cardiovascular health

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [3]:
# Select the specified features: goal, height, weight, age, experience_level
# Note: Goal is text, so we'll encode it
features = ['Goal', 'Height (m)', 'Weight (kg)', 'Age', 'Experience_Level']
target = 'Workout_Type'

# Create a copy of the dataframe with selected features
df_model = df[features + [target]].copy()

# Check for missing values in selected features
print("Missing values in selected features:")
print(df_model.isnull().sum())

# Encode the Goal feature (categorical to numerical)
label_encoder = LabelEncoder()
df_model['Goal_encoded'] = label_encoder.fit_transform(df_model['Goal'])

# Prepare final feature matrix and target vector
X = df_model[['Goal_encoded', 'Height (m)', 'Weight (kg)', 'Age', 'Experience_Level']]
y = df_model[target]

print("\nFeature matrix shape:", X.shape)
print("Target vector shape:", y.shape)
print("\nFeature names:", X.columns.tolist())
print("\nGoal encoding mapping:")
for i, goal in enumerate(label_encoder.classes_):
    print(f"{i}: {goal}")

Missing values in selected features:
Goal                0
Height (m)          0
Weight (kg)         0
Age                 0
Experience_Level    0
Workout_Type        0
dtype: int64

Feature matrix shape: (973, 5)
Target vector shape: (973,)

Feature names: ['Goal_encoded', 'Height (m)', 'Weight (kg)', 'Age', 'Experience_Level']

Goal encoding mapping:
0: Build muscle
1: Improve cardiovascular health
2: Improve flexibility and balance
3: Lose weight


In [4]:
# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"Training set percentage: {X_train.shape[0] / len(X) * 100:.1f}%")
print(f"Testing set percentage: {X_test.shape[0] / len(X) * 100:.1f}%")

# Create and train the Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,  # Number of trees
    random_state=42,   # For reproducibility
    max_depth=10,      # Maximum depth of trees
    min_samples_split=5,  # Minimum samples required to split a node
    min_samples_leaf=2    # Minimum samples required at a leaf node
)

# Train the model
print("\nTraining the Random Forest model...")
rf_model.fit(X_train, y_train)
print("Model training completed!")

Training set size: 778 samples
Testing set size: 195 samples
Training set percentage: 80.0%
Testing set percentage: 20.0%

Training the Random Forest model...
Model training completed!


In [5]:
# Make predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Calculate accuracies
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Display results
print("=" * 50)
print("RANDOM FOREST MODEL RESULTS")
print("=" * 50)
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Testing Accuracy:  {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print("=" * 50)

# Show detailed classification report for test set
print("\nDetailed Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

RANDOM FOREST MODEL RESULTS
Training Accuracy: 0.9254 (92.54%)
Testing Accuracy:  0.8667 (86.67%)

Detailed Classification Report (Test Set):
              precision    recall  f1-score   support

      Cardio       0.90      0.84      0.87        51
        HIIT       0.81      0.89      0.85        44
    Strength       0.87      0.88      0.88        52
        Yoga       0.89      0.85      0.87        48

    accuracy                           0.87       195
   macro avg       0.87      0.87      0.87       195
weighted avg       0.87      0.87      0.87       195



In [None]:
import joblib
import os

# Ensure the directory exists
os.makedirs('../models/workout-type', exist_ok=True)

# Save the trained model
model_filename = '../models/workout-type/workout_model.pkl'
joblib.dump(rf_model, model_filename)
print(f"Model saved as '{model_filename}'")

# Save the label encoder
encoder_filename = '../models/workout-type/label_encoder.pkl'
joblib.dump(label_encoder, encoder_filename)
print(f"Label encoder saved as '{encoder_filename}'")

# Verify the saved files
print(f"\nSaved files:")
print(f"Model file size: {os.path.getsize(model_filename) / 1024:.2f} KB")
print(f"Encoder file size: {os.path.getsize(encoder_filename) / 1024:.2f} KB")

# Optional: Load the model to verify it works
loaded_model = joblib.load(model_filename)
loaded_encoder = joblib.load(encoder_filename)
print("\nModel and encoder loaded successfully!")
print(f"Model type: {type(loaded_model)}")
print(f"Encoder type: {type(loaded_encoder)}")


Model saved as '../models/workout-type/workout_model.pkl'
Label encoder saved as '../models/workout-type/label_encoder.pkl'

Saved files:
Model file size: 1432.35 KB
Encoder file size: 0.62 KB

Model and encoder loaded successfully!
Model type: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Encoder type: <class 'sklearn.preprocessing._label.LabelEncoder'>


In [7]:
import pandas as pd
import joblib

# Load the trained model
model = joblib.load('../models/workout-type/workout_model.pkl')

# Load the encoder for 'Goal' if it was used during training (e.g., LabelEncoder)
goal_encoder = joblib.load('../models/workout-type/label_encoder.pkl')  # Only if you used LabelEncoder

# Define 5 diverse test samples with proper feature names
test_samples = pd.DataFrame([
    {
        'Goal': 'Lose weight',
        'Height (m)': 175,
        'Weight (kg)': 73,
        'Age': 20,
        'Experience_Level': 1  # in years
    },
    {
        'Goal': 'Build muscle',
        'Height (m)': 180,
        'Weight (kg)': 85,
        'Age': 25,
        'Experience_Level': 3
    },
    {
        'Goal': 'Improve cardiovascular health',
        'Height (m)': 168,
        'Weight (kg)': 60,
        'Age': 30,
        'Experience_Level': 5
    },
    {
        'Goal': 'Improve flexibility and balance',
        'Height (m)': 170,
        'Weight (kg)': 65,
        'Age': 35,
        'Experience_Level': 2
    },
    {
        'Goal': 'Lose weight',
        'Height (m)': 160,
        'Weight (kg)': 58,
        'Age': 28,
        'Experience_Level': 4
    }
], columns=['Goal', 'Height (m)', 'Weight (kg)', 'Age', 'Experience_Level'])

# Encode the 'Goal' column as 'Goal_encoded' to match training
test_samples['Goal_encoded'] = goal_encoder.transform(test_samples['Goal'])

# Ensure the feature order matches the training data
columns_order = ['Goal_encoded', 'Height (m)', 'Weight (kg)', 'Age', 'Experience_Level']
test_samples = test_samples[columns_order]

# Predict the workout type
predictions = model.predict(test_samples)

# Print the predicted workout types
for i, workout_type in enumerate(predictions, 1):
    print(f"Sample {i} - Predicted Workout Type: {workout_type}")


Sample 1 - Predicted Workout Type: HIIT
Sample 2 - Predicted Workout Type: Strength
Sample 3 - Predicted Workout Type: Cardio
Sample 4 - Predicted Workout Type: Yoga
Sample 5 - Predicted Workout Type: HIIT
