In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro, kstest, anderson


#Grab data and put numerical vals in one table
#################################################################################################################
dataset_as_given = pd.read_csv(r"d:\Desktop\Practice Python\21.gym_members_exercise_tracking.csv")
#print(dataset_as_given)

# Get column names
column_names = dataset_as_given.columns
#print(column_names)

desired = ["Age", "Weight (kg)", "Height (m)", "Max_BPM", "Avg_BPM", "Resting_BPM", 
           "Session_Duration (hours)", "Calories_Burned", "Fat_Percentage", "Water_Intake (liters)", "Workout_Frequency (days/week)", "Experience_Level", "BMI"]

dataset_as_given_numeric = dataset_as_given[desired]



#Get Rid of Outliers
#################################################################################################################

# Copy the dataset so you keep the original intact
cleaned_df = dataset_as_given.copy()

# Select numeric columns
numeric_cols = cleaned_df.select_dtypes(include=['int64', 'float64']).columns

for col in numeric_cols:
    Q1 = cleaned_df[col].quantile(0.25)
    Q3 = cleaned_df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out outliers
    cleaned_df = cleaned_df[(cleaned_df[col] >= lower_bound) & (cleaned_df[col] <= upper_bound)]



print("Original rows:", dataset_as_given.shape[0])
print("Cleaned rows:", cleaned_df.shape[0])
print("Rows removed:", dataset_as_given.shape[0] - cleaned_df.shape[0])


Original rows: 973
Cleaned rows: 931
Rows removed: 42


Why binary encoding is NOT appropriate here<br>
Binary encoding (or label encoding) assigns numbers like:<br><br>

Yoga → 0<br><br>

HIIT → 1<br><br>

Cardio → 2<br><br>

Strength → 3<br><br>

This creates fake numerical relationships, such as:<br><br>

Strength (3) > Cardio (2)<br><br>

HIIT (1) > Yoga (0)<br><br>

But these comparisons don’t make sense — Strength isn’t “more” than Cardio, and HIIT isn’t “greater” than Yoga.<br><br>

If you used binary encoding:<br><br>

Linear models would assume a linear relationship between categories<br><br>

Distance‑based models (KNN, SVM) would assume distance meaning<br><br>

Tree‑based models would still work, but splits become less interpretable <br><br>

Binary encoding is only appropriate when:<br><br>

The category is binary (Male/Female)<br><br>

The category is ordinal (Low < Medium < High)<br><br>

Workout_Type is neither.<br><br>

Gender works with binary encoding because it is binary<br>
Your Gender column has only two categories:<br><br>

Male<br><br>

Female<br><br>

A binary variable can be encoded as:<br><br>

Male → 1<br><br>

Female → 0<br><br>

This does not introduce any false ordering or magnitude, because:<br><br>

There are only two states<br><br>

The model only needs to know “same or different”<br><br>

There is no middle category<br><br>

There is no implied ranking<br><br>

Binary encoding is mathematically equivalent to one‑hot encoding for two categories.<br><br>

With 2 categories:<br>
One‑hot encoding → 2 columns<br><br>

Binary encoding → 1 column<br><br>

Both represent the same information, but binary encoding is cleaner and avoids unnecessary dimensionality.

In [14]:
encoded_df = cleaned_df.copy()

encoded_df['Gender'] = encoded_df['Gender'].map({
    'Male': 1,
    'Female': 0
})

workout_dummies = pd.get_dummies(encoded_df['Workout_Type'], prefix='Workout', drop_first=True)
encoded_df = pd.concat([encoded_df.drop(columns=['Workout_Type']), workout_dummies], axis=1)



Gender → binary encoding<br>

Workout_Type → one‑hot encoding<br><br>
Yoga = 0, 0, 1 <br>
Cardio = 0, 0, 0 <br>
HIIT = 1, 0, 0 <br>
Strength = 0, 1, 0

In [18]:
encoded_df

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI,Workout_HIIT,Workout_Strength,Workout_Yoga
0,56,1,88.3,1.71,180,157,60,1.69,1313.0,12.6,3.5,4,3,30.20,False,False,True
1,46,0,74.9,1.53,179,151,66,1.30,883.0,33.9,2.1,4,2,32.00,True,False,False
2,32,0,68.1,1.66,167,122,54,1.11,677.0,33.4,2.3,4,2,24.71,False,False,False
3,25,1,53.2,1.70,190,164,56,0.59,532.0,28.8,2.1,3,1,18.41,False,True,False
4,38,1,46.1,1.79,188,158,68,0.64,556.0,29.2,2.8,3,1,14.39,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,24,1,87.1,1.74,187,158,67,1.57,1364.0,10.0,3.5,4,3,28.77,False,True,False
969,25,1,66.6,1.61,184,166,56,1.38,1260.0,25.0,3.0,2,1,25.69,False,True,False
970,59,0,60.4,1.76,194,120,53,1.72,929.0,18.8,2.7,5,3,19.50,False,False,False
971,32,1,126.4,1.83,198,146,62,1.10,883.0,28.2,2.1,3,2,37.74,True,False,False


In [19]:
cleaned_df.isnull().sum()


Age                              0
Gender                           0
Weight (kg)                      0
Height (m)                       0
Max_BPM                          0
Avg_BPM                          0
Resting_BPM                      0
Session_Duration (hours)         0
Calories_Burned                  0
Workout_Type                     0
Fat_Percentage                   0
Water_Intake (liters)            0
Workout_Frequency (days/week)    0
Experience_Level                 0
BMI                              0
dtype: int64

In [21]:
print(encoded_df.duplicated().sum())


0
