Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

Import the csv file

In [2]:
df = pd.read_csv("fitness_data.csv")

Data processing (Checking and replacing the missing values )

In [3]:
print(df.isnull().sum()) #printing out all the missing values

Age                              10
Gender                           71
Weight (kg)                      22
Height (m)                       26
Max_BPM                          21
Avg_BPM                          30
Resting_BPM                      19
Session_Duration (hours)         23
Calories_Burned                  23
Workout_Type                     61
Fat_Percentage                   16
Water_Intake (liters)            24
Workout_Frequency (days/week)    58
Experience_Level                 57
BMI                              30
dtype: int64


Handling Max_BPM specifically remove the \t and convert them to numeric

In [None]:
print("Missing values in 'Max_BPM':", df['Max_BPM'].isnull().sum())
# Remove tab characters
# df['Max_BPM'] = df['Max_BPM'].str.replace('\t', '', regex=False)
# Convert Max_BPM to string type
df['Max_BPM'] = df['Max_BPM'].astype(str)

# Then use the .str.replace() method to remove tabs
df['Max_BPM'] = df['Max_BPM'].str.replace('\t', '', regex=False)

# Check the cleaned column
print(df['Max_BPM'].head())




df['Max_BPM'] = pd.to_numeric(df['Max_BPM'], errors='coerce')
# Calculate the median (ignoring NaN values)
median_value = df['Max_BPM'].median()

# Fill missing values with the median
df['Max_BPM'].fillna(median_value, inplace=True)

Missing values in 'Max_BPM': 0
0    174.0
1    166.0
2    187.0
3    187.0
4    177.0
Name: Max_BPM, dtype: object


KeyError: 'Workout_Type'

Replacing missing values for all features.

In [5]:
# Replacing missing gender with the most frequent gender
mode_gender = df['Gender'].mode()[0]
df['Gender'].fillna(mode_gender, inplace=True)
print(df['Gender'].isnull().sum()) #check whether if it is replaced

#Replacing age with medain
med_age = df['Age'].median()
df['Age'].fillna(med_age, inplace=True)

# Impute numerical features with median
for col in ['Weight (kg)', 'Height (m)', 'Avg_BPM', 'Resting_BPM', 'Session_Duration (hours)', 'Calories_Burned', 'Fat_Percentage', 'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'BMI']:
    df[col].fillna(df[col].median(), inplace=True)


# Impute categorical features with mode
for col in ['Workout_Type', 'Experience_Level']:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Recalculate BMI
df['BMI'] = df['Weight (kg)'] / (df['Height (m)'] ** 2)

0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].fillna(mode_gender, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(med_age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

In [6]:
from scipy import stats  
import numpy as np  

z_scores = np.abs(stats.zscore(df['Weight (kg)']))  
outliers = df[z_scores > 3]  
print(f"Outliers in Weight: {len(outliers)}")  

Outliers in Weight: 19


Use label encoder to convert male and female to numerical values.

In [7]:
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

Check correlation between features


In [10]:
import seaborn as sns

# Remove specific characters like \n and \t
df['Workout_Type'] = df['Workout_Type'].replace({'\n': '', '\t': ''}, regex=True)
# One-hot encode the 'Workout_Type' column
df = pd.get_dummies(df, columns=['Workout_Type'], drop_first=True)

# Now you can compute the correlation matrix
correlation_matrix = df.corr()
print(correlation_matrix)

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

KeyError: 'Workout_Type'