In [7]:
# The goal is to create fake but believable data for the purpose of testing
# The dataset will be based on a range of people using the gym, data points will include
# - Gender - Male / Feamle
# - Age - 18 - 65
# - Height - 5'0" - 6'6"
# - Weight - 100 - 300 lbs
# - BMI - Calculate from height and weight
# - Experience - 0 - 10 years - categorized into Beginner, Intermediate, Expert
# - Avaiilability - 0 - 7 days a week
# - Goals - Weight Loss, Muscle Gain, Maintain Weight, Flexibility
# - Membership - Boolean - True / False

# The dataset will be based on an existing dataset from Kaggle
# https://www.kaggle.com/datasets/thedevastator/exploring-risk-factors-for-cardiovascular-diseas?resource=download
# The dataset is based on a range of people with cardiovascular disease


In [8]:
import pandas as pd
import numpy as np
import random


In [9]:
# To begin we want to filter out all the data that is not relevant to our dataset
# We will be using the following columns
# - age
# - height
# - weight
# - gender

# We will also be creating the following columns
# - BMI
# - Experience
# - Availability
# - Goals
# - Membership
# - User ID

In [13]:
# Exploring the data
df = pd.read_csv('heart_data.csv')
df.head()



count    70000.000000
mean        53.339358
std          6.759594
min         29.583562
25%         48.394521
50%         53.980822
75%         58.430137
max         64.967123
Name: age, dtype: float64

In [11]:
# Drop columns that are not relevant to our dataset
def dropColumns(df):
    df.drop(['id', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'], axis=1, inplace=True)
    return df

# Add all our new columns to the dataset, all data except for BMI will be random
def addColumns(df):
    df['BMI'] = np.nan
    df['Experience'] = np.nan
    df['Availability'] = np.nan
    df['Goals'] = np.nan
    df['Membership'] = np.nan
    df['User ID'] = np.nan
    return df

# Calculate BMI based on height and weight
def calculateBMI(df):
    df['BMI'] = df['weight'] / ((df['height'] / 100) ** 2)
    return df

# Randomly assign a value to the Experience column, Begginer, Intermediate, Expert
def assignExperience(df):
    df['Experience'] = df['Experience'].apply(lambda x: random.choice(['Beginner', 'Intermediate', 'Expert']))
    return df

# Randomly assign a value to the Availability column, 0 - 7 days a week
def assignAvailability(df):
    df['Availability'] = df['Availability'].apply(lambda x: random.randint(0, 7))
    return df

# Randomly assign a value to the Goals column, Weight Loss, Muscle Gain, Flexibility
def assignGoals(df):
    df['Goals'] = df['Goals'].apply(lambda x: random.choice(['Weight Loss', 'Muscle Gain', 'Flexibility']))
    return df

# Randomly assign a value to the Membership column, True / False
def assignMembership(df):
    df['Membership'] = df['Membership'].apply(lambda x: random.choice([True, False]))
    return df

# Randomly assign a value to the User ID column, randomly generated 6 digit number
def assignUserID(df):
    df['User ID'] = df['User ID'].apply(lambda x: random.randint(100000, 999999))
    return df

# The years column is in days, we want to convert this to years and round to the nearest whole number
def convertDaysToYears(df):
    df['age'] = df['age'].apply(lambda x: round(x / 365))
    return df

# Our current ages only range from 29 - 64, we want to expand this to 18 - 65
# We will do this by randomly selecting a number between 18 and 29 and adding it to the age
# Then replacing a random row with the new age
def expandAgeRange(df):
    for i in range(0, 1000):
        age = random.randint(18, 29)
        df['age'] = df['age'].apply(lambda x: x + age)
        df['age'] = df['age'].apply(lambda x: random.choice(df['age']))
    return df



In [12]:
# Before running the above functions we need to remove outliers of height and weight based on true values
