In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('health_fitness_dataset.csv')

print(df.head())

print(df.describe())

In [None]:
desc_stats = df.describe()
print(desc_stats)
print("------------------------\n")

age_count = df['age'].value_counts()
print(age_count)

print("------------------------\n")
print(df)

In [None]:
# making a function to cut age into bins
def age_cut(arr):
    bins = np.empty(arr.shape[0])
    for idx, x in enumerate(arr):
        if (x >=0) & (x<20):
            bins[idx] = 1
        elif (x >= 21) & (x<30):
            bins[idx] = 2
        elif (x >= 31) & (x<40):
            bins[idx] = 3
        elif (x >= 41) & (x<50):
            bins[idx] = 4
        elif (x >= 51) & (x<60):
            bins[idx] = 5
        else: 
            bins[idx] = 6
    return bins

# makes a numeric column of binned ages
binned_ages_numeric = age_cut(df['age'].to_numpy())
df.insert(6, "binned_ages_numeric", binned_ages_numeric)


# prepares converting numeric binned ages to categorical
conversion_dict = {1: '0 to 20',
                   2: '21 to 30',
                   3: '31 to 40',
                   4: '41 to 50',
                   5: '51 to 60',
                   6: '61+'}
age_order = ['0 to 20', '21 to 30', '31 to 40', '41 to 50', '51 to 60', '61+']

# converts numeric bins to categorical bins
binned_ages_categorical = list(map(conversion_dict.get, binned_ages_numeric))
df.insert(7, "binned_ages_categorical", binned_ages_categorical)

# reorders categorical bins to make sense in legend
df['binned_ages_categorical'] = pd.Categorical(df['binned_ages_categorical'], categories=age_order, ordered=True)

# creating palette for scatterplot
palette = {'0 to 20':'blue',
            '21 to 30':'green',
            '31 to 40':'yellow',
            '41 to 50':'orange',
            '51 to 60':'red',
            '61+':'darkred'}

print(df)

In [None]:
print(df)

# creating scatterplot of steps per day vs. calories burned
plt.figure(figsize=(6,4))
sns.scatterplot(x='steps_per_day', y='calories_burned', hue ='binned_ages_categorical', palette=palette, data=df).get_figure()
plt.legend(loc='center right', bbox_to_anchor=(1.27,0.5))
plt.title('Scatter Plot of Steps per Day vs. Calories Burned')
plt.xlabel('Steps per Day')
plt.ylabel('Calories Burned')
plt.savefig('scatterplot.pdf', pad_inches=5)
plt.show()


# create a pairplot, first dropping unnecessary columns
df.drop(columns=['person_id', 'binned_ages_numeric'], inplace=True)
sns.pairplot(df, hue='binned_ages_categorical', palette=palette)
plt.savefig('pairplot.png')
plt.show()