In [2]:
import pandas as pd

bodyfat2 = pd.read_csv('bodyfat2.csv')
bodyfat3 = pd.read_csv('bodyfat3.csv')

# Question 1a
print('Question 1a')

# Select columns from neck onwards till wrist (fat present in the body parts)
bodyfat2_compiled = bodyfat2.iloc[:, 5:15]

# Compute the mean, median, and sum, for each individual
bodyfat2_compiled_means = bodyfat2_compiled.mean(1) # 1 for columns
bodyfat2_compiled_medians = bodyfat2_compiled.median(1)
bodyfat2_compiled_sums = bodyfat2_compiled.sum(1)

# Create DataFrames for the top and bottom individuals
top_df = pd.DataFrame({
    'ID': [i for i in range(3)],
    'Mean': bodyfat2_compiled_means.head(3),
    'Median': bodyfat2_compiled_medians.head(3),
    'Sum': bodyfat2_compiled_sums.head(3)
})

bottom_df = pd.DataFrame({
    'ID': [i for i in range(47, 50)],
    'Mean': bodyfat2_compiled_means.tail(3),
    'Median': bodyfat2_compiled_medians.tail(3),
    'Sum': bodyfat2_compiled_sums.tail(3)
})

# Concatenate the DataFrames to fit within the 3 columns
bodyfat2_top_bottom_3 = pd.concat([top_df, bottom_df], ignore_index=True)

print(bodyfat2_top_bottom_3.to_string(header=True, index=False, col_space=10))
print()

Question 1a
        ID       Mean     Median        Sum
         0      50.37      36.75      503.7
         1      51.08      37.90      510.8
         2      51.00      36.45      510.0
        47      48.22      36.05      482.2
        48      47.79      34.30      477.9
        49      45.13      34.20      451.3



In [8]:
# Question 1b
print('Question 1b')
# Compute and store as series
bodyfat2_means = bodyfat2.mean()
bodyfat2_medians = bodyfat2.median()
bodyfat2_sums = bodyfat2.sum()

# Create data frames from the Series
means_df = pd.DataFrame({'feature': bodyfat2.columns, 'mean': bodyfat2_means})
medians_df = pd.DataFrame({'feature': bodyfat2.columns, 'median': bodyfat2_medians})
sums_df = pd.DataFrame({'feature': bodyfat2.columns, 'sum': bodyfat2_sums})

# Merge the data frames based on the 'feature' column
result_df = pd.merge(means_df, medians_df, on='feature')
result_df = pd.merge(result_df, sums_df, on='feature')

# Exclude unwanted features
excluded_features = ['density', 'bodyfat', 'age', 'weight', 'height']
result_df = result_df.query('feature not in @excluded_features')

print(result_df.to_string(header=True, index=False, col_space=10))
print()

Question 1b
   feature       mean     median        sum
      neck     38.016      38.10     1900.8
     chest    101.128     101.10     5056.4
   abdomen     92.080      89.15     4604.0
       hip    102.058     100.45     5102.9
     thigh     61.506      61.60     3075.3
      knee     38.828      38.70     1941.4
     ankle     23.586      23.10     1179.3
    biceps     32.798      32.45     1639.9
   forearm     28.852      29.00     1442.6
     wrist     18.118      18.20      905.9



In [4]:
# Question 1c
print('Question 1c')
import scipy.stats

# Compute geometric mean and harmonic mean for each body part
arithmetic_means = bodyfat2.iloc[:, 5:15].mean(axis=0)
geometric_means = scipy.stats.gmean(bodyfat2.iloc[:, 5:15], axis=0)
harmonic_means = scipy.stats.hmean(bodyfat2.iloc[:, 5:15], axis=0)

# Create a DataFrame to store the computed means
means_df = pd.DataFrame({
    'Body Part': bodyfat2.columns[5:15],
    'Arithmetic Mean': arithmetic_means,
    'Geometric Mean': geometric_means,
    'Harmonic Mean': harmonic_means
})

print(means_df.to_string(header=True, index=False, col_space=10))
print()


Question 1c
 Body Part  Arithmetic Mean  Geometric Mean  Harmonic Mean
      neck           38.016       37.883069      37.755200
     chest          101.128      100.606458     100.106007
   abdomen           92.080       91.065145      90.135062
       hip          102.058      101.577150     101.131966
     thigh           61.506       61.161311      60.827484
      knee           38.828       38.725157      38.625040
     ankle           23.586       23.490255      23.403851
    biceps           32.798       32.598184      32.403743
   forearm           28.852       28.769638      28.685546
     wrist           18.118       18.084399      18.051573



In [19]:
# Question 2a
print('Question 2a')
# Remove age, weight and height
bodyfat2_without_age_weight_height_cols = bodyfat2.drop(columns=['age', 'weight', 'height'])

# For each feature, and combine them into a single data frame
print("Max and Min values for all features and associated IDs")
bodyfat2_features_max_and_min_values = pd.concat([bodyfat2_without_age_weight_height_cols.max(),
                                                  bodyfat2_without_age_weight_height_cols.idxmax(),
                                                  bodyfat2_without_age_weight_height_cols.min(),
                                                  bodyfat2_without_age_weight_height_cols.idxmin()], axis=1).set_axis(['Max value', 'Max ID', 'Min value', 'Min ID'], axis='columns').rename_axis('Feature')

print(bodyfat2_features_max_and_min_values.to_string(header=True, index=True, col_space=10))
print()

Question 2a
Max and Min values for all features and associated IDs
            Max value     Max ID  Min value     Min ID
Feature                                               
density        1.0911         25     1.0101         35
bodyfat       40.1000         35     3.7000         25
neck          51.2000         38    31.5000         44
chest        136.2000         38    83.4000         49
abdomen      148.1000         38    70.4000         49
hip          147.7000         38    85.3000         26
thigh         87.3000         38    50.0000         44
knee          49.1000         38    34.4000         49
ankle         33.9000         30    20.6000         48
biceps        45.0000         38    26.1000         44
forearm       32.8000         21    23.1000         44
wrist         21.4000         38    16.1000         44



In [17]:
# Question 2b
print('Question 2b')

# Identify individuals appearing more than once under Max ID
max_id_duplicates = bodyfat2_features_max_and_min_values[
    bodyfat2_features_max_and_min_values['Max ID'].duplicated(keep=False)
].drop(columns=['Min value', 'Min ID'])

# Identify individuals appearing more than once under Min ID
min_id_duplicates = bodyfat2_features_max_and_min_values[
    bodyfat2_features_max_and_min_values['Min ID'].duplicated(keep=False)
].drop(columns=['Max value', 'Max ID'])

print("Individuals appearing more than once under Max ID:")
print(max_id_duplicates)
print("\nIndividuals appearing more than once under Min ID:")
print(min_id_duplicates)

Question 2b
Individuals appearing more than once under Max ID:
         Max value  Max ID
Feature                   
neck          51.2      38
chest        136.2      38
abdomen      148.1      38
hip          147.7      38
thigh         87.3      38
knee          49.1      38
biceps        45.0      38
wrist         21.4      38

Individuals appearing more than once under Min ID:
         Min value  Min ID
Feature                   
neck          31.5      44
chest         83.4      49
abdomen       70.4      49
thigh         50.0      44
knee          34.4      49
biceps        26.1      44
forearm       23.1      44
wrist         16.1      44


In [28]:
# Question 3
print('Question 3')
bodyfat2_stds = bodyfat2.std()
print("For each feature, the more values that lie within 10% of standard deviation from the centre, the tighter the spread")

within_10_percent_std_of_means = (bodyfat2 >= bodyfat2_means - 0.1 * bodyfat2_stds) & (bodyfat2 <= bodyfat2_means + 0.1 * bodyfat2_stds)
within_10_percent_std_of_medians = (bodyfat2 >= bodyfat2_medians - 0.1 * bodyfat2_stds) & (bodyfat2 <= bodyfat2_medians + 0.1 * bodyfat2_stds)

num_within_10_percent_std_of_means = within_10_percent_std_of_means.sum()
num_within_10_percent_std_of_medians = within_10_percent_std_of_medians.sum()

result_df = pd.DataFrame({
    'feature': bodyfat2.columns,  # Select the desired feature columns
    'number_mean': num_within_10_percent_std_of_means,  # Select the corresponding counts
    'number_median': num_within_10_percent_std_of_medians  # Select the corresponding counts
})

print(result_df.to_string(index=False))
print()

Question 3
For each feature, the more values that lie within 10% of standard deviation from the centre, the tighter the spread
feature  number_mean  number_median
density            1              3
bodyfat            1              3
    age            3              4
 weight            7              8
 height            3              5
   neck            4              6
  chest            7              7
abdomen            4              6
    hip            4              6
  thigh            2              2
   knee            6              5
  ankle            5              7
 biceps            5              6
forearm            4              3
  wrist            5              7



In [31]:
# Question 4
print('Question 4')
print('The higher the number of missing data, the more incomplete the data for that particular feature')
print('Possible remedy: Features with too many missing values should dropped')
print(pd.isna(bodyfat3).sum().to_string())
print()

Question 4
The higher the number of missing data, the more incomplete the data for that particular feature
Possible remedy: Features with too many missing values should dropped
density    0
bodyfat    4
age        0
weight     7
height     2
neck       3
chest      1
abdomen    0
hip        6
thigh      3
knee       1
ankle      2
biceps     4
forearm    0
wrist      2



In [38]:
# Question 5a
print('Question 5a')
# Calculate the means of the available values in bodyfat3 dataset
bodyfat3_means = bodyfat3.mean()

# Fill missing values in bodyfat3 dataset with the calculated means
filled_data = bodyfat3.fillna(value=bodyfat3_means)

# Calculate means again, although they should be similar to bodyfat3_means
filled_data_means = filled_data.mean()

# Calculate the absolute difference in mean values for each feature
mean_difference = abs(bodyfat2_means - filled_data_means)

print("Absolute difference in mean values for each feature compared to the original mean from bodyfat2 dataset:")
print(mean_difference.to_string())
print()

Question 5a
Absolute difference in mean values for each feature compared to the original mean from bodyfat2 dataset:
density    0.000000
bodyfat    0.536348
age        0.000000
weight     1.001302
height     0.036667
neck       0.139404
chest      0.017796
abdomen    0.000000
hip        0.410182
thigh      0.193234
knee       0.074939
ankle      0.034833
biceps     0.158522
forearm    0.000000
wrist      0.013250



In [40]:
# Question 5b
print('Question 5b')
# Calculate the medians of the available values in bodyfat3 dataset
bodyfat3_medians = bodyfat3.median()

# Fill missing values in bodyfat3 dataset with the calculated medians
filled_data_with_medians = bodyfat3.fillna(value=bodyfat3_medians)

# Calculate medians again, although they should be similar to bodyfat3_medians
filled_data_with_medians_medians = filled_data_with_medians.median()

# Calculate the absolute difference in median values for each feature
median_difference = abs(bodyfat2_medians - filled_data_with_medians_medians)

print("Absolute difference in median values for each feature compared to the original median from bodyfat2 dataset:")
print(median_difference.to_string())
print()


Question 5b
Absolute difference in median values for each feature compared to the original median from bodyfat2 dataset:
density    0.000
bodyfat    0.850
age        0.000
weight     0.875
height     0.000
neck       0.100
chest      0.200
abdomen    0.000
hip        1.100
thigh      1.500
knee       0.000
ankle      0.000
biceps     0.050
forearm    0.000
wrist      0.000



In [43]:
# Question 6a
print('Question 6a')
# Apply normalisation formula provided
bodyfat2_normalized = (bodyfat2 - bodyfat2_means) / bodyfat2_stds
print('Normalised features for first 3 individuals')
print(bodyfat2_normalized.head(3).to_string())
print('Normalised features for last 3 individuals')
print(bodyfat2_normalized.tail(3).to_string())

print()

Question 6a
Normalised features for first 3 individuals
    density   bodyfat       age    weight    height      neck     chest   abdomen       hip     thigh      knee     ankle    biceps   forearm     wrist
0  0.561613 -0.567159 -1.268721 -0.720556 -0.893031 -0.552493 -0.756337 -0.474192 -0.722223 -0.373159 -0.527376 -0.742220 -0.215020 -0.666231 -0.901998
1  1.217281 -1.197111 -1.387738 -0.248598  0.795473  0.147250 -0.709231 -0.625823 -0.320882 -0.417831 -0.527376 -0.081882 -0.619192  0.022024  0.072656
2 -0.767810  0.753706 -1.387738 -0.726766 -1.455866 -1.221814 -0.501964 -0.288099 -0.273103 -0.283815  0.024850  0.182253 -1.077254 -1.675671 -1.345022
Normalised features for last 3 individuals
     density   bodyfat       age    weight    height      neck     chest   abdomen       hip     thigh      knee     ankle    biceps   forearm     wrist
47  1.271543 -1.247913  0.635551 -0.863386  0.420250 -1.039272 -1.067238 -0.867054 -0.894227 -1.311268 -0.458348 -0.742220 -1.077254 -0.9415

In [54]:
# Question 6a
print('Question 6a')

# Define a normalization function using a lambda expression
normalize_function = lambda x: (x - bodyfat2_means) / bodyfat2_stds

# Apply the normalization function to the entire DataFrame
bodyfat2_normalized = bodyfat2.apply(normalize_function)

print('Normalised features for first 3 individuals:')
print(bodyfat2_normalized.head(3).to_string(index=False))

print('Normalised features for last 3 individuals:')
print(bodyfat2_normalized.tail(3).to_string(index=False))

print()


Question 6a
Normalised features for first 3 individuals:
 density  bodyfat  age  weight  height  neck  chest  abdomen  hip  thigh  knee  ankle  biceps  forearm  wrist
     NaN      NaN  NaN     NaN     NaN   NaN    NaN      NaN  NaN    NaN   NaN    NaN     NaN      NaN    NaN
     NaN      NaN  NaN     NaN     NaN   NaN    NaN      NaN  NaN    NaN   NaN    NaN     NaN      NaN    NaN
     NaN      NaN  NaN     NaN     NaN   NaN    NaN      NaN  NaN    NaN   NaN    NaN     NaN      NaN    NaN
Normalised features for last 3 individuals:
 density  bodyfat  age  weight  height  neck  chest  abdomen  hip  thigh  knee  ankle  biceps  forearm  wrist
     NaN      NaN  NaN     NaN     NaN   NaN    NaN      NaN  NaN    NaN   NaN    NaN     NaN      NaN    NaN
     NaN      NaN  NaN     NaN     NaN   NaN    NaN      NaN  NaN    NaN   NaN    NaN     NaN      NaN    NaN
     NaN      NaN  NaN     NaN     NaN   NaN    NaN      NaN  NaN    NaN   NaN    NaN     NaN      NaN    NaN



In [45]:
# Question 6b
print('Question 6b')
bodyfat2_normalized_means = bodyfat2_normalized.mean()
bodyfat2_normalized_greater_than_means = bodyfat2_normalized.gt(bodyfat2_normalized_means).sum()
print('Number of people who are above average for the feature')
print(bodyfat2_normalized_greater_than_means.to_string())

Question 6b
Number of people who are above average for the feature
density    27
bodyfat    23
age        21
weight     25
height     23
neck       26
chest      25
abdomen    21
hip        22
thigh      25
knee       23
ankle      21
biceps     22
forearm    27
wrist      27


In [51]:
# Question 6b
print('Question 6b')
bodyfat2_normalized_means = bodyfat2_normalized.mean()

# Count the number of people who are above average for each feature
bodyfat2_normalized_greater_than_means_count = (bodyfat2_normalized > bodyfat2_normalized_means).sum()

print('Individuals above average for each feature:')
print(bodyfat2_normalized_greater_than_means_count.to_string())


Question 6b
Individuals above average for each feature:
density    27
bodyfat    23
age        21
weight     25
height     23
neck       26
chest      25
abdomen    21
hip        22
thigh      25
knee       23
ankle      21
biceps     22
forearm    27
wrist      27
