In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression

In [None]:
measurements = pd.read_csv('train.csv')

In [None]:
measurements.head()

In [None]:
# checking for nulls in the dataframe
measurements.isnull().sum()

In [None]:
# understanding the nature of the nulls
null_rows = measurements[measurements.isnull().any(axis=1)]
null_rows

In [None]:
# removing rows without data on circumferences
filtered_measurement = measurements.dropna(subset=['bust_circumference', 'waist_circumference',	'hip_circumference'], how='all')
filtered_measurement.isnull().sum()

In [None]:
# checking for outliers
filtered_measurement.describe(include='all')

In [None]:
# checking the balance of the dataset
gender_counts = filtered_measurement['gender'].value_counts().reset_index()
gender_counts

In [None]:
plt.figure(figsize=(8,8))
plt.pie(gender_counts['count'], labels=['Female', 'Male'], autopct='%1.3f%%',explode=[0,0])
plt.legend(loc='upper left')
plt.show()

In [None]:
# checking the data types per column
filtered_measurement.info()

In [None]:
# standardizing numeric data
std_measurements = filtered_measurement.copy()
cols_to_convert = ['height', 'bust_circumference', 'waist_circumference', 'hip_circumference']

for col in cols_to_convert:
    std_measurements.loc[:, col] = (
        std_measurements[col]
        .astype(str)
        .str.replace(',', '', regex=False)
        .astype(float)
    )

std_measurements

In [None]:
# Creating box plot for outliers
sns.boxplot(x= 'gender', y='age', data=std_measurements)
plt.title("Age vs Gender")
plt.show()


In [None]:
# Creating box plot for outliers
sns.boxplot(x= 'gender', y='height', data=std_measurements)
plt.title("Height vs Gender")
plt.show()

In [None]:
# Creating box plot for outliers
sns.boxplot(x= 'gender', y='weight', data=std_measurements)
plt.title("Weight vs Gender")
plt.show()

In [None]:
# Creating box plot for outliers
sns.boxplot(x= 'gender', y='bust_circumference', data=std_measurements)
plt.title("Bust vs Gender")
plt.show()

In [None]:
# Creating box plot for outliers
sns.boxplot(x= 'gender', y='waist_circumference', data=std_measurements)
plt.title("Waist vs Gender")
plt.show()

In [None]:
# Creating box plot for outliers
sns.boxplot(x= 'gender', y='hip_circumference', data=std_measurements)
plt.title("Hip vs Gender")
plt.show()

In [None]:
# including ratios between measurements
std_measurements['whr'] = std_measurements['waist_circumference'] / std_measurements['hip_circumference']
std_measurements['bhr'] = std_measurements['bust_circumference'] / std_measurements['hip_circumference']
std_measurements['bwr'] = std_measurements['bust_circumference'] / std_measurements['waist_circumference']
std_measurements

In [None]:
# Creating box plot for outliers
sns.boxplot(x= 'gender', y='whr', data=std_measurements)
plt.title("WHR vs Gender")
plt.show()

In [None]:
# Creating box plot for outliers
sns.boxplot(x= 'gender', y='bhr', data=std_measurements)
plt.title("BHR vs Gender")
plt.show()

In [None]:
# Creating box plot for outliers
sns.boxplot(x= 'gender', y='bwr', data=std_measurements)
plt.title("BWR vs Gender")
plt.show()