In [32]:
import pandas as pd
import numpy as np

# Load raw dataset
data = pd.read_csv('../data/raw/athletes.csv')

# Remove not relevant columns
required_cols = [
    'region','age','weight','height','howlong','gender','eat',
    'train','background','experience','schedule',
    'deadlift','candj','snatch','backsq'
]
data = data.dropna(subset=[col for col in required_cols if col in data.columns])

cols_to_drop = [
    'affiliate','team','name','athlete_id','fran','helen','grace',
    'filthy50','fgonebad','run400','run5k','pullups','train'
]
data = data.drop(columns=[col for col in cols_to_drop if col in data.columns])

# Remove Outliers
data = data[data['weight'] < 1500]
data = data[data['gender'] != '--']
data = data[data['age'] >= 18]
data = data[(data['height'] < 96) & (data['height'] > 48)]

data = data[((data['deadlift'] > 0) &
             (((data['gender'] != 'Female') & (data['deadlift'] <= 1105)) |
              ((data['gender'] == 'Female') & (data['deadlift'] <= 636))))]
data = data[(data['candj'] > 0) & (data['candj'] <= 395)]
data = data[(data['snatch'] > 0) & (data['snatch'] <= 496)]
data = data[(data['backsq'] > 0) & (data['backsq'] <= 1069)]

# Clean Survey Data
cols = ['eat','background','experience','schedule','howlong']
for c in cols:
    s = data[c].astype(str)
    s = s.str.replace(r'(?i)\s*\|\s*decline\s*to\s*answer\|\s*', '|', regex=True)
    s = s.str.replace(r'(?i)^\s*decline\s*to\s*answer\|\s*', '', regex=True)
    s = s.str.replace(r'(?i)\s*\|\s*decline\s*to\s*answer\s*$', '', regex=True)
    s = s.str.replace(r'\|{2,}', '|', regex=True)
    s = s.str.replace(r'^\|', '', regex=True)
    s = s.str.replace(r'\|$', '', regex=True)
    data[c] = s.str.strip()

data = data.dropna(subset=['background','experience','schedule','howlong','eat'])

# Save cleaned dataset (v2)
data.to_csv('../data/processed/athletes_clean.csv', index=False)

In [33]:
# Verify cleaning
v1 = pd.read_csv('../data/raw/athletes.csv')
v2 = pd.read_csv('../data/processed/athletes_clean.csv')

print("v1 shape:", v1.shape)
print("v2 shape:", v2.shape)
print("Minimum age:", v2['age'].min())
print("Height range:", v2['height'].min(), "to", v2['height'].max())
print("Unique genders:", v2['gender'].unique())
print("Decline to answer present?",
      v2.select_dtypes(include='object')
        .apply(lambda s: s.str.contains('Decline to answer', na=False, case=False))
        .any().any())
print("Columns:", list(v2.columns))

v1 shape: (423006, 27)
v2 shape: (30832, 14)
Minimum age: 18.0
Height range: 52.0 to 83.0
Unique genders: ['Male' 'Female']
Decline to answer present? False
Columns: ['region', 'gender', 'age', 'height', 'weight', 'candj', 'snatch', 'deadlift', 'backsq', 'eat', 'background', 'experience', 'schedule', 'howlong']
