In [1]:
# EDA and cleaning for calories, exercise_dataset, gym_members_exercise_tracking_synthetic_data
import pandas as pd
from pathlib import Path
import numpy as np
import json
import re

BASE = Path('''/Users/corin44/Documents/myofitness/myo_datas''')
RAW = BASE / 'data' / 'raw'
PROCESSED = BASE / 'data' / 'processed'
PROCESSED.mkdir(parents=True, exist_ok=True)

# Helper functions

def standardize_gender(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().lower()
    if s in ['m','male','man']:
        return 'male'
    if s in ['f','female','woman']:
        return 'female'
    return s


def to_numeric_safe(series, downcast=None):
    return pd.to_numeric(series.astype(str).str.replace(r"[^0-9\.\-]", "", regex=True), errors='coerce', downcast=downcast)


def clean_common(df):
    # trim strings
    for c in df.select_dtypes(include=['object']).columns:
        df[c] = df[c].astype(str).str.strip()
        df[c] = df[c].replace({'nan': np.nan})
    return df

# 1) calories.csv
calories_path = RAW / 'calories.csv'
df_cal = pd.read_csv(calories_path, low_memory=False)
df_cal = clean_common(df_cal)
# standardize genders
if 'Gender' in df_cal.columns:
    df_cal['Gender'] = df_cal['Gender'].apply(standardize_gender)
# numeric coercion
for col in ['Age','Height','Weight','Duration','Heart_Rate','Body_Temp','Calories']:
    if col in df_cal.columns:
        df_cal[col] = to_numeric_safe(df_cal[col])
# compute BMI if possible
if 'Height' in df_cal.columns and 'Weight' in df_cal.columns:
    df_cal['Height_m'] = df_cal['Height'] / 100.0
    df_cal['BMI'] = df_cal.apply(lambda r: (r.Weight / (r.Height_m**2)) if pd.notna(r.Weight) and pd.notna(r.Height_m) and r.Height_m>0 else np.nan, axis=1)
# save
cal_out = PROCESSED / 'calories_cleaned.csv'
df_cal.to_csv(cal_out, index=False)

# 2) exercise_dataset.csv
ed_path = RAW / 'exercise_dataset.csv'
df_ex = pd.read_csv(ed_path, low_memory=False)
df_ex = clean_common(df_ex)
# numeric coercion for likely numeric columns
for col in ['Calories Burn','Dream Weight','Actual Weight','Age','Duration','Heart Rate','BMI','Exercise Intensity']:
    if col in df_ex.columns:
        df_ex[col] = to_numeric_safe(df_ex[col])
# standardize Exercise names
if 'Exercise' in df_ex.columns:
    df_ex['Exercise_clean'] = df_ex['Exercise'].str.lower().str.replace(r"[^a-z0-9 ]", "", regex=True).str.strip()
# save
ex_out = PROCESSED / 'exercise_dataset_cleaned.csv'
df_ex.to_csv(ex_out, index=False)

# 3) gym_members_exercise_tracking_synthetic_data.csv
gm_path = RAW / 'gym_members_exercise_tracking_synthetic_data.csv'
df_gm = pd.read_csv(gm_path, low_memory=False)
df_gm = clean_common(df_gm)
# numeric coercion
for col in ['Age','Height','Weight','Duration','Heart_Rate','Body_Temp','Calories']:
    if col in df_gm.columns:
        df_gm[col] = to_numeric_safe(df_gm[col])
# standardize gender
if 'Gender' in df_gm.columns:
    df_gm['Gender'] = df_gm['Gender'].apply(standardize_gender)
# BMI if heights in cm
if 'Height' in df_gm.columns and 'Weight' in df_gm.columns:
    # try detect if Height already in meters (values < 3)
    df_gm['Height_m'] = df_gm['Height'].apply(lambda v: v if pd.isna(v) else (v if v<3 else v/100.0))
    df_gm['BMI'] = df_gm.apply(lambda r: (r.Weight / (r.Height_m**2)) if pd.notna(r.Weight) and pd.notna(r.Height_m) and r.Height_m>0 else np.nan, axis=1)
# save
gm_out = PROCESSED / 'gym_members_exercise_tracking_synthetic_data_cleaned.csv'
df_gm.to_csv(gm_out, index=False)

# Basic summary stats saved as json
summary = {
    'calories':{
        'rows': len(df_cal),
        'columns': df_cal.columns.tolist(),
        'missing_pct': (df_cal.isna().mean()*100).round(2).to_dict()
    },
    'exercise_dataset':{
        'rows': len(df_ex),
        'columns': df_ex.columns.tolist(),
        'missing_pct': (df_ex.isna().mean()*100).round(2).to_dict()
    },
    'gym_members':{
        'rows': len(df_gm),
        'columns': df_gm.columns.tolist(),
        'missing_pct': (df_gm.isna().mean()*100).round(2).to_dict()
    }
}
with open(PROCESSED / 'cleaning_summary.json','w') as f:
    json.dump(summary,f,indent=2)

print('Saved cleaned files and summary')



Saved cleaned files and summary
