In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
df=pd.read_csv('/content/bodyfat.csv')

In [4]:
# Create new features
df['BMI'] = (df['Weight'] * 0.453592) / ((df['Height'] * 0.0254) ** 2)  # Convert to kg and meters
df['Waist_Hip_Ratio'] = df['Abdomen'] / df['Hip']
df['Age_Weight_Interaction'] = df['Age'] * df['Weight']
df['Chest_Abdomen_Ratio'] = df['Chest'] / df['Abdomen']

In [5]:
#Removing impossible values for body fat
df = df[(df['BodyFat'] >= 2) & (df['BodyFat'] <= 45)]
print(f"After body fat validation: {len(df)}")

After body fat validation: 249


In [6]:
# Anthropometric ranges
df = df[(df['Height'] >= 58) & (df['Height'] <= 84)]  # 4'10" to 7'0"
df = df[(df['Weight'] >= 100) & (df['Weight'] <= 400)]  # Reasonable weight range
print(f"After height/weight validation: {len(df)}")

After height/weight validation: 248


In [7]:
# Check skewness first
skewness = df.select_dtypes(include=[np.number]).skew()
print("Skewness:\n", skewness)

Skewness:
 Density                  -0.005836
BodyFat                   0.080335
Age                       0.275528
Weight                    1.300520
Height                    0.130470
Neck                      0.573650
Chest                     0.752117
Abdomen                   0.885009
Hip                       1.612861
Thigh                     0.904387
Knee                      0.550756
Ankle                     2.286452
Biceps                    0.341797
Forearm                  -0.220027
Wrist                     0.282708
BMI                       1.609002
Waist_Hip_Ratio           0.189080
Age_Weight_Interaction    0.630114
Chest_Abdomen_Ratio       0.248679
dtype: float64


In [8]:
#Removing statistical outliers
def remove_outliers_iqr(df, columns):
    df_clean = df.copy()
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        before = len(df_clean)
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
        after = len(df_clean)
        if before != after:
            print(f"  {col}: removed {before - after} outliers")
    return df_clean

df = remove_outliers_iqr(df, ['BodyFat', 'Weight', 'Abdomen', 'Hip'])

  Weight: removed 2 outliers
  Hip: removed 1 outliers


In [9]:
print(f"\nFinal dataset size: {len(df)}")
print(f"Total removed: {252 - len(df)} samples ({((252 - len(df))/252)*100:.1f}%)")


Final dataset size: 245
Total removed: 7 samples (2.8%)


In [10]:
# Split features and target
X = df.drop(['BodyFat', 'Density'], axis=1)  # Remove BodyFat(target) and Density(data leakage)
y = df['BodyFat']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
import pickle
with open('/content/drive/MyDrive/body fat predictor/data/processed_data.pkl', 'wb') as f:
    pickle.dump({
        'X_train': X_train_scaled,
        'X_test': X_test_scaled,
        'y_train': y_train,
        'y_test': y_test,
        'scaler': scaler,
        'feature_names': X_train.columns.tolist()
    }, f)