In [3]:
# %% [markdown]
# 🤖 Body Measurement Prediction Model
# **Goal**: Predict missing measurements from height + 2 circumferences (Bust/Waist/Hip).
# Dataset: `augmented_measurements_v1.xlsx`

# %% [Step 1: Load and Standardize Data]
import pandas as pd
import numpy as np

# Load augmented data
try:
    df = pd.read_excel("../data/augmented_measurements_v1.xlsx")
except FileNotFoundError:
    print("ERROR: File not found. Check the path to augmented_measurements_v1.xlsx!")

# Standardize column names (spaces → underscores, lowercase)
df.columns = [
    col.replace(" ", "_").replace("(", "").replace(")", "").replace("CM", "cm").lower() 
    for col in df.columns
]
print("✅ Columns standardized:", df.columns.tolist()[:5])  # Preview first 5

# %% [Step 2: Define Inputs & Targets]
# --- Inputs (Height + any 2 of Bust/Chest/Waist/Hip) ---
input_cols = [
    'height_cm', 
    'chest_cm', 
    'bust_cm', 
    'waist_cm', 
    'hip_cm'
]

# --- Targets (All other measurements except metadata) ---
exclude = ['id', 'date_measured_yyyy-mm-dd'] + input_cols
target_cols = [col for col in df.columns if col not in exclude]

print(f"🎯 Targets ({len(target_cols)}):", target_cols[:5])  # Preview

# %% [Step 3: Simulate User Inputs (Random Missing Values)]
# Simulate users providing only height + 2 random circumferences
np.random.seed(42)
for idx in df.index:
    # Randomly select 2 circumferences to keep (other than height)
    keep = np.random.choice(input_cols[1:], 2, replace=False)  # Exclude height
    # Set others to NaN
    for col in input_cols[1:]:
        if col not in keep:
            df.loc[idx, col] = np.nan

# %% [Step 4: Train XGBoost Model]
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# Prepare data
X = df[input_cols]
y = df[target_cols]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model (handles NaN inputs)
model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    missing=np.nan  # Critical for user input flexibility
)
model.fit(X_train, y_train)
print("✅ Model trained!")

# %% [Step 5: Evaluate Accuracy]
from sklearn.metrics import mean_absolute_error

# Predict and calculate MAE
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred, multioutput='raw_values')

# Print top 10 most accurate predictions
mae_per_target = pd.Series(mae, index=target_cols).sort_values()
print("📏 Top 10 Accurate Predictions:")
print(mae_per_target.head(10))

# %% [Step 6: Save Model & Metadata]
import joblib
import json
import os  # Add this import

# Create directories if they don't exist
os.makedirs("../models", exist_ok=True)  # Fixes FileNotFoundError
os.makedirs("../data", exist_ok=True)

# Save model
joblib.dump(model, "../models/fashion_measurement_predictor_v1.pkl")

# Save metadata
with open("../data/metadata.json", "w") as f:
    json.dump({"input_cols": input_cols, "target_cols": target_cols}, f)

print("💾 Model saved to ../models/ ✅")


✅ Columns standardized: ['id', 'date_measured_yyyy-mm-dd', 'height_cm', 'chest_cm', 'bust_cm']
🎯 Targets (45): ['waist_to_hip_cm', 'bust_height_cm', 'breast_distance_cm', 'bust_radius_cm', 'shoulde_to_underbust_cm']
✅ Model trained!
📏 Top 10 Accurate Predictions:
height_inseam_ratio     0.617611
height_z                0.727691
inseam_z                0.785695
bust_radius_cm          1.192644
wrist_cm                1.318584
breast_distance_cm      1.324368
hand_entry_cm           1.473523
bust_height_cm          1.525566
back_shoulder_cm        1.852041
back_waist_length_cm    1.936759
dtype: float32
💾 Model saved to ../models/ ✅


In [4]:
ls ../models  # Should show the .pkl file

Invalid switch - "models".
