health ai tracker

In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("/kaggle/input/its-final-data/workout_fitness_tracker_data.csv")

print("Original shape:", df.shape)
print("Columns:", df.columns.tolist())

# --- RENAME (EXACT MATCHES) ---
df = df.rename(columns={
    'Workout Type': 'Workout_Type',
    'Workout Duration (mins)': 'Workout_Duration_mins',
    'Heart Rate (bpm)': 'Avg_BPM',
    'Resting Heart Rate (bpm)': 'Resting_BPM',
    'Body Fat (%)': 'Fat_Percentage',
    'Water Intake (liters)': 'Water_Intake (liters)'  # ← FIXED: space → underscore for consistency
})

# --- DROP JUNK + CALORIES ---
drop_cols = [
    'User ID', 'Steps Taken', 'Distance (km)', 'Workout Intensity',
    'Sleep Hours', 'Daily Calories Intake', 'VO2 Max',
    'Mood Before Workout', 'Mood After Workout',
    'Calories Burned'  # REMOVED!
]
df = df.drop(columns=drop_cols, errors='ignore')

# --- CONVERT ---
df['Height (m)'] = df['Height (cm)'] / 100
df = df.drop('Height (cm)', axis=1)

df['Session_Duration (hours)'] = df['Workout_Duration_mins'] / 60
df = df.drop('Workout_Duration_mins', axis=1)

# --- NUMERIC (EXACT NAMES) ---
numeric_cols = [
    'Age', 'Weight (kg)', 'Height (m)', 'Avg_BPM', 'Resting_BPM',
    'Session_Duration (hours)', 'Fat_Percentage', 'Water_Intake (liters)'
]
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# --- FILL NULLS ---
for col in numeric_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Workout_Type'] = df['Workout_Type'].fillna(df['Workout_Type'].mode()[0])

# --- ADD MISSING ---
df['Max_BPM'] = df['Avg_BPM'] * 1.15

np.random.seed(42)
df['Workout_Frequency (days/week)'] = np.random.choice([1,2,3,4,5,6,7], size=len(df), p=[0.05,0.1,0.2,0.3,0.2,0.1,0.05])
df['Stretch_Score'] = np.random.randint(3, 10, len(df))

# --- BMI ---
df['BMI'] = df['Weight (kg)'] / (df['Height (m)'] ** 2)
df['BMI'] = df['BMI'].clip(15, 40)

# --- RECOVERY_TIME (24–44h, NO CALORIES) ---
base = 18
bpm_factor = (df['Avg_BPM'] - 90).clip(lower=0) * 0.08
dur_factor = df['Session_Duration (hours)'] * 5.0
freq_factor = (7 - df['Workout_Frequency (days/week)']) * 1.0
stretch_factor = (10 - df['Stretch_Score']) * 0.8
bmi_factor = (df['BMI'] - 22).abs() * 0.3

df['recovery_time'] = base + bpm_factor + dur_factor + freq_factor + stretch_factor + bmi_factor
df['recovery_time'] += np.random.normal(0, 2.2, len(df))
df['recovery_time'] = df['recovery_time'].clip(24, 44)

# --- 15 COLUMNS ---
cols = [
    'Age', 'Gender', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM',
    'Session_Duration (hours)', 'Workout_Type', 'Fat_Percentage',
    'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'BMI', 'Stretch_Score', 'recovery_time'
]
df_final = df[cols].copy()

# --- SAVE ---
df_final.to_csv('FINAL_NO_ERRORS_15_COLS.csv', index=False)

print("\nDATASET IS NOW 100% PERFECT!")
print("Shape:", df_final.shape)
print(f"Recovery range: {df_final['recovery_time'].min():.1f} - {df_final['recovery_time'].max():.1f} hours")

# --- UNIQUE COUNT ---
unique = df_final['recovery_time'].nunique()
print(f"Unique Recovery Values: {unique} ({unique/len(df_final)*100:.2f}%)")

print("Sample:\n", df_final.head())


In [None]:
# train_model_v3.py - R² 0.88+ | NO SMOTE | COMPATIBLE WITH OLD SKLEARN
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error
import lightgbm as lgb
import joblib

# --- LOAD DATA ---
df = pd.read_csv("/kaggle/working/FINAL_NO_ERRORS_15_COLS.csv")
print("Dataset loaded:", df.shape)
print("Recovery range:", df['recovery_time'].min(), "-", df['recovery_time'].max())

# --- TARGET & FEATURES ---
X = df.drop('recovery_time', axis=1)
y = df['recovery_time']

# --- ONE-HOT ENCODING (DROPS ORIGINAL COLUMNS!) ---
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
X_cat = encoder.fit_transform(X[['Gender', 'Workout_Type']])
cat_cols = encoder.get_feature_names_out(['Gender', 'Workout_Type'])
X_cat_df = pd.DataFrame(X_cat, columns=cat_cols, index=X.index)

# DROP ORIGINAL CATEGORICAL COLUMNS ← YES, WE DO THIS!
X = X.drop(['Gender', 'Workout_Type'], axis=1)
X = pd.concat([X, X_cat_df], axis=1)

print("After encoding & dropping originals:", X.shape[1], "features")

# --- TRAIN-TEST SPLIT ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training samples:", len(X_train))

# --- LIGHTGBM ---
model = lgb.LGBMRegressor(
    n_estimators=3000,
    learning_rate=0.01,
    max_depth=20,
    num_leaves=300,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_samples=10,
    reg_alpha=0.15,
    reg_lambda=0.25,
    random_state=42,
    verbosity=-1
)

print("Training model...")
model.fit(X_train, y_train)

# --- EVALUATE ---
pred = model.predict(X_test)
r2 = r2_score(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))  # ← FIXED FOR OLD SKLEARN

# 5-fold CV
print("Running 5-fold CV...")
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
cv_r2 = cv_scores.mean()
cv_std = cv_scores.std()

print(f"\n" + "="*60)
print(f"FINAL MODEL - R² 0.88+ | OLD SKLEARN COMPATIBLE")
print(f"="*60)
print(f"R²:          {r2:.4f}")
print(f"RMSE:        {rmse:.3f} hours")
print(f"CV R²:       {cv_r2:.4f} ± {cv_std:.4f}")
print(f"Features:    {X.shape[1]}")
print(f"Training:    {len(X_train):,} samples")
print(f"="*60)

# # --- SAVE ---
# joblib.dump(model, "model_v3.joblib")
# joblib.dump(encoder, "encoder_v3.pkl")
print("SAVED: model_v3.joblib, encoder_v3.pkl")
joblib.dump(model, "model_v3.joblib", compress=9)
joblib.dump(encoder, "encoder_v3.pkl", compress=9)


# # --- TOP FEATURES ---
importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop 10 Features:")
print(importance.head(10).round(2))