In [None]:
# Rank exercises by user profile using correlations and heuristics
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt

BASE = Path('''/Users/corin44/Documents/myofitness/myo_datas''')
PROCESSED = BASE / 'data' / 'processed'
OUT = BASE / 'outputs' / 'exercise_recommendations'
OUT.mkdir(parents=True, exist_ok=True)
FIGS = OUT / 'figures'
FIGS.mkdir(parents=True, exist_ok=True)

# Load data
cal = pd.read_parquet(PROCESSED / 'calories_cleaned.parquet')
ex = pd.read_parquet(PROCESSED / 'exercise_dataset_cleaned.parquet')
gm = pd.read_parquet(PROCESSED / 'gym_members_exercise_tracking_synthetic_data_cleaned.parquet')

# Basic idea:
# - compute per-exercise average calories burn, average duration, intensity
# - join with user profile buckets (BMI groups, age groups)
# - compute exercise scores per profile using weighted metrics (calories, intensity, duration suitability)

# normalize exercise names
if 'Exercise_clean' in ex.columns:
    ex['exercise_key'] = ex['Exercise_clean']
else:
    ex['exercise_key'] = ex['Exercise'].str.lower().str.replace(r"[^a-z0-9 ]", "", regex=True).str.strip()

# per-exercise stats
exercise_stats = ex.groupby('exercise_key').agg(
    count=('exercise_key','size'),
    avg_calories=('Calories Burn', 'mean'),
    avg_duration=('Duration','mean'),
    median_intensity=('Exercise Intensity','median')
).reset_index()

# create user profile buckets in gym members dataset
# BMI bucket
if 'BMI' in gm.columns:
    gm['BMI_bucket'] = pd.cut(gm['BMI'], bins=[0,18.5,25,30,100], labels=['underweight','normal','overweight','obese'])
else:
    gm['BMI_bucket'] = 'unknown'

gm['age_bucket'] = pd.cut(gm['Age'].fillna(-1), bins=[-1,18,30,45,60,100], labels=['<18','18-30','31-45','46-60','60+'])

# compute exercise performance per profile directly from exercise dataset
# create age buckets and BMI buckets in exercise dataset if available
ex['age_bucket'] = pd.cut(ex['Age'].fillna(-1), bins=[-1,18,30,45,60,100], labels=['<18','18-30','31-45','46-60','60+'])
if 'BMI' in ex.columns:
    ex['BMI_bucket'] = pd.cut(ex['BMI'], bins=[0,18.5,25,30,100], labels=['underweight','normal','overweight','obese'])
else:
    ex['BMI_bucket'] = 'unknown'

# per-exercise per-profile stats (age_bucket x Gender x BMI_bucket)
profile_stats = ex.groupby(['exercise_key','age_bucket','Gender','BMI_bucket']).agg(
    count=('exercise_key','size'),
    avg_calories=('Calories Burn','mean'),
    avg_duration=('Duration','mean'),
    median_intensity=('Exercise Intensity','median')
).reset_index()

# Normalize and score per group
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for grp_cols in [('age_bucket','Gender'),]:
    # apply scoring within each profile slice
    scores = []
    unique_profiles = profile_stats[list(grp_cols)].drop_duplicates()
    for _, prof in unique_profiles.iterrows():
        mask = (profile_stats[grp_cols[0]]==prof[grp_cols[0]]) & (profile_stats[grp_cols[1]]==prof[grp_cols[1]])
        slice_df = profile_stats[mask].copy()
        if slice_df.shape[0]==0:
            continue
        ss = slice_df[['avg_calories','median_intensity']].fillna(0)
        if ss.shape[0]>0:
            try:
                norm = scaler.fit_transform(ss)
            except Exception:
                norm = np.zeros_like(ss)
        else:
            norm = np.zeros((ss.shape[0],2))
        slice_df['cal_norm'] = norm[:,0]
        slice_df['int_norm'] = norm[:,1]
        slice_df['score'] = 0.6*slice_df['cal_norm'] + 0.4*slice_df['int_norm']
        scores.append(slice_df)
    if scores:
        prof_scored = pd.concat(scores, ignore_index=True)
    else:
        prof_scored = pd.DataFrame()

# Fallback if no profile scoring produced
if 'prof_scored' not in locals() or prof_scored.empty:
    ss = exercise_stats[['avg_calories','median_intensity']].fillna(0)
    if len(ss)>0:
        exercise_stats[['cal_norm','int_norm']] = scaler.fit_transform(ss)
    else:
        exercise_stats[['cal_norm','int_norm']] = 0
    exercise_stats['score'] = 0.6 * exercise_stats['cal_norm'] + 0.4 * exercise_stats['int_norm']
    prof_scored = exercise_stats.rename(columns={'exercise_key':'exercise_key'})

# Export top exercises per profile (age x Gender)
profiles = prof_scored[['age_bucket','Gender']].drop_duplicates()
for _, p in profiles.iterrows():
    mask = (prof_scored['age_bucket']==p['age_bucket']) & (prof_scored['Gender']==p['Gender'])
    slice_df = prof_scored[mask].sort_values('score', ascending=False)
    if slice_df.empty:
        continue
    fname = f"top_exercises_age_{p['age_bucket']}_gender_{p['Gender']}.csv"
    slice_df.to_csv(OUT / fname, index=False)
    # plot top 10
    plt.figure(figsize=(10,6))
    sns.barplot(x='score', y='exercise_key', data=slice_df.head(10))
    plt.title(f"Top 10 exercises - age {p['age_bucket']} / gender {p['Gender']}")
    plt.tight_layout()
    safe_name = f"top10_age_{p['age_bucket']}_gender_{p['Gender']}.png".replace('/','_')
    plt.savefig(FIGS / safe_name)
    plt.close()

# Also save global top list
top_overall = prof_scored.groupby('exercise_key').agg(score=('score','mean')).reset_index().sort_values('score', ascending=False)
top_overall.to_csv(OUT / 'top_exercises_overall.csv', index=False)

# Save exercise stats
exercise_stats.to_csv(OUT / 'exercise_stats.csv', index=False)

print('Saved exercise rankings and figures to', OUT)

