In [None]:
# EDA visual report
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

BASE = Path('''/Users/corin44/Documents/myofitness/myo_datas''')
PROCESSED = BASE / 'data' / 'processed'
FIGS = BASE / 'reports' / 'figures'
FIGS.mkdir(parents=True, exist_ok=True)

# Load datasets
cal = pd.read_parquet(PROCESSED / 'calories_cleaned.parquet')
ex = pd.read_parquet(PROCESSED / 'exercise_dataset_cleaned.parquet')
gm = pd.read_parquet(PROCESSED / 'gym_members_exercise_tracking_synthetic_data_cleaned.parquet')

# 1) Basic distributions for calories dataset
plt.figure(figsize=(8,5))
sns.histplot(cal['Calories'].dropna(), bins=50, kde=True)
plt.title('Distribution of Calories burned (calories.csv)')
plt.xlabel('Calories')
plt.savefig(FIGS / 'calories_distribution.png')
plt.close()

# 2) Calories vs Duration scatter
plt.figure(figsize=(8,6))
sns.scatterplot(x=cal['Duration'], y=cal['Calories'], hue=cal['Gender'], alpha=0.6)
plt.title('Calories vs Duration colored by Gender')
plt.xlabel('Duration (min)')
plt.ylabel('Calories')
plt.savefig(FIGS / 'calories_vs_duration.png')
plt.close()

# 3) BMI distribution (calories)
if 'BMI' in cal.columns:
    plt.figure(figsize=(8,5))
    sns.kdeplot(cal['BMI'].dropna(), fill=True)
    plt.title('BMI distribution (calories.csv)')
    plt.savefig(FIGS / 'calories_bmi_kde.png')
    plt.close()

# 4) Exercise intensity vs calories burn (exercise dataset)
plt.figure(figsize=(8,6))
sns.boxplot(x='Exercise Intensity', y='Calories Burn', data=ex)
plt.title('Calories Burn by Exercise Intensity (exercise_dataset.csv)')
plt.savefig(FIGS / 'exercise_intensity_calories_box.png')
plt.close()

# 5) Top 20 exercises by frequency
plt.figure(figsize=(10,6))
top_ex = ex['Exercise_clean'].value_counts().nlargest(20)
sns.barplot(y=top_ex.index, x=top_ex.values)
plt.xlabel('Count')
plt.ylabel('Exercise')
plt.title('Top 20 exercises (exercise_dataset.csv)')
plt.savefig(FIGS / 'top20_exercises.png')
plt.close()

# 6) Correlation heatmap for gym members dataset (numeric cols)
num_cols = gm.select_dtypes(include=[np.number]).columns
corr = gm[num_cols].corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation heatmap (gym members dataset)')
plt.savefig(FIGS / 'gym_members_corr_heatmap.png')
plt.close()

# 7) Save a short summary CSV with key correlations
corr_pairs = []
for c in num_cols:
    for d in num_cols:
        if c!=d:
            corr_pairs.append((c,d,corr.loc[c,d]))

corr_df = pd.DataFrame(corr_pairs, columns=['var1','var2','corr']).sort_values('corr', key=lambda s: s.abs(), ascending=False).drop_duplicates(subset=['var1','var2'])
corr_df.to_csv(PROCESSED / 'numeric_correlations_summary.csv', index=False)

print('Saved figures to', FIGS)

