In [None]:
# Distribution of categorical features
categorical_cols = ['Gender', 'FitnessGoal', 'PreferredExercise', 'SuggestedSport']
for col in categorical_cols:
    sns.countplot(x=col, data=df)
    plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
    plt.title(f"Distribution of {col}")
    plt.show()

# Relationships between features (examples)
sns.pairplot(df[['Age', 'Height', 'Weight', 'RestingHR', 'WorkoutHR']])
plt.show()

sns.boxplot(x='FitnessGoal', y='Age', data=df)
plt.xticks(rotation=45, ha='right')
plt.show()

# Correlation matrix for numerical features
corr_matrix = df[['Age', 'Height', 'Weight', 'RestingHR', 'WorkoutHR']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

# 3. Data Preprocessing

# a. Handling Missing Values (if any - check your data)
# In this synthetic data example, there should be no missing values.
# But in real data, you would use methods like:
# df.dropna()  # Remove rows with missing values (if appropriate)
# df.fillna(df.mean())  # Fill with the mean (for numerical features)
# df.fillna(df.mode()[0])  # Fill with the mode (for categorical features)

# b. Encoding Categorical Features
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])
    print(f"Encoded {col}: {le.classes_}") # Print the classes for reference

# c. Feature Scaling (for numerical features)
numerical_cols = ['Age', 'Height', 'Weight', 'RestingHR', 'WorkoutHR']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# 4. Prepare Data for Machine Learning

X = df.drop('SuggestedSport', axis=1)  # Features
y = df['SuggestedSport']  # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% train, 20% test

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)
