In [None]:
pip install xgboost

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro, kstest, anderson


#Grab data and put numerical vals in one table
#################################################################################################################
dataset_as_given = pd.read_csv(r"d:\Desktop\Practice Python\21.gym_members_exercise_tracking.csv")
#print(dataset_as_given)

# Get column names
column_names = dataset_as_given.columns
#print(column_names)

desired = ["Age", "Weight (kg)", "Height (m)", "Max_BPM", "Avg_BPM", "Resting_BPM", 
           "Session_Duration (hours)", "Calories_Burned", "Fat_Percentage", "Water_Intake (liters)", "Workout_Frequency (days/week)", "Experience_Level", "BMI"]

dataset_as_given_numeric = dataset_as_given[desired]



#Get Rid of Outliers
#################################################################################################################

# Copy the dataset so you keep the original intact
cleaned_df = dataset_as_given.copy()

# Select numeric columns
numeric_cols = cleaned_df.select_dtypes(include=['int64', 'float64']).columns

for col in numeric_cols:
    Q1 = cleaned_df[col].quantile(0.25)
    Q3 = cleaned_df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out outliers
    cleaned_df = cleaned_df[(cleaned_df[col] >= lower_bound) & (cleaned_df[col] <= upper_bound)]



print("Original rows:", dataset_as_given.shape[0])
print("Cleaned rows:", cleaned_df.shape[0])
print("Rows removed:", dataset_as_given.shape[0] - cleaned_df.shape[0])


#################################################################################################################

encoded_df = cleaned_df.copy()

encoded_df['Gender'] = encoded_df['Gender'].map({
    'Male': 1,
    'Female': 0
})

workout_dummies = pd.get_dummies(encoded_df['Workout_Type'], prefix='Workout', drop_first=True)
encoded_df = pd.concat([encoded_df.drop(columns=['Workout_Type']), workout_dummies], axis=1)




Original rows: 973
Cleaned rows: 931
Rows removed: 42


In [None]:
# ============================================================
#   TRAIN/TEST SPLIT + SCALING
#   This prepares your data for any ML model
# ============================================================

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# ------------------------------------------------------------
# 1. Define X (features) and y (target)
# ------------------------------------------------------------
# Gender must already be encoded as 0/1 in encoded_df
X = encoded_df.drop(columns=['Gender'])
y = encoded_df['Gender']

# ------------------------------------------------------------
# 2. Train/Test Split
# ------------------------------------------------------------
# test_size=0.2 means 20% of data is held out for testing
# random_state=42 ensures reproducibility
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------------------------------------------
# 3. Scale the numeric features
# ------------------------------------------------------------
# Scaling helps models like Logistic Regression, SVM, KNN, Neural Networks
# Tree-based models (Random Forest, XGBoost) do NOT require scaling,
# but scaling does not harm them.
scaler = StandardScaler()

# Fit on training data ONLY, then transform both sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ------------------------------------------------------------
# 4. Optional: Print shapes to confirm everything looks right
# ------------------------------------------------------------
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

print("\nScaled versions created: X_train_scaled, X_test_scaled")


Logistic Regression is a supervised machine learning algorithm used for classification problems. <br>
It predicts the the probability that an input belings to a specific class.<br>
In our case the classes would be male and female. <br>
Logistic Regression is used for binary classification (such as 0 or 1; True or False; Yes or No)<br><br>
Logistic Regression has 3 main types:<br>
1. Binomial Logistic Regression<br>
This type is used when the dependant variable (in our case Gender) has only two possible categories.<br>
It is the most common form of Logistic Regression and is used for binary classification problems.<br><br>

2. Multinomial Logistic Regression<br>
This type is used when the dependant variable (in our case Gender) has 3 or more possible categories that are not ordered.<br>
As an example of this: Cat VS Dog VS Sheet -> they do not have a specific order.<br><br>

3. Ordinal Logistic Regression<br>
This type of Logistic Regression is used when the dependant variable (in our case that is Gender) has 3 or more categories it can be sorted into - and these categories have a natural order or ranking. As an example of this: Low VS Medium VS High. This specific model takes the order of the categories into account when modeling.<br><br>

In order for the model to perfom well - or at least avoid certain issues that might stop it from performing well. There are some assumtions that must be met by the dataset before the model should be trained on it.<br><br>
The main assumptions of Logistic Regression are:<br><br>

1. Independent Observations -> this means that each of the columns we have should not be correlated to any of the other columns -> helps avoid multicolinearity.<br><br>
2. Binary Dependant Variables -> this assumption means that the dataset tries to preddict the data assuming that the dependant variable (which is Gender in our case) can only take 2 forms (in our case these 2 forms would be Male and Female)<br><br>
3. Linearity relationaship between independent variables and log odds -> the model assumes that a linear relationship exists between the independent variables and the log odds of the dependant variable -> meaning that the predictors affect the log odds of the predicted variable in a linear way.<br><br>
4. No outliers -> the model assumes that the dataset contains no outliers - or at least no extreme outliers; Outliers in a Logistic Regression model can distort the estimation of the logistic regression coefficients.<br><br>
5. Large sample size -> the models needs at least a certain amount of data to be able to be trained enough to give reasonable predictions -> otherwise it simply can not figure out the pattern of the data and be able to predict the dependant variable.<br><br>

Ligistic Regression uses a Sigmoid Function<br><br>
1. The sigmoid function is used to convert the raw output of the model into a probability value between 0 and 1.<br><br>
2. The Sigmoid function takes any real number and maps it iinto the range 0 to 1 forming an 'S' shaped curve called the sigmoid curve. Because probabilities must lie between 0 and 1, the sigmoid function is perfect for this purpose.<br><br>
3. In Logistic Regression, a threshold value (usually 0.5) is used to decide the class label.<br><br>
- If the sigmoid output is same or above the threshold, the input is classified as Class 1.<br>
- If it is below the threshold, the input is classified as Class 0.<br><br>
The approach of the Sigmoid Function helps to transform continous input values into meaningful class predictions.<br><br>

Terminologies used in Logistic Regression<br><br>
1. Independent Variables -> These are the values we use to make predictions with<br>
In our case we predict the Gender using vaiables such as Weight, Height, Water Intake, etc.<br><br>
2. Dependant Variable -> The dependant variable is the variable we are predicting AKA the variable that is dependant on the independent variables in order to be predicted<br><br>
3. Logistic Function -> This is the function that trasforms the Independent Variables (Weight, Height, etc.) into a probability between 0 and 1 which represents the likelihood that the Dependant Variable (Gender) is either 0 or 1.<br><br>
4. Odds -> This is the ratio of the probability of an event happening to the probability of that event not happening. It differs from probability because probability is the ratio of occurances to total possibilities.<br><br>
5. Log-Odds (Logit) -> The natural logarithm of the odds. In Logistic Regression, the log-odds are modeled as a linear combination of the independent variables and the intercept.<br><br>
6. Coefficient -> These variables show how much the dependant variable is affected by the independent variables.
7. Intercept -> The constant term in the Logistic Regression model which represents the log-odds when all independent variables are equal to 0.<br><br>
8. Maximum Likelihood Estimation (MLE) -> This method is used to estimate the coefficients of the logistic regression model by maximizing the likelihood of observing the given data.<br><br>

Implementation for Logistic Regression<br><br>

https://www.geeksforgeeks.org/machine-learning/understanding-logistic-regression/

In [None]:
# ============================================================
# LOGISTIC REGRESSION
# Good baseline model. Interpretable. Works well with scaling.
# Look for balanced precision/recall and stable accuracy.
# ============================================================

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))


In [None]:
# ============================================================
#   LOGISTIC REGRESSION — FULL MODEL EVALUATION SUITE
#   This cell trains the model and evaluates it using:
#     ✔ Accuracy
#     ✔ Classification Report
#     ✔ Confusion Matrix
#     ✔ ROC Curve + AUC
#     ✔ Precision–Recall Curve
#     ✔ Coefficients (feature influence)
#   Every section includes comments explaining what to look for.
# ============================================================

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_curve, auc, precision_recall_curve
)
import pandas as pd

# ------------------------------------------------------------
# 1. Train the Logistic Regression model
# ------------------------------------------------------------
# Logistic Regression works best with scaled data.
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred_lr = log_reg.predict(X_test_scaled)
y_prob_lr = log_reg.predict_proba(X_test_scaled)[:, 1]  # needed for ROC/PR curves


# ------------------------------------------------------------
# 2. Accuracy Score
# ------------------------------------------------------------
# Accuracy = proportion of correct predictions.
# Good for balanced datasets. If classes are imbalanced,
# accuracy alone can be misleading.
accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {accuracy:.4f}")


# ------------------------------------------------------------
# 3. Classification Report
# ------------------------------------------------------------
# Shows precision, recall, and F1-score for each class.
# Precision = how many predicted positives were correct.
# Recall = how many actual positives were found.
# F1 = balance between precision and recall.
# Look for balanced values across both classes.
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))


# ------------------------------------------------------------
# 4. Confusion Matrix
# ------------------------------------------------------------
# Shows EXACTLY where the model is making mistakes.
# Diagonal = correct predictions.
# Off-diagonal = errors.
cm = confusion_matrix(y_test, y_pred_lr)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


# ------------------------------------------------------------
# 5. ROC Curve + AUC Score
# ------------------------------------------------------------
# ROC curve shows how well the model separates the two classes.
# AUC close to 1.0 = excellent.
# AUC around 0.5 = random guessing.
fpr, tpr, thresholds = roc_curve(y_test, y_prob_lr)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0,1], [0,1], linestyle='--', color='grey')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()


# ------------------------------------------------------------
# 6. Precision–Recall Curve
# ------------------------------------------------------------
# Useful when one class is less common.
# Look for curves that stay high (good precision)
# and far right (good recall).
precision, recall, thresholds = precision_recall_curve(y_test, y_prob_lr)

plt.figure(figsize=(6,4))
plt.plot(recall, precision)
plt.title("Precision–Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()


# ------------------------------------------------------------
# 7. Logistic Regression Coefficients
# ------------------------------------------------------------
# Shows how each feature influences the prediction.
# Positive coefficient = pushes prediction toward class 1.
# Negative coefficient = pushes prediction toward class 0.
# Look for features with large absolute values.
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_reg.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nLogistic Regression Coefficients (Feature Influence):")
print(coef_df)

plt.figure(figsize=(8,6))
sns.barplot(data=coef_df, x='Coefficient', y='Feature', palette='viridis')
plt.title("Feature Influence (Logistic Regression Coefficients)")
plt.show()
