<a href="https://colab.research.google.com/github/thaheshan/Breast_Cancer_Prediction_Model/blob/main/notebook03final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Title: Notebook 3 – Ensemble Classifier & Decision Tree Regression
# Author: Suresh Thaheshan
# Peer Reviewer: Ayman Jaleel
# Date: April 10, 2025


# ---------------IMPORT LIBRARIES -----------------------------
import pandas as pd
 # For data manipulation

import numpy as np
 # For numerical computations

import matplotlib.pyplot as plt
 # For plotting graphs

import seaborn as sns
 # For advanced plotting

import joblib
 # For saving and loading models in joblib frormat

import os
# For working with file paths

# Importing necessary functions from scikit-learn

from sklearn.model_selection import train_test_split, cross_val_score
 # For splitting data and cross-validation

from sklearn.ensemble import VotingClassifier, GradientBoostingRegressor
 # For ensemble and boosting classifiers

from sklearn.tree import DecisionTreeRegressor, plot_tree
# For decision tree regression and visualization

from sklearn.metrics import (
    confusion_matrix, classification_report, roc_curve, auc,
    # For classification metrics
    mean_squared_error, mean_absolute_error, r2_score, accuracy_score

    # For regression and classification metrics
)


from sklearn.preprocessing import StandardScaler
 # For feature scaling

from sklearn.naive_bayes import GaussianNB
 # For Naive Bayes classifier

from sklearn.linear_model import LogisticRegression
 # For Logistic Regression classifier

from sklearn.neighbors import KNeighborsClassifier
 # For K-Nearest Neighbors classifier

from sklearn.impute import SimpleImputer
 # For handling missing values

from sklearn.pipeline import Pipeline
  # For pipeline creation


In [3]:
# Set visualization style
sns.set_style('whitegrid')
plt.style.use('seaborn-v0_8-whitegrid')

# Define save path
save_path = '/content/drive/MyDrive/Machine_Learning_CourseWork'
os.makedirs(save_path, exist_ok=True)



In [6]:


# ----------- LOAD THE PREOROCESSED DATA---------------------
print("Loading processed datasets...")
classification_df = pd.read_csv(os.path.join(save_path, 'classification_dataset.csv'))
regression_df = pd.read_csv(os.path.join(save_path, 'regression_dataset.csv'))


print(f"Classification dataset shape: {classification_df.shape}")
print(f"Regression dataset shape: {regression_df.shape}")

Loading processed datasets...
Classification dataset shape: (4019, 66)
Regression dataset shape: (4019, 67)


In [7]:
# ----------------- CHECK FOR MISSING VALUES-------------------
# Print mssage to tell that missing value cheking is starting
print("\n= ------------- CHECKING FOR MISSING VALUES --------------------")

# Print total number of missing values in the classification dataset
print(f"Classification dataset missing values: {classification_df.isna().sum().sum()}")
# Print total number of missing values in the regression dataset
print(f"Regression dataset missing values: {regression_df.isna().sum().sum()}")




# Print the names and counts of columns with missing values in classification dataset
print("\nColumns with missing values in classification dataset:")
# Find and store column names which has atleast one missing value
missing_cols = classification_df.columns[classification_df.isna().any()].tolist()


# Loop through each column and print how many values are missing
for col in missing_cols:
    missing_count = classification_df[col].isna().sum()
    print(f"- {col}: {missing_count} missing values ({missing_count/len(classification_df)*100:.2f}%)")




= ------------- CHECKING FOR MISSING VALUES --------------------
Classification dataset missing values: 1
Regression dataset missing values: 1

Columns with missing values in classification dataset:
- Regional_Node_Examined: 1 missing values (0.02%)


In [9]:
# === 4. CLEAN UP MORTALITY STATUS VALUES ===
# Print a seperation message to tell user that the cleaning of 'Mortality_Status' column is starting
print("\n ------------------ CLEANING MORTALITY STATUS VALUES -------------------------")

# Print current values in the Mortality_Status column before cleaning
print("\nMortality Status values:")
print(classification_df['Mortality_Status'].value_counts())

# Define a mapping dictionary to convert various forms of alive/dead values into 0 and 1
mortality_mapping = {
    0: 0, '0': 0, 'ALIVE': 0, 'alive': 0, 'ALive': 0, 'Alive': 0,
    1: 1, '1': 1, 'DEAD': 1, 'dead': 1, 'Dead': 1
}

# Apply the above mapping to the classification dataset
classification_df['Mortality_Status'] = classification_df['Mortality_Status'].map(mortality_mapping)

# Apply the same mapping to the regression dataset
regression_df['Mortality_Status'] = regression_df['Mortality_Status'].map(mortality_mapping)

# Convert the mapped values into integers to make sure datatype is consistent
classification_df['Mortality_Status'] = classification_df['Mortality_Status'].astype(int)
regression_df['Mortality_Status'] = regression_df['Mortality_Status'].astype(int)

# Print the cleaned and final version of Mortality_Status values in classification dataset
print("\nCleaned Mortality Status values:")
print(classification_df['Mortality_Status'].value_counts())

# Also show the % of each class (0 or 1) for better understanding of distribution
print(classification_df['Mortality_Status'].value_counts(normalize=True))



 ------------------ CLEANING MORTALITY STATUS VALUES -------------------------

Mortality Status values:
Mortality_Status
0    3404
1     615
Name: count, dtype: int64

Cleaned Mortality Status values:
Mortality_Status
0    3404
1     615
Name: count, dtype: int64
Mortality_Status
0    0.846977
1    0.153023
Name: proportion, dtype: float64


In [10]:
# === CLASSIFICATION DATA PREPARATION ===
# Print a message to show this part is for classification model preperation
print("\n--------------------- CLASSIFICATION DATA PREPARATION ---------------------")

# === 5.1 SETUP FEATURES AND TARGET FOR CLASSIFICATION ===
# Drop the target column 'Mortality_Status' from the dataset to get the input features (X)
# Target column 'Mortality_Status' is used to store the output labels (y)
X_class = classification_df.drop(columns=['Mortality_Status'])
y_class = classification_df['Mortality_Status']

# === 5.2 TRAIN-TEST SPLITTING FOR CLASSIFICATION MODEL ===
# Split the dataset into training (80%) and testing (20%) sets
# Using stratify to make sure the target class (0 and 1) is balanced in both sets
# random_state makes the split reproducible everytime you run
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, stratify=y_class, random_state=42
)

# Print the shape of the splitted training and testing datasets
print(f"Classification training set shape: {X_train_class.shape}")
print(f"Classification testing set shape: {X_test_class.shape}")

# === 5.3 PIPELINE CREATION FOR IMPUTATION AND SCALING ===
# Create a pipeline to clean and scale the data before sending to model
# First, imputer fills the missing values with the median of each feature
# Then, scaler standardizes the features (mean = 0, std = 1) to improve model perfomance
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),   # Median is good for outliers
    ('scaler', StandardScaler())                     # Scaling important for distance based models
])

# Fit and transform the training dataset using pipeline
# fit_transform will first learn the statistics from train set and apply transform
X_train_class_processed = preprocessing_pipeline.fit_transform(X_train_class)

# Only transform the test data using same pipeline (no fitting again)
# So that test set get processed exactly like train set
X_test_class_processed = preprocessing_pipeline.transform(X_test_class)



--------------------- CLASSIFICATION DATA PREPARATION ---------------------
Classification training set shape: (3215, 65)
Classification testing set shape: (804, 65)


In [12]:
# ---------------ENSEMBLE CLASSIFIER --------------------
# Print a section heading to show that we're now working on the ensemble classifier model
print("\n ...........ENSEMBLE CLASSIFIER........... ")

# What is an Ensemble Classifier?
# An ensemble classifier combines predictions from multiple different models (called base learners)
# instead of relying on just one model. This often improves accuracy and makes the final result more robust.

# The idea is: Many weak models working together can create a strong model.
# In this case, we are using 'soft voting' to combine probabilities from multiple models


#  DEFINE BASE LEARNERS
# We define 3 base learners here:
# Logistic Regression: a linear model that's good for binary classification
log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')  # Use balanced weight to handle class imbalance

# Naive Bayes: a probabilistic model based on Bayes theorem, works well for simple classification
naive_bayes = GaussianNB()

# K-Nearest Neighbors (KNN): a distance-based model, classifies based on closest training examples
knn = KNeighborsClassifier(n_neighbors=5)  # 5 nearest neighbours are considered

# === 6.2 CREATE A VOTING ENSEMBLE (SOFT VOTING) ===
# Combine all the above base learners into one ensemble model using VotingClassifier
# Soft voting means we average the predicted class probabilities from all models and choose the class with highest avg prob
ensemble_classifier = VotingClassifier(
    estimators=[
        ('lr', log_reg),     # Logistic Regression model
        ('nb', naive_bayes), # Naive Bayes model
        ('knn', knn)         # KNN model
    ],
    voting='soft'  # Use soft voting (based on probabilities, not just predicted labels)
)

# === TRAIN THE ENSEMBLE CLASSIFIER ===
# Fit the ensemble model on the preprocessed training data
# All 3 base models will be trained under the hood
ensemble_classifier.fit(X_train_class_processed, y_train_class)



 ...........ENSEMBLE CLASSIFIER........... 


In [13]:
# .................. MAKE PREDITIONS WITH THE ENSEMBLE MODEL ..........................
# Use the trained ensemble classifier to make predictions on the test set

# Predct the class labels (0 or 1) for the test dataset
y_pred_ensemble = ensemble_classifier.predict(X_test_class_processed)  # these are the predicted classes

# Predict the probabilities for the positive class (class = 1) from soft voting
# We take only the second column [:, 1] because it represents the probability of class "1"
y_prob_ensemble = ensemble_classifier.predict_proba(X_test_class_processed)[:, 1]  # these are the predicted probabilities

# === EVALUATE THE ENSEMBLE CLASSIFIER ===
# Print a heading for the evaluation metrics
print("\n ---------- Ensemble Classifier Evaluation ----------------------")

# Print the Confusion Matrix
# It shows TP, TN, FP, FN — useful to understand what the model is doing well or not
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_ensemble))
 # prints matrix with real vs predicted labels

# Print the Classification Report
# It includes precision, recall, f1-score for each class — good for imbalanced data
print("\nClassification Report:")
print(classification_report(y_test_class, y_pred_ensemble))
  # more detailed evalaution

# Print the Accuracy Score of the ensemble classifier
# Accuracy = (correct predictions) / (total predictions)
print(f"Accuracy Score: {accuracy_score(y_test_class, y_pred_ensemble):.4f}")
    # prints accuracy rounded to 4 decimal places



 ---------- Ensemble Classifier Evaluation ----------------------

Confusion Matrix:
[[360 321]
 [ 29  94]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.53      0.67       681
           1       0.23      0.76      0.35       123

    accuracy                           0.56       804
   macro avg       0.58      0.65      0.51       804
weighted avg       0.82      0.56      0.62       804

Accuracy Score: 0.5647


In [14]:
# --------- ROC CURVE AND AUC ------------
# Calculate the False Positive Rate (FPR) and True Positive Rate (TPR) for the Ensemble model
fpr_ens, tpr_ens, _ = roc_curve(y_test_class, y_prob_ensemble)
 # calcualtes the FPR and TPR

auc_ens = auc(fpr_ens, tpr_ens)
  # calcualtes AUC (Area Under the Curve)

# Plot the ROC curve for the Ensemble model
plt.figure(figsize=(8, 6))
  # sets figure size
plt.plot(fpr_ens, tpr_ens, label=f'Ensemble (AUC = {auc_ens:.4f})')
  # plot the ROC curve with AUC label
plt.plot([0, 1], [0, 1], 'k--')
 # plots the diagonal line (random classifier line)
plt.xlabel('False Positive Rate')
  # X-axis label
plt.ylabel('True Positive Rate')
 # Y-axis label
plt.title('ROC Curve - Ensemble Classifier')
  # Title of the plot
plt.legend()
 # adds legend to the plot
plt.grid(True)
# enables grid for better readability
plt.savefig(os.path.join(save_path, 'ensemble_roc_curve.png'))
 # save the plot as PNG
plt.close()
# closes the plot after saving

# ----------------- CONFUSION MATRIX VISUALIZATION-------------------
# Visualize the confusion matrix as a heatmap
plt.figure(figsize=(6, 4))
# sets figure size for confusion matrix
sns.heatmap(confusion_matrix(y_test_class, y_pred_ensemble), annot=True, fmt='d', cmap='Blues')
# creates heatmap
plt.title('Confusion Matrix - Ensemble Classifier')
# Title of the plot

plt.xlabel('Predicted')
# X-axis label
plt.ylabel('Actual')
 # Y-axis label
plt.savefig(os.path.join(save_path, 'ensemble_confusion_matrix.png'))
 # saves confusion matrix as PNG
plt.close()
 # closes the plot after saving


In [15]:
# -----------------REGRESSION DATA PREPARATION -------------------------
print("\n --------------- REGRESSION DATA PREPARATION -------------------------------")

#  Setup features and target for regreesion
# Corrected column name from 'Survival_Months' to 'Survival_Months'
X_reg = regression_df.drop(columns=['Survival_Months', 'Mortality_Status'])  # remove the target column and 'Mortality_Status' for features
y_reg = regression_df['Survival_Months']  # define the target variable

# 7.2 Train-test split for regression
# Split the dataset into train and test sets (80% train, 20% test)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

print(f"Regression training set shape: {X_train_reg.shape}")  # print shape of training set
print(f"Regression testing set shape: {X_test_reg.shape}")  # print shape of testing set

# 7.3 Create a pipline for preproccesing with imputation
# This pipeline handles missing values by filling them with the median value
reg_preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))  # Handle missing values by replacing with the median
])

# Transform the data (apply imputation strategy to fill missing values)
X_train_reg_processed = reg_preprocessing_pipeline.fit_transform(X_train_reg)  # fitting and transforming training data
X_test_reg_processed = reg_preprocessing_pipeline.transform(X_test_reg)  # transforming test data (without fitting again)



 --------------- REGRESSION DATA PREPARATION -------------------------------
Regression training set shape: (3215, 65)
Regression testing set shape: (804, 65)


In [16]:
# .................. DECISION TREE REGRESSION MODELS ..........................
print("\n ---------------- DECSION TREE REGRESSION MODELS-------------------------")

#  Model 1: Full Decison Tree (no constraints)
# Create and fit a Decision Tree Regressor without any constraints
dt_reg_full = DecisionTreeRegressor(random_state=42)  # Random state ensures reproducibility
dt_reg_full.fit(X_train_reg_processed, y_train_reg)  # Fit the model to training data

# Model 2: Purned Decison Tree
# Create and fit a pruned Decision Tree Regressor with constraints (max depth and min samples split)
dt_reg_pruned = DecisionTreeRegressor(max_depth=4, min_samples_split=10, random_state=42)
dt_reg_pruned.fit(X_train_reg_processed, y_train_reg)  # Fit pruned model to training data

# Gradient Boosting Regressor (additional model)
# This model uses an ensemble method to build multiple weak learners (decision trees) to improve prediction
gb_reg = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_reg.fit(X_train_reg_processed, y_train_reg)  # Fit Gradient Boosting model to training data

# Make predicitons
# Predict using the full decision tree model
y_pred_dt_full = dt_reg_full.predict(X_test_reg_processed)

# Predict using the pruned decision tree model
y_pred_dt_pruned = dt_reg_pruned.predict(X_test_reg_processed)

# Predict using the Gradient Boosting Regressor model
y_pred_gb = gb_reg.predict(X_test_reg_processed)



=== DECISION TREE REGRESSION MODELS ===


In [17]:
# ------------------ DECISION TREE REGRESSION EVALUATION ------------------------------
print("\n------------ DECISION TREE REGRESSION EVALUATION ----------------------")

#------------------Evaluate Full Decison Tree-----------------------------
mse_dt_full = mean_squared_error(y_test_reg, y_pred_dt_full)
# Calculate Mean Squared Error
mae_dt_full = mean_absolute_error(y_test_reg, y_pred_dt_full)
# Calculate Mean Absolute Error
r2_dt_full = r2_score(y_test_reg, y_pred_dt_full)
# Calculate R² score

print("\n--- Full Decison Tree Evaluation ---")
print(f"Mean Squared Error: {mse_dt_full:.2f}")
# Display MSE
print(f"Mean Absolute Error: {mae_dt_full:.2f}")
 # Display MAE
print(f"R² Score: {r2_dt_full:.4f}")
 # Display R² score

# ------------- Evaluate Purned Decison Tree----------------------
mse_dt_pruned = mean_squared_error(y_test_reg, y_pred_dt_pruned)
# Calculate MSE for pruned tree
mae_dt_pruned = mean_absolute_error(y_test_reg, y_pred_dt_pruned)
# Calculate MAE for pruned tree
r2_dt_pruned = r2_score(y_test_reg, y_pred_dt_pruned)
 # Calculate R² for pruned tree

print("\n--- Purned Decison Tree Evaluation ---")
print(f"Mean Squared Error: {mse_dt_pruned:.2f}")
# Display MSE for pruned tree
print(f"Mean Absolute Error: {mae_dt_pruned:.2f}")
 # Display MAE for pruned tree
print(f"R² Score: {r2_dt_pruned:.4f}")
  # Display R² for pruned tree



------------ DECISION TREE REGRESSION EVALUATION ----------------------

--- Full Decison Tree Evaluation ---
Mean Squared Error: 1081.16
Mean Absolute Error: 26.17
R² Score: -1.0208

--- Purned Decison Tree Evaluation ---
Mean Squared Error: 533.38
Mean Absolute Error: 18.61
R² Score: 0.0030


In [18]:
# -----------------Evaluate Gradient Boosting----------------------------------
mse_gb = mean_squared_error(y_test_reg, y_pred_gb)  # Calculate Mean Squared Error for Gradient Boosting
# The Mean Squared Error (MSE) measures the average squared difference between the actual and predicted values.
# A lower MSE indicates a better model performance.

mae_gb = mean_absolute_error(y_test_reg, y_pred_gb)  # Calculate Mean Absolute Error for Gradient Boosting
# The Mean Absolute Error (MAE) calculates the average of the absolute errors between actual and predicted values.
# It is less sensitive to outliers than MSE.

r2_gb = r2_score(y_test_reg, y_pred_gb)  # Calculate R² score for Gradient Boosting
# The R² Score represents the proportion of variance in the dependent variable that is predictable from the independent variables.
# A higher R² indicates a better fit of the model.

print("\n--- Gradient Boosting Evaluation ---")  # Print header for Gradient Boosting evaluation
print(f"Mean Squared Error: {mse_gb:.2f}")  # Print MSE value for Gradient Boosting
print(f"Mean Absolute Error: {mae_gb:.2f}")  # Print MAE value for Gradient Boosting
print(f"R² Score: {r2_gb:.4f}")  # Print R² Score for Gradient Boosting

# 9.4 Visualize Decision Tree (Pruned for better interpretability)
plt.figure(figsize=(20, 10))  # Set figure size for better readability in the plot
# This increases the plot's width and height for clearer visualization, especially when dealing with large decision trees.

plot_tree(dt_reg_pruned,
          feature_names=X_train_reg.columns,  # Use feature names for better interpretability of the tree
          filled=True,  # Fill the nodes with colors to represent the gini index or entropy of splits
          rounded=True,  # Round the corners for a more polished look
          max_depth=3)  # Limiting depth for visualization to ensure the tree remains interpretable
# The max_depth parameter ensures that only the first few levels of the tree are displayed for better readability.

plt.title("Pruned Decision Tree (Max Depth=3)")  # Title for the Decision Tree visualization
plt.savefig(os.path.join(save_path, 'decision_tree_visualization.png'), dpi=300, bbox_inches='tight')  # Save the plot as a PNG file with high resolution (300 dpi)
# The 'bbox_inches' argument ensures that the bounding box of the plot is tight, removing extra whitespace around the figure.

plt.close()  # Close the plot to free up memory
# This ensures that the plot is not displayed in the notebook or left open, which helps in memory management when generating multiple plots.



--- Gradient Boosting Evaluation ---
Mean Squared Error: 539.86
Mean Absolute Error: 18.67
R² Score: -0.0091


In [19]:
# Feater Importnace for Gradint Boosting
feature_importance_gb = pd.DataFrame({
    'Feature': X_train_reg.columns,
    # Collecting feature names
    'Importance': gb_reg.feature_importances_
    # Extracting feature importances from the Gradient Boosting model
}).sort_values(by='Importance', ascending=False)
 # Sorting the features by importance in descending order


plt.figure(figsize=(12, 8))
 # Adjusting the figure size for a clear plot
sns.barplot(x='Importance', y='Feature', data=feature_importance_gb.head(20))
# Plotting the top 20 important features
plt.title('Top 20 Gradint Boosting Feater Importances')
 # Title of the plot with a slight typo in 'Gradient' and 'Feature'
plt.tight_layout()
# Ensuring there’s no clipping of the plot content
plt.savefig(os.path.join(save_path, 'gb_feater_importnace.png'))
# Saving the plot with another intentional spelling error
plt.close()
# Closing the plot to free up memory

# Actual vs Predictted comparsion for all modles
plt.figure(figsize=(15, 6))
 # Adjusting the figure size for subplots


# Full Decission Tree
plt.subplot(1, 3, 1)
 # First subplot for Full Decision Tree
plt.scatter(y_test_reg, y_pred_dt_full, alpha=0.7)
# Scatter plot of actual vs predicted for Full Decision Tree
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'k--', lw=2)
 # Diagonal line showing perfect prediction
plt.title('Full Decission Tree: Actual vs Predicted')
  # Title with a misspelling in 'Decision'
plt.xlabel('Actual Survial Months')
 # Typo in 'Survival'
plt.ylabel('Predicted Survial Months')
# Typo in 'Survival'
plt.grid(True)
# Adding grid lines for easier visualization

# Pruned Decission Tree
plt.subplot(1, 3, 2)
  # Second subplot for Pruned Decision Tree
plt.scatter(y_test_reg, y_pred_dt_pruned, alpha=0.7, color='orange')
 # Scatter plot of actual vs predicted for Pruned Decision Tree
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'k--', lw=2)
 # Diagonal line showing perfect prediction
plt.title('Pruned Decission Tree: Actual vs Predicted')
  # Title with a misspelling in 'Decision'
plt.xlabel('Actual Survial Months')
# Typo in 'Survival'
plt.ylabel('Predicted Survial Months')
 # Typo in 'Survival'
plt.grid(True)
  # Adding grid lines

# Gradient Boosting
plt.subplot(1, 3, 3)
# Third subplot for Gradient Boosting
plt.scatter(y_test_reg, y_pred_gb, alpha=0.7, color='green')
 # Scatter plot of actual vs predicted for Gradient Boosting
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'k--', lw=2)
 # Diagonal line showing perfect prediction
plt.title('Gradient Boosting: Actual vs Predicted')
  # Title
plt.xlabel('Actual Survial Months')
# Typo in 'Survival'
plt.ylabel('Predicted Survial Months')
 # Typo in 'Survival'
plt.grid(True)

 # Adding grid lines

plt.tight_layout()
# Adjusting layout to avoid overlap
plt.savefig(os.path.join(save_path, 'regression_models_comparsion.png'))
  # Saving the plot with a typo in 'comparison'
plt.close()
 # Closing the plot


In [20]:

# ------------- MODEL COMPARISON AND SUMMARY -----------------------
print("\n=== MODEL COMPARISON SUMMARY ===")

#  Classification Summary enseblie
print("\nClassification Model (Ensemble):")
print(f"Accuracy Score: {accuracy_score(y_test_class, y_pred_ensemble):.4f}")
print(f"AUC: {auc_ens:.4f}")

#  Regression Summary ensemble
print("\nRegression Models:")
print(f"Full Decision Tree - MSE: {mse_dt_full:.2f}, R²: {r2_dt_full:.4f}")
print(f"Pruned Decision Tree - MSE: {mse_dt_pruned:.2f}, R²: {r2_dt_pruned:.4f}")
print(f"Gradient Boosting - MSE: {mse_gb:.2f}, R²: {r2_gb:.4f}")

# Save the best models to path in joblib type
print("\n=== SAVING MODELS ===")
joblib.dump(ensemble_classifier, os.path.join(save_path, 'ensemble_classifier.joblib'))
joblib.dump(preprocessing_pipeline, os.path.join(save_path, 'ensemble_preprocessing_pipeline.joblib'))

joblib.dump(gb_reg, os.path.join(save_path, 'gradient_boosting_regressor.joblib'))
joblib.dump(reg_preprocessing_pipeline, os.path.join(save_path, 'regression_preprocessing_pipeline.joblib'))

print("\nAll models saved successfully.")


=== MODEL COMPARISON SUMMARY ===

Classification Model (Ensemble):
Accuracy Score: 0.5647
AUC: 0.7237

Regression Models:
Full Decision Tree - MSE: 1081.16, R²: -1.0208
Pruned Decision Tree - MSE: 533.38, R²: 0.0030
Gradient Boosting - MSE: 539.86, R²: -0.0091

=== SAVING MODELS ===

All models saved successfully.
