<a href="https://colab.research.google.com/github/thaheshan/Breast_Cancer_Prediction_Model/blob/main/notebook03final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =============================================================
# Title: Final Python Notebook 3 – Ensemble Classifier & DT Regression
# Author: Suresh Thaheshan
# Peer Reviewer: [Peer Reviewer Name], Date: [Review Date]
# Reused from: Code Reuse Session 3
# =============================================================

# ============ IMPORT LIBRARIES ============
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import (confusion_matrix, classification_report, roc_curve, auc,
                             mean_squared_error, mean_absolute_error, r2_score)

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import os

# ============ LOAD DATA ============
# Load preprocessed datasets
classification_df = pd.read_csv('/content/drive/MyDrive/Machine_Learning_CourseWork/classification_dataset.csv')
regression_df = pd.read_csv('/content/drive/MyDrive/Machine_Learning_CourseWork/regression_dataset2.csv')

# ============ PREPROCESSING ============

# For classification task
X_class = classification_df.drop(columns=['Mortality_Status'])
y_class = classification_df['Mortality_Status']

# For regression task - ensure the 'Survival_Months' column exists
if 'Survival_Months' in regression_df.columns:
    X_reg = regression_df.drop(columns=['Survival_Months'])
    y_reg = regression_df['Survival_Months']
else:
    print("Column 'Survival_Months' not found in the dataset.")
    y_reg = None  # Prevent further errors

# Train-Test Split for Classification
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_class, y_class, test_size=0.2, stratify=y_class, random_state=42)

# Train-Test Split for Regression (only if 'Survival_Months' column exists)
if y_reg is not None:
    Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# ============ ENSEMBLE CLASSIFIER ============

# Define base learners
lr = LogisticRegression(max_iter=1000, random_state=42)
nb = GaussianNB()

# Ensemble using VotingClassifier (soft voting)
ensemble_model = VotingClassifier(estimators=[('lr', lr), ('nb', nb)], voting='soft')

# Train ensemble
ensemble_model.fit(Xc_train, yc_train)

# Predict & Evaluate
y_pred_ens = ensemble_model.predict(Xc_test)
y_prob_ens = ensemble_model.predict_proba(Xc_test)[:, 1]

# Confusion matrix & classification report
print("Confusion Matrix (Ensemble):")
print(confusion_matrix(yc_test, y_pred_ens))
print("\nClassification Report (Ensemble):")
print(classification_report(yc_test, y_pred_ens))

# ROC Curve
fpr, tpr, _ = roc_curve(yc_test, y_prob_ens)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc(fpr, tpr):.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Ensemble ROC Curve')
plt.legend()
plt.grid(True)
plt.show()

# ============ REGRESSION – DECISION TREE MODELS ============

# Model 1: Fully grown Decision Tree
dt1 = DecisionTreeRegressor(random_state=42)
dt1.fit(Xr_train, yr_train)

# Model 2: Pruned Decision Tree (max_depth=4)
dt2 = DecisionTreeRegressor(max_depth=4, random_state=42)
dt2.fit(Xr_train, yr_train)

# ============ EVALUATION – REGRESSION MODELS ============

def evaluate_regression(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{model_name} Evaluation:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"R-squared: {r2:.4f}")

    # Plotting actual vs predicted values
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2)
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title(f'{model_name} – Actual vs Predicted')
    plt.grid(True)
    plt.show()

evaluate_regression(dt1, Xr_test, yr_test, 'Fully Grown Decision Tree')
evaluate_regression(dt2, Xr_test, yr_test, 'Pruned Decision Tree')


Column 'Survival_Months' not found in the dataset.


ValueError: Input y contains NaN.

In [None]:
# =============================================================
# Title: Final Python Notebook 3 – Ensemble Classifier & DT Regression
# Author: Suresh Thaheshan
# Peer Reviewer: [Peer Reviewer Name], Date: [Review Date]
# Reused from: Code Reuse Session 3
# =============================================================

# ============ IMPORT LIBRARIES ============
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import (confusion_matrix, classification_report, roc_curve, auc,
                             mean_squared_error, mean_absolute_error, r2_score)

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import os

# ============ LOAD DATA ============
# Load preprocessed datasets
classification_df = pd.read_csv('/content/drive/MyDrive/Machine_Learning_CourseWork/classification_dataset.csv')
regression_df = pd.read_csv('/content/drive/MyDrive/Machine_Learning_CourseWork/regression_dataset.csv')

# ============ PREPROCESSING ============
# For classification task
X_class = classification_df.drop(columns=['Mortality_Status'])
y_class = classification_df['Mortality_Status']

# For regression task
if 'Survival_Months' in regression_df.columns:
    X_reg = regression_df.drop(columns=['Survival_Months'])
    y_reg = regression_df['Survival_Months']
else:
    print("Column 'Survival_Months' not found in the dataset.")

# Train-Test Split for Classification
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_class, y_class, test_size=0.2, stratify=y_class, random_state=42)

# Train-Test Split for Regression
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# ============ ENSEMBLE CLASSIFIER ============
# Define base learners
lr = LogisticRegression(max_iter=1000, random_state=42)
nb = GaussianNB()

# Ensemble using VotingClassifier (soft voting)
ensemble_model = VotingClassifier(estimators=[('lr', lr), ('nb', nb)], voting='soft')

# Train ensemble
ensemble_model.fit(Xc_train, yc_train)

# Predict & Evaluate
y_pred_ens = ensemble_model.predict(Xc_test)
y_prob_ens = ensemble_model.predict_proba(Xc_test)[:, 1]

# Confusion matrix & classification report
print("Confusion Matrix (Ensemble):")
print(confusion_matrix(yc_test, y_pred_ens))
print("\nClassification Report (Ensemble):")
print(classification_report(yc_test, y_pred_ens))

# ROC Curve
fpr, tpr, _ = roc_curve(yc_test, y_prob_ens)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc(fpr, tpr):.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Ensemble ROC Curve')
plt.legend()
plt.grid(True)
plt.show()

# ============ REGRESSION – DECISION TREE MODELS ============
# Model 1: Fully grown Decision Tree
dt1 = DecisionTreeRegressor(random_state=42)
dt1.fit(Xr_train, yr_train)

# Model 2: Pruned Decision Tree (max_depth=4)
dt2 = DecisionTreeRegressor(max_depth=4, random_state=42)
dt2.fit(Xr_train, yr_train)

# ============ EVALUATION – REGRESSION MODELS ============
def evaluate_regression(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{model_name} Evaluation:")
    print(f"MSE: {mse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R² Score: {r2:.2f}")
    return mse, mae, r2

mse1, mae1, r2_1 = evaluate_regression(dt1, Xr_test, yr_test, "DT-1 Fully Grown")
mse2, mae2, r2_2 = evaluate_regression(dt2, Xr_test, yr_test, "DT-2 Pruned (max_depth=4)")

# ============ VISUALIZATION OF TREES ============
# Plot DT-1
plt.figure(figsize=(20, 10))
plot_tree(dt1, filled=True, feature_names=Xr_train.columns)
plt.title("Fully Grown Decision Tree (DT-1)")
plt.show()

# Plot DT-2
plt.figure(figsize=(20, 10))
plot_tree(dt2, filled=True, feature_names=Xr_train.columns)
plt.title("Pruned Decision Tree (DT-2)")
plt.show()

# ============ INTERPRETATION – PATIENT PREDICTION ============
# Predict survival months for patient B002565 (manually construct input)
patient = pd.DataFrame({
    'Age': [29],
    'Tumor_Size': [41],
    'Regional_Node_Examined': [5],
    'Regional_Node_Positive': [1],
    'Sex_Male': [0],
    'T_Stage_T3': [1],
    'N_Stage_N1': [1],
    '6th_Stage_IIIC': [1],
    'Differentiated_Moderately differentiated': [1],
    'A_Stage_Regional': [1],
    'Estrogen_Status_Negative': [1],
    'Progesterone_Status_Positive': [1]
}, index=[0])  # All other columns default to 0

# Align with training data columns
patient = patient.reindex(columns=Xr_train.columns, fill_value=0)

# Predict
prediction_dt = dt2.predict(patient)
print(f"\nPredicted Survival Months for patient B002565: {prediction_dt[0]:.2f}")


Column 'Survival_Months' not found in the dataset.


ValueError: Input y contains NaN.

In [None]:
# =============================================================
# Title: Final Python Notebook 3 – Ensemble Classifier & DT Regression
# Author: Suresh Thaheshan
# Peer Reviewer: [Peer Reviewer Name], Date: [Review Date]
# Reused from: Code Reuse Session 3
# =============================================================

# ============ IMPORT LIBRARIES ============
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import (confusion_matrix, classification_report, roc_curve, auc,
                             mean_squared_error, mean_absolute_error, r2_score)

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# ============ LOAD DATA ============
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Machine_Learning_CourseWork/classification_dataset.csv')

# Create classification and regression datasets
df_class = df.drop(columns=['Survival_Months'])  # Classification dataset
df_reg = df[df['Mortality_Status'] == 1].drop(columns=['Mortality_Status'])  # Regression dataset

# ============ PREPROCESSING ============
# One-hot encode categorical features if necessary
X_class = pd.get_dummies(df_class.drop(columns=['Mortality_Status']), drop_first=True)
y_class = df_class['Mortality_Status']

X_reg = pd.get_dummies(df_reg.drop(columns=['Survival_Months']), drop_first=True)
y_reg = df_reg['Survival_Months']

# Train-Test Split
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_class, y_class, test_size=0.2, stratify=y_class, random_state=42)
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# ============ ENSEMBLE CLASSIFIER ============
# Define base learners
lr = LogisticRegression(max_iter=1000, random_state=42)
nb = GaussianNB()
# You can swap one of the models for KNN if desired

# Ensemble using VotingClassifier (soft voting)
ensemble_model = VotingClassifier(estimators=[('lr', lr), ('nb', nb)], voting='soft')

# Train ensemble
ensemble_model.fit(Xc_train, yc_train)

# Predict & Evaluate
y_pred_ens = ensemble_model.predict(Xc_test)
y_prob_ens = ensemble_model.predict_proba(Xc_test)[:, 1]

# Confusion matrix & classification report
print("Confusion Matrix (Ensemble):")
print(confusion_matrix(yc_test, y_pred_ens))
print("\nClassification Report (Ensemble):")
print(classification_report(yc_test, y_pred_ens))

# ROC Curve
fpr, tpr, _ = roc_curve(yc_test, y_prob_ens)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc(fpr, tpr):.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Ensemble ROC Curve')
plt.legend()
plt.grid(True)
plt.show()

# ============ REGRESSION – DECISION TREE MODELS ============
# Model 1: Fully grown Decision Tree
dt1 = DecisionTreeRegressor(random_state=42)
dt1.fit(Xr_train, yr_train)

# Model 2: Pruned Decision Tree (max_depth=4)
dt2 = DecisionTreeRegressor(max_depth=4, random_state=42)
dt2.fit(Xr_train, yr_train)

# ============ EVALUATION – REGRESSION MODELS ============
def evaluate_regression(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{model_name} Evaluation:")
    print(f"MSE: {mse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R² Score: {r2:.2f}")
    return mse, mae, r2

mse1, mae1, r2_1 = evaluate_regression(dt1, Xr_test, yr_test, "DT-1 Fully Grown")
mse2, mae2, r2_2 = evaluate_regression(dt2, Xr_test, yr_test, "DT-2 Pruned (max_depth=4)")

# ============ VISUALIZATION OF TREES ============
# Plot DT-1
plt.figure(figsize=(20, 10))
plot_tree(dt1, filled=True, feature_names=Xr_train.columns)
plt.title("Fully Grown Decision Tree (DT-1)")
plt.show()

# Plot DT-2
plt.figure(figsize=(20, 10))
plot_tree(dt2, filled=True, feature_names=Xr_train.columns)
plt.title("Pruned Decision Tree (DT-2)")
plt.show()

# ============ INTERPRETATION – PATIENT PREDICTION ============
# Predict survival months for patient B002565 (manually construct input)
patient = pd.DataFrame({
    'Age': [29],
    'Tumour_Size': [41],
    'Regional_Node_Examined': [5],
    'Regional_Node_Positive': [1],
    'Month_of_Birth_July': [1],
    'Sex_Male': [0],
    'Occupation_15': [1],
    'T_Stage_T3': [1],
    'N_Stage_N1': [1],
    'Stage_6th_IIIC': [1],
    'Differentiated_Moderately differentiated': [1],
    'Grade_2': [1],
    'A_Stage_Regional': [1],
    'Estrogen_Status_Negative': [1],
    'Progesterone_Status_Positive': [1]
}, index=[0])  # All other columns default to 0

# Align with training data columns
patient = patient.reindex(columns=Xr_train.columns, fill_value=0)

# Predict
prediction_dt = dt2.predict(patient)
print(f"\nPredicted Survival Months for patient B002565: {prediction_dt[0]:.2f}")


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
# =============================================================
# Title: Notebook 3 – Ensemble Classifier & Decision Tree Regression
# Author: Suresh Thaheshan
# Peer Reviewer: Ayman Jaleel
# Date: May 3, 2025
# =============================================================

# === 1. IMPORT LIBRARIES ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import VotingClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_curve, auc,
    mean_squared_error, mean_absolute_error, r2_score, accuracy_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer  # Import imputer for handling missing values
from sklearn.pipeline import Pipeline

# Set visualization style
sns.set_style('whitegrid')
plt.style.use('seaborn-v0_8-whitegrid')

# Define save path
save_path = '/content/drive/MyDrive/Machine_Learning_CourseWork'
os.makedirs(save_path, exist_ok=True)

# === 2. LOAD DATA ===
print("Loading processed datasets...")
classification_df = pd.read_csv(os.path.join(save_path, 'classification_dataset.csv'))
regression_df = pd.read_csv(os.path.join(save_path, 'regression_dataset.csv'))

print(f"Classification dataset shape: {classification_df.shape}")
print(f"Regression dataset shape: {regression_df.shape}")

# === 3. CHECK FOR MISSING VALUES ===
print("\n=== CHECKING FOR MISSING VALUES ===")
print(f"Classification dataset missing values: {classification_df.isna().sum().sum()}")
print(f"Regression dataset missing values: {regression_df.isna().sum().sum()}")

# Print columns with missing values
print("\nColumns with missing values in classification dataset:")
missing_cols = classification_df.columns[classification_df.isna().any()].tolist()
for col in missing_cols:
    missing_count = classification_df[col].isna().sum()
    print(f"- {col}: {missing_count} missing values ({missing_count/len(classification_df)*100:.2f}%)")

# === 4. CLEAN UP MORTALITY STATUS VALUES ===
print("\n=== CLEANING MORTALITY STATUS VALUES ===")

# Check the current values
print("\nMortality Status values:")
print(classification_df['Mortality_Status'].value_counts())

# Clean up the mortality status values - convert to binary
mortality_mapping = {
    0: 0, '0': 0, 'ALIVE': 0, 'alive': 0, 'ALive': 0, 'Alive': 0,
    1: 1, '1': 1, 'DEAD': 1, 'dead': 1, 'Dead': 1
}

# Apply mapping to both datasets
classification_df['Mortality_Status'] = classification_df['Mortality_Status'].map(mortality_mapping)
regression_df['Mortality_Status'] = regression_df['Mortality_Status'].map(mortality_mapping)

# Convert to integer type
classification_df['Mortality_Status'] = classification_df['Mortality_Status'].astype(int)
regression_df['Mortality_Status'] = regression_df['Mortality_Status'].astype(int)

print("\nCleaned Mortality Status values:")
print(classification_df['Mortality_Status'].value_counts())
print(classification_df['Mortality_Status'].value_counts(normalize=True))

# === 5. CLASSIFICATION DATA PREPARATION ===
print("\n=== CLASSIFICATION DATA PREPARATION ===")

# 5.1 Setup features and target for classification
# Corrected column name from 'Mortality_Status_1' to 'Mortality_Status'
X_class = classification_df.drop(columns=['Mortality_Status'])
y_class = classification_df['Mortality_Status']

# 5.2 Train-test split for classification
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, stratify=y_class, random_state=42
)

print(f"Classification training set shape: {X_train_class.shape}")
print(f"Classification testing set shape: {X_test_class.shape}")

# 5.3 Create a pipeline for preprocessing with imputation and scaling
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())  # Scale features
])

# Transform the data
X_train_class_processed = preprocessing_pipeline.fit_transform(X_train_class)
X_test_class_processed = preprocessing_pipeline.transform(X_test_class)

# === 6. ENSEMBLE CLASSIFIER ===
print("\n=== ENSEMBLE CLASSIFIER ===")

# 6.1 Define base learners
log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
naive_bayes = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5)

# 6.2 Create and train voting ensemble (soft voting)
ensemble_classifier = VotingClassifier(
    estimators=[
        ('lr', log_reg),
        ('nb', naive_bayes),
        ('knn', knn)
    ],
    voting='soft'
)

ensemble_classifier.fit(X_train_class_processed, y_train_class)

# 6.3 Make predictions
y_pred_ensemble = ensemble_classifier.predict(X_test_class_processed)
y_prob_ensemble = ensemble_classifier.predict_proba(X_test_class_processed)[:, 1]

# 6.4 Evaluate ensemble classifier
print("\n--- Ensemble Classifier Evaluation ---")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_ensemble))

print("\nClassification Report:")
print(classification_report(y_test_class, y_pred_ensemble))

print(f"Accuracy Score: {accuracy_score(y_test_class, y_pred_ensemble):.4f}")

# 6.5 ROC Curve and AUC
fpr_ens, tpr_ens, _ = roc_curve(y_test_class, y_prob_ensemble)
auc_ens = auc(fpr_ens, tpr_ens)

plt.figure(figsize=(8, 6))
plt.plot(fpr_ens, tpr_ens, label=f'Ensemble (AUC = {auc_ens:.4f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Ensemble Classifier')
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(save_path, 'ensemble_roc_curve.png'))
plt.close()

# 6.6 Confusion Matrix Visualization
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test_class, y_pred_ensemble), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Ensemble Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig(os.path.join(save_path, 'ensemble_confusion_matrix.png'))
plt.close()

# === 7. REGRESSION DATA PREPARATION ===
print("\n=== REGRESSION DATA PREPARATION ===")

# 7.1 Setup features and target for regression
# Corrected column name from 'Mortality_Status_1' to 'Mortality_Status'
X_reg = regression_df.drop(columns=['Survival_Months', 'Mortality_Status'])
y_reg = regression_df['Survival_Months']

# 7.2 Train-test split for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

print(f"Regression training set shape: {X_train_reg.shape}")
print(f"Regression testing set shape: {X_test_reg.shape}")

# 7.3 Create a pipeline for preprocessing with imputation
reg_preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))  # Handle missing values
])

# Transform the data
X_train_reg_processed = reg_preprocessing_pipeline.fit_transform(X_train_reg)
X_test_reg_processed = reg_preprocessing_pipeline.transform(X_test_reg)

# === 8. DECISION TREE REGRESSION MODELS ===
print("\n=== DECISION TREE REGRESSION MODELS ===")

# 8.1 Model 1: Full Decision Tree (no constraint)
dt_reg_full = DecisionTreeRegressor(random_state=42)
dt_reg_full.fit(X_train_reg_processed, y_train_reg)

# 8.2 Model 2: Pruned Decision Tree
dt_reg_pruned = DecisionTreeRegressor(max_depth=4, min_samples_split=10, random_state=42)
dt_reg_pruned.fit(X_train_reg_processed, y_train_reg)

# 8.3 Gradient Boosting Regressor (additional model)
gb_reg = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_reg.fit(X_train_reg_processed, y_train_reg)

# 8.4 Make predictions
y_pred_dt_full = dt_reg_full.predict(X_test_reg_processed)
y_pred_dt_pruned = dt_reg_pruned.predict(X_test_reg_processed)
y_pred_gb = gb_reg.predict(X_test_reg_processed)

# === 9. DECISION TREE REGRESSION EVALUATION ===
print("\n=== DECISION TREE REGRESSION EVALUATION ===")

# 9.1 Evaluate Full Decision Tree
mse_dt_full = mean_squared_error(y_test_reg, y_pred_dt_full)
mae_dt_full = mean_absolute_error(y_test_reg, y_pred_dt_full)
r2_dt_full = r2_score(y_test_reg, y_pred_dt_full)

print("\n--- Full Decision Tree Evaluation ---")
print(f"Mean Squared Error: {mse_dt_full:.2f}")
print(f"Mean Absolute Error: {mae_dt_full:.2f}")
print(f"R² Score: {r2_dt_full:.4f}")

# 9.2 Evaluate Pruned Decision Tree
mse_dt_pruned = mean_squared_error(y_test_reg, y_pred_dt_pruned)
mae_dt_pruned = mean_absolute_error(y_test_reg, y_pred_dt_pruned)
r2_dt_pruned = r2_score(y_test_reg, y_pred_dt_pruned)

print("\n--- Pruned Decision Tree Evaluation ---")
print(f"Mean Squared Error: {mse_dt_pruned:.2f}")
print(f"Mean Absolute Error: {mae_dt_pruned:.2f}")
print(f"R² Score: {r2_dt_pruned:.4f}")

# 9.3 Evaluate Gradient Boosting
mse_gb = mean_squared_error(y_test_reg, y_pred_gb)
mae_gb = mean_absolute_error(y_test_reg, y_pred_gb)
r2_gb = r2_score(y_test_reg, y_pred_gb)

print("\n--- Gradient Boosting Evaluation ---")
print(f"Mean Squared Error: {mse_gb:.2f}")
print(f"Mean Absolute Error: {mae_gb:.2f}")
print(f"R² Score: {r2_gb:.4f}")

# 9.4 Visualize Decision Tree (Pruned for better interpretability)
plt.figure(figsize=(20, 10))
plot_tree(dt_reg_pruned,
          feature_names=X_train_reg.columns,
          filled=True,
          rounded=True,
          max_depth=3)  # Limiting depth for visualization
plt.title("Pruned Decision Tree (Max Depth=3)")
plt.savefig(os.path.join(save_path, 'decision_tree_visualization.png'), dpi=300, bbox_inches='tight')
plt.close()

# 9.5 Feature Importance for Gradient Boosting
feature_importance_gb = pd.DataFrame({
    'Feature': X_train_reg.columns,
    'Importance': gb_reg.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_gb.head(20))
plt.title('Top 20 Gradient Boosting Feature Importances')
plt.tight_layout()
plt.savefig(os.path.join(save_path, 'gb_feature_importance.png'))
plt.close()

# 9.6 Actual vs Predicted comparison for all models
plt.figure(figsize=(15, 6))

# Full Decision Tree
plt.subplot(1, 3, 1)
plt.scatter(y_test_reg, y_pred_dt_full, alpha=0.7)
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'k--', lw=2)
plt.title('Full Decision Tree: Actual vs Predicted')
plt.xlabel('Actual Survival Months')
plt.ylabel('Predicted Survival Months')
plt.grid(True)

# Pruned Decision Tree
plt.subplot(1, 3, 2)
plt.scatter(y_test_reg, y_pred_dt_pruned, alpha=0.7, color='orange')
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'k--', lw=2)
plt.title('Pruned Decision Tree: Actual vs Predicted')
plt.xlabel('Actual Survival Months')
plt.ylabel('Predicted Survival Months')
plt.grid(True)

# Gradient Boosting
plt.subplot(1, 3, 3)
plt.scatter(y_test_reg, y_pred_gb, alpha=0.7, color='green')
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'k--', lw=2)
plt.title('Gradient Boosting: Actual vs Predicted')
plt.xlabel('Actual Survival Months')
plt.ylabel('Predicted Survival Months')
plt.grid(True)

plt.tight_layout()
plt.savefig(os.path.join(save_path, 'regression_models_comparison.png'))
plt.close()

# === 10. MODEL COMPARISON AND SUMMARY ===
print("\n=== MODEL COMPARISON SUMMARY ===")

# 10.1 Classification Summary
print("\nClassification Model (Ensemble):")
print(f"Accuracy Score: {accuracy_score(y_test_class, y_pred_ensemble):.4f}")
print(f"AUC: {auc_ens:.4f}")

# 10.2 Regression Summary
print("\nRegression Models:")
print(f"Full Decision Tree - MSE: {mse_dt_full:.2f}, R²: {r2_dt_full:.4f}")
print(f"Pruned Decision Tree - MSE: {mse_dt_pruned:.2f}, R²: {r2_dt_pruned:.4f}")
print(f"Gradient Boosting - MSE: {mse_gb:.2f}, R²: {r2_gb:.4f}")

# 10.3 Save the best models
print("\n=== SAVING MODELS ===")
joblib.dump(ensemble_classifier, os.path.join(save_path, 'ensemble_classifier.joblib'))
joblib.dump(preprocessing_pipeline, os.path.join(save_path, 'ensemble_preprocessing_pipeline.joblib'))

joblib.dump(gb_reg, os.path.join(save_path, 'gradient_boosting_regressor.joblib'))
joblib.dump(reg_preprocessing_pipeline, os.path.join(save_path, 'regression_preprocessing_pipeline.joblib'))

print("\nAll models saved successfully.")

Loading processed datasets...
Classification dataset shape: (4019, 66)
Regression dataset shape: (4019, 67)

=== CHECKING FOR MISSING VALUES ===
Classification dataset missing values: 1
Regression dataset missing values: 1

Columns with missing values in classification dataset:
- Regional_Node_Examined: 1 missing values (0.02%)

=== CLEANING MORTALITY STATUS VALUES ===

Mortality Status values:
Mortality_Status
0        3395
1         597
DEAD       10
dead        8
ALIVE       5
alive       3
ALive       1
Name: count, dtype: int64

Cleaned Mortality Status values:
Mortality_Status
0    3404
1     615
Name: count, dtype: int64
Mortality_Status
0    0.846977
1    0.153023
Name: proportion, dtype: float64

=== CLASSIFICATION DATA PREPARATION ===
Classification training set shape: (3215, 65)
Classification testing set shape: (804, 65)

=== ENSEMBLE CLASSIFIER ===

--- Ensemble Classifier Evaluation ---

Confusion Matrix:
[[360 321]
 [ 29  94]]

Classification Report:
              precis