<a href="https://colab.research.google.com/github/thaheshan/Breast_Cancer_Prediction_Model/blob/main/Notebook2Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

# Title: Notebook 2 - Classification and Regression Models for Cancer Survival Analysis
# Author: Suresh Thaheshan
# Peer Reviewer: Ayman Jaleel
# Date: March,  11, 2025



# IMPORT LIBRARIES
# Import the libraries for the necessary inputs related clssification regression modelling and preprocess the regression modellig

#code reuse for import from notebok 01 reuse
#import pandas library
#which is very important for loading te datset , cleaning and manipulation
#it will manipulate the dataset and analyze the survival prediction
import pandas as pd

#import numpy library for thenumerical operation
#it is usefuk for calculations in math opeartions
import numpy as np

#import mathplotlib for the creative visualizaations
#it will give the data analysis in visualization graph and tables
import matplotlib.pyplot as plt

#import seadborn to use this with matplotlib library
#it will be helpul for, statistical graaphic and for the distribution plots
import seaborn as sns
import os


# Importing joblib for saving and loading models.
# joblib helps in serializing and deserializing models, enabling saving of trained models.
import joblib


# Importing model selection tools for splitting data, cross-validation, and hyperparameter tuning.
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Importing ensemble models for classification and regression.
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Importing performance evaluation metrics.
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, roc_auc_score,
    roc_curve, mean_squared_error, r2_score, mean_absolute_error
)

# Importing data preprocessing tools.
from sklearn.preprocessing import StandardScaler

# Importing the Pipeline module to streamline preprocessing and model training.
from sklearn.pipeline import Pipeline

# Importing SimpleImputer for handling missing data.
from sklearn.impute import SimpleImputer


In [3]:
# Set visualization style
sns.set_style('whitegrid')
# Sets the style of seaborn visualizations. The 'whitegrid' style is chosen for clean background and gridlines which helps with readability of the data.

plt.style.use('seaborn-v0_8-whitegrid')
# Applies the seaborn style from matplotlib for consistent plot appearances. This is helpful for both seaborn and matplotlib plots, keeping a uniform style throughout the notebook.

# Define save path
save_path = '/content/drive/MyDrive/Machine_Learning_CourseWork'
# Defines the directory where the plots, models, and other results will be saved. This path is set to Google Drive (for Colab users), but can be customized to any path.

os.makedirs(save_path, exist_ok=True)
# Creates the directory specified by 'save_path'. If the directory already exists, it does nothing (thanks to the 'exist_ok=True' argument).
# Ensures that the directory exists for saving outputs, avoiding errors if the directory is not already there.


In [6]:
#---------------- LOAD the preprocessed data into this system---------------
# Previously we saved it with notebook 01
print("---------------Loading processed datasets from notebook 01----------")

# Load the preprocessed classification file
classification_df = pd.read_csv(os.path.join(save_path, 'classification_dataset.csv'))
# Loads the preprocessed classification dataset that was saved earlier from notebook 01.
# The 'pd.read_csv' method is used to read the CSV file from the specified save_path.

# Load the preprocessed regression data file
regression_df = pd.read_csv(os.path.join(save_path, 'regression_dataset.csv'))
# Loads the preprocessed regression dataset that was saved earlier from notebook 01.

# Display the shape of both datasets
print(f"Classification dataset shape: {classification_df.shape}")
# Prints the shape (number of rows and columns) of the classification dataset.

print(f"Regression dataset shape: {regression_df.shape}")
# Prints the shape (number of rows and columns) of the regression dataset.

# Display the columns of the classification dataset
print("\nClassification dataset columns:")
print(classification_df.columns.tolist())
# Prints the column names of the classification dataset in a list format.


---------------Loading processed datasets from notebook 01----------
Classification dataset shape: (4019, 66)
Regression dataset shape: (4019, 67)

Classification dataset columns:
['Month_of_Birth', 'Age', 'Tumor_Size', 'Regional_Node_Examined', 'Reginol_Node_Positive', 'Mortality_Status', 'Sex_Female', 'Occupation_Agriculture', 'Occupation_Arts', 'Occupation_Business', 'Occupation_Cleaning', 'Occupation_Construction', 'Occupation_Consultancy', 'Occupation_Design', 'Occupation_Development', 'Occupation_Driving', 'Occupation_Engineering', 'Occupation_Entertainment', 'Occupation_Finance', 'Occupation_Healthcare', 'Occupation_Hospitality', 'Occupation_House Person', 'Occupation_Human Resources', 'Occupation_Legal', 'Occupation_Leisure', 'Occupation_Maintenance', 'Occupation_Management', 'Occupation_Manufacturing', 'Occupation_Marketing', 'Occupation_Medical', 'Occupation_Military', 'Occupation_Multimedia', 'Occupation_Photography', 'Occupation_Planning', 'Occupation_Politics', 'Occupation

In [7]:
# -----------CHECK FOR MISSING VALUES-----------------
# This section checks the missing values in both datasets

print("\n ----- CHECKING FOR THE MISSING VALUES ---------")

# Check for missing values in the classification dataset
print(f"Classification dataset missing values: {classification_df.isna().sum().sum()}")
# The 'isna().sum().sum()' function calculates the total number of missing values in the entire classification dataset.

# Check for missng values in the regression dataset
print(f"Regression dataset missing values: {regression_df.isna().sum().sum()}")
# Similarly, checks the total number of missing values in the regression dataset.

# Check which columns have missing values in the classificaiion dataset
print("\nColumns with missing values in classification dataset:")
missing_cols = classification_df.columns[classification_df.isna().any()].tolist()
# 'isna().any()' finds columns with at least one missing value. The columns are then listed using '.columns' and converted to a list.

# Print out each column with its count of missing values
for col in missing_cols:
    missing_count = classification_df[col].isna().sum()
    print(f"- {col}: {missing_count} missing values ({missing_count/len(classification_df)*100:.2f}%)")
    # For each column with missing values, it prints the number of missing values and the percentage of missing data.

# CLEAN UP MORTALITY STATUS VALUES
# This section ensures the Mortality Status values are standardized

print("\n=== CLEANING MORTALITY STATUS VALUES ===")
print("\nInitial Mortality Status values:")
print(classification_df['Mortality_Status'].value_counts())
# This shows the initial distribution of values in the 'Mortality_Status' column before cleaning.

# Mapping different representations of Mortality Status to consistent values (0 for alive, 1 for dead)
mortality_mapping = {
    0: 0, '0': 0, 'ALIVE': 0, 'alive': 0, 'ALive': 0, 'Alive': 0,
    1: 1, '1': 1, 'DEAD': 1, 'dead': 1, 'Dead': 1
}
# A dictionary is created to map all variations of 'alive' and 'dead' to the standardized values of 0 and 1, respectively.

# Apply the mapping to both datasets
classification_df['Mortality_Status'] = classification_df['Mortality_Status'].map(mortality_mapping)
regression_df['Mortality_Status'] = regression_df['Mortality_Status'].map(mortality_mapping)
# The mapping is applied to both datasets' 'Mortality_Status' columns to ensure consistent values.

# Ensure that the Mortality Status is of integer type
classification_df['Mortality_Status'] = classification_df['Mortality_Status'].astype(int)
regression_df['Mortality_Status'] = regression_df['Mortality_Status'].astype(int)
# Converts the 'Mortality_Status' column to integers after cleaning.

# Show the cleaned Mortality Status values
print("\nCleaned Mortality Status values:")
print(classification_df['Mortality_Status'].value_counts())
print(classification_df['Mortality_Status'].value_counts(normalize=True))
# Displays the distribution of 'Mortality_Status' after cleaning, both as a count and as a normalized percentage.



 ----- CHECKING FOR THE MISSING VALUES ---------
Classification dataset missing values: 1
Regression dataset missing values: 1

Columns with missing values in classification dataset:
- Regional_Node_Examined: 1 missing values (0.02%)

=== CLEANING MORTALITY STATUS VALUES ===

Initial Mortality Status values:
Mortality_Status
0        3395
1         597
DEAD       10
dead        8
ALIVE       5
alive       3
ALive       1
Name: count, dtype: int64

Cleaned Mortality Status values:
Mortality_Status
0    3404
1     615
Name: count, dtype: int64
Mortality_Status
0    0.846977
1    0.153023
Name: proportion, dtype: float64


In [8]:
#----------- CLASSIFICATION PREPROCESSING -----------------------------
# This section preprocesses the data for the classification model

print("\n ----------- CLASSIFICATION MODEL PREPROCESSING ---------------------")

# Separate features and target variable from the classification dataset
X_class = classification_df.drop(columns=['Mortality_Status'])  # Drop the target variable column 'Mortality_Status' for features
y_class = classification_df['Mortality_Status']  # Assign the target variable to y_class

# Split the data into training and testing sets (80% train, 20% test)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, stratify=y_class, random_state=42
)
# train_test_split randomly splits the data into training and testing sets
# stratify ensures that the target variable's distribution is maintained in both train and test sets
# random_state=42 ensures reproducibility of the results

# Define a pipeline to preprocess the data for classification
classification_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values by replacing them with the median
    ('scaler', StandardScaler())  # Standardize the features by scaling them to have a mean of 0 and a standard deviation of 1
])
# The pipeline ensures that the preprocessing steps are applied sequentially on the dataset

# Apply the pipeline to transform the training data
X_train_class_processed = classification_pipeline.fit_transform(X_train_class)
# 'fit_transform' fits the imputer and scaler to the training data, then applies the transformations

# Apply the pipeline to transform the test data using the same transformations learned from the training data
X_test_class_processed = classification_pipeline.transform(X_test_class)
# 'transform' uses the already learned transformations from the training data to scale the test data



=== CLASSIFICATION MODEL PREPROCESSING ===


In [10]:
#-------------- CLASSIFICATION MODELS------------------------------
# This section trains and evaluates the logistic regression model for classification

print("\n=== LOGISTIC REGRESSION MODEL ===")

# Create and train a Logistic Regression model
# 'max_iter=1000' ensures that the solver has enough iterations to converge
# 'random_state=42' ensures reproducibility
# 'class_weight="balanced"' adjusts weights to handle class imbalance in the target variable
log_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')

# Fit the logistic regression model on the preprocessed training data
log_model.fit(X_train_class_processed, y_train_class)

# Make predictions on the test set
y_pred_log = log_model.predict(X_test_class_processed)

# Print confusion matrix to show the number of correct/incorrect predictions for each class
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_log))

# Print classification report to show precision, recall, f1-score, and support for each class
print("\nClassification Report:")
print(classification_report(y_test_class, y_pred_log))

# Print the accuracy score of the model
print("Accuracy Score:", accuracy_score(y_test_class, y_pred_log))

# Perform cross-validation to evaluate the model using 5-fold cross-validation
cv_scores_log = cross_val_score(
    Pipeline([  # Create a pipeline with preprocessing steps and the logistic regression model
        ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with the median
        ('scaler', StandardScaler()),  # Standardize features to have zero mean and unit variance
        ('model', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'))  # Logistic regression model
    ]),
    X_train_class, y_train_class, cv=5  # Perform 5-fold cross-validation on the training set
)

# Print the cross-validation scores
print(f"\nCross-validation scores (Logistic Regression): {cv_scores_log}")

# Print the average cross-validation accuracy
print(f"Average CV accuracy: {cv_scores_log.mean():.4f}")

# Get the probabilities of the positive class (class 1) for ROC curve plotting
y_probs_log = log_model.predict_proba(X_test_class_processed)[:, 1]

# Calculate the false positive rate and true positive rate for ROC curve
fpr_log, tpr_log, _ = roc_curve(y_test_class, y_probs_log)

# Calculate the area under the ROC curve (AUC)
auc_log = roc_auc_score(y_test_class, y_probs_log)



=== LOGISTIC REGRESSION MODEL ===

Confusion Matrix:
[[469 212]
 [ 45  78]]

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.69      0.78       681
           1       0.27      0.63      0.38       123

    accuracy                           0.68       804
   macro avg       0.59      0.66      0.58       804
weighted avg       0.81      0.68      0.72       804

Accuracy Score: 0.6803482587064676

Cross-validation scores (Logistic Regression): [0.74183515 0.70606532 0.68429238 0.72939347 0.69984448]
Average CV accuracy: 0.7123


In [11]:
# Plot ROC curve for Logistic Regression model performance
# This section generates and saves the ROC curve plot

plt.figure(figsize=(8, 5))  # Set figure size
plt.plot(fpr_log, tpr_log, label=f'Logistic Regresion (AUC = {auc_log:.2f})')  # Plot ROC curve with label showing AUC value
plt.plot([0, 1], [0, 1], 'k--')  # Plot diagonal line (no discrimination line)
plt.xlabel('False Positive Rate')  # Label for x-axis
plt.ylabel('True Positive Rate')  # Label for y-axis
plt.title('ROC Curve - Logistic Regresion')  # Title of the plot
plt.legend()  # Show legend for the curve
plt.grid(True)  # Add grid lines for easier reading
plt.savefig(os.path.join(save_path, 'log_reg_roc_curve.png'))  # Save the plot as a PNG file
plt.close()  # Close the plot to free memory

# Plot confusion matrix for Logistic Regression model performance
# This section generates and saves the confusion matrix heatmap

plt.figure(figsize=(6, 4))  # Set figure size
sns.heatmap(confusion_matrix(y_test_class, y_pred_log), annot=True, fmt='d', cmap='Blues')  # Plot heatmap of confusion matrix with annotations
plt.title('Confusion Matrix - Logistic Regresion')  # Title of the heatmap
plt.xlabel('Predicted')  # Label for x-axis
plt.ylabel('Actual')  # Label for y-axis
plt.savefig(os.path.join(save_path, 'log_reg_confusion_matrix.png'))  # Save the heatmap as a PNG file
plt.close()  # Close the plot to free memory


In [12]:
# RANDOM FOREST CLASSIFIER

# Print the starting message for the Random Forest Classifier model
print("\n ------------ RANDOM FOREST CLASSIFIER ------------------------------------")

# Create a pipeline with imputation and Random Forest Classifier
# The pipeline will first fill missing values with median, then apply the Random Forest Classifier
rf_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Use median to handle missing data
    ('model', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))  # Initialize RandomForestClassifier
])

# Fit the Random Forest pipeline on the training data
rf_pipeline.fit(X_train_class, y_train_class)

# Make predictions on the test set
y_pred_rf = rf_pipeline.predict(X_test_class)

# Extract the Random Forest model from the pipeline
rf_model = rf_pipeline.named_steps['model']

# Print the confusion matrix to evaluate the classification performance
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_rf))

# Print the classification report to get detailed metrics (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test_class, y_pred_rf))

# Print the accuracy score for the model's performance
print("Accuracy Score:", accuracy_score(y_test_class, y_pred_rf))

# Create a DataFrame to display the feature importances for the Random Forest model
# This will help to identify which features contribute most to the model's predictions
feature_importance_df = pd.DataFrame({
    'Feature': X_train_class.columns,  # Features in the training dataset
    'Importance': rf_model.feature_importances_  # Importance of each feature calculated by the Random Forest model
}).sort_values(by='Importance', ascending=False)  # Sort features by importance in descending order

# Plot the top 20 features with the highest importance
plt.figure(figsize=(12, 8))
# Set figure size


sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20))
# Plot the barplots


plt.title('Top 20 Random Forest Feature Importances (Classification)')
# Title for the plots


plt.tight_layout()
# Adjust layout to avoid overlap


plt.savefig(os.path.join(save_path, 'rf_feature_importance.png'))
 # Save the plot to the speccified dirctory


plt.close()
 # Close the plot to free memory



=== RANDOM FOREST CLASSIFIER ===

Confusion Matrix:
[[665  16]
 [110  13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.98      0.91       681
           1       0.45      0.11      0.17       123

    accuracy                           0.84       804
   macro avg       0.65      0.54      0.54       804
weighted avg       0.80      0.84      0.80       804

Accuracy Score: 0.8432835820895522


In [13]:
# PREDICTIONS AND ROC CURVE COMPARISON

# Get the predicted probabilities for the Random Forest Classifier model
# '[:, 1]' extracts the probabilities for the positive class (1) in a binary classification
y_probs_rf = rf_pipeline.predict_proba(X_test_class)[:, 1]

# Compute the false positive rate and true positive rate for the Random Forest model
fpr_rf, tpr_rf, _ = roc_curve(y_test_class, y_probs_rf)

# Calculate the AUC (Area Under the Curve) score for the Random Forest model
auc_rf = roc_auc_score(y_test_class, y_probs_rf)

# Plot ROC curves for both Logistic Regression and Random Forest models
# Set up the figure size for the plot
plt.figure(figsize=(8, 5))

# Plot ROC curve for Logistic Regression (already calculated in previous steps)
plt.plot(fpr_log, tpr_log, label=f'Logistic Regression (AUC = {auc_log:.2f})')

# Plot ROC curve for Random Forest, using a dashed line for distinction
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.2f})', linestyle='--')

# Plot the diagonal line representing a random classifier (AUC = 0.5)
plt.plot([0, 1], [0, 1], 'k--')

# Label the axes of the plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# Add a title for the plot
plt.title('ROC Curve Comparison')

# Display the legend showing the models and their AUC values
plt.legend()

# Enable the grid for better readability of the plot
plt.grid(True)

# Save the ROC curve comparison plot to the specified save path
plt.savefig(os.path.join(save_path, 'roc_curve_comparison.png'))

# Close the plot to release memory
plt.close()


In [14]:
# REGRESSION PREPROCESSING ===
# Start by printing the message indicating the start of the Regression model preprocessing step
print("\n=== REGRESSION MODEL PREPROCESSING ===")

# Separate the feature set (X) and target variable (y) for regression
# We drop 'Survival_Months' and 'Mortality_Status' from the features as 'Survival_Months' is the target variable
X_reg = regression_df.drop(columns=['Survival_Months', 'Mortality_Status'])
y_reg = regression_df['Survival_Months']

# Display basic statistics (like mean, min, max, etc.) for the 'Survival_Months' column
# This helps in understanding the distribution and range of the target variable
print("\nSurvival Months Statistics:")
print(y_reg.describe())

# Split the dataset into training and testing sets (80% for training, 20% for testing)
# The split ensures that we have separate data for training the model and for evaluating its performance
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Create a pipeline for data preprocessing for regression
# The pipeline includes an imputer to handle missing values and a scaler to standardize the data
regression_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Imputer replaces missing values with the median of the column
    ('scaler', StandardScaler())  # Scaler standardizes the features to have mean = 0 and variance = 1
])

# Fit and transform the training data using the pipeline
# The fit_transform method applies both imputation and scaling to the training set
X_train_reg_processed = regression_pipeline.fit_transform(X_train_reg)

# Transform the test data using the same imputation and scaling (without fitting it again)
# This ensures that the test set is processed in the same way as the training set
X_test_reg_processed = regression_pipeline.transform(X_test_reg)



=== REGRESSION MODEL PREPROCESSING ===

Survival Months Statistics:
count    4019.000000
mean       71.476487
std        25.367123
min         1.000000
25%        56.000000
50%        73.000000
75%        90.000000
max       760.000000
Name: Survival_Months, dtype: float64


In [15]:
# =------------------ REGRESSION MODELS ----------------------------------
# Print the message indicating the start of the Regression models section
print("\n --------------- REGRESSION MODELS------------------------------ ")

# Linear Regression
# Print the message indicating the Linear Regression model is being trained and evaluated
print("\n    --- Linear Regresion Model ---")

# Initialize the Linear Regression model
# lin_reg_model is created as an instance of the LinearRegression class
lin_reg_model = LinearRegression()

# Fit the Linear Regression model to the training data
# The model is trained using the processed training features (X_train_reg_processed) and the target variable (y_train_reg)
lin_reg_model.fit(X_train_reg_processed, y_train_reg)

# Make predictions on the test data using the trained model
# y_pred_lin stores the predicted survival months values based on the test features (X_test_reg_processed)
y_pred_lin = lin_reg_model.predict(X_test_reg_processed)

# Calculate the performance metrics for the Linear Regression model
# mse_lin: Mean Squared Error, mae_lin: Mean Absolute Error, r2_lin: R-squared score
mse_lin = mean_squared_error(y_test_reg, y_pred_lin)
mae_lin = mean_absolute_error(y_test_reg, y_pred_lin)
r2_lin = r2_score(y_test_reg, y_pred_lin)

# Print the evaluation metrics for the Linear Regression model
print(f"Mean Squared Error: {mse_lin:.2f}")
print(f"Mean Absolute Error: {mae_lin:.2f}")
print(f"R^2 Score: {r2_lin:.2f}")



 --------------- REGRESSION MODELS------------------------------ 

    --- Linear Regresion Model ---
Mean Squared Error: 509.32
Mean Absolute Error: 18.54
R^2 Score: 0.05


In [16]:
# Cross-validation for Linear Regression
# Perform cross-validation on the Linear Regression model to evaluate its performance across multiple folds
cv_scores_reg = cross_val_score(
    LinearRegression(),  # The model to evaluate
    X_train_reg_processed,  # The processed training data
    y_train_reg,  # The target variable
    scoring='r2',  # Scoring metric (R² score)
    cv=5  # Number of folds (5-fold cross-validation)
)

# Print the cross-validation R² scores for each fold
print(f"Cross-Validation R² Scores: {cv_scores_reg}")
# Print the average R² score across all folds
print(f"Average R²: {cv_scores_reg.mean():.4f}")

# Random Forest Regressor
# Print the message indicating the Random Forest Regressor model is being trained and evaluated
print("\n--- Random Forst Regressor ---")

# Initialize the Random Forest Regressor model
# rf_reg_model is created as an instance of the RandomForestRegressor class
rf_reg_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the Random Forest Regressor model to the training data
# The model is trained using the processed training features (X_train_reg_processed) and the target variable (y_train_reg)
rf_reg_model.fit(X_train_reg_processed, y_train_reg)

# Make predictions on the test data using the trained model
# y_pred_rf_reg stores the predicted survival months values based on the test features (X_test_reg_processed)
y_pred_rf_reg = rf_reg_model.predict(X_test_reg_processed)

# Calculate the performance metrics for the Random Forest Regressor model
# mse_rf: Mean Squared Error, mae_rf: Mean Absolute Error, r2_rf: R-squared score
mse_rf = mean_squared_error(y_test_reg, y_pred_rf_reg)
mae_rf = mean_absolute_error(y_test_reg, y_pred_rf_reg)
r2_rf = r2_score(y_test_reg, y_pred_rf_reg)

# Print the evaluation metrics for the Random Forest Regressor model
print(f"Random Forest - MSE: {mse_rf:.2f}")
print(f"Random Forest - MAE: {mae_rf:.2f}")
print(f"Random Forest - R^2 Score: {r2_rf:.2f}")

# Feature Importances for Regression
# Create a bar plot to show the feature importances of the Random Forest Regressor model
# This helps identify which features are most influential in predicting survival months
plt.figure(figsize=(12, 8))
sns.barplot(x=rf_reg_model.feature_importances_, y=X_train_reg.columns)
plt.title("Feature Importances - Random Forst Regressor")  # Title with a spelling mistake: 'Forst' instead of 'Forest'
plt.tight_layout()
plt.savefig(os.path.join(save_path, 'rf_reg_feature_importance.png'))
plt.close()

# Predicted vs Actual Plot
# Create a scatter plot to compare the actual vs predicted survival months for the Random Forest Regressor model
# This helps visualize how well the model's predictions match the actual data
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test_reg, y=y_pred_rf_reg, alpha=0.6)
plt.xlabel('Actual Survival Months')
plt.ylabel('Predicted Survival Months')
plt.title('Actual vs Predicted - Random Forest Regressor')
plt.grid(True)
plt.savefig(os.path.join(save_path, 'rf_reg_actual_vs_predicted.png'))
plt.close()


Cross-Validation R² Scores: [ 0.00142027  0.02845832  0.01234979 -0.00081983  0.01798137]
Average R²: 0.0119

--- Random Forst Regressor ---
Random Forest - MSE: 544.69
Random Forest - MAE: 19.01
Random Forest - R^2 Score: -0.02
