In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score,
                            roc_auc_score, roc_curve, precision_recall_curve, f1_score)
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

# ==================== 1. DATA LOADING AND EXPLORATION ====================

# Load the dataset
df = pd.read_csv('RS-A4_SEER Breast Cancer Dataset .csv')

print("="*80)
print("BREAST CANCER PROGNOSIS RECOMMENDATION SYSTEM")
print("="*80)
print("\n1. DATASET OVERVIEW")
print("-"*80)
print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nStatistical Summary:")
print(df.describe())

# ==================== 2. DATA PREPROCESSING ====================

print("\n2. DATA PREPROCESSING")
print("-"*80)

# Create a copy for processing
df_processed = df.copy()

# Handle missing values
print("Handling missing values...")
# Drop columns that are entirely missing (like 'Unnamed: 3') as they provide no information
cols_to_drop_entirely = [col for col in df_processed.columns if df_processed[col].isnull().all()]
if cols_to_drop_entirely:
    print(f"Dropping entirely missing columns: {cols_to_drop_entirely}")
    df_processed.drop(columns=cols_to_drop_entirely, inplace=True)

# Fill remaining missing values
for col in df_processed.columns:
    if df_processed[col].isnull().sum() > 0:
        if df_processed[col].dtype in ['int64', 'float64']:
            df_processed[col].fillna(df_processed[col].median(), inplace=True)
        else:
            df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)

print("Missing values after handling:", df_processed.isnull().sum().sum())

# Encode categorical variables
print("\nEncoding categorical variables...")
label_encoders = {}
categorical_cols = df_processed.select_dtypes(include=['object']).columns

for col in categorical_cols:
    if col != 'Status':  # Keep Status for last
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        label_encoders[col] = le
        print(f"  - {col}: {len(le.classes_)} unique values")

# Encode target variable (Status: Alive=0, Dead=1)
if 'Status' in df_processed.columns:
    le_status = LabelEncoder()
    df_processed['Status'] = le_status.fit_transform(df_processed['Status'])
    label_encoders['Status'] = le_status
    print(f"\nTarget Variable (Status) encoded: {dict(enumerate(le_status.classes_))}")

# Check target distribution
print("\nTarget Distribution:")
print(df_processed['Status'].value_counts())
print(f"Class Balance: {df_processed['Status'].value_counts(normalize=True)}")

# ==================== 3. FEATURE ENGINEERING ====================

print("\n3. FEATURE ENGINEERING")
print("-"*80)

# Create risk score based on clinical features
if all(col in df_processed.columns for col in ['T Stage', 'N Stage', 'Grade']):
    df_processed['Risk_Score'] = (df_processed['T Stage'] +
                                  df_processed['N Stage'] +
                                  df_processed['Grade']) / 3
    print("Created Risk_Score feature")

# Create age groups
if 'Age' in df_processed.columns:
    df_processed['Age_Group'] = pd.cut(df_processed['Age'],
                                       bins=[0, 40, 50, 60, 70, 100],
                                       labels=[0, 1, 2, 3, 4])
    df_processed['Age_Group'] = df_processed['Age_Group'].astype(int)
    print("Created Age_Group feature")

# Tumor size category
if 'Tumor Size' in df_processed.columns:
    # Adjust bins to ensure all 'Tumor Size' values are covered
    # The max tumor size in the dataset is 140. We extend the last bin boundary.
    max_current_bin_edge = 100
    max_data_tumor_size = df_processed['Tumor Size'].max()
    adjusted_upper_bin_edge = max(max_current_bin_edge, max_data_tumor_size) # Use the larger of 100 or actual max

    df_processed['Tumor_Category'] = pd.cut(df_processed['Tumor Size'],
                                            bins=[0, 20, 50, adjusted_upper_bin_edge],
                                            labels=[0, 1, 2],
                                            right=True, # Intervals are (a, b]
                                            include_lowest=True) # Ensure values from the lowest bin boundary (like 0) are included

    df_processed['Tumor_Category'] = df_processed['Tumor_Category'].astype(int)
    print("Created Tumor_Category feature")

print(f"\nTotal features after engineering: {df_processed.shape[1]}")

# ==================== 4. FEATURE SELECTION ====================

print("\n4. FEATURE SELECTION")
print("-"*80)

# Separate features and target
X = df_processed.drop('Status', axis=1)
y = df_processed['Status']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures used: {list(X.columns)}")

# Feature correlation analysis
correlation_matrix = X.corrwith(y).abs().sort_values(ascending=False)
print("\nTop 10 Features Correlated with Target:")
print(correlation_matrix.head(10))

# ==================== 5. DATA SPLITTING AND SCALING ====================

print("\n5. DATA SPLITTING AND SCALING")
print("-"*80)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Training set class distribution:\n{y_train.value_counts()}")
print(f"Test set class distribution:\n{y_test.value_counts()}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures scaled using StandardScaler")

# ==================== 6. MODEL TRAINING ====================

print("\n6. MODEL TRAINING")
print("-"*80)

# Initialize only the Gradient Boosting model
models = {
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate models (will only run for Gradient Boosting)
results = {}
trained_models = {}

for name, model in models.items():
    print(f"\nTraining {name}...")

    # Train model
    model.fit(X_train_scaled, y_train)
    trained_models[name] = model

    # Predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)

    results[name] = {
        'accuracy': accuracy,
        'f1_score': f1,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }

    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  Cross-Val Mean: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# ==================== 7. MODEL EVALUATION ====================

#print("\n7. MODEL EVALUATION")
#print("-"*80)

# Best model is now explicitly Gradient Boosting
best_model_name = 'Gradient Boosting'
#best_model = trained_models[best_model_name]

#print(f"\nBest Model: {best_model_name}")
#print(f"Accuracy: {results[best_model_name]['accuracy']:.4f}")
#print(f"F1-Score: {results[best_model_name]['f1_score']:.4f}")

# Detailed classification report for best model
#print(f"\nClassification Report for {best_model_name}:")
#print(classification_report(y_test, results[best_model_name]['predictions'],
#                          target_names=['Alive', 'Dead']))

# Confusion Matrix
#print(f"\nConfusion Matrix for {best_model_name}:")
#cm = confusion_matrix(y_test, results[best_model_name]['predictions'])
#print(cm)

# ==================== 8. HYPERPARAMETER TUNING ====================

print("\n8. HYPERPARAMETER TUNING (Best Model)")
print("-"*80)

# Parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5]
}
base_model = GradientBoostingClassifier(random_state=42)

print(f"Tuning {best_model_name}...")
grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Update best model to the tuned one
best_model = grid_search.best_estimator_

# Re-evaluate tuned model
y_pred_tuned = best_model.predict(X_test_scaled)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
print(f"Tuned model test accuracy: {accuracy_tuned:.4f}")

# Update results for 'Gradient Boosting' with tuned model's predictions/probabilities
results[best_model_name]['predictions'] = y_pred_tuned
if hasattr(best_model, 'predict_proba'):
    results[best_model_name]['probabilities'] = best_model.predict_proba(X_test_scaled)[:, 1]
results[best_model_name]['accuracy'] = accuracy_tuned

# ==================== 9. FEATURE IMPORTANCE ====================

print("\n9. FEATURE IMPORTANCE ANALYSIS")
print("-"*80)

if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)

    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))

# ==================== 10. ENSEMBLE MODEL ====================

print("\n10. ENSEMBLE MODEL (Not applicable - focusing on single model)")
print("-"*80)
print("Ensemble modeling section skipped as per user request to focus on Gradient Boosting.")
# Removed ensemble creation and evaluation for simplicity
ensemble_model = best_model # For compatibility with recommendation function

# ==================== 11. RECOMMENDATION FUNCTION ====================

print("\n11. PROGNOSIS RECOMMENDATION SYSTEM")
print("-"*80)

def predict_prognosis(patient_data, model, scaler, feature_names, encoders):
    """
    Predict breast cancer prognosis and provide recommendations

    Parameters:
    - patient_data: dict with patient information
    - model: trained ML model
    - scaler: fitted StandardScaler
    - feature_names: list of feature names
    - encoders: dict of label encoders

    Returns:
    - prediction, probability, risk_level, recommendations
    """
    # Prepare input data
    patient_df = pd.DataFrame([patient_data])

    # Encode categorical variables
    for col, encoder in encoders.items():
        if col in patient_df.columns and col != 'Status':
            try:
                patient_df[col] = encoder.transform(patient_df[col].astype(str))
            except:
                patient_df[col] = 0  # Default value if encoding fails

    # Ensure all features are present
    for col in feature_names:
        if col not in patient_df.columns:
            patient_df[col] = 0

    # Select and order features
    patient_df = patient_df[feature_names]

    # Scale features
    patient_scaled = scaler.transform(patient_df)

    # Predict
    prediction = model.predict(patient_scaled)[0]
    probability = model.predict_proba(patient_scaled)[0] if hasattr(model, 'predict_proba') else None

    # Determine risk level
    if probability is not None:
        death_prob = probability[1]
        if death_prob < 0.3:
            risk_level = "Low Risk"
        elif death_prob < 0.6:
            risk_level = "Moderate Risk"
        else:
            risk_level = "High Risk"
    else:
        risk_level = "Unable to determine"

    # Generate recommendations
    recommendations = []

    if prediction == 1 or (probability is not None and probability[1] > 0.5):
        recommendations.append("‚ö†Ô∏è Higher risk of poor prognosis detected")
        recommendations.append("üè• Recommend aggressive treatment approach")
        recommendations.append("üìÖ Frequent follow-up monitoring (every 3-6 months)")
        recommendations.append("üî¨ Consider additional diagnostic tests")
        recommendations.append("üë• Multidisciplinary team consultation recommended")
    else:
        recommendations.append("‚úÖ Lower risk prognosis indicated")
        recommendations.append("üíä Standard treatment protocol recommended")
        recommendations("üìÖ Regular follow-up monitoring (every 6-12 months)")
        recommendations.append("üèÉ Encourage healthy lifestyle modifications")

    # Add specific recommendations based on features
    if 'Tumor Size' in patient_data and patient_data['Tumor Size'] > 50:
        recommendations.append("‚öïÔ∏è Large tumor size - consider neoadjuvant therapy")

    if 'Grade' in patient_data and 'Poorly' in str(patient_data['Grade']):
        recommendations.append("üî¨ High-grade tumor - consider adjuvant chemotherapy")

    return {
        'prediction': 'Dead' if prediction == 1 else 'Alive',
        'probability': probability,
        'risk_level': risk_level,
        'recommendations': recommendations
    }

# Example prediction (using the tuned best_model, which is Gradient Boosting)
print("\nExample Patient Prognosis:")
example_patient = {
    'Age': 55,
    'Race': 'White',
    'Marital Status': 'Married (including common law)',
    'T Stage': 'T2',
    'N Stage': 'N1',
    '6th Stage': 'IIB',
    'Grade': 'Moderate',
    'A Stage': 'Regional',
    'Tumor Size': 35,
    'Estrogen Status': 'Positive',
    'Progesterone Status': 'Positive',
    'Regional Node Examined': 15,
    'Reginol Node Positive': 3,
    'Survival Months': 24
}

result = predict_prognosis(example_patient, best_model, scaler, # Use best_model here
                          X.columns.tolist(), label_encoders)

print(f"\nPrediction: {result['prediction']}")
print(f"Risk Level: {result['risk_level']}")
if result['probability'] is not None:
    print(f"Survival Probability: {result['probability'][0]:.2%}")
    print(f"Death Probability: {result['probability'][1]:.2%}")
print("\nRecommendations:")
for rec in result['recommendations']:
    print(f"  {rec}")

# ==================== 12. MODEL COMPARISON SUMMARY ====================

print("\n12. MODEL COMPARISON SUMMARY (Gradient Boosting Only)")
print("-"*80)
print(f"{'Model':<25} {'Accuracy':<12} {'F1-Score':<12} {'CV Mean':<12}")
print("-"*80)
# Only show Gradient Boosting results
metrics = results[best_model_name]
print(f"{best_model_name:<25} {metrics['accuracy'] if 'accuracy' in metrics else 'N/A':<12.4f} {metrics['f1_score'] if 'f1_score' in metrics else 'N/A':<12.4f} {metrics['cv_mean'] if 'cv_mean' in metrics else 'N/A':<12.4f}")

print("\n" + "="*80)
print("RECOMMENDATION SYSTEM COMPLETE")
print("="*80)

# ==================== VISUALIZATION CODE (Optional) ====================

# Create visualizations
# fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Model Comparison (now only for Gradient Boosting)
# model_names_single = [best_model_name]
# accuracies_single = [results[best_model_name]['accuracy']]
# axes[0, 0].bar(model_names_single, accuracies_single, color='skyblue')
# axes[0, 0].set_ylabel('Accuracy')
# axes[0, 0].set_title(f'Accuracy - {best_model_name}')
# axes[0, 0].set_ylim([0.6, 1.0])

# 2. Confusion Matrix for Best Model (Gradient Boosting)
# cm = confusion_matrix(y_test, results[best_model_name]['predictions'])
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 1],
#             xticklabels=['Alive', 'Dead'], yticklabels=['Alive', 'Dead'])
# axes[0, 1].set_title(f'Confusion Matrix - {best_model_name}')
# axes[0, 1].set_ylabel('True Label')
# axes[0, 1].set_xlabel('Predicted Label')

# 3. Feature Importance (if available for Gradient Boosting)
# if hasattr(best_model, 'feature_importances_'):
#     feature_importance = pd.DataFrame({
#         'Feature': X.columns,
#         'Importance': best_model.feature_importances_
#     }).sort_values('Importance', ascending=False)
#     top_features = feature_importance.head(10)
#     axes[1, 0].barh(range(len(top_features)), top_features['Importance'])
#     axes[1, 0].set_yticks(range(len(top_features)))
#     axes[1, 0].set_yticklabels(top_features['Feature'])
#     axes[1, 0].set_xlabel('Importance')
#     axes[1, 0].set_title('Top 10 Feature Importance (Gradient Boosting)')
#     axes[1, 0].invert_yaxis()

# 4. ROC Curve (for Gradient Boosting)
# if results[best_model_name]['probabilities'] is not None:
#     fpr, tpr, _ = roc_curve(y_test, results[best_model_name]['probabilities'])
#     auc_score = roc_auc_score(y_test, results[best_model_name]['probabilities'])
#     axes[1, 1].plot(fpr, tpr, label=f'{best_model_name} (AUC = {auc_score:.3f})')
#     axes[1, 1].plot([0, 1], [0, 1], 'k--', label='Random Classifier')
#     axes[1, 1].set_xlabel('False Positive Rate')
#     axes[1, 1].set_ylabel('True Positive Rate')
#     axes[1, 1].set_title('ROC Curve (Gradient Boosting)')
#     axes[1, 1].legend()
#     axes[1, 1].grid(True, alpha=0.3)

# plt.tight_layout()
# plt.savefig('breast_cancer_ml_analysis.png', dpi=300, bbox_inches='tight')
# print("\nVisualizations saved to 'breast_cancer_ml_analysis.png'")

"""
==================================================================================
COMPREHENSIVE EXPLANATION OF THE BREAST CANCER PROGNOSIS RECOMMENDATION SYSTEM
(Focusing on Gradient Boosting Model)
==================================================================================

1. PROJECT OVERVIEW:
   - Develops an ML-based system to predict breast cancer patient survival (Alive/Dead)
   - Uses SEER (Surveillance, Epidemiology, and End Results) dataset
   - Provides risk assessment and clinical recommendations, specifically using the Gradient Boosting model.

2. DATA PREPROCESSING:
   - Handles missing values using median (numerical) and mode (categorical)
   - Encodes categorical variables (Race, Marital Status, T Stage, N Stage, etc.)
   - Target variable: Status (Alive=0, Dead=1).
   - Addresses class imbalance in dataset.

3. FEATURE ENGINEERING:
   - Risk_Score: Combination of T Stage, N Stage, and Grade.
   - Age_Group: Categorical age bins for age-related patterns.
   - Tumor_Category: Categorizes tumor size into small/medium/large.
   - These engineered features capture domain knowledge about cancer prognosis.

4. MODEL IMPLEMENTED:
   a) Gradient Boosting: A powerful ensemble model known for its high performance in classification tasks. It builds trees sequentially, with each new tree correcting errors made by previous ones.

5. MODEL EVALUATION METRICS:
   - Accuracy: Overall correct predictions.
   - F1-Score: Harmonic mean of precision and recall.
   - Cross-Validation: 5-fold CV for robust performance estimation.
   - ROC-AUC: Area under ROC curve for classification quality.
   - Confusion Matrix: True/False Positives and Negatives.

6. HYPERPARAMETER TUNING:
   - GridSearchCV was used to optimize the Gradient Boosting model's hyperparameters.
   - Parameters like `n_estimators`, `learning_rate`, `max_depth`, and `min_samples_split` were tuned to improve model generalization and performance.

7. FEATURE IMPORTANCE:
   - Identifies most influential features for predictions from the Gradient Boosting model.
   - Helps understand which clinical factors matter most and guides clinical decision-making.

8. RECOMMENDATION SYSTEM FUNCTIONALITY:
   - Takes patient data as input.
   - Predicts survival status (Alive/Dead) using the tuned Gradient Boosting model.
   - Calculates death probability.
   - Assigns risk level (Low/Moderate/High).
   - Generates personalized clinical recommendations based on the prediction and risk level:
     * Treatment intensity (standard vs aggressive).
     * Follow-up frequency (3-6 months vs 6-12 months).
     * Additional diagnostic tests.
     * Lifestyle modifications.
     * Specialist consultations.

9. CLINICAL RECOMMENDATIONS LOGIC:
   - High Risk (Death Prob > 60%):
     * Aggressive treatment approach.
     * Frequent monitoring every 3-6 months.
     * Multidisciplinary team consultation.
   - Low Risk (Death Prob < 30%):
     * Standard treatment protocol.
     * Regular monitoring every 6-12 months.
     * Lifestyle modifications.
   - Feature-specific recommendations:
     * Large tumors ‚Üí Neoadjuvant therapy.
     * High-grade tumors ‚Üí Adjuvant chemotherapy.

10. KEY FEATURES FROM DATASET:
    - Age: Patient age at diagnosis.
    - Tumor Size: Size in millimeters.
    - T Stage: Tumor stage (T1, T2, T3, T4).
    - N Stage: Lymph node involvement (N0, N1, N2, N3).
    - Grade: Cell differentiation (Well/Moderate/Poorly differentiated).
    - Hormone Status: Estrogen/Progesterone receptor status.
    - Regional Node Positive: Number of positive lymph nodes.
    - 6th Stage: Overall cancer stage (I, II, III, IV).

11. MODEL PERFORMANCE CONSIDERATIONS:
    - Training/Test Split: 80/20 with stratification.
    - Feature Scaling: StandardScaler for normalized features.
    - Cross-Validation: 5-fold to prevent overfitting.
    - Handles imbalanced classes appropriately.

12. PRACTICAL USAGE:
    - Input: Patient clinical data (demographics, tumor characteristics, treatment).
    - Output: Survival prediction, risk level, actionable recommendations.
    - Can be integrated into hospital information systems.
    - Supports clinical decision-making, not replacement.

13. LIMITATIONS & CONSIDERATIONS:
    - Model predictions are probabilistic, not deterministic.
    - Should be used alongside clinical expertise.
    - Regular retraining needed with new data.
    - Performance depends on data quality and representativeness.

14. VISUALIZATIONS CREATED (for Gradient Boosting model):
    - Model accuracy bar chart.
    - Confusion matrix heatmap.
    - Feature importance plot.
    - ROC curve.

15. FUTURE ENHANCEMENTS:
    - Deep learning models (Neural Networks).
    - Survival analysis (time-to-event modeling).
    - Integration with treatment response data.
    - Real-time prediction API.
    - Explainable AI techniques (SHAP, LIME).

16. MEDICAL SIGNIFICANCE:
    - Early identification of high-risk patients.
    - Personalized treatment planning.
    - Resource optimization in healthcare.
    - Improved patient outcomes through data-driven decisions.

==================================================================================
This system demonstrates the power of machine learning in healthcare, providing
evidence-based recommendations to support oncologists in breast cancer prognosis
and treatment planning, specifically utilizing the robust Gradient Boosting model.
==================================================================================
"""

BREAST CANCER PROGNOSIS RECOMMENDATION SYSTEM

1. DATASET OVERVIEW
--------------------------------------------------------------------------------
Dataset Shape: (4024, 16)

Columns: ['Age', 'Race ', 'Marital Status', 'Unnamed: 3', 'T Stage ', 'N Stage', '6th Stage', 'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status', 'Progesterone Status', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Status']

First 5 rows:
   Age                                              Race   \
0   43  Other (American Indian/AK Native, Asian/Pacifi...   
1   47  Other (American Indian/AK Native, Asian/Pacifi...   
2   67                                              White   
3   46                                              White   
4   63                                              White   

                   Marital Status  Unnamed: 3 T Stage  N Stage 6th Stage  \
0  Married (including common law)         NaN       T2      N3      IIIC   
1  Married (including common law)        



# Deep Dive: Concepts, Math, and Code Walkthrough

> Supervised classification for breast cancer prognosis with tuned Gradient Boosting and an interactive intake menu.

## Preprocessing
- Handle missing values; drop high‚Äëleakage columns when necessary.
- Label Encoding categorical vars; Standardization for numeric vars: $z = (x-\mu)/\sigma$.
- Train/Test split for generalization estimates.

## Gradient Boosting (classification)
Additive model minimizing a differentiable loss via stage‚Äëwise function approximation.
- Initialize with a constant model $F_0(x)$ (e.g., minimizing loss on y).
- For $m = 1,\dots,M$:
  1) Compute negative gradient (pseudo‚Äëresiduals) $r_{im} = -\left[\dfrac{\partial \ell(y_i, F(x_i))}{\partial F(x_i)}\right]_{F=F_{m-1}}$.
  2) Fit a weak learner $h_m(x)$ (shallow tree) to $\{(x_i, r_{im})\}$.
  3) Line search for step size $\gamma_m$.
  4) Update: $F_m(x) = F_{m-1}(x) + \nu\,\gamma_m\,h_m(x)$, with learning rate $\nu\in(0,1]$.

For logistic loss (binary):
- Probability: $p(y=1\mid x) = \sigma(F(x)) = \dfrac{1}{1+e^{-F(x)}}$.
- Optimize negative log‚Äëlikelihood; predictions via sign or probability threshold.

## Evaluation
- Accuracy: $\dfrac{TP+TN}{TP+TN+FP+FN}$
- Precision/Recall/F1 per class with macro/weighted averaging as appropriate.
- Cross‚Äëvalidated grid search (`GridSearchCV`) tunes `n_estimators`, `learning_rate`, `max_depth`, etc.

## Code mapping
- Cell 1: builds `df_processed`, encoders, scaler; trains and tunes Gradient Boosting; prints metrics and importance.
- `predict_prognosis(...)`: standardizes inputs with the trained scaler/encoders and returns class + probabilities.
- Cell 2 menu: guided intake; prints prediction, probabilities, risk level, and recommendations.

## Edge cases and tips
- Unseen category at prediction time: map to closest known or fall back safely.
- Imbalanced classes: consider class weights or threshold tuning.
- Calibration: for probability‚Äëcritical use cases, consider Platt/Isotonic calibration.

# Breast Cancer Prognosis ‚Äî Overview

> This notebook builds a supervised classification pipeline (tuned Gradient Boosting) on the SEER Breast Cancer dataset and includes an interactive prognosis menu.

## What‚Äôs inside
- Data loading and cleaning (selected features from SEER dataset)
- Preprocessing:
  - Handle missing values and drop leakage‚Äëprone columns
  - Label Encoding for categorical features
  - Standardization for numeric features
- Modeling:
  - Baseline models (KNN, SVM, RF, etc.)
  - Tuned Gradient Boosting (GridSearchCV) as the best model
- Evaluation:
  - Accuracy, F1, classification report
  - Feature importance (where applicable)
- Interactive Menu (Cell 2):
  - Enter patient attributes or use examples
  - Get prediction, class probabilities, risk level, and recommendations

## Quick pipeline
1. Load and preprocess data ‚Üí `df_processed`
2. Split into train/test; scale numeric features
3. Train multiple models; select/tune best (Gradient Boosting)
4. Evaluate on test set; inspect metrics and importance
5. Launch the menu to input a patient profile and get prognosis

## Inputs and outputs
- Inputs: patient‚Äëlevel categorical + numeric features
- Outputs:
  - Predicted class (e.g., survival/risk category)
  - Class probabilities
  - Human‚Äëreadable risk level + suggestions

## Interactive use
- Run Cell 1 to fit models and set up helpers
- Run Cell 2 to launch the interactive intake menu
- Use default/example profiles to sanity‚Äëcheck predictions

## Notes
- Encoders and scaler must match the training fit; the menu reuses them safely
- Robustness guards are added for missing/unknown categories
- Extendable to new features and alternative models

In [2]:
# Interactive Menu: Breast Cancer Prognosis Recommendation (RS_A4)
# Run cell 1 first. Then run this cell to launch the menu.

from IPython.display import display
import pandas as pd

# --- Helpers ---------------------------------------------------------------

def _pick_from_list(name, options, default=None):
    options = [str(o) for o in options if pd.notna(o)]
    options = sorted(list(dict.fromkeys(options)))  # unique + stable order
    if not options:
        return input(f"Enter {name}: ").strip() or (default or '')
    preview = ", ".join(options[:10]) + (" ..." if len(options) > 10 else "")
    while True:
        raw = input(f"{name} [{default if default is not None else ''}] (e.g., {preview}): ").strip()
        if raw:
            return raw
        if default is not None:
            return default


def _prompt_float(name, default):
    while True:
        raw = input(f"{name} [{default}]: ").strip()
        if raw == '':
            return float(default)
        try:
            return float(raw)
        except Exception:
            print("Enter a number.")


def _prompt_int(name, default):
    while True:
        raw = input(f"{name} [{default}]: ").strip()
        if raw == '':
            return int(default)
        try:
            return int(raw)
        except Exception:
            print("Enter an integer.")


def _defaults_from_data():
    # Prefer df_processed if available, else df
    src = None
    try:
        src = df_processed
    except NameError:
        try:
            src = df
        except NameError:
            src = None
    num_defaults = {}
    if isinstance(src, pd.DataFrame):
        for col in ['Age','Tumor Size','Regional Node Examined','Reginol Node Positive','Survival Months']:
            if col in src.columns:
                num_defaults[col] = float(src[col].median())
    # categorical defaults: first class from encoders if present
    cat_defaults = {}
    try:
        for col in ['Race','Marital Status','T Stage','N Stage','6th Stage','Grade','A Stage','Estrogen Status','Progesterone Status']:
            if col in label_encoders and hasattr(label_encoders[col], 'classes_'):
                cat_defaults[col] = label_encoders[col].classes_[0]
    except NameError:
        pass
    return num_defaults, cat_defaults


def _options_from_data(col):
    # Gather options to help the user pick
    if 'label_encoders' in globals() and col in label_encoders and hasattr(label_encoders[col], 'classes_'):
        return list(label_encoders[col].classes_)
    if 'df' in globals() and isinstance(df, pd.DataFrame) and col in df.columns:
        return sorted(df[col].dropna().astype(str).unique().tolist())
    return []


def _collect_patient_input():
    num_defaults, cat_defaults = _defaults_from_data()

    # Categorical choices
    race = _pick_from_list('Race', _options_from_data('Race'), cat_defaults.get('Race','White'))
    marital = _pick_from_list('Marital Status', _options_from_data('Marital Status'), cat_defaults.get('Marital Status','Married (including common law)'))
    t_stage = _pick_from_list('T Stage', _options_from_data('T Stage'), cat_defaults.get('T Stage','T1'))
    n_stage = _pick_from_list('N Stage', _options_from_data('N Stage'), cat_defaults.get('N Stage','N0'))
    sixth = _pick_from_list('6th Stage', _options_from_data('6th Stage'), cat_defaults.get('6th Stage','IIA'))
    grade = _pick_from_list('Grade', _options_from_data('Grade'), cat_defaults.get('Grade','Moderate'))
    a_stage = _pick_from_list('A Stage', _options_from_data('A Stage'), cat_defaults.get('A Stage','Regional'))
    er = _pick_from_list('Estrogen Status', _options_from_data('Estrogen Status'), cat_defaults.get('Estrogen Status','Positive'))
    pr = _pick_from_list('Progesterone Status', _options_from_data('Progesterone Status'), cat_defaults.get('Progesterone Status','Positive'))

    # Numeric entries
    age = _prompt_int('Age', int(num_defaults.get('Age', 55)))
    tumor = _prompt_int('Tumor Size (mm)', int(num_defaults.get('Tumor Size', 30)))
    nodes_exam = _prompt_int('Regional Node Examined', int(num_defaults.get('Regional Node Examined', 12)))
    nodes_pos = _prompt_int('Reginol Node Positive', int(num_defaults.get('Reginol Node Positive', 2)))
    surv_months = _prompt_int('Survival Months', int(num_defaults.get('Survival Months', 24)))

    patient = {
        'Age': age,
        'Race': race,
        'Marital Status': marital,
        'T Stage': t_stage,
        'N Stage': n_stage,
        '6th Stage': sixth,
        'Grade': grade,
        'A Stage': a_stage,
        'Tumor Size': tumor,
        'Estrogen Status': er,
        'Progesterone Status': pr,
        'Regional Node Examined': nodes_exam,
        'Reginol Node Positive': nodes_pos,
        'Survival Months': surv_months,
    }
    return patient


def _safe_predict(patient_data):
    # Use existing predict_prognosis if available; else fallback
    try:
        res = predict_prognosis(patient_data, best_model, scaler, X.columns.tolist(), label_encoders)
        return res
    except Exception as e:
        # Fallback: minimal pipeline using encoders and scaler
        local = pd.DataFrame([patient_data])
        # Encode via label_encoders
        if 'label_encoders' in globals():
            for col, enc in label_encoders.items():
                if col == 'Status':
                    continue
                if col in local.columns:
                    try:
                        local[col] = enc.transform(local[col].astype(str))
                    except Exception:
                        # unseen label -> use first class
                        try:
                            local[col] = enc.transform([enc.classes_[0]])[0]
                        except Exception:
                            local[col] = 0
        # Ensure all features
        for col in X.columns:
            if col not in local.columns:
                local[col] = 0
        local = local[X.columns]
        Xs = scaler.transform(local)
        pred = best_model.predict(Xs)[0]
        proba = best_model.predict_proba(Xs)[0] if hasattr(best_model,'predict_proba') else None
        risk = 'Unable to determine'
        if proba is not None:
            death_prob = float(proba[1])
            risk = 'Low Risk' if death_prob < 0.3 else ('Moderate Risk' if death_prob < 0.6 else 'High Risk')
        recs = []
        if proba is not None and proba[1] > 0.5 or pred == 1:
            recs += [
                'Higher risk detected; consider aggressive treatment',
                'Frequent follow-up (3-6 months)',
                'Additional diagnostic tests',
                'Multidisciplinary team consultation'
            ]
        else:
            recs += [
                'Lower risk indicated',
                'Standard treatment protocol',
                'Regular follow-up (6-12 months)',
                'Healthy lifestyle encouragement'
            ]
        if 'Tumor Size' in patient_data and patient_data['Tumor Size'] > 50:
            recs.append('Large tumor size - consider neoadjuvant therapy')
        return {
            'prediction': 'Dead' if int(pred) == 1 else 'Alive',
            'probability': proba,
            'risk_level': risk,
            'recommendations': recs
        }


def launch_prognosis_menu():
    print("="*80)
    print("BREAST CANCER PROGNOSIS - INTERACTIVE MENU")
    print("="*80)

    while True:
        print("\nOptions:")
        print("  1) Enter patient data and predict")
        print("  2) Use example: Lower risk profile")
        print("  3) Use example: Higher risk profile")
        print("  4) Exit")
        choice = input("Select [1-4]: ").strip() or '1'

        if choice == '1':
            patient = _collect_patient_input()
            res = _safe_predict(patient)
            print("\nPrediction Result:")
            print(f"Prediction: {res['prediction']}")
            if res.get('probability') is not None:
                print(f"Survival Probability: {res['probability'][0]:.2%}")
                print(f"Death Probability:    {res['probability'][1]:.2%}")
            print(f"Risk Level: {res['risk_level']}")
            print("Recommendations:")
            for r in res.get('recommendations', []):
                print(f"  - {r}")

        elif choice == '2':
            patient = {
                'Age': 48,
                'Race': 'White',
                'Marital Status': 'Married (including common law)',
                'T Stage': 'T1',
                'N Stage': 'N0',
                '6th Stage': 'IIA',
                'Grade': 'Moderate',
                'A Stage': 'Regional',
                'Tumor Size': 18,
                'Estrogen Status': 'Positive',
                'Progesterone Status': 'Positive',
                'Regional Node Examined': 12,
                'Reginol Node Positive': 0,
                'Survival Months': 36,
            }
            res = _safe_predict(patient)
            print("\nLower-risk Example Result:")
            print(f"Prediction: {res['prediction']}")
            if res.get('probability') is not None:
                print(f"Survival Probability: {res['probability'][0]:.2%}")
                print(f"Death Probability:    {res['probability'][1]:.2%}")
            print(f"Risk Level: {res['risk_level']}")
            print("Recommendations:")
            for r in res.get('recommendations', []):
                print(f"  - {r}")

        elif choice == '3':
            patient = {
                'Age': 67,
                'Race': 'Black',
                'Marital Status': 'Single',
                'T Stage': 'T3',
                'N Stage': 'N2',
                '6th Stage': 'IIIB',
                'Grade': 'Poorly',
                'A Stage': 'Regional',
                'Tumor Size': 75,
                'Estrogen Status': 'Negative',
                'Progesterone Status': 'Negative',
                'Regional Node Examined': 20,
                'Reginol Node Positive': 6,
                'Survival Months': 12,
            }
            res = _safe_predict(patient)
            print("\nHigher-risk Example Result:")
            print(f"Prediction: {res['prediction']}")
            if res.get('probability') is not None:
                print(f"Survival Probability: {res['probability'][0]:.2%}")
                print(f"Death Probability:    {res['probability'][1]:.2%}")
            print(f"Risk Level: {res['risk_level']}")
            print("Recommendations:")
            for r in res.get('recommendations', []):
                print(f"  - {r}")

        elif choice == '4':
            print("Exiting prognosis menu.")
            break
        else:
            print("Please select a valid option (1-4).")

# Auto-launch the menu when this cell runs (comment out to disable)
launch_prognosis_menu()


BREAST CANCER PROGNOSIS - INTERACTIVE MENU

Options:
  1) Enter patient data and predict
  2) Use example: Lower risk profile
  3) Use example: Higher risk profile
  4) Exit

Prediction Result:
Prediction: Alive
Survival Probability: 94.22%
Death Probability:    5.78%
Risk Level: Low Risk
Recommendations:
  - Lower risk indicated
  - Standard treatment protocol
  - Regular follow-up (6-12 months)
  - Healthy lifestyle encouragement

Options:
  1) Enter patient data and predict
  2) Use example: Lower risk profile
  3) Use example: Higher risk profile
  4) Exit

Prediction Result:
Prediction: Alive
Survival Probability: 94.22%
Death Probability:    5.78%
Risk Level: Low Risk
Recommendations:
  - Lower risk indicated
  - Standard treatment protocol
  - Regular follow-up (6-12 months)
  - Healthy lifestyle encouragement

Options:
  1) Enter patient data and predict
  2) Use example: Lower risk profile
  3) Use example: Higher risk profile
  4) Exit

Lower-risk Example Result:
Prediction: 