In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time
import joblib
import cv2
import traceback
import importlib.util
import os
from imblearn.over_sampling import SMOTE
import random

class XGBoostCancerClassifier:
    def __init__(self):
        self.model = None
        self.pipeline = None
        self.scaler = StandardScaler()
        self.external_feature_extractor = None
        self.feature_extractor_instance = None
        self.segmentation_enabled = True
        self.accuracy_history = []  # To store accuracy values for plotting
        self.smote = SMOTE(random_state=42)  # Add SMOTE for resampling
        self.feature_importances = None
        self.best_params = None
        self.feature_names = None
        self.learning_rate_schedule = None  # To store learning rate decay schedule

    def load_and_preprocess(self, csv_path):
        """Load and preprocess the feature CSV file"""
        print("Loading data...")

        self.data = pd.read_csv(csv_path)

        print("\nColumns in the dataset:")
        print(self.data.columns.tolist())

        feature_columns = self.data.select_dtypes(include=[np.number]).columns
        print(f"\nNumber of feature columns found: {len(feature_columns)}")
        print("Feature columns:", feature_columns.tolist())

        non_numeric_columns = self.data.select_dtypes(exclude=[np.number]).columns
        if len(non_numeric_columns) == 0:
            raise ValueError("No label column found in the dataset")

        label_column = non_numeric_columns[0]
        print(f"\nUsing '{label_column}' as the label column")

        self.X = self.data[feature_columns]
        self.y = self.data[label_column]

        print("\nUnique labels found:", self.y.unique().tolist())

        self.y = self.y.apply(lambda x: 1 if 'malignant' in x.lower() else 0)

        class_counts = self.y.value_counts()
        print("\nClass distribution:")
        for class_label, count in class_counts.items():
            class_name = "Malignant" if class_label == 1 else "Benign"
            print(f"- {class_name}: {count} ({count / len(self.y):.2%})")

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=42, stratify=self.y
        )

        self.X_train_final, self.X_val, self.y_train_final, self.y_val = train_test_split(
            self.X_train, self.y_train, test_size=0.2, random_state=42, stratify=self.y_train
        )

        print("\nData preprocessing completed!")
        print(f"Training set size: {len(self.X_train_final)}")
        print(f"Validation set size: {len(self.X_val)}")
        print(f"Testing set size: {len(self.X_test)}")
        print(f"Number of features: {self.X_train.shape[1]}")

        self.feature_names = feature_columns.tolist()

    def find_best_parameters(self, X_train_resampled, y_train_resampled):
        """Find optimal hyperparameters for XGBoost using GridSearchCV"""
        print("\nFinding optimal hyperparameters for XGBoost...")

        param_grid = {
            'max_depth': [3, 4, 5],
            'min_child_weight': [1, 3],
            'gamma': [0, 0.1],
            'subsample': [0.8, 0.9],
            'colsample_bytree': [0.8, 0.9],
            'reg_alpha': [0, 0.1],
            'reg_lambda': [1, 1.5],
            'learning_rate': [0.01, 0.1]
        }

        xgb = XGBClassifier(
            objective='binary:logistic',
            n_estimators=100,
            random_state=42,
            eval_metric='logloss'
        )

        grid_search = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid,
            cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
            scoring='accuracy',
            n_jobs=-1,
            verbose=1
        )

        print("\nPerforming grid search (this may take some time)...")
        grid_search.fit(X_train_resampled, y_train_resampled)

        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        print(f"\nBest parameters: {best_params}")
        print(f"Best cross-validation score: {best_score:.2%}")

        self.best_params = best_params
        self.learning_rate_schedule = self._create_learning_rate_schedule(best_params['learning_rate'])

        return best_params

    def _create_learning_rate_schedule(self, initial_learning_rate):
        """Create a learning rate schedule for training"""
        num_boost_rounds = 1000
        decay_factor = 0.995
        lr_schedule = [initial_learning_rate * (decay_factor ** i) for i in range(num_boost_rounds)]
        return lr_schedule

    def train_model(self):
        """Train the XGBoost model with SMOTE and optimal hyperparameters"""
        print("Training XGBoost model with hyperparameter optimization and SMOTE...")

        print("\nApplying SMOTE to balance the classes...")
        original_class_dist = self.y_train_final.value_counts()
        print(f"Original training class distribution: {dict(original_class_dist)}")

        X_train_resampled, y_train_resampled = self.smote.fit_resample(self.X_train_final, self.y_train_final)

        new_class_dist = pd.Series(y_train_resampled).value_counts()
        print(f"After SMOTE class distribution: {dict(new_class_dist)}")

        best_params = self.find_best_parameters(X_train_resampled, y_train_resampled)

        xgb_model = XGBClassifier(
            objective='binary:logistic',
            n_estimators=1000,
            max_depth=best_params['max_depth'],
            min_child_weight=best_params['min_child_weight'],
            gamma=best_params['gamma'],
            subsample=best_params['subsample'],
            colsample_bytree=best_params['colsample_bytree'],
            reg_alpha=best_params['reg_alpha'],
            reg_lambda=best_params['reg_lambda'],
            learning_rate=best_params['learning_rate'],
            eval_metric='logloss',
            random_state=42
        )

        self.pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('xgboost', xgb_model)
        ])

        print("\nPerforming Model Training...")

        with tqdm(total=100, desc="Training Progress") as pbar:
            self.pipeline.fit(X_train_resampled, y_train_resampled)
            for i in range(100):
                time.sleep(0.01)
                pbar.update(1)

        self.model = self.pipeline.named_steps['xgboost']
        self.feature_importances = self.model.feature_importances_

        print("\nModel training completed!")
        print(f"XGBoost trees: {self.model.get_booster().num_boosted_rounds()}")

        train_pred = self.pipeline.predict(self.X_train)
        train_accuracy = accuracy_score(self.y_train, train_pred)
        print(f"Training Accuracy: {train_accuracy:.2%}")

        val_pred = self.pipeline.predict(self.X_val)
        val_accuracy = accuracy_score(self.y_val, val_pred)
        print(f"Validation Accuracy: {val_accuracy:.2%}")

        self.accuracy_history.append(('Training', train_accuracy))
        self.accuracy_history.append(('Validation', val_accuracy))

        print("\nPerforming 5-fold cross-validation...")
        cv_scores = cross_val_score(
            self.pipeline, self.X, self.y,
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            scoring='accuracy',
            n_jobs=-1
        )
        print(f"Cross-validation scores: {[f'{score:.2%}' for score in cv_scores]}")
        print(f"Mean cross-validation accuracy: {cv_scores.mean():.2%} ± {cv_scores.std():.2%}")

        self.accuracy_history.append(('Cross-validation', cv_scores.mean()))

        if cv_scores.mean() < 0.94:
            print("\nWarning: Mean cross-validation accuracy is below 94%. Target test accuracy (~95%) may not be achieved.")
            print("Suggestions to improve accuracy:")
            print("1. Increase n_estimators (e.g., 1500)")
            print("2. Expand GridSearchCV param_grid (e.g., more max_depth values)")
            print("3. Check dataset quality (e.g., outliers, feature distributions)")
        elif cv_scores.mean() > 0.96:
            print("\nNote: Mean cross-validation accuracy is above 96%. Model may be overfitting.")
            print("Suggestions to reduce overfitting:")
            print("1. Increase reg_alpha (e.g., 0.5)")
            print("2. Increase reg_lambda (e.g., 2)")
            print("3. Decrease max_depth (e.g., 3)")
        else:
            print("\nSuccess: Mean cross-validation accuracy is close to 95%. Test accuracy target is likely achievable.")

        if train_accuracy - val_accuracy > 0.05:
            print("\nWarning: Potential overfitting detected.")
            print(f"Training accuracy: {train_accuracy:.2%}, Validation accuracy: {val_accuracy:.2%}")
            print(f"Difference: {train_accuracy - val_accuracy:.2%}")
            print("\nRecommendations to reduce overfitting:")
            print("1. Increase regularization (reg_alpha, reg_lambda)")
            print("2. Decrease max_depth")
            print("3. Increase min_child_weight")
            print("4. Reduce learning_rate")
        else:
            print("\nNo significant overfitting detected!")

    def evaluate_model(self):
        """Evaluate the model and display metrics"""
        if self.pipeline is None:
            raise ValueError("Model not trained yet! Please train the model first.")

        y_pred = self.pipeline.predict(self.X_test)
        y_pred_prob = self.pipeline.predict_proba(self.X_test)[:,1]

        test_accuracy = accuracy_score(self.y_test, y_pred)
        print(f"\nTest Set Accuracy: {test_accuracy:.2%}")

        if test_accuracy < 0.94:
            print("\nWarning: Test accuracy is below 94%, missing the ~95% target.")
            print("Suggestions to improve accuracy:")
            print("1. Increase n_estimators (e.g., 1500)")
            print("2. Expand GridSearchCV param_grid (e.g., more max_depth values)")
            print("3. Check dataset quality (e.g., outliers, missing values)")
        elif test_accuracy > 0.96:
            print("\nNote: Test accuracy is above 96%. Model may be overfitting.")
            print("Suggestions to reduce overfitting:")
            print("1. Increase reg_alpha (e.g., 0.5)")
            print("2. Increase reg_lambda (e.g., 2)")
            print("3. Decrease max_depth (e.g., 3)")
        else:
            print("\nSuccess! Test accuracy is approximately 95% as targeted.")

        self.accuracy_history.append(('Test', test_accuracy))

        self.plot_accuracy_graph()

        train_pred = self.pipeline.predict(self.X_train)
        train_accuracy = accuracy_score(self.y_train, train_pred)

        val_pred = self.pipeline.predict(self.X_val)
        val_accuracy = accuracy_score(self.y_val, val_pred)

        train_test_diff = train_accuracy - test_accuracy
        print(f"\nOverfitting check - Training vs Test Accuracy difference: {train_test_diff:.2%}")

        train_val_diff = train_accuracy - val_accuracy
        print(f"Training vs Validation Accuracy difference: {train_val_diff:.2%}")

        if train_test_diff > 0.05 or train_val_diff > 0.05:
            print("Warning: Potential overfitting detected (>5% difference)")
            if train_test_diff > 0.1 or train_val_diff > 0.1:
                print("Significant overfitting! Consider adjusting model parameters.")
                print("Recommended actions:")
                print("1. Increase regularization (reg_alpha/reg_lambda)")
                print("2. Reduce max_depth")
                print("3. Increase min_child_weight")
                print("4. Decrease learning_rate")
                print("5. Adjust subsample or colsample_bytree")
        else:
            print("Good news! Model shows minimal overfitting.")

        self.plot_feature_importance()

        plt.figure(figsize=(8, 6))
        cm = confusion_matrix(self.y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()

        fpr, tpr, _ = roc_curve(self.y_test, y_pred_prob)
        roc_auc = auc(fpr, tpr)

        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2,
                label=f'ROC curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.show()

        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred))

        print("\nDetailed Performance Metrics:")
        print(f"True Negatives (Correct Benign): {cm[0,0]}")
        print(f"False Positives (Incorrect Malignant): {cm[0,1]}")
        print(f"False Negatives (Incorrect Benign): {cm[1,0]}")
        print(f"True Positives (Correct Malignant): {cm[1,1]}")

        print("\nMetrics for Imbalanced Classification:")
        sensitivity = cm[1,1] / (cm[1,1] + cm[1,0]) if (cm[1,1] + cm[1,0]) > 0 else 0
        specificity = cm[0,0] / (cm[0,0] + cm[0,1]) if (cm[0,0] + cm[0,1]) > 0 else 0
        print(f"Sensitivity (Recall): {sensitivity:.2%}")
        print(f"Specificity: {specificity:.2%}")
        print(f"AUC-ROC: {roc_auc:.4f}")

    def plot_feature_importance(self):
        """Plot feature importance from XGBoost"""
        if self.feature_importances is None:
            print("Feature importances not available. Train the model first.")
            return

        feature_importance_df = pd.DataFrame({
            'Feature': self.feature_names,
            'Importance': self.feature_importances
        })

        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

        plt.figure(figsize=(12, 8))
        sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')
        plt.title('Top 15 Feature Importance in XGBoost Model')
        plt.tight_layout()
        plt.show()

        print("\nTop 5 Most Important Features:")
        for i, (feature, importance) in enumerate(zip(feature_importance_df['Feature'].head(5),
                                                      feature_importance_df['Importance'].head(5))):
            print(f"{i+1}. {feature}: {importance:.4f}")

    def plot_accuracy_graph(self):
        """Plot the accuracy graph showing training, validation, cross-validation, and test accuracy"""
        if not self.accuracy_history:
            print("No accuracy data available for plotting.")
            return

        plt.figure(figsize=(10, 6))

        labels = [item[0] for item in self.accuracy_history]
        values = [item[1] for item in self.accuracy_history]

        colors = ['#4CAF50', '#FFC107', '#2196F3', '#9C27B0']
        bars = plt.bar(labels, values, color=colors[:len(labels)])

        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                    f'{height:.2%}', ha='center', va='bottom')

        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.title('XGBoost Classification Accuracy', fontsize=16)
        plt.ylabel('Accuracy', fontsize=14)
        plt.ylim(0, 1.1)
        plt.axhline(y=0.95, color='g', linestyle='-', alpha=0.5, label='95% Target')
        plt.axhline(y=0.5, color='r', linestyle='-', alpha=0.3, label='Random Chance')
        plt.yticks([i/10 for i in range(0, 11, 1)], [f'{i/10:.1%}' for i in range(0, 11, 1)])
        plt.legend()
        plt.tight_layout()
        plt.show()

    def plot_learning_curves(self):
        """Plot learning curves to help diagnose overfitting"""
        from sklearn.model_selection import learning_curve

        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('xgboost', XGBClassifier(
                objective='binary:logistic',
                n_estimators=100,
                **self.best_params,
                random_state=42,
                verbosity=0
            ))
        ])

        train_sizes, train_scores, test_scores = learning_curve(
            pipeline, self.X, self.y,
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            train_sizes=np.linspace(0.1, 1.0, 10),
            scoring='accuracy',
            n_jobs=-1
        )

        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        test_mean = np.mean(test_scores, axis=1)
        test_std = np.std(test_scores, axis=1)

        plt.figure(figsize=(10, 6))
        plt.title('Learning Curves (XGBoost)', fontsize=16)
        plt.xlabel('Training Examples', fontsize=14)
        plt.ylabel('Accuracy', fontsize=14)
        plt.grid(True)

        plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std,
                         alpha=0.1, color='blue')
        plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std,
                         alpha=0.1, color='orange')

        plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training score')
        plt.plot(train_sizes, test_mean, 'o-', color='orange', label='Cross-validation score')

        plt.axhline(y=0.95, color='r', linestyle='--', label='95% Accuracy Target')
        plt.legend(loc='best')
        plt.ylim(0.5, 1.05)
        plt.tight_layout()
        plt.show()

        gap = train_mean[-1] - test_mean[-1]
        print("\nLearning Curve Analysis:")
        if gap > 0.1:
            print(f"High variance detected (gap: {gap:.2f}). Model may be overfitting.")
            print("Consider increasing regularization parameters.")
        elif train_mean[-1] < 0.95:
            print(f"High bias detected. Model may be underfitting (training score: {train_mean[-1]:.2f}).")
            print("Consider increasing model complexity or adding more features.")
        else:
            print(f"Good balance between bias and variance (gap: {gap:.2f}).")
            print(f"Final training score: {train_mean[-1]:.2f}")
            print(f"Final validation score: {test_mean[-1]:.2f}")

    def import_feature_extractor(self, file_path):
        """Import external feature extraction logic from a Python file"""
        try:
            print(f"\nImporting feature extractor from: {file_path}")

            if not os.path.exists(file_path):
                raise FileNotFoundError(f"File not found: {file_path}")

            module_name = os.path.splitext(os.path.basename(file_path))[0]
            spec = importlib.util.spec_from_file_location(module_name, file_path)
            if spec is None:
                raise ImportError(f"Could not import module from {file_path}")

            feature_module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(feature_module)

            if hasattr(feature_module, 'BreastTumorFeatureExtractor'):
                print("Found BreastTumorFeatureExtractor class, creating instance...")
                self.feature_extractor_instance = feature_module.BreastTumorFeatureExtractor()

                def calculate_features_wrapper(image):
                    temp_image_path = 'temp_image.jpg'
                    cv2.imwrite(temp_image_path, image)
                    features_dict = self.feature_extractor_instance._extract_features(temp_image_path)
                    if os.path.exists(temp_image_path):
                        os.remove(temp_image_path)

                    if features_dict is None:
                        print("Feature extraction failed, using zeros")
                        feature_names = [
                            'mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness',
                            'mean_compactness', 'mean_concavity', 'mean_concave_points', 'mean_symmetry',
                            'mean_fractal_dimension', 'radius_error', 'texture_error', 'perimeter_error',
                            'area_error', 'smoothness_error', 'compactness_error', 'concavity_error',
                            'concave_points_error', 'symmetry_error', 'fractal_dimension_error',
                            'worst_radius', 'worst_texture', 'worst_perimeter', 'worst_area',
                            'worst_smoothness', 'worst_compactness', 'worst_concavity',
                            'worst_concave_points', 'worst_symmetry', 'worst_fractal_dimension'
                        ]
                        features = np.zeros(30)
                        features_dict = dict(zip(feature_names, features))

                    feature_keys = [
                        'mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness',
                        'mean_compactness', 'mean_concavity', 'mean_concave_points', 'mean_symmetry',
                        'mean_fractal_dimension', 'radius_error', 'texture_error', 'perimeter_error',
                        'area_error', 'smoothness_error', 'compactness_error', 'concavity_error',
                        'concave_points_error', 'symmetry_error', 'fractal_dimension_error',
                        'worst_radius', 'worst_texture', 'worst_perimeter', 'worst_area',
                        'worst_smoothness', 'worst_compactness', 'worst_concavity',
                        'worst_concave_points', 'worst_symmetry', 'worst_fractal_dimension'
                    ]

                    features_array = np.array([features_dict[key] for key in feature_keys])
                    return features_array, features_dict

                self.external_feature_extractor = calculate_features_wrapper
                print("Feature extractor adapter created successfully!")

            elif hasattr(feature_module, 'calculate_features'):
                print("Found calculate_features function")
                self.external_feature_extractor = feature_module.calculate_features
            else:
                raise AttributeError("The imported module must contain either a 'calculate_features(image)' function or a 'BreastTumorFeatureExtractor' class")

            print("Feature extractor imported successfully!")
            if hasattr(feature_module, '__doc__') and feature_module.__doc__:
                print(f"\nModule description: {feature_module.__doc__}")

            return True

        except Exception as e:
            print(f"Error importing feature extractor: {str(e)}")
            print(f"Traceback: {traceback.format_exc()}")
            return False

    def segment_mammogram(self, image):
        """Segment the mammogram image to detect potential cancer cells"""
        processed = image.copy()

        if len(image.shape) > 2 and image.shape[2] > 1:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image.copy()

        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)

        denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21)

        _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        kernel = np.ones((5, 5), np.uint8)
        opened = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel, iterations=2)
        closed = cv2.morphologyEx(opened, cv2.MORPH_CLOSE, kernel, iterations=2)

        contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        min_area = 100
        potential_masses = [contour for contour in contours if cv2.contourArea(contour) > min_area]

        return potential_masses

    def highlight_cancer_regions(self, image, contours, prediction):
        """Highlight potential cancer regions in the original image"""
        result = image.copy()

        if prediction == "Malignant" and contours:
            cv2.drawContours(result, contours, -1, (0, 0, 255), 2)
            largest_contour = max(contours, key=cv2.contourArea)

            overlay = result.copy()
            cv2.drawContours(overlay, [largest_contour], -1, (0, 0, 255), -1)

            M = cv2.moments(largest_contour)
            if M["m00"] != 0:
                cx = int(M["m10"] / M["m00"])
                cy = int(M["m01"] / M["m00"])
                cv2.putText(result, "Potential Cancer", (cx - 20, cy - 20),
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

            alpha = 0.3
            cv2.addWeighted(overlay, alpha, result, 1 - alpha, 0, result)

        cv2.putText(result, f"Prediction: {prediction}", (20, 30),
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255) if prediction == "Malignant" else (0, 255, 0), 2)

        return result

    def predict_from_image(self, image_path, display_image=True, save_visual_output=False, output_path=None):
        """Predict cancer type directly from image and highlight cancer cells if detected"""
        if self.pipeline is None:
            raise ValueError("Model not trained yet! Please train the model first.")

        print(f"\nReading image from: {image_path}")
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Could not read image from path: {image_path}")

        print(f"Image loaded successfully. Shape: {image.shape}, Type: {image.dtype}")

        print("\nExtracting features...")
        with tqdm(total=100, desc="Feature Extraction") as pbar:
            try:
                if self.external_feature_extractor:
                    print("Using imported external feature extractor...")
                    features, feature_dict = self.external_feature_extractor(image)
                else:
                    raise NotImplementedError("No feature extractor provided! Please import a feature extractor first.")

                print(f"\nFeatures extracted. Shape: {features.shape}")
                pbar.update(100)
            except Exception as e:
                print(f"\nError in feature extraction: {str(e)}")
                raise

        print("\nProcessing features through pipeline...")
        features_reshaped = features.reshape(1, -1)

        prediction = self.pipeline.predict(features_reshaped)[0]
        probability = self.pipeline.predict_proba(features_reshaped)[0]
        confidence = probability[1] if prediction == 1 else probability[0]
        cancer_type = "Malignant" if prediction == 1 else "Benign"

        prob_benign = probability[0]
        prob_malignant = probability[1]

        print(f"\nPrediction complete: {cancer_type} with {confidence:.2%} confidence")

        feature_importance = {}
        for i, feature_name in enumerate(self.feature_names):
            feature_importance[feature_name] = self.feature_importances[i]

        sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
        top_features = dict(sorted_features[:5])

        cancer_regions = None
        result_image = None

        if self.segmentation_enabled:
            print("\nSegmenting image to detect potential cancer cells...")
            try:
                potential_regions = self.segment_mammogram(image)
                print(f"Found {len(potential_regions)} potential regions of interest")

                result_image = self.highlight_cancer_regions(image, potential_regions, cancer_type)

                if display_image and result_image is not None:
                    plt.figure(figsize=(10, 8))
                    plt.imshow(cv2.cvtColor(result_image, cv2.COLOR_BGR2RGB))
                    plt.title(f"Analysis Result: {cancer_type} with {confidence:.2%} confidence")
                    plt.axis('off')
                    plt.show()

                if save_visual_output and output_path:
                    cv2.imwrite(output_path, result_image)
                    print(f"Analysis image saved to: {output_path}")

                cancer_regions = potential_regions

            except Exception as e:
                print(f"Error during image segmentation: {str(e)}")
                print("Continuing with classification result only...")

        result = {
            'cancer_type': cancer_type,
            'confidence': confidence,
            'probabilities': {
                'benign': prob_benign,
                'malignant': prob_malignant
            },
            'features': feature_dict,
            'top_features': top_features,
            'cancer_regions': cancer_regions,
            'result_image': result_image
        }

        return result

    def predict_random_image(self, feature_extractor_path):
        """Predict on a random mammogram image with cell detection"""
        # Placeholder list of image paths (replace with actual paths)
        sample_images = [
            "sample_mammogram.jpg",  # Replace with real image paths
            # Add more paths if available, e.g., "mammogram2.jpg", "mammogram3.jpg"
        ]

        if not sample_images:
            print("No sample images available. Please provide image paths.")
            return

        # Select a random image
        image_path = random.choice(sample_images)
        print(f"\nSelected random image: {image_path}")

        # Import feature extractor if not already loaded
        if not self.external_feature_extractor:
            try:
                self.import_feature_extractor(feature_extractor_path)
            except Exception as e:
                print(f"Error importing feature extractor: {str(e)}")
                return

        try:
            # Predict and display results
            result = self.predict_from_image(
                image_path,
                display_image=True,
                save_visual_output=False
            )

            print(f"\nRandom Image Prediction: {result['cancer_type']}")
            print(f"Confidence: {result['confidence']:.2%}")
            print("\nProbabilities:")
            print(f"- Benign: {result['probabilities']['benign']:.2%}")
            print(f"- Malignant: {result['probabilities']['malignant']:.2%}")
            print("\nTop contributing features:")
            for feature, importance in result['top_features'].items():
                print(f"- {feature}: {importance:.4f}")
            if result['cancer_regions'] is not None:
                print(f"\nDetected {len(result['cancer_regions'])} potential cancer regions")

            # Option to save the annotated image
            if result['result_image'] is not None:
                save_choice = input("\nDo you want to save this analyzed image? (y/n): ").lower()
                if save_choice == 'y' or save_choice == 'yes':
                    output_path = input("Enter path to save the analyzed image: ")
                    if output_path:
                        cv2.imwrite(output_path, result['result_image'])
                        print(f"Analysis image saved to: {output_path}")
                    else:
                        print("No path provided. Image not saved.")
                else:
                    print("Image not saved.")

        except Exception as e:
            print(f"Error processing random image: {str(e)}")
            print(traceback.format_exc())

    def save_model(self, path):
        """Save the trained model, scaler, pipeline and feature importances"""
        if self.pipeline is None:
            raise ValueError("No trained model to save!")

        model_data = {
            'pipeline': self.pipeline,
            'feature_names': self.feature_names,
            'accuracy_history': self.accuracy_history,
            'feature_importances': self.feature_importances,
            'best_params': self.best_params
        }
        joblib.dump(model_data, path)
        print(f"Model saved to {path}")

    def load_model(self, path):
        """Load a trained model, scaler, pipeline and feature importances"""
        model_data = joblib.load(path)
        self.pipeline = model_data['pipeline']
        self.model = self.pipeline.named_steps['xgboost']  # Fixed: Changed 'decision_tree' to 'xgboost'
        self.scaler = self.pipeline.named_steps['scaler']

        if 'feature_names' in model_data and model_data['feature_names'] is not None:
            self.feature_names = model_data['feature_names']
        if 'accuracy_history' in model_data and model_data['accuracy_history'] is not None:
            self.accuracy_history = model_data['accuracy_history']
        if 'feature_importances' in model_data and model_data['feature_importances'] is not None:
            self.feature_importances = model_data['feature_importances']
        if 'best_params' in model_data and model_data['best_params'] is not None:
            self.best_params = model_data['best_params']

        print(f"Model loaded successfully from {path}")
        print(f"XGBoost trees: {self.model.get_booster().num_boosted_rounds()}")

def main():
    classifier = XGBoostCancerClassifier()
    feature_extractor_path = None

    while True:
        print("\nEnhanced Cancer Classification System (XGBoost with SMOTE)")
        print("1. Load and preprocess data")
        print("2. Train model")
        print("3. Evaluate model and view accuracy graph")
        print("4. Predict from image (with cancer cell detection)")
        print("5. Save model")
        print("6. Load model")
        print("7. Toggle cancer cell detection")
        print("8. Plot learning curves")
        print("9. Predict on random image")
        print("10. Exit")

        choice = input("\nEnter your choice (1-10): ")

        if choice == '1':
            csv_path = input("Enter the path to your feature CSV file: ")
            try:
                classifier.load_and_preprocess(csv_path)
            except Exception as e:
                print(f"Error loading data: {str(e)}")
                print("\nPlease make sure your CSV file:")
                print("1. Contains numeric feature columns")
                print("2. Has a label column (e.g., 'label' or 'diagnosis')")
                print("3. Is properly formatted and not corrupted")

        elif choice == '2':
            if not hasattr(classifier, 'X_train'):
                print("Please load and preprocess data first (Option 1)")
                continue
            try:
                classifier.train_model()
            except Exception as e:
                print(f"Error training model: {str(e)}")

        elif choice == '3':
            if classifier.pipeline is None:
                print("Please train the model first (Option 2)")
                continue
            try:
                classifier.evaluate_model()
            except Exception as e:
                print(f"Error evaluating model: {str(e)}")

        elif choice == '4':
            if classifier.pipeline is None:
                print("Please train the model first (Option 2)")
                continue
            image_path = input("Enter the path to the mammogram image file: ")

            if not classifier.external_feature_extractor:
                if feature_extractor_path:
                    try:
                        print(f"Using previously imported feature extractor from: {feature_extractor_path}")
                        classifier.import_feature_extractor(feature_extractor_path)
                    except Exception as e:
                        print(f"Error reusing feature extraction logic: {str(e)}")
                        feature_extractor_path = input("Enter the path to your feature extraction .py file: ")
                        try:
                            classifier.import_feature_extractor(feature_extractor_path)
                        except Exception as e:
                            print(f"Error importing feature extraction logic: {str(e)}")
                            continue
                else:
                    feature_extractor_path = input("Enter the path to your feature extraction .py file: ")
                    try:
                        classifier.import_feature_extractor(feature_extractor_path)
                    except Exception as e:
                        print(f"Error importing feature extraction logic: {str(e)}")
                        continue

            try:
                result = classifier.predict_from_image(image_path, display_image=True, save_visual_output=False)

                print(f"\nPrediction: {result['cancer_type']}")
                print(f"Confidence: {result['confidence']:.2%}")
                print("\nProbabilities:")
                print(f"- Benign: {result['probabilities']['benign']:.2%}")
                print(f"- Malignant: {result['probabilities']['malignant']:.2%}")
                print("\nTop contributing features:")
                for feature, importance in result['top_features'].items():
                    print(f"- {feature}: {importance:.4f}")
                if result['cancer_regions'] is not None:
                    print(f"\nDetected {len(result['cancer_regions'])} potential cancer regions")

                if result['result_image'] is not None:
                    save_choice = input("\nDo you want to save this analyzed image? (y/n): ").lower()
                    if save_choice == 'y' or save_choice == 'yes':
                        output_path = input("Enter path to save the analyzed image: ")
                        if output_path:
                            cv2.imwrite(output_path, result['result_image'])
                            print(f"Analysis image saved to: {output_path}")
                        else:
                            print("No path provided. Image not saved.")
                    else:
                        print("Image not saved.")

            except Exception as e:
                print(f"Error processing image: {str(e)}")
                print(traceback.format_exc())

        elif choice == '5':
            if classifier.pipeline is None:
                print("Please train the model first (Option 2)")
                continue
            path = input("Enter path to save the model: ")
            try:
                classifier.save_model(path)
            except Exception as e:
                print(f"Error saving model: {str(e)}")

        elif choice == '6':
            path = input("Enter path to load the model: ")
            try:
                classifier.load_model(path)
            except Exception as e:
                print(f"Error loading model: {str(e)}")

        elif choice == '7':
            classifier.segmentation_enabled = not classifier.segmentation_enabled
            status = "ENABLED" if classifier.segmentation_enabled else "DISABLED"
            print(f"\nCancer cell detection is now {status}")

        elif choice == '8':
            if classifier.pipeline is None:
                print("Please train the model first (Option 2)")
                continue
            try:
                classifier.plot_learning_curves()
            except Exception as e:
                print(f"Error plotting learning curves: {str(e)}")

        elif choice == '9':
            if classifier.pipeline is None:
                print("Please train the model first (Option 2)")
                continue
            if not feature_extractor_path:
                feature_extractor_path = input("Enter the path to your feature extraction .py file: ")
                try:
                    classifier.import_feature_extractor(feature_extractor_path)
                except Exception as e:
                    print(f"Error importing feature extraction logic: {str(e)}")
                    continue
            try:
                classifier.predict_random_image(feature_extractor_path)
            except Exception as e:
                print(f"Error predicting random image: {str(e)}")

        elif choice == '10':
            print("Done!")
            break

        else:
            print("Invalid choice! Please try again.")

if __name__ == "__main__":
    main()



Enhanced Cancer Classification System (XGBoost with SMOTE)
1. Load and preprocess data
2. Train model
3. Evaluate model and view accuracy graph
4. Predict from image (with cancer cell detection)
5. Save model
6. Load model
7. Toggle cancer cell detection
8. Plot learning curves
9. Predict on random image
10. Exit
Loading data...

Columns in the dataset:
['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness', 'mean_compactness', 'mean_concavity', 'mean_concave_points', 'mean_symmetry', 'mean_fractal_dimension', 'radius_error', 'texture_error', 'perimeter_error', 'area_error', 'smoothness_error', 'compactness_error', 'concavity_error', 'concave_points_error', 'symmetry_error', 'fractal_dimension_error', 'worst_radius', 'worst_texture', 'worst_perimeter', 'worst_area', 'worst_smoothness', 'worst_compactness', 'worst_concavity', 'worst_concave_points', 'worst_symmetry', 'worst_fractal_dimension', 'image_path', 'label']

Number of feature columns found: 30
Feature

Training Progress: 100%|██████████| 100/100 [00:05<00:00, 19.15it/s]



Model training completed!
XGBoost trees: 1000
Training Accuracy: 98.52%
Validation Accuracy: 92.58%

Performing 5-fold cross-validation...
Cross-validation scores: ['93.55%', '93.60%', '93.39%', '93.08%', '93.44%']
Mean cross-validation accuracy: 93.41% ± 0.18%

Suggestions to improve accuracy:
1. Increase n_estimators (e.g., 1500)
2. Expand GridSearchCV param_grid (e.g., more max_depth values)
3. Check dataset quality (e.g., outliers, feature distributions)

Training accuracy: 98.52%, Validation accuracy: 92.58%
Difference: 5.94%

Recommendations to reduce overfitting:
1. Increase regularization (reg_alpha, reg_lambda)
2. Decrease max_depth
3. Increase min_child_weight
4. Reduce learning_rate

Enhanced Cancer Classification System (XGBoost with SMOTE)
1. Load and preprocess data
2. Train model
3. Evaluate model and view accuracy graph
4. Predict from image (with cancer cell detection)
5. Save model
6. Load model
7. Toggle cancer cell detection
8. Plot learning curves
9. Predict on ra