In [1]:
# install pandas and its dependencies
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn

# Import the libraries again after reinstallation
import pandas as pd  # for data manipulation or analysis
import numpy as np  # for numeric calculation
import matplotlib.pyplot as plt  # for data visualization
import seaborn as sns  # for data visualization



In [3]:
import pandas as pd
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Thesis/Dataset/data 1.csv')

# Drop ID column
df.drop(columns=['id'], inplace=True)

# Encode diagnosis column: M = 1, B = 0
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Separate features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Check basic info
print(df.info())
print("Missing values:\n", df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   radius_mean              569 non-null    float64
 1   texture_mean             569 non-null    float64
 2   perimeter_mean           569 non-null    float64
 3   area_mean                569 non-null    float64
 4   smoothness_mean          569 non-null    float64
 5   compactness_mean         569 non-null    float64
 6   concavity_mean           569 non-null    float64
 7   concave points_mean      569 non-null    float64
 8   symmetry_mean            569 non-null    float64
 9   fractal_dimension_mean   569 non-null    float64
 10  radius_se                569 non-null    float64
 11  texture_se               569 non-null    float64
 12  perimeter_se             569 non-null    float64
 13  area_se                  569 non-null    float64
 14  smoothness_se            5

In [4]:
print("Original class distribution:\n", y.value_counts())


Original class distribution:
 diagnosis
0    357
1    212
Name: count, dtype: int64


In [5]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.05, random_state=42)
y_pred_outliers = iso.fit_predict(X)
mask = y_pred_outliers != -1

X_clean = X[mask]
y_clean = y[mask]

print("After outlier removal:\n", y_clean.value_counts())


After outlier removal:
 diagnosis
0    350
1    190
Name: count, dtype: int64


In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clean)


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_clean, test_size=0.2, random_state=42, stratify=y_clean)


In [8]:
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:\n", Counter(y_train_resampled))


Class distribution after SMOTE:
 Counter({0: 280, 1: 280})


In [9]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

# Define the estimator
rf_estimator = RandomForestClassifier(n_estimators=100, random_state=42)

# Define cross-validation strategy
cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Apply RFECV
rfecv = RFECV(estimator=rf_estimator, step=1, cv=cv_strategy, scoring='f1', n_jobs=-1)
rfecv.fit(X_train_resampled, y_train_resampled)

# Show optimal number of features
print("Optimal number of features:", rfecv.n_features_)

# Get selected features
X_train_rfe = rfecv.transform(X_train_resampled)
X_test_rfe = rfecv.transform(X_test)

# Optional: Show ranking of features
feature_ranking = rfecv.ranking_
print("Feature ranking (1 = selected):", feature_ranking)


Optimal number of features: 19
Feature ranking (1 = selected): [ 1  1  1  1  5  2  1  1  7 10  1  9  1  1  8  4  3 11 12  6  1  1  1  1
  1  1  1  1  1  1]


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# Utility function to get evaluation metrics
def get_metrics(y_true, y_pred, y_prob, model_name):
    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'ROC AUC': roc_auc_score(y_true, y_prob)
    }

results_rfecv = []

# --- Logistic Regression ---
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_rfe, y_train_resampled)
y_pred_lr = lr.predict(X_test_rfe)
y_prob_lr = lr.predict_proba(X_test_rfe)[:, 1]

print("📘 Logistic Regression:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_lr))
results_rfecv.append(get_metrics(y_test, y_pred_lr, y_prob_lr, 'Logistic Regression (RFECV)'))

# --- Random Forest ---
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_rfe, y_train_resampled)
y_pred_rf = rf.predict(X_test_rfe)
y_prob_rf = rf.predict_proba(X_test_rfe)[:, 1]

print("📗 Random Forest:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_rf))
results_rfecv.append(get_metrics(y_test, y_pred_rf, y_prob_rf, 'Random Forest (RFECV)'))

# --- MLP Classifier ---
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp.fit(X_train_rfe, y_train_resampled)
y_pred_mlp = mlp.predict(X_test_rfe)
y_prob_mlp = mlp.predict_proba(X_test_rfe)[:, 1]

print("📙 MLP Classifier:")
print(confusion_matrix(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_mlp))
results_rfecv.append(get_metrics(y_test, y_pred_mlp, y_prob_mlp, 'MLP Classifier (RFECV)'))

# Final results table
results_rfecv_df = pd.DataFrame(results_rfecv)
print("\n📊 RFECV-Based Model Comparison:")
print(results_rfecv_df.round(4))


📘 Logistic Regression:
[[70  0]
 [ 0 38]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        70
           1       1.00      1.00      1.00        38

    accuracy                           1.00       108
   macro avg       1.00      1.00      1.00       108
weighted avg       1.00      1.00      1.00       108

ROC AUC Score: 1.0
📗 Random Forest:
[[70  0]
 [ 2 36]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        70
           1       1.00      0.95      0.97        38

    accuracy                           0.98       108
   macro avg       0.99      0.97      0.98       108
weighted avg       0.98      0.98      0.98       108

ROC AUC Score: 0.9996240601503759
📙 MLP Classifier:
[[70  0]
 [ 0 38]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        70
           1       1.00      1.00      1.00        38

    accuracy  

The same thing is done by directly implementing n_features_to_select=19,same result.

In [11]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Use Random Forest as estimator inside RFE
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=19)
X_train_rfe = rfe_selector.fit_transform(X_train_resampled, y_train_resampled)
X_test_rfe = rfe_selector.transform(X_test)

In [12]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

In [13]:
# Logistic Regression
lr_rfe = LogisticRegression(max_iter=1000)
lr_rfe.fit(X_train_rfe, y_train_resampled)
y_pred_lr_rfe = lr_rfe.predict(X_test_rfe)
y_prob_lr_rfe = lr_rfe.predict_proba(X_test_rfe)[:, 1]

# Random Forest
rf_rfe = RandomForestClassifier(n_estimators=100, random_state=42)
rf_rfe.fit(X_train_rfe, y_train_resampled)
y_pred_rf_rfe = rf_rfe.predict(X_test_rfe)
y_prob_rf_rfe = rf_rfe.predict_proba(X_test_rfe)[:, 1]

# MLP
mlp_rfe = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_rfe.fit(X_train_rfe, y_train_resampled)
y_pred_mlp_rfe = mlp_rfe.predict(X_test_rfe)
y_prob_mlp_rfe = mlp_rfe.predict_proba(X_test_rfe)[:, 1]


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
def get_metrics(y_true, y_pred, y_prob, model_name):
    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'ROC AUC': roc_auc_score(y_true, y_prob)
    }
results_rfe = []
results_rfe.append(get_metrics(y_test, y_pred_lr_rfe, y_prob_lr_rfe, 'LR (RFE)'))
results_rfe.append(get_metrics(y_test, y_pred_rf_rfe, y_prob_rf_rfe, 'RF (RFE)'))
results_rfe.append(get_metrics(y_test, y_pred_mlp_rfe, y_prob_mlp_rfe, 'MLP (RFE)'))

# Create DataFrame
results_df = pd.DataFrame(results_rfe)

# Format and display
print("🔍 Model Performance Comparison:")
print(results_df.round(4))


🔍 Model Performance Comparison:
       Model  Accuracy  Precision  Recall  F1 Score  ROC AUC
0   LR (RFE)    1.0000        1.0  1.0000     1.000   1.0000
1   RF (RFE)    0.9815        1.0  0.9474     0.973   0.9996
2  MLP (RFE)    1.0000        1.0  1.0000     1.000   1.0000
