In [1]:
# install pandas and its dependencies
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn

# Import the libraries again after reinstallation
import pandas as pd  # for data manipulation or analysis
import numpy as np  # for numeric calculation
import matplotlib.pyplot as plt  # for data visualization
import seaborn as sns  # for data visualization



In [2]:
import pandas as pd
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Thesis/Dataset/data 1.csv')

# Drop ID column
df.drop(columns=['id'], inplace=True)

# Encode diagnosis column: M = 1, B = 0
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Separate features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Check basic info
print(df.info())
print("Missing values:\n", df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   radius_mean              569 non-null    float64
 1   texture_mean             569 non-null    float64
 2   perimeter_mean           569 non-null    float64
 3   area_mean                569 non-null    float64
 4   smoothness_mean          569 non-null    float64
 5   compactness_mean         569 non-null    float64
 6   concavity_mean           569 non-null    float64
 7   concave points_mean      569 non-null    float64
 8   symmetry_mean            569 non-null    float64
 9   fractal_dimension_mean   569 non-null    float64
 10  radius_se                569 non-null    float64
 11  texture_se               569 non-null    float64
 12  perimeter_se             569 non-null    float64
 13  area_se                  569 non-null    float64
 14  smoothness_se            5

In [3]:
print("Original class distribution:\n", y.value_counts())


Original class distribution:
 diagnosis
0    357
1    212
Name: count, dtype: int64


In [4]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.05, random_state=42)
y_pred_outliers = iso.fit_predict(X)
mask = y_pred_outliers != -1

X_clean = X[mask]
y_clean = y[mask]

print("After outlier removal:\n", y_clean.value_counts())


After outlier removal:
 diagnosis
0    350
1    190
Name: count, dtype: int64


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clean)


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_clean, test_size=0.2, random_state=42, stratify=y_clean)


In [7]:
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:\n", Counter(y_train_resampled))


Class distribution after SMOTE:
 Counter({0: 280, 1: 280})


In [9]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score
import numpy as np

# Initialize ExtraTrees classifier
etc = ExtraTreesClassifier(n_estimators=100, random_state=42)
etc.fit(X_train_resampled, y_train_resampled)

# Get feature importances and sort features by importance
importances = etc.feature_importances_
indices = np.argsort(importances)[::-1]

# Define a range of k values to test
k_values = list(range(5, X_train_resampled.shape[1]+1, 5))  # e.g., 5,10,15,... max features

# Dictionary to store mean CV scores for Logistic Regression (you can extend for others)
cv_scores = {}

for k in k_values:
    selected_indices = indices[:k]
    X_train_k = X_train_resampled[:, selected_indices]

    # Use Logistic Regression as example classifier for tuning k
    lr = LogisticRegression(max_iter=1000, random_state=42)

    # 5-fold CV to evaluate
    scores = cross_val_score(lr, X_train_k, y_train_resampled, cv=5, scoring='f1')
    cv_scores[k] = scores.mean()
    print(f"k={k}: Mean CV F1 score = {cv_scores[k]:.4f}")

# Find best k
best_k = max(cv_scores, key=cv_scores.get)
print(f"\nBest number of features by ExtraTrees top-k tuning: {best_k}")

# Select top-k features based on best_k
selected_indices = indices[:best_k]
X_train_etc = X_train_resampled[:, selected_indices]
X_test_etc = X_test[:, selected_indices]


k=5: Mean CV F1 score = 0.9416
k=10: Mean CV F1 score = 0.9567
k=15: Mean CV F1 score = 0.9641
k=20: Mean CV F1 score = 0.9765
k=25: Mean CV F1 score = 0.9803
k=30: Mean CV F1 score = 0.9785

Best number of features by ExtraTrees top-k tuning: 25


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
def get_metrics(y_true, y_pred, y_prob, model_name):
    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'ROC AUC': roc_auc_score(y_true, y_prob)
    }
# Train and evaluate models on ExtraTrees top-k selected features
results_etc = []

# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_etc, y_train_resampled)
y_pred_lr = lr.predict(X_test_etc)
y_prob_lr = lr.predict_proba(X_test_etc)[:, 1]

print("📘 Logistic Regression (ExtraTrees top-k):")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_lr))
results_etc.append(get_metrics(y_test, y_pred_lr, y_prob_lr, 'Logistic Regression (ExtraTrees)'))

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_etc, y_train_resampled)
y_pred_rf = rf.predict(X_test_etc)
y_prob_rf = rf.predict_proba(X_test_etc)[:, 1]

print("📗 Random Forest (ExtraTrees top-k):")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_rf))
results_etc.append(get_metrics(y_test, y_pred_rf, y_prob_rf, 'Random Forest (ExtraTrees)'))

# MLP Classifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp.fit(X_train_etc, y_train_resampled)
y_pred_mlp = mlp.predict(X_test_etc)
y_prob_mlp = mlp.predict_proba(X_test_etc)[:, 1]

print("📙 MLP Classifier (ExtraTrees top-k):")
print(confusion_matrix(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_mlp))
results_etc.append(get_metrics(y_test, y_pred_mlp, y_prob_mlp, 'MLP Classifier (ExtraTrees)'))

# Summary DataFrame
results_etc_df = pd.DataFrame(results_etc)
print("\n📊 ExtraTrees Top-k Model Comparison:")
print(results_etc_df.round(4))


📘 Logistic Regression (ExtraTrees top-k):
[[69  1]
 [ 0 38]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        70
           1       0.97      1.00      0.99        38

    accuracy                           0.99       108
   macro avg       0.99      0.99      0.99       108
weighted avg       0.99      0.99      0.99       108

ROC AUC Score: 1.0
📗 Random Forest (ExtraTrees top-k):
[[70  0]
 [ 2 36]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        70
           1       1.00      0.95      0.97        38

    accuracy                           0.98       108
   macro avg       0.99      0.97      0.98       108
weighted avg       0.98      0.98      0.98       108

ROC AUC Score: 0.9992481203007519
📙 MLP Classifier (ExtraTrees top-k):
[[70  0]
 [ 0 38]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        70
           1

In [15]:
from sklearn.ensemble import ExtraTreesClassifier
import numpy as np

etc = ExtraTreesClassifier(n_estimators=100, random_state=42)
etc.fit(X_train_resampled, y_train_resampled)

# Get top 25 important features
importances = etc.feature_importances_
indices = np.argsort(importances)[::-1][:25]

X_train_etc = X_train_resampled[:, indices]
X_test_etc = X_test[:, indices]


In [16]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [17]:
# Logistic Regression
lr_etc = LogisticRegression(max_iter=1000)
lr_etc.fit(X_train_etc, y_train_resampled)
y_pred_lr_etc = lr_etc.predict(X_test_etc)
y_prob_lr_etc = lr_etc.predict_proba(X_test_etc)[:, 1]

# Random Forest
rf_etc = RandomForestClassifier(n_estimators=100, random_state=42)
rf_etc.fit(X_train_etc, y_train_resampled)
y_pred_rf_etc = rf_etc.predict(X_test_etc)
y_prob_rf_etc = rf_etc.predict_proba(X_test_etc)[:, 1]

# MLP
mlp_etc = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_etc.fit(X_train_etc, y_train_resampled)
y_pred_mlp_etc = mlp_etc.predict(X_test_etc)
y_prob_mlp_etc = mlp_etc.predict_proba(X_test_etc)[:, 1]


In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
def get_metrics(y_true, y_pred, y_prob, model_name):
    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'ROC AUC': roc_auc_score(y_true, y_prob)
    }
results_etc = []
results_etc.append(get_metrics(y_test, y_pred_lr_etc, y_prob_lr_etc, 'LR (ExtraTrees)'))
results_etc.append(get_metrics(y_test, y_pred_rf_etc, y_prob_rf_etc, 'RF (ExtraTrees)'))
results_etc.append(get_metrics(y_test, y_pred_mlp_etc, y_prob_mlp_etc, 'MLP (ExtraTrees)'))

# Create DataFrame
results_df = pd.DataFrame(results_etc)

# Format and display
print("🔍 Model Performance Comparison:")
print(results_df.round(4))


🔍 Model Performance Comparison:
              Model  Accuracy  Precision  Recall  F1 Score  ROC AUC
0   LR (ExtraTrees)    0.9907     0.9744  1.0000     0.987   1.0000
1   RF (ExtraTrees)    0.9815     1.0000  0.9474     0.973   0.9992
2  MLP (ExtraTrees)    1.0000     1.0000  1.0000     1.000   1.0000


OSError: Cannot save file into a non-existent directory: '/content/drive/MyDrive/Thesis/Code'