In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import time

In [7]:
data = load_breast_cancer()
X = data.data
y = data.target

In [8]:
print("\n" + "="*70)
print("DATASET INFORMATION")
print("="*70)
print(f"Total samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Malignant cases (0): {np.sum(y == 0)}")
print(f"Benign cases (1): {np.sum(y == 1)}")
print(f"\nFeature names: {data.feature_names[:5]}... (showing first 5)")


DATASET INFORMATION
Total samples: 569
Number of features: 30
Malignant cases (0): 212
Benign cases (1): 357

Feature names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness']... (showing first 5)


In [9]:
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

In [10]:
print("\nFirst few rows of the dataset:")
print(df.head())

print("\nDataset statistics:")
print(df.describe())


First few rows of the dataset:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimet

In [11]:
# Split data: 70% training, 30% testing (as per the paper)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# Apply Standardization: X' = (X - μ) / σ
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Training set size: 398 samples
Testing set size: 171 samples


In [12]:
print("\nStandardization applied!")
print(f"Training data mean (after scaling): {np.mean(X_train_scaled, axis=0)[:3]}")
print(f"Training data std (after scaling): {np.std(X_train_scaled, axis=0)[:3]}")
print("(Showing first 3 features - all should be ~0 mean and ~1 std)")


Standardization applied!
Training data mean (after scaling): [-4.97480337e-15  2.74863884e-15  2.03912822e-15]
Training data std (after scaling): [1. 1. 1.]
(Showing first 3 features - all should be ~0 mean and ~1 std)


In [None]:
# Initialize KNN classifier with k=1 (nearest neighbor)
knn_model = KNeighborsClassifier(
    n_neighbors=1,
    metric='euclidean',  # L2 norm: d_L2(p,q) = √(Σ(p_i - q_i)²)
    algorithm='auto'
)

knn_model.fit(X_train_scaled, y_train)

# Make predictions
y_train_pred_knn = knn_model.predict(X_train_scaled)
y_test_pred_knn = knn_model.predict(X_test_scaled)

# Calculate accuracies
knn_train_acc = accuracy_score(y_train, y_train_pred_knn)
knn_test_acc = accuracy_score(y_test, y_test_pred_knn)

print(f"\nKNN Results:")
print(f"Training accuracy: {knn_train_acc*100:.2f}%")
print(f"Testing accuracy: {knn_test_acc*100:.2f}%")
print(f"Expected test accuracy (from paper): 95.90%")

# Confusion Matrix
cm_knn = confusion_matrix(y_test, y_test_pred_knn)
print(f"\nConfusion Matrix:")
print(cm_knn)

# Classification Report
print(f"\nDetailed Classification Report:")
print(classification_report(y_test, y_test_pred_knn, 
                          target_names=['Malignant', 'Benign']))

# Calculate additional metrics
tn, fp, fn, tp = cm_knn.ravel()
sensitivity = tp / (tp + fn)  # TPR (True Positive Rate)
specificity = tn / (tn + fp)  # TNR (True Negative Rate)
fpr = fp / (fp + tn)  # False Positive Rate
fnr = fn / (fn + tp)  # False Negative Rate

print(f"\nAdditional Metrics:")
print(f"Sensitivity (TPR): {sensitivity*100:.2f}%")
print(f"Specificity (TNR): {specificity*100:.2f}%")
print(f"False Positive Rate (FPR): {fpr*100:.2f}%")
print(f"False Negative Rate (FNR): {fnr*100:.2f}%")


Training KNN model...

KNN Results:
Training time: 0.0328 seconds
Training accuracy: 100.00%
Testing accuracy: 95.91%
Expected test accuracy (from paper): 95.90%

Confusion Matrix:
[[ 61   3]
 [  4 103]]

Detailed Classification Report:
              precision    recall  f1-score   support

   Malignant       0.94      0.95      0.95        64
      Benign       0.97      0.96      0.97       107

    accuracy                           0.96       171
   macro avg       0.96      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171


Additional Metrics:
Sensitivity (TPR): 96.26%
Specificity (TNR): 95.31%
False Positive Rate (FPR): 4.69%
False Negative Rate (FNR): 3.74%


[WinError 2] Le fichier spécifié est introuvable
  File "c:\Users\LENOVO\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\LENOVO\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\LENOVO\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [None]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Hyper-parameters from paper
BATCH_SIZE = 128
LEARNING_RATE = 1e-2
EPOCHS = 3000
NUM_NODES = [500, 500, 500]
NUM_CLASSES = 2
TEST_SIZE = 0.30

# Load data
X, y = datasets.load_breast_cancer(return_X_y=True)

# Split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=SEED
)

# Standardize features: fit on train only
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build model
mlp_model = keras.Sequential(
    [
        layers.Input(shape=(X_train_scaled.shape[1],)),
        layers.Dense(NUM_NODES[0], activation="relu", name="hidden_layer_1"),
        layers.Dense(NUM_NODES[1], activation="relu", name="hidden_layer_2"),
        layers.Dense(NUM_NODES[2], activation="relu", name="hidden_layer_3"),
        layers.Dense(NUM_CLASSES, activation="softmax", name="output_layer"),
    ],
    name="mlp_500_500_500",
)

# Compile: SGD with lr=1e-2 (plain SGD, no momentum to match paper unless paper used momentum)
optimizer = keras.optimizers.SGD(learning_rate=LEARNING_RATE)
mlp_model.compile(optimizer=optimizer,
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])

mlp_model.summary()

# Train. If you want to monitor test set, pass validation_data
start = time.time()
history = mlp_model.fit(
    X_train_scaled,
    y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_test_scaled, y_test),
    shuffle=True,
    verbose=1,
)
train_time = time.time() - start

# Evaluate
train_loss, train_acc = mlp_model.evaluate(X_train_scaled, y_train, verbose=0)
test_loss, test_acc = mlp_model.evaluate(X_test_scaled, y_test, verbose=0)

print("\nMLP Results:")
print(f"Training time: {train_time:.2f} seconds")
print(f"Training accuracy: {train_acc*100:.4f}%")
print(f"Testing accuracy:  {test_acc*100:.4f}%")

MODEL 2: MULTILAYER PERCEPTRON (MLP)

Building MLP architecture: 500-500-500
Activation function: ReLU
Loss function: Cross Entropy
Optimizer: SGD (Stochastic Gradient Descent)

Model Architecture:



Training MLP model...
Epoch 1/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - accuracy: 0.5288 - loss: 0.6654 - val_accuracy: 0.8500 - val_loss: 0.6130
Epoch 2/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.8273 - loss: 0.6083 - val_accuracy: 0.9417 - val_loss: 0.5665
Epoch 3/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.9029 - loss: 0.5639 - val_accuracy: 0.9583 - val_loss: 0.5275
Epoch 4/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.9317 - loss: 0.5263 - val_accuracy: 0.9667 - val_loss: 0.4946
Epoch 5/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.9317 - loss: 0.4946 - val_accuracy: 0.9667 - val_loss: 0.4640
Epoch 6/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.9317 - loss: 0.4652 - val_accuracy: 0.9750 - val_loss: 0.4362
Epoch 7/300
[1m3