In [6]:
# Import necessary libraries
from sklearn.datasets import load_breast_cancer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [7]:
# Load the breast cancer dataset
X, y = load_breast_cancer(return_X_y=True)

In [8]:
# Split the data into features (X) and target (y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Check for missing values
print("Missing values count:", np.sum(np.isnan(X)))

Missing values count: 0


In [10]:
# Impute missing values if any
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

In [11]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

In [12]:
# Print shapes to confirm preprocessing
print("Original shape:", X.shape)
print("Scaled shape:", X_scaled.shape)

Original shape: (569, 30)
Scaled shape: (569, 30)


In [13]:
if len(X_scaled) != len(y_train):
    print("Mismatch detected. Aligning sample sizes...")
    
    # Check which array has fewer samples
    if len(X_scaled) > len(y_train):
        X_scaled = X_scaled[:len(y_train)]
    elif len(X_scaled) < len(y_train):
        y_train = y_train[:len(X_scaled)]
    
    print(f"Adjusted shapes: X_scaled {X_scaled.shape}, y_train {y_train.shape}")
else:
    print("Sample sizes already aligned.")

Mismatch detected. Aligning sample sizes...
Adjusted shapes: X_scaled (455, 30), y_train (455,)


In [14]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_scaled, y_train)

In [15]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_scaled, y_train)

In [16]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_scaled, y_train)

In [17]:
# Support Vector Machine (SVM)
from sklearn.svm import SVC
svm = SVC(kernel='linear', probability=True, random_state=42)
svm.fit(X_scaled, y_train)

In [18]:
# k-Nearest Neighbors (k-NN)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn.fit(X_scaled, y_train)

In [19]:
from sklearn.metrics import accuracy_score

# Predictions
logreg_pred = logreg.predict(X_test)
dtree_pred = dtree.predict(X_test)
rf_pred = rf.predict(X_test)
svm_pred = svm.predict(X_test)
knn_pred = knn.predict(X_test)

In [20]:
# Accuracy scores
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print("Decision Tree Accuracy:", accuracy_score(y_test, dtree_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Support Vector Machine Accuracy:", accuracy_score(y_test, svm_pred))
print("k-Nearest Neighbors Accuracy:", accuracy_score(y_test, knn_pred))

Logistic Regression Accuracy: 0.6228070175438597
Decision Tree Accuracy: 0.8070175438596491
Random Forest Accuracy: 0.8333333333333334
Support Vector Machine Accuracy: 0.6228070175438597
k-Nearest Neighbors Accuracy: 0.6052631578947368


In [21]:
# Sort models by accuracy
models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Support Vector Machine', 'k-Nearest Neighbors']
accuracies = [
    accuracy_score(y_test, logreg_pred),
    accuracy_score(y_test, dtree_pred),
    accuracy_score(y_test, rf_pred),
    accuracy_score(y_test, svm_pred),
    accuracy_score(y_test, knn_pred)
]

best_model = models[np.argmax(accuracies)]
worst_model = models[np.argmin(accuracies)]

print("\nBest Performing Model:", best_model)
print("Worst Performing Model:", worst_model)


Best Performing Model: Random Forest
Worst Performing Model: k-Nearest Neighbors
