In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = load_breast_cancer()

df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

print("First 5 rows:")
print(df.head())

print("Feature names:")
for i, feature in enumerate(data.feature_names):
    print(f"{i+1:2d}. {feature}")

print("\n")
print("Target variables:")
print(f"Target names: {data.target_names}")
print(f"Target values: {pd.Series(data.target).value_counts().to_dict()}")


In [17]:
from sklearn.model_selection import train_test_split

feaTrain, feaTest, tarTrain, tarTest = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

logRegModel = LogisticRegression(max_iter=5000, random_state=42)
logRegModel.fit(feaTrain, tarTrain)

tarPred = logRegModel.predict(feaTest)
print(tarPred)
print(f"Class 0 (Malignant): {(tarPred == 0).mean() * 100:.2f}%")
print(f"Class 1 (Benign): {(tarPred == 1).mean() * 100:.2f}%")
print(tarPred.mean())


In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt

confMatrix = confusion_matrix(tarTest, tarPred)
print("Confusion matrix:")
print(confMatrix)

tarPredProb = logRegModel.predict_proba(feaTest)[:, 1]
fpr, tpr, thresholds = roc_curve(tarTest, tarPredProb)
aucScore = auc(fpr, tpr)

print(f"\n AUC score: {aucScore:.4f}")

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blueviolet', lw=2, label=f'chad AUC:{aucScore:.4f}')
plt.plot([0, 1], [0, 1], color='deepskyblue', lw=2, linestyle='--', label='basic loser classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False pos')
plt.ylabel('True pos')
plt.title('Logistic regression ROC curve')
plt.legend(loc="lower right")
plt.show()

In [27]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

feaTrainScaled = scaler.fit_transform(feaTrain)
feaTestScaled = scaler.transform(feaTest)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

paramGrid = {
    'n_neighbors': range(1, 22), 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()

gridSearch = GridSearchCV(knn, paramGrid, cv=5, scoring='accuracy')
gridSearch.fit(feaTrainScaled, tarTrain)

bestParams = gridSearch.best_params_
bestScore = gridSearch.best_score_

print("Best parameters:")
print(bestParams)
print(f"Best cross validation accuracy: {bestScore:.4f}")

In [None]:
knnBest = KNeighborsClassifier(n_neighbors=8, weights='uniform', metric='euclidean')
knnBest.fit(feaTrainScaled, tarTrain)

knnPred = knnBest.predict(feaTestScaled)
knnPredProb = knnBest.predict_proba(feaTestScaled)[:, 1]

knnConfMatrix = confusion_matrix(tarTest, knnPred)
print("confusion matrix:")
print(knnConfMatrix)

knnFpr, knnTpr, knnThresholds = roc_curve(tarTest, knnPredProb)
knnAucScore = auc(knnFpr, knnTpr)

print(f"\nAUC score: {knnAucScore:.4f}")

plt.figure(figsize=(8, 6))
plt.plot(knnFpr, knnTpr, color='olivedrab', lw=2, label=f'KNN AUC: {knnAucScore:.4f}')
plt.plot([0, 1], [0, 1], color='lawngreen', lw=2, linestyle='-', label='Massive L')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False pos')
plt.ylabel('True pos')
plt.title('KNN ROC curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
print("comparison")
print(f"LR AUC: {aucScore:.4f}")
print(f"KNN AUC: {knnAucScore:.4f}")

print("\nConfmatrix")
print("LR:")
print(confMatrix)
print("\nKNN:")
print(knnConfMatrix)

from sklearn.metrics import precision_score, recall_score, f1_score

lr_precision = precision_score(tarTest, tarPred)
lr_recall = recall_score(tarTest, tarPred)
lr_f1 = f1_score(tarTest, tarPred)
knn_precision = precision_score(tarTest, knnPred)
knn_recall = recall_score(tarTest, knnPred)
knn_f1 = f1_score(tarTest, knnPred)

print(f"{'Metric':<15} {'LR':<20} {'KNN':<10}")
print(f"{'Precision':<15} {lr_precision:<20.4f} {knn_precision:<10.4f}")
print(f"{'Recall':<15} {lr_recall:<20.4f} {knn_recall:<10.4f}")
print(f"{'F1-Score':<15} {lr_f1:<20.4f} {knn_f1:<10.4f}")

Based on the graphs, AUC score and the metrics i printed above, logistic regression outperforms Knn in AUC, recall and f1 score but knn has more true positives, 41 and less false positives. Both have strengths but overall, LR is the winner, due to AUC, F1, Recall and 1 false negative, compared to 3 missed cancer cases from knn