<a href="https://colab.research.google.com/github/vinny380/breast_cancer_knn_and_logistic_regression/blob/main/logistic_regression_knn_breast_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KNN From Scratch

### 1. Data Preprocessing
I'll first load the dataset, normalize it, and split it into training and testing sets.

In [28]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#loading the dataset
data = load_breast_cancer()
X = data.data  #features
y = data.target  #target (0 for benign, 1 for malignant)

#normaliziing the data
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

#splitting the dataset (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.3, random_state=42)


###2. KNN Implementation from Scratch
Now I'll implement the K-Nearest Neighbors algorithm. I'll manually calculate the Euclidean distance, select the k nearest neighbors, and classify based on the majority vote.

In [29]:
class KNN:
    def __init__(self, k: int = 3) -> None:
        self.k = k
        self.X_train = np.array([])
        self.y_train = np.array([])

    def fit(self, X_train: np.ndarray, y_train: np.ndarray) -> None:
        self.X_train = X_train
        self.y_train = y_train

    def euclidean_distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def predict(self, X_test: np.ndarray) -> np.ndarray:
        predictions = [self._predict(x) for x in X_test]
        return np.array(predictions)

    def _predict(self, x: np.ndarray) -> np.int64:
        #computing distances between x and all samples in the training set
        distances = [self.euclidean_distance(x, x_train) for x_train in self.X_train]
        #sorting by distance and return indices of the first k neighbors
        k_indices: np.ndarray = np.argsort(distances)[:self.k]
        #extracting the labels of the k nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        #returns the most common class label
        return np.bincount(k_nearest_labels).argmax()


#instantiating KNN with k=3
knn = KNN(k=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

#now evaluate the model using accuracy
accuracy = np.sum(y_pred == y_test) / len(y_test)
print(f"KNN accuracy: {accuracy * 100:.2f}%")


KNN accuracy: 95.91%


### 3. Parameter Tuning:
To find the optimal value for k, I will experiment with different values and evaluate the accuracy of the model on the test set.

In [30]:
def find_best_k(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray) -> int:
    accuracies = []
    k_values = range(1, 20)

    for k in k_values:
        knn = KNN(k=k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = np.sum(y_pred == y_test) / len(y_test)
        accuracies.append(accuracy)
        print(f"Accuracy for k={k}: {accuracy * 100:.2f}%")

    optimal_k = k_values[np.argmax(accuracies)]
    print(f"Optimal k: {optimal_k} with accuracy {max(accuracies) * 100:.2f}%")
    return optimal_k

# Find the optimal value for k
optimal_k = find_best_k(X_train, X_test, y_train, y_test)


Accuracy for k=1: 95.32%
Accuracy for k=2: 95.32%
Accuracy for k=3: 95.91%
Accuracy for k=4: 95.91%
Accuracy for k=5: 95.91%
Accuracy for k=6: 95.91%
Accuracy for k=7: 95.91%
Accuracy for k=8: 96.49%
Accuracy for k=9: 97.08%
Accuracy for k=10: 97.08%
Accuracy for k=11: 95.91%
Accuracy for k=12: 97.08%
Accuracy for k=13: 95.91%
Accuracy for k=14: 96.49%
Accuracy for k=15: 95.32%
Accuracy for k=16: 96.49%
Accuracy for k=17: 95.32%
Accuracy for k=18: 95.32%
Accuracy for k=19: 95.32%
Optimal k: 9 with accuracy 97.08%


### Evaluation Metrics:
Now I'll compute additional evaluation metrics like precision, recall, and F1-score. Since no model training is involved here, I'll use `scikit-learn`

In [31]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94        63
           1       0.96      0.97      0.97       108

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171



# Logistic Regression With `Scikit-Learn`

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

#creating a Logistic Regression model
logreg = LogisticRegression(random_state=42, max_iter=10000)

#fitting the model on the training data
logreg.fit(X_train, y_train)

#making predictions on the test data
y_pred_logreg = logreg.predict(X_test)

#finally, evaluate the model
accuracy = accuracy_score(y_test, y_pred_logreg)
precision = precision_score(y_test, y_pred_logreg)
recall = recall_score(y_test, y_pred_logreg)
f1 = f1_score(y_test, y_pred_logreg)

print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Logistic Regression Accuracy: 98.25%
Precision: 99.07%
Recall: 98.15%
F1 Score: 98.60%


In [33]:
print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98        63
           1       0.99      0.98      0.99       108

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171

