<a href="https://colab.research.google.com/github/yousufcoxs/ML_Lab/blob/main/KNN_from_Scratch_Lab_report_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import numpy as np
from collections import Counter
from sklearn.datasets import load_iris, fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier as SklearnKNN

In [30]:
class KNNClassifier:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)

    def predict(self, X_test):
        return [self._predict_one(x) for x in X_test]

    def _predict_one(self, x):
        distances = [np.linalg.norm(x - x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_labels).most_common(1)
        return most_common[0][0]

In [31]:
def accuracy_score(y_true, y_pred):
    return np.mean(np.array(y_true) == np.array(y_pred))

def confusion_matrix(y_true, y_pred, labels):
    matrix = np.zeros((len(labels), len(labels)), dtype=int)
    label_index = {label: i for i, label in enumerate(labels)}
    for t, p in zip(y_true, y_pred):
        matrix[label_index[t]][label_index[p]] += 1
    return matrix

def precision_recall_f1(cm):
    precision, recall, f1 = [], [], []
    for i in range(len(cm)):
        tp = cm[i][i]
        fp = sum(cm[:, i]) - tp
        fn = sum(cm[i, :]) - tp
        p = tp / (tp + fp) if (tp + fp) != 0 else 0
        r = tp / (tp + fn) if (tp + fn) != 0 else 0
        f = 2 * p * r / (p + r) if (p + r) != 0 else 0
        precision.append(p)
        recall.append(r)
        f1.append(f)
    return precision, recall, f1

In [32]:
def iris_experiment(k=3, test_size=0.2):
    iris = load_iris()
    X = iris.data
    y = iris.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    model = KNNClassifier(k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("\n=== IRIS DATASET ===")
    print(f"Best k = {k}, test_size = {test_size}")
    print("Custom KNN Accuracy:", accuracy_score(y_test, y_pred))

In [33]:
def iris_experiment_with_evaluation(k=3, test_size=0.2):
    iris = load_iris()
    X = iris.data
    y = iris.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    model = KNNClassifier(k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("\n=== IRIS DATASET ===")
    print(f"Best k = {k}, test_size = {test_size}")
    print("Custom KNN Accuracy:", accuracy_score(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred, labels=np.unique(y))
    print("Confusion Matrix:\n", cm)
    p, r, f = precision_recall_f1(cm)
    print("Precision:", p)
    print("Recall:", r)
    print("F1-score:", f)

    skl_model = SklearnKNN(n_neighbors=k)
    skl_model.fit(X_train, y_train)
    skl_pred = skl_model.predict(X_test)
    print("\nScikit-learn Classification Report:\n",
    classification_report(y_test, skl_pred))

# Run the combined experiment
iris_experiment_with_evaluation(k=3, test_size=0.2)


=== IRIS DATASET ===
Best k = 3, test_size = 0.2
Custom KNN Accuracy: 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Precision: [np.float64(1.0), np.float64(1.0), np.float64(1.0)]
Recall: [np.float64(1.0), np.float64(1.0), np.float64(1.0)]
F1-score: [np.float64(1.0), np.float64(1.0), np.float64(1.0)]

Scikit-learn Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [34]:
def news_experiment(k=3, test_size=0.2):
    categories = ['alt.atheism', 'sci.space', 'rec.sport.baseball']
    data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

    X_raw = data.data
    y_raw = data.target

    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(X_raw).toarray()
    y = y_raw

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    model = KNNClassifier(k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("\n=== NEWS DATASET ===")
    print(f"Best k = {k}, test_size = {test_size}")
    print("Custom KNN Accuracy:", accuracy_score(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred, labels=np.unique(y))
    print("Confusion Matrix:\n", cm)
    p, r, f = precision_recall_f1(cm)
    print("Precision:", p)
    print("Recall:", r)
    print("F1-score:", f)

In [35]:
def news_experiment_with_evaluation(k=3, test_size=0.2):
    categories = ['alt.atheism', 'sci.space', 'rec.sport.baseball']
    data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

    X_raw = data.data
    y_raw = data.target

    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(X_raw).toarray()
    y = y_raw

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    model = KNNClassifier(k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("\n=== NEWS DATASET ===")
    print(f"Best k = {k}, test_size = {test_size}")
    print("Custom KNN Accuracy:", accuracy_score(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred, labels=np.unique(y))
    print("Confusion Matrix:\n", cm)
    p, r, f = precision_recall_f1(cm)
    print("Precision:", p)
    print("Recall:", r)
    print("F1-score:", f)

    skl_model = SklearnKNN(n_neighbors=k)
    skl_model.fit(X_train, y_train)
    skl_pred = skl_model.predict(X_test)
    print("\nScikit-learn Classification Report:\n",
    classification_report(y_test, skl_pred))

# Run the combined experiment
news_experiment_with_evaluation(k=3, test_size=0.2)


=== NEWS DATASET ===
Best k = 3, test_size = 0.2
Custom KNN Accuracy: 0.5287769784172662
Confusion Matrix:
 [[ 61  78  15]
 [  9 164  25]
 [ 21 114  69]]
Precision: [np.float64(0.6703296703296703), np.float64(0.4606741573033708), np.float64(0.6330275229357798)]
Recall: [np.float64(0.3961038961038961), np.float64(0.8282828282828283), np.float64(0.3382352941176471)]
F1-score: [np.float64(0.4979591836734693), np.float64(0.592057761732852), np.float64(0.44089456869009586)]

Scikit-learn Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.33      0.46       154
           1       0.43      0.96      0.59       198
           2       0.98      0.21      0.34       204

    accuracy                           0.51       556
   macro avg       0.72      0.50      0.46       556
weighted avg       0.72      0.51      0.46       556



In [36]:
def tune_knn(dataset='iris'):
    best_k = None
    best_split = None
    best_score = 0

    k_values = range(1, 16)
    split_ratios = [0.1, 0.2, 0.3, 0.4]

    print(f"\nTUNING KNN for {dataset.upper()} dataset")
    for k in k_values:
        for split in split_ratios:
            if dataset == 'iris':
                iris = load_iris()
                X, y = iris.data, iris.target
            elif dataset == 'news':
                categories = ['alt.atheism', 'sci.space', 'rec.sport.baseball']
                data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
                X_raw = data.data
                y = data.target
                vectorizer = TfidfVectorizer(max_features=1000)
                X = vectorizer.fit_transform(X_raw).toarray()
            else:
                raise ValueError("Unknown dataset")

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=42)
            model = KNNClassifier(k)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)

            # Add print statement here to show progress during tuning
            print(f"k={k}, split={split:.1f}, accuracy={acc:.4f}")


            if acc > best_score:
                best_score = acc
                best_k = k
                best_split = split

    print(f"\nBest k: {best_k}")
    print(f"Best test_size: {best_split}")
    print(f"Best accuracy: {best_score}")

    return best_k, best_split

In [37]:
if __name__ == "__main__":
    # Auto-tune for Iris
    k_iris, split_iris = tune_knn('iris')
    iris_experiment_with_evaluation(k=k_iris, test_size=split_iris)

    # Auto-tune for News
    k_news, split_news = tune_knn('news')
    news_experiment_with_evaluation(k=k_news, test_size=split_news)


TUNING KNN for IRIS dataset
k=1, split=0.1, accuracy=1.0000
k=1, split=0.2, accuracy=1.0000
k=1, split=0.3, accuracy=1.0000
k=1, split=0.4, accuracy=0.9833
k=2, split=0.1, accuracy=1.0000
k=2, split=0.2, accuracy=1.0000
k=2, split=0.3, accuracy=1.0000
k=2, split=0.4, accuracy=0.9833
k=3, split=0.1, accuracy=1.0000
k=3, split=0.2, accuracy=1.0000
k=3, split=0.3, accuracy=1.0000
k=3, split=0.4, accuracy=0.9833
k=4, split=0.1, accuracy=1.0000
k=4, split=0.2, accuracy=1.0000
k=4, split=0.3, accuracy=1.0000
k=4, split=0.4, accuracy=0.9833
k=5, split=0.1, accuracy=1.0000
k=5, split=0.2, accuracy=1.0000
k=5, split=0.3, accuracy=1.0000
k=5, split=0.4, accuracy=0.9833
k=6, split=0.1, accuracy=1.0000
k=6, split=0.2, accuracy=1.0000
k=6, split=0.3, accuracy=1.0000
k=6, split=0.4, accuracy=0.9833
k=7, split=0.1, accuracy=0.9333
k=7, split=0.2, accuracy=0.9667
k=7, split=0.3, accuracy=1.0000
k=7, split=0.4, accuracy=0.9833
k=8, split=0.1, accuracy=1.0000
k=8, split=0.2, accuracy=1.0000
k=8, split=