<a href="https://colab.research.google.com/github/y4-5H0/Machine-Learning-Lab/blob/main/Lab_Report_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from collections import Counter

class CustomKNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)

    def predict(self, X_test):
        X_test = np.array(X_test)
        return [self._predict(x) for x in X_test]

    def _predict(self, x):
        distances = [np.linalg.norm(x - x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_labels = [self.y_train[i] for i in k_indices]
        return Counter(k_labels).most_common(1)[0][0]


In [2]:
def accuracy_score(y_true, y_pred):
    return np.sum(np.array(y_true) == np.array(y_pred)) / len(y_true)

def confusion_matrix(y_true, y_pred, labels=None):
    if labels is None:
        labels = np.unique(y_true)
    matrix = np.zeros((len(labels), len(labels)), dtype=int)
    label_to_index = {label: i for i, label in enumerate(labels)}
    for t, p in zip(y_true, y_pred):
        matrix[label_to_index[t], label_to_index[p]] += 1
    return matrix

def precision_recall_f1(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    precision, recall, f1 = [], [], []

    for i in range(len(cm)):
        tp = cm[i][i]
        fp = sum(cm[:, i]) - tp
        fn = sum(cm[i, :]) - tp

        prec = tp / (tp + fp) if (tp + fp) != 0 else 0
        rec = tp / (tp + fn) if (tp + fn) != 0 else 0
        f1_score = 2 * prec * rec / (prec + rec) if (prec + rec) != 0 else 0

        precision.append(prec)
        recall.append(rec)
        f1.append(f1_score)

    return np.mean(precision), np.mean(recall), np.mean(f1)


In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data, iris.target


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


news_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/labelled_newscatcher_dataset.csv', sep=';', engine='python')  # Columns: ['text', 'label']
X_news_raw = news_df['title']
y_news = news_df['topic']

vectorizer = TfidfVectorizer(max_features=500)
X_news = vectorizer.fit_transform(X_news_raw).toarray()

In [5]:
def evaluate_model(X, y, k, split_ratio):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=42)
    model = CustomKNN(k=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1 = precision_recall_f1(y_test, y_pred)
    return acc, prec, rec, f1


In [6]:
def find_best_k_split(X, y, k_values, split_ratios):
    best_result = {'accuracy': 0}
    for k in k_values:
        for split in split_ratios:
            acc, prec, rec, f1 = evaluate_model(X, y, k, split)
            if acc > best_result['accuracy']:
                best_result.update({
                    'k': k,
                    'split': split,
                    'accuracy': acc,
                    'precision': prec,
                    'recall': rec,
                    'f1': f1
                })
    return best_result


In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score as sk_acc, precision_score, recall_score, f1_score

def sklearn_knn_evaluation(X, y, k, split_ratio):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=42)
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = sk_acc(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    return acc, prec, rec, f1

In [None]:
k_vals = [1, 3, 5, 7, 9]
splits = [0.2, 0.3, 0.4]

# Iris dataset
best_iris = find_best_k_split(X, y, k_vals, splits)
print("Best Custom KNN for Iris:", best_iris)

# Compare with sklearn
iris_sklearn_metrics = sklearn_knn_evaluation(X, y, best_iris['k'], best_iris['split'])
print("Sklearn KNN for Iris:", iris_sklearn_metrics)

# News dataset
best_news = find_best_k_split(X_news, y_news, k_vals, splits)
print("Best Custom KNN for News:", best_news)

# Compare with sklearn
news_sklearn_metrics = sklearn_knn_evaluation(X_news, y_news, best_news['k'], best_news['split'])
print("Sklearn KNN for News:", news_sklearn_metrics)


Best Custom KNN for Iris: {'accuracy': np.float64(1.0), 'k': 1, 'split': 0.2, 'precision': np.float64(1.0), 'recall': np.float64(1.0), 'f1': np.float64(1.0)}
Sklearn KNN for Iris: (1.0, 1.0, 1.0, 1.0)
