## Problem 5: KNN

In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances, rbf_kernel, polynomial_kernel
from scipy.stats import mode

In [2]:
# load spambase:
data_spam = np.loadtxt("spambase/spambase.data", delimiter=",")
X_spam = data_spam[:, :-1]
y_spam = data_spam[:, -1]

scaler_spam = StandardScaler()
X_spam_norm = scaler_spam.fit_transform(X_spam)
X_train_spam, X_test_spam, y_train_spam, y_test_spam = train_test_split(
    X_spam_norm, y_spam, test_size=0.2, random_state=42)

# load digits:
X_train_digits = np.loadtxt("mnist_haar_bingyu/training_image.txt", delimiter=",")
y_train_digits = np.loadtxt("mnist_haar_bingyu/training_label.txt", delimiter=",")
X_test_digits = np.loadtxt("mnist_haar_bingyu/testing_image.txt", delimiter=",")
y_test_digits = np.loadtxt("mnist_haar_bingyu/testing_label.txt", delimiter=",")

scaler_digits = StandardScaler()
X_train_digits_norm = scaler_digits.fit_transform(X_train_digits)
X_test_digits_norm = scaler_digits.transform(X_test_digits)

## PART A:

In [3]:
def knn_classifier(X_train, y_train, X_test, k, metric = 'euclidean'):
    """
    k-NN classifier implementation

    Args:
        X_train: Training features
        y_train: Training labels
        X_test: Test features
        k: Number of nearest neighbors
        metric: Distance metric or kernel type

    Returns:
        predictions: Array of predicted labels for test set
    """
    n_test = X_test.shape[0]
    predictions = np.zeros(n_test)
    n_batches = (n_test + batch_size - 1) // batch_size

    
    
    # 1. calculate distances:
    if metric in ['euclidean', 'cosine']:
        distances = pairwise_distances(X_test, X_train, metric=metric)
    elif metric == 'gaussian':
        distances = -rbf_kernel(X_test, X_train, gamma=1.0)
    elif metric == 'polynomial':
        distances = -polynomial_kernel(X_test, X_train, degree=2, coef0=1)

     # Find k-nearest neighbors for each point in batch
        for i in range(distances.shape[0]):
            if k == 1:
                # Optimize for k=1
                predictions[start + i] = y_train[np.argmin(distances[i])]
            else:
                # Use argpartition for efficiency
                k_nearest_idx = np.argpartition(distances[i], k)[:k]
                k_nearest_labels = y_train[k_nearest_idx]
                predictions[start + i] = mode(k_nearest_labels, keepdims=True)[0][0]
        
        # Progress update
        if batch_idx % max(1, n_batches // 10) == 0:
            print(f"    Batch {batch_idx+1}/{n_batches} processed", end='\r')
    
    print()  
    return predictions

A1 : Implementation of Spambase : (Euclidean)

In [4]:
print("SPAMBASE RESULTS (Euclidean Distance):")
for k in [1, 3, 7]:
    train_pred = knn_classifier(X_train_spam, y_train_spam, X_train_spam, k, metric='euclidean')
    train_acc = np.mean(train_pred == y_train_spam) * 100

    test_pred = knn_classifier(X_train_spam, y_train_spam, X_test_spam, k, metric='euclidean')
    test_acc = np.mean(test_pred == y_test_spam) * 100

    print(f"k={k}: Train Accuracy: {train_acc:.2f}%, Test Accuracy: {test_acc:.2f}%")

SPAMBASE RESULTS (Euclidean Distance):



NameError: name 'predictions' is not defined

A2: Implementation of Digits: (Cosine)

In [5]:
print("\nDIGITS RESULTS (Cosine Distance):")
for k in [1, 3, 7]:
    train_pred = knn_classifier(X_train_digits_norm, y_train_digits, X_train_digits_norm, k, metric='cosine')
    train_acc = np.mean(train_pred == y_train_digits) * 100

    test_pred = knn_classifier(X_train_digits_norm, y_train_digits, X_test_digits_norm, k, metric='cosine')
    test_acc = np.mean(test_pred == y_test_digits) * 100

    print(f"k={k}: Train Accuracy: {train_acc:.2f}%, Test Accuracy: {test_acc:.2f}%")


DIGITS RESULTS (Cosine Distance):


KeyboardInterrupt: 

A3: Implementation of KNN using Digits: ( Gaussian / RBF ):

In [None]:
print("\nDIGITS RESULTS (Gaussian/RBF Kernel):")
for k in [1, 3, 7]:
    train_pred = knn_classifier(X_train_digits_norm, y_train_digits, X_train_digits_norm, k, metric='gaussian')
    train_acc = np.mean(train_pred == y_train_digits) * 100

    test_pred = knn_classifier(X_train_digits_norm, y_train_digits, X_test_digits_norm, k, metric='gaussian')
    test_acc = np.mean(test_pred == y_test_digits) * 100

    print(f"k={k}: Train Accuracy: {train_acc:.2f}%, Test Accuracy: {test_acc:.2f}%")

A4: Implementation of KNN on Digits:(Polynomial kernel)

In [None]:
print("\nDIGITS RESULTS (Polynomial Kernel):")
for k in [1, 3, 7]:
    train_pred = knn_classifier(X_train_digits_norm, y_train_digits, X_train_digits_norm, k, metric='polynomial')
    train_acc = np.mean(train_pred == y_train_digits) * 100

    test_pred = knn_classifier(X_train_digits_norm, y_train_digits, X_test_digits_norm, k, metric='polynomial')
    test_acc = np.mean(test_pred == y_test_digits) * 100

    print(f"k={k}: Train Accuracy: {train_acc:.2f}%, Test Accuracy: {test_acc:.2f}%")

## PART - B:

In [8]:
def knn_fixed_window(X_train, y_train, X_test, radius, metric='euclidean'):
    """
    k-NN with fixed window (radius) approach

    Args:
        X_train: Training features
        y_train: Training labels
        X_test: Test features
        radius: Maximum distance for neighbors to be included
        metric: Distance metric

    Returns:
        predictions: Array of predicted labels
    """

    distances = pairwise_distances(X_test, X_train, metric=metric)

    n_test = X_test.shape[0]
    predictions = np.zeros(n_test)

    for i in range(n_test):
        neighbors_mask = distances[i] <= radius
        neighbors_labels = y_train[neighbors_mask]

        if len(neighbors_labels) > 0:
            predictions[i] = mode(neighbors_labels, keepdims=True)[0][0]
        else:
            nearest_idx = np.argmin(distances[i])
            predictions[i] = y_train[nearest_idx]

    return predictions

In [None]:
print("PART B - FIXED WINDOW APPROACH")
print("="*50)
print("\nSPAMBASE RESULTS (Euclidean Distance):")

# Try different radius values to find optimal
radii_spam = [0.5, 1.0, 1.5, 2.0]

for radius in radii_spam:
    # Train predictions
    train_pred = knn_fixed_window(X_train_spam, y_train_spam,
                                  X_train_spam, radius, metric='euclidean')
    train_acc = np.mean(train_pred == y_train_spam) * 100

    # Test predictions
    test_pred = knn_fixed_window(X_train_spam, y_train_spam,
                                 X_test_spam, radius, metric='euclidean')
    test_acc = np.mean(test_pred == y_test_spam) * 100

    # Count how many test points had neighbors
    distances_test = pairwise_distances(X_test_spam, X_train_spam, metric='euclidean')
    points_with_neighbors = np.sum(np.any(distances_test <= radius, axis=1))

    print(f"Radius={radius}: Train Acc: {train_acc:.2f}%, Test Acc: {test_acc:.2f}%")
    print(f"  ({points_with_neighbors}/{len(X_test_spam)} test points had neighbors)")

# Part B: Digits with Cosine distance
print("\nDIGITS RESULTS (Cosine Distance):")

# Try different radius values
radii_digits = [0.05, 0.1, 0.15, 0.2]

for radius in radii_digits:
    # Train predictions
    train_pred = knn_fixed_window(X_train_digits_norm, y_train_digits,
                                  X_train_digits_norm, radius, metric='cosine')
    train_acc = np.mean(train_pred == y_train_digits) * 100

    # Test predictions
    test_pred = knn_fixed_window(X_train_digits_norm, y_train_digits,
                                 X_test_digits_norm, radius, metric='cosine')
    test_acc = np.mean(test_pred == y_test_digits) * 100

    # Count how many test points had neighbors
    distances_test = pairwise_distances(X_test_digits_norm, X_train_digits_norm, metric='cosine')
    points_with_neighbors = np.sum(np.any(distances_test <= radius, axis=1))

    print(f"Radius={radius}: Train Acc: {train_acc:.2f}%, Test Acc: {test_acc:.2f}%")
    print(f"  ({points_with_neighbors}/{len(X_test_digits_norm)} test points had neighbors)")

PART B - FIXED WINDOW APPROACH

SPAMBASE RESULTS (Euclidean Distance):
Radius=0.5: Train Acc: 99.59%, Test Acc: 89.79%
  (283/921 test points had neighbors)
Radius=1.0: Train Acc: 99.18%, Test Acc: 89.58%
  (332/921 test points had neighbors)
Radius=1.5: Train Acc: 98.56%, Test Acc: 89.47%
  (424/921 test points had neighbors)
Radius=2.0: Train Acc: 97.50%, Test Acc: 89.36%
  (523/921 test points had neighbors)

DIGITS RESULTS (Cosine Distance):


In [None]:
def find_optimal_radius(X_train, y_train, X_test, y_test,
                       radius_range, metric='euclidean'):
    """
    Find the optimal radius for fixed window k-NN
    """
    best_radius = None
    best_test_acc = 0

    for radius in radius_range:
        test_pred = knn_fixed_window(X_train, y_train, X_test, radius, metric)
        test_acc = np.mean(test_pred == y_test) * 100

        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_radius = radius

    return best_radius, best_test_acc

radius_range_spam = np.linspace(0.5, 2.0, 20)
best_r_spam, best_acc_spam = find_optimal_radius(
    X_train_spam, y_train_spam, X_test_spam, y_test_spam,
    radius_range_spam, metric='euclidean'
)
print(f"\nOptimal radius for Spambase: {best_r_spam:.3f} (Test Acc: {best_acc_spam:.2f}%)")

radius_range_digits = np.linspace(0.05, 0.2, 20)
best_r_digits, best_acc_digits = find_optimal_radius(
    X_train_digits_norm, y_train_digits, X_test_digits_norm, y_test_digits,
    radius_range_digits, metric='cosine'
)
print(f"Optimal radius for Digits: {best_r_digits:.3f} (Test Acc: {best_acc_digits:.2f}%)")

## PART - C:

In [6]:
def knn_kernel_density_estimation(X_train, y_train, X_test, kernel='gaussian', gamma=1.0):
    """
    k-NN using Kernel Density Estimation with Bayesian probability

    Args:
        X_train: Training features
        y_train: Training labels
        X_test: Test features
        kernel: Type of kernel ('gaussian' or 'polynomial')
        gamma: Parameter for RBF kernel

    Returns:
        predictions: Array of predicted labels
    """
    classes = np.unique(y_train)
    n_classes = len(classes)
    n_test = X_test.shape[0]

    # class priors P(C) = count(C) / total
    class_priors = {}
    for c in classes:
        class_priors[c] = np.sum(y_train == c) / len(y_train)

    if kernel == 'gaussian':
        K = rbf_kernel(X_test, X_train, gamma=gamma)
    elif kernel == 'polynomial':
        K = polynomial_kernel(X_test, X_train, degree=2, coef0=1)

    # P(z|C) for each test pt and class:
    log_posteriors = np.zeros((n_test, n_classes))

    for i, c in enumerate(classes):
        class_mask = (y_train == c)
        mc = np.sum(class_mask)

        if mc > 0:
            p_z_given_c = np.sum(K[:, class_mask], axis=1) / mc

            epsilon = 1e-10
            p_z_given_c = np.maximum(p_z_given_c, epsilon)

            # log P(C|z) = log P(C) + log P(z|C)
            log_posteriors[:, i] = np.log(class_priors[c]) + np.log(p_z_given_c)

        predictions = classes[np.argmax(log_posteriors, axis=1)]

        return predictions

In [7]:
print("PART C - KERNEL DENSITY ESTIMATION")
print("="*50)
print("\nSPAMBASE RESULTS (Gaussian/RBF Kernel):")

gammas = [0.001, 0.01, 0.1, 1.0]

for gamma in gammas:
    train_pred = knn_kernel_density_estimation(
        X_train_spam, y_train_spam, X_train_spam,
        kernel='gaussian', gamma=gamma
    )
    train_acc = np.mean(train_pred == y_train_spam) * 100

    test_pred = knn_kernel_density_estimation(
        X_train_spam, y_train_spam, X_test_spam,
        kernel='gaussian', gamma=gamma
    )
    test_acc = np.mean(test_pred == y_test_spam) * 100

    print(f"Gamma={gamma}: Train Acc: {train_acc:.2f}%, Test Acc: {test_acc:.2f}%")

print("\nFinding optimal gamma...")
best_gamma = 0.01
train_pred = knn_kernel_density_estimation(
    X_train_spam, y_train_spam, X_train_spam,
    kernel='gaussian', gamma=best_gamma
)
train_acc = np.mean(train_pred == y_train_spam) * 100

test_pred = knn_kernel_density_estimation(
    X_train_spam, y_train_spam, X_test_spam,
    kernel='gaussian', gamma=best_gamma
)
test_acc = np.mean(test_pred == y_test_spam) * 100

print(f"\nBest results with gamma={best_gamma}:")
print(f"Train Accuracy: {train_acc:.2f}%")
print(f"Test Accuracy: {test_acc:.2f}%")

PART C - KERNEL DENSITY ESTIMATION

SPAMBASE RESULTS (Gaussian/RBF Kernel):
Gamma=0.001: Train Acc: 38.67%, Test Acc: 42.35%
Gamma=0.01: Train Acc: 38.67%, Test Acc: 42.35%
Gamma=0.1: Train Acc: 38.67%, Test Acc: 42.35%
Gamma=1.0: Train Acc: 38.67%, Test Acc: 42.35%

Finding optimal gamma...

Best results with gamma=0.01:
Train Accuracy: 38.67%
Test Accuracy: 42.35%
