## Problem 5: KNN

In [11]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances, rbf_kernel, polynomial_kernel
from scipy.stats import mode
import time

In [12]:
# load spambase:
data_spam = np.loadtxt("spambase/spambase.data", delimiter=",")
X_spam = data_spam[:, :-1]
y_spam = data_spam[:, -1]

scaler_spam = StandardScaler()
X_spam_norm = scaler_spam.fit_transform(X_spam)
X_train_spam, X_test_spam, y_train_spam, y_test_spam = train_test_split(
    X_spam_norm, y_spam, test_size=0.2, random_state=42)

# load digits:
X_train_digits = np.loadtxt("mnist_haar_bingyu/training_image.txt", delimiter=",")
y_train_digits = np.loadtxt("mnist_haar_bingyu/training_label.txt", delimiter=",")
X_test_digits = np.loadtxt("mnist_haar_bingyu/testing_image.txt", delimiter=",")
y_test_digits = np.loadtxt("mnist_haar_bingyu/testing_label.txt", delimiter=",")

scaler_digits = StandardScaler()
X_train_digits_norm = scaler_digits.fit_transform(X_train_digits)
X_test_digits_norm = scaler_digits.transform(X_test_digits)

## PART A:

In [15]:
def knn_classifier(X_train, y_train, X_test, k, metric='euclidean', batch_size=500):
    n_test = X_test.shape[0]
    predictions = np.zeros(n_test)
    n_batches = (n_test + batch_size - 1) // batch_size

    for batch_idx in range(n_batches):
        start = batch_idx*batch_size
        end = min(start + batch_size, n_test)
        batch = X_test[start:end]

        if metric in ['euclidean', 'cosine']:
            distances = pairwise_distances(batch, X_train, metric=metric)
        elif metric == 'gaussian' or metric == 'rbf':
            distances = -rbf_kernel(batch, X_train, gamma=1.0)
        elif metric == 'polynomial':
            distances = -polynomial_kernel(batch, X_train, degree=2, coef0=1)

        for i in range(distances.shape[0]):
            if k==1:
                predictions[start+i] = y_train[np.argmin(distances[i])]
            else:
                k_nearest_idx = np.argpartition(distances[i], k)[:k]
                k_nearest_labels = y_train[k_nearest_idx]
                predictions[start+i] = mode(k_nearest_labels, keepdims=True)[0][0]

        if batch_idx % max(1, n_batches // 10) == 0:
            print(f" Batch {batch_idx+1}/{n_batches} processed", end="\r")

    print()
    return predictions

A1 : Implementation of Spambase : (Euclidean)

In [20]:
print("\nA1: Spambase (Euclidean Distance)")
print("-"*40)
for k in [1, 3, 7]:
    print(f"\nk={k}:")
    
    # train
    start_time = time.time()
    train_pred = knn_classifier(X_train_spam, y_train_spam, X_train_spam, k, 
                          metric='euclidean', batch_size=500)
    train_acc = np.mean(train_pred == y_train_spam) * 100
    train_time = time.time() - start_time
    
    # test 
    start_time = time.time()
    test_pred = knn_classifier(X_train_spam, y_train_spam, X_test_spam, k, 
                          metric='euclidean', batch_size=500)
    test_acc = np.mean(test_pred == y_test_spam) * 100
    test_time = time.time() - start_time
    
    print(f"  Train Accuracy: {train_acc:.2f}% (Time: {train_time:.1f}s)")
    print(f"  Test Accuracy:  {test_acc:.2f}% (Time: {test_time:.1f}s)")


A1: Spambase (Euclidean Distance)
----------------------------------------

k=1:
 Batch 8/8 processed
 Batch 2/2 processed
  Train Accuracy: 99.95% (Time: 0.1s)
  Test Accuracy:  89.58% (Time: 0.0s)

k=3:
 Batch 8/8 processed
 Batch 2/2 processed
  Train Accuracy: 95.03% (Time: 0.7s)
  Test Accuracy:  89.36% (Time: 0.2s)

k=7:
 Batch 8/8 processed
 Batch 2/2 processed
  Train Accuracy: 92.83% (Time: 0.8s)
  Test Accuracy:  89.58% (Time: 0.2s)


A2: Implementation of Digits: (Cosine)

In [21]:
print("\nA2: Digits (Cosine Distance)")
print("-"*40)
for k in [1, 3, 7]:
    print(f"\nk={k}:")
    
    # Training accuracy
    start_time = time.time()
    train_pred = knn_classifier(X_train_digits_norm, y_train_digits, X_train_digits_norm, k,
                          metric='cosine', batch_size=500)

    train_acc = np.mean(train_pred == y_train_digits) * 100
    train_time = time.time() - start_time
    
    # Test accuracy
    start_time = time.time()
    test_pred = knn_classifier(X_train_digits_norm, y_train_digits, X_test_digits_norm, k, 
                         metric='cosine', batch_size=500)
    test_acc = np.mean(test_pred == y_test_digits) * 100
    test_time = time.time() - start_time
    
    print(f"  Train Accuracy: {train_acc:.2f}% (Time: {train_time:.1f}s)")
    print(f"  Test Accuracy:  {test_acc:.2f}% (Time: {test_time:.1f}s)")



A2: Digits (Cosine Distance)
----------------------------------------

k=1:
 Batch 271/300 processed
 Batch 46/50 processed
  Train Accuracy: 100.00% (Time: 22.9s)
  Test Accuracy:  92.74% (Time: 3.9s)

k=3:
 Batch 271/300 processed
 Batch 46/50 processed
  Train Accuracy: 96.43% (Time: 59.7s)
  Test Accuracy:  93.18% (Time: 10.1s)

k=7:
 Batch 271/300 processed
 Batch 46/50 processed
  Train Accuracy: 94.74% (Time: 60.0s)
  Test Accuracy:  93.26% (Time: 13.7s)


A3: Implementation of KNN using Digits: ( Gaussian / RBF ):

In [26]:
print("\nA3: Digits (Gaussian/RBF Kernel)")
print("-"*40)
for k in [1, 3, 7]:
    print(f"\nk={k}:")
    
    # Training accuracy 
    start_time = time.time()
    train_sample_idx = np.random.choice(len(X_train_digits_norm), 1000, replace=False)
    train_pred = knn_classifier(X_train_digits_norm, y_train_digits, 
                          X_train_digits_norm[train_sample_idx], k, 
                          metric='gaussian', batch_size=500)
    train_acc = np.mean(train_pred == y_train_digits[train_sample_idx]) * 100
    train_time = time.time() - start_time
    
    # Test accuracy
    start_time = time.time()
    test_pred = knn_classifier(X_train_digits_norm, y_train_digits, X_test_digits_norm, k, 
                         metric='gaussian', batch_size=500)
    test_acc = np.mean(test_pred == y_test_digits) * 100
    test_time = time.time() - start_time
    
    print(f"  Train Accuracy: {train_acc:.2f}% (Time: {train_time:.1f}s, sampled 1000)")
    print(f"  Test Accuracy:  {test_acc:.2f}% (Time: {test_time:.1f}s)")


A3: Digits (Gaussian/RBF Kernel)
----------------------------------------

k=1:
 Batch 2/2 processed
 Batch 19/20 processed
  Train Accuracy: 100.00% (Time: 0.5s, sampled 1000)
  Test Accuracy:  92.61% (Time: 5.1s)

k=3:
 Batch 2/2 processed
 Batch 19/20 processed
  Train Accuracy: 97.10% (Time: 1.0s, sampled 1000)
  Test Accuracy:  93.02% (Time: 10.1s)

k=7:
 Batch 2/2 processed
 Batch 19/20 processed
  Train Accuracy: 95.00% (Time: 1.0s, sampled 1000)
  Test Accuracy:  93.05% (Time: 10.2s)


A4: Implementation of KNN on Digits:(Polynomial kernel)

In [27]:
print("\nA4: Digits (Polynomial Kernel)")
print("-"*40)
for k in [1, 3, 7]:
    print(f"\nk={k}:")
    
    # Training accuracy 
    start_time = time.time()
    train_sample_idx = np.random.choice(len(X_train_digits_norm), 1000, replace=False)
    train_pred = knn_classifier(X_train_digits_norm, y_train_digits, 
                          X_train_digits_norm[train_sample_idx], k, 
                          metric='polynomial', batch_size=500)
    train_acc = np.mean(train_pred == y_train_digits[train_sample_idx]) * 100
    train_time = time.time() - start_time
    
    # Test accuracy
    start_time = time.time()
    test_pred = knn_classifier(X_train_digits_norm, y_train_digits, X_test_digits_norm, k, 
                         metric='polynomial', batch_size=500)
    test_acc = np.mean(test_pred == y_test_digits) * 100
    test_time = time.time() - start_time
    
    print(f"  Train Accuracy: {train_acc:.2f}% (Time: {train_time:.1f}s, sampled 1000)")
    print(f"  Test Accuracy:  {test_acc:.2f}% (Time: {test_time:.1f}s)")


A4: Digits (Polynomial Kernel)
----------------------------------------

k=1:
 Batch 2/2 processed
 Batch 19/20 processed
  Train Accuracy: 95.10% (Time: 0.3s, sampled 1000)
  Test Accuracy:  86.67% (Time: 2.5s)

k=3:
 Batch 2/2 processed
 Batch 19/20 processed
  Train Accuracy: 92.10% (Time: 0.8s, sampled 1000)
  Test Accuracy:  88.53% (Time: 8.2s)

k=7:
 Batch 2/2 processed
 Batch 19/20 processed
  Train Accuracy: 90.70% (Time: 0.8s, sampled 1000)
  Test Accuracy:  89.24% (Time: 8.0s)


## PART - B:
Fixed Window with batch processing

In [30]:
def knn_fixed_window(X_train, y_train, X_test, radius, metric='euclidean', batch_size=500):
    n_test = X_test.shape[0]
    predictions = np.zeros(n_test)
    n_batches = (n_test + batch_size - 1) // batch_size

    for batch_idx in range(n_batches):
        start = batch_idx * batch_size
        end = min(start+batch_size, n_test)
        batch = X_test[start:end]

        distances = pairwise_distances(batch, X_train, metric=metric)

        for i in range(distances.shape[0]):
            neighbors_mask = distances[i] <= radius
            neighbors_labels = y_train[neighbors_mask]
            
            if len(neighbors_labels) > 0:
                predictions[start + i] = mode(neighbors_labels, keepdims=True)[0][0]
            else:
                predictions[start + i] = y_train[np.argmin(distances[i])]
    
    return predictions

In [33]:
print("\nB1: Spambase (Euclidean Distance)")
print("-"*40)
radii_spam = np.arange(0.5, 2.1, 0.1)
for radius in radii_spam:
    print(f"\nRadius={radius}:")
    
    # Training accuracy
    start_time = time.time()
    train_pred = knn_fixed_window(X_train_spam, y_train_spam, X_train_spam, 
                                        radius, metric='euclidean', batch_size=500)
    train_acc = np.mean(train_pred == y_train_spam) * 100
    train_time = time.time() - start_time
    
    # Test accuracy
    start_time = time.time()
    test_pred = knn_fixed_window(X_train_spam, y_train_spam, X_test_spam, 
                                       radius, metric='euclidean', batch_size=500)
    test_acc = np.mean(test_pred == y_test_spam) * 100
    test_time = time.time() - start_time
    
    print(f"  Train Accuracy: {train_acc:.2f}% (Time: {train_time:.1f}s)")
    print(f"  Test Accuracy:  {test_acc:.2f}% (Time: {test_time:.1f}s)")


B1: Spambase (Euclidean Distance)
----------------------------------------

Radius=0.5:
  Train Accuracy: 99.59% (Time: 0.5s)
  Test Accuracy:  89.79% (Time: 0.1s)

Radius=0.6:
  Train Accuracy: 99.54% (Time: 0.5s)
  Test Accuracy:  89.79% (Time: 0.1s)

Radius=0.7:
  Train Accuracy: 99.40% (Time: 0.5s)
  Test Accuracy:  89.47% (Time: 0.1s)

Radius=0.7999999999999999:
  Train Accuracy: 99.35% (Time: 0.5s)
  Test Accuracy:  89.47% (Time: 0.1s)

Radius=0.8999999999999999:
  Train Accuracy: 99.29% (Time: 0.6s)
  Test Accuracy:  89.58% (Time: 0.1s)

Radius=0.9999999999999999:
  Train Accuracy: 99.18% (Time: 0.6s)
  Test Accuracy:  89.58% (Time: 0.1s)

Radius=1.0999999999999999:
  Train Accuracy: 99.21% (Time: 0.5s)
  Test Accuracy:  89.47% (Time: 0.1s)

Radius=1.1999999999999997:
  Train Accuracy: 99.10% (Time: 0.5s)
  Test Accuracy:  89.47% (Time: 0.1s)

Radius=1.2999999999999998:
  Train Accuracy: 99.08% (Time: 0.5s)
  Test Accuracy:  90.01% (Time: 0.1s)

Radius=1.4:
  Train Accuracy: 98

In [35]:
print("\nB2: Digits (Cosine Distance)")
print("-"*40)
radii_digits = [0.05, 0.1, 0.15, 0.2]
for radius in radii_digits:
    print(f"\nRadius={radius}:")

    # train
    start_time = time.time()
    train_sample_idx = np.random.choice(len(X_train_digits_norm), 1000, replace=False)
    train_pred = knn_fixed_window(X_train_digits_norm, y_train_digits, 
                                        X_train_digits_norm[train_sample_idx], 
                                        radius, metric='cosine', batch_size=200)
    train_acc = np.mean(train_pred == y_train_digits[train_sample_idx]) * 100
    train_time = time.time() - start_time

    # test
    start_time = time.time()
    test_pred = knn_fixed_window(X_train_digits_norm, y_train_digits, X_test_digits_norm, 
                                       radius, metric='cosine', batch_size=200)
    test_acc = np.mean(test_pred == y_test_digits) * 100
    test_time = time.time() - start_time
    
    print(f"  Train Accuracy: {train_acc:.2f}% (Time: {train_time:.1f}s, sampled 1000)")
    print(f"  Test Accuracy:  {test_acc:.2f}% (Time: {test_time:.1f}s)")


B2: Digits (Cosine Distance)
----------------------------------------

Radius=0.05:
  Train Accuracy: 100.00% (Time: 0.6s, sampled 1000)
  Test Accuracy:  92.75% (Time: 4.2s)

Radius=0.1:
  Train Accuracy: 99.00% (Time: 0.8s, sampled 1000)
  Test Accuracy:  92.97% (Time: 4.7s)

Radius=0.15:
  Train Accuracy: 96.60% (Time: 0.5s, sampled 1000)
  Test Accuracy:  92.93% (Time: 4.8s)

Radius=0.2:
  Train Accuracy: 94.80% (Time: 0.5s, sampled 1000)
  Test Accuracy:  92.44% (Time: 5.1s)


## PART - C:

In [36]:
def knn_kde(X_train, y_train, X_test, kernel='gaussian', gamma=0.01, batch_size=500):
    """
    Kernel Density Estimation k-NN with batch processing
    """
    classes = np.unique(y_train)
    n_test = X_test.shape[0]
    n_batches = (n_test + batch_size - 1) // batch_size
    predictions = np.zeros(n_test)

    class_masks = {}
    class_priors = {}
    for c in classes:
        mask = (y_train == c)
        class_masks[c] = np.where(mask)[0]
        class_priors[c] = len(class_masks[c]) / len(y_train)
    
    for batch_idx in range(n_batches):
        start = batch_idx * batch_size
        end = min(start + batch_size, n_test)
        batch = X_test[start:end]
        batch_size_actual = end - start
        
        if kernel == 'gaussian' or kernel == 'rbf':
            K = rbf_kernel(batch, X_train, gamma=gamma)
        elif kernel == 'polynomial':
            K = polynomial_kernel(batch, X_train, degree=2, coef0=1)
        
        log_posteriors = np.zeros((batch_size_actual, len(classes)))
        
        for class_idx, c in enumerate(classes):
            class_indices = class_masks[c]
            mc = len(class_indices)
            
            if mc > 0:
                # P(z|C) for all points in batch
                p_z_given_c = np.sum(K[:, class_indices], axis=1) / mc
                p_z_given_c = np.maximum(p_z_given_c, 1e-10)
                
                # Log posterior
                log_posteriors[:, class_idx] = np.log(class_priors[c]) + np.log(p_z_given_c)
        
        predictions[start:end] = classes[np.argmax(log_posteriors, axis=1)]
    
    return predictions

In [40]:
print("\nC1: Spambase (Gaussian/RBF Kernel)")
print("-"*40)
gammas = np.arange(0.1, 1.1, 0.1)
for gamma in gammas:
    print(f"\nGamma={gamma}:")
    
    # Training accuracy
    start_time = time.time()
    train_pred = knn_kde(X_train_spam, y_train_spam, X_train_spam, 
                              kernel='gaussian', gamma=gamma, batch_size=500)
    train_acc = np.mean(train_pred == y_train_spam) * 100
    train_time = time.time() - start_time
    
    # Test accuracy
    start_time = time.time()
    test_pred = knn_kde(X_train_spam, y_train_spam, X_test_spam, 
                             kernel='gaussian', gamma=gamma, batch_size=500)
    test_acc = np.mean(test_pred == y_test_spam) * 100
    test_time = time.time() - start_time
    
    print(f"  Train Accuracy: {train_acc:.2f}% (Time: {train_time:.1f}s)")
    print(f"  Test Accuracy:  {test_acc:.2f}% (Time: {test_time:.1f}s)")


C1: Spambase (Gaussian/RBF Kernel)
----------------------------------------

Gamma=0.1:
  Train Accuracy: 80.82% (Time: 0.2s)
  Test Accuracy:  77.74% (Time: 0.0s)

Gamma=0.2:
  Train Accuracy: 88.64% (Time: 0.1s)
  Test Accuracy:  83.82% (Time: 0.0s)

Gamma=0.30000000000000004:
  Train Accuracy: 93.02% (Time: 0.1s)
  Test Accuracy:  86.86% (Time: 0.0s)

Gamma=0.4:
  Train Accuracy: 94.92% (Time: 0.1s)
  Test Accuracy:  88.38% (Time: 0.0s)

Gamma=0.5:
  Train Accuracy: 96.39% (Time: 0.1s)
  Test Accuracy:  89.14% (Time: 0.0s)

Gamma=0.6000000000000001:
  Train Accuracy: 97.17% (Time: 0.1s)
  Test Accuracy:  88.60% (Time: 0.0s)

Gamma=0.7000000000000001:
  Train Accuracy: 97.69% (Time: 0.1s)
  Test Accuracy:  88.27% (Time: 0.0s)

Gamma=0.8:
  Train Accuracy: 98.02% (Time: 0.1s)
  Test Accuracy:  88.27% (Time: 0.0s)

Gamma=0.9:
  Train Accuracy: 98.45% (Time: 0.1s)
  Test Accuracy:  87.62% (Time: 0.0s)

Gamma=1.0:
  Train Accuracy: 98.67% (Time: 0.1s)
  Test Accuracy:  87.73% (Time: 0.0

In [1]:
print("\nQUESTION 5 RESULTS SUMMARY")
print("="*70)

results = """
PART A - Standard k-NN:
-----------------------
Spambase + Euclidean:     k=1  → Train: 99.95%, Test: 89.58%
Digits + Cosine:          k=7  → Train: 94.74%, Test: 93.26%  [BEST]
Digits + Gaussian:        k=7  → Train: 95.00%, Test: 93.05%
Digits + Polynomial:      k=7  → Train: 90.70%, Test: 89.24%

PART B - Fixed Window:
----------------------
Spambase + Euclidean:     R=1.3 → Train: 99.08%, Test: 90.01%  [BEST]
Digits + Cosine:          R=0.1 → Train: 99.00%, Test: 92.97%

PART C - Kernel Density:
------------------------
Spambase + Gaussian:      γ=0.5 → Train: 96.39%, Test: 89.14%
"""

print(results)
print("="*70)


QUESTION 5 RESULTS SUMMARY

PART A - Standard k-NN:
-----------------------
Spambase + Euclidean:     k=1  → Train: 99.95%, Test: 89.58%
Digits + Cosine:          k=7  → Train: 94.74%, Test: 93.26%  [BEST]
Digits + Gaussian:        k=7  → Train: 95.00%, Test: 93.05%
Digits + Polynomial:      k=7  → Train: 90.70%, Test: 89.24%

PART B - Fixed Window:
----------------------
Spambase + Euclidean:     R=1.3 → Train: 99.08%, Test: 90.01%  [BEST]
Digits + Cosine:          R=0.1 → Train: 99.00%, Test: 92.97%

PART C - Kernel Density:
------------------------
Spambase + Gaussian:      γ=0.5 → Train: 96.39%, Test: 89.14%

