In [None]:
# Run the following cell if using Google Colab

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/cs340/project/models

!pip install git+https://github.com/openai/CLIP.git
!pip install cuml-cu12

In [11]:
import clip_feature_extractor
import numpy as np

from cuml.neighbors import KNeighborsClassifier
from cuml.metrics import accuracy_score
import cupy as cp
import cudf
from sklearn.model_selection import train_test_split

import time

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:

def train_knn(X_train, y_train, X_val, y_val, n_neighbors=5, algorithm='brute'):
    """
    Train a K-Nearest Neighbors classifier using RAPIDS cuML with GPU acceleration and 
    evaluate on a validation set.

    Parameters:
    - X_train (np.ndarray or cp.ndarray): Training feature data.
    - y_train (np.ndarray or cp.ndarray): Training labels.
    - X_val (np.ndarray or cp.ndarray): Validation feature data.
    - y_val (np.ndarray or cp.ndarray): Validation labels.
    - n_neighbors (int): Number of neighbors to use (default: 5).
    - algorithm (str): Algorithm to compute the nearest neighbors (default: 'brute').

    Returns:
    - accuracy (float): Classification accuracy on the validation set.
    - model (cuml.neighbors.KNeighborsClassifier): Trained KNN model.
    """
    def convert_and_cast(data, dtype):
        """
        Convert data to CuPy array if not already and cast to the specified dtype.

        Parameters:
        - data (np.ndarray or cp.ndarray): Input data.
        - dtype (data-type): Desired data type.

        Returns:
        - cp.ndarray: Converted and casted CuPy array.
        """
        if not isinstance(data, cp.ndarray):
            data = cp.asarray(data)
        return data.astype(dtype)

    # Convert training and validation sets to desired types
    X_train = convert_and_cast(X_train, cp.float32)
    y_train = convert_and_cast(y_train, cp.int32)
    X_val = convert_and_cast(X_val, cp.float32)
    y_val = convert_and_cast(y_val, cp.int32)

    # Create and train the model
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=algorithm)
    knn.fit(X_train, y_train)

    # Predict on validation set
    y_pred = knn.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy, knn

def hyperparameter_search_knn(X_train, y_train, X_test, y_test, n_neighbors_list, algorithm='brute'):
    """
    Perform hyperparameter search for K-Nearest Neighbors classifier using 
    a separate 80/20 validation split from the training data.

    Parameters:
    - X_train (np.ndarray or cp.ndarray): Original training feature data.
    - y_train (np.ndarray or cp.ndarray): Original training labels.
    - X_test (np.ndarray or cp.ndarray): Testing feature data (held-out set).
    - y_test (np.ndarray or cp.ndarray): Testing labels (held-out set).
    - n_neighbors_list (list): List of n_neighbors values to test.
    - algorithm (str): Algorithm to compute the nearest neighbors (default: 'brute').

    Returns:
    - best_params (dict): Dictionary of best hyperparameters.
    - best_accuracy (float): Best classification accuracy achieved on validation set.
    - best_model (cuml.neighbors.KNeighborsClassifier): Best model trained on the validation split.
    """

    # Create an 80/20 validation split from the training data
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )

    start = time.time()

    best_accuracy = 0.0
    best_params = None
    best_model = None

    for n_neighbors in n_neighbors_list:
        print(f"Training with n_neighbors={n_neighbors}")
        accuracy, model = train_knn(X_train_split, y_train_split, X_val, y_val, 
                                    n_neighbors=n_neighbors, algorithm=algorithm)
        print(f"Validation Accuracy: {accuracy:.4f}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {'n_neighbors': n_neighbors, 'algorithm': algorithm}
            best_model = model

    print("Best Hyperparameters:", best_params)
    print("Best Validation Accuracy:", best_accuracy)
    print("Time per model:", (time.time() - start) / len(n_neighbors_list))

    best_accuracy, best_model = train_knn(X_train, y_train, X_test, y_test, 
                                           n_neighbors=best_params['n_neighbors'], 
                                           algorithm=best_params['algorithm'])
    print("Test Accuracy after retraining on full training data:", best_accuracy)

    return best_params, best_accuracy, best_model

# CIFAR10

In [7]:
X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10 = clip_feature_extractor.get_CIFAR10_features()

Files already downloaded and verified
Files already downloaded and verified
Loaded features from disk.


In [13]:
k_max = int(np.sqrt(X_train_CIFAR10.shape[0])) # search up to sqrt of input size
n_neighbors_list = range(1, k_max)

best_params_CIFAR10, best_accuracy_CIFAR10, best_model_CIFAR10 = hyperparameter_search_knn(X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10, n_neighbors_list)

Training with n_neighbors=1
Validation Accuracy: 0.9174
Training with n_neighbors=2
Validation Accuracy: 0.9126
Training with n_neighbors=3
Validation Accuracy: 0.9282
Training with n_neighbors=4
Validation Accuracy: 0.9291
Training with n_neighbors=5
Validation Accuracy: 0.9316
Training with n_neighbors=6
Validation Accuracy: 0.9312
Training with n_neighbors=7
Validation Accuracy: 0.9313
Training with n_neighbors=8
Validation Accuracy: 0.9329
Training with n_neighbors=9
Validation Accuracy: 0.9329
Training with n_neighbors=10
Validation Accuracy: 0.9324
Training with n_neighbors=11
Validation Accuracy: 0.9326
Training with n_neighbors=12
Validation Accuracy: 0.9329
Training with n_neighbors=13
Validation Accuracy: 0.9331
Training with n_neighbors=14
Validation Accuracy: 0.9318
Training with n_neighbors=15
Validation Accuracy: 0.9314
Training with n_neighbors=16
Validation Accuracy: 0.9311
Training with n_neighbors=17
Validation Accuracy: 0.9312
Training with n_neighbors=18
Validation 

# CIFAR100


In [14]:
X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100 = clip_feature_extractor.get_CIFAR100_features()


Files already downloaded and verified
Files already downloaded and verified
Extracting features from CIFAR100 dataset
Loaded previously extracted features from disk.


In [15]:
k_max = int(np.sqrt(X_train_CIFAR10.shape[0])) # search up to sqrt of input size
n_neighbors_list = range(1, k_max)

best_params_CIFAR100, best_accuracy_CIFAR100, best_model_CIFAR100 = hyperparameter_search_knn(X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100, n_neighbors_list)

Training with n_neighbors=1
Validation Accuracy: 0.6636
Training with n_neighbors=2
Validation Accuracy: 0.6393
Training with n_neighbors=3
Validation Accuracy: 0.6713
Training with n_neighbors=4
Validation Accuracy: 0.6783
Training with n_neighbors=5
Validation Accuracy: 0.6874
Training with n_neighbors=6
Validation Accuracy: 0.6961
Training with n_neighbors=7
Validation Accuracy: 0.7021
Training with n_neighbors=8
Validation Accuracy: 0.7016
Training with n_neighbors=9
Validation Accuracy: 0.7075
Training with n_neighbors=10
Validation Accuracy: 0.7098
Training with n_neighbors=11
Validation Accuracy: 0.7083
Training with n_neighbors=12
Validation Accuracy: 0.7121
Training with n_neighbors=13
Validation Accuracy: 0.7134
Training with n_neighbors=14
Validation Accuracy: 0.7087
Training with n_neighbors=15
Validation Accuracy: 0.7114
Training with n_neighbors=16
Validation Accuracy: 0.7121
Training with n_neighbors=17
Validation Accuracy: 0.7114
Training with n_neighbors=18
Validation 

# MNIST

In [16]:
X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST = clip_feature_extractor.get_MNIST_features()

Using device: cuda


Extracting features: 100%|██████████| 600/600 [01:43<00:00,  5.79it/s]


Using device: cuda


Extracting features: 100%|██████████| 100/100 [00:19<00:00,  5.25it/s]


In [18]:
k_max = int(np.sqrt(X_train_CIFAR10.shape[0])) # search up to sqrt of input size
n_neighbors_list = range(1, k_max)

best_params_MNIST, best_accuracy_MNIST, best_model_MNIST = hyperparameter_search_knn(X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST, n_neighbors_list)

Training with n_neighbors=1
Validation Accuracy: 0.9683
Training with n_neighbors=2
Validation Accuracy: 0.9633
Training with n_neighbors=3
Validation Accuracy: 0.9707
Training with n_neighbors=4
Validation Accuracy: 0.9697
Training with n_neighbors=5
Validation Accuracy: 0.9707
Training with n_neighbors=6
Validation Accuracy: 0.9713
Training with n_neighbors=7
Validation Accuracy: 0.9707
Training with n_neighbors=8
Validation Accuracy: 0.9700
Training with n_neighbors=9
Validation Accuracy: 0.9707
Training with n_neighbors=10
Validation Accuracy: 0.9694
Training with n_neighbors=11
Validation Accuracy: 0.9704
Training with n_neighbors=12
Validation Accuracy: 0.9697
Training with n_neighbors=13
Validation Accuracy: 0.9689
Training with n_neighbors=14
Validation Accuracy: 0.9683
Training with n_neighbors=15
Validation Accuracy: 0.9674
Training with n_neighbors=16
Validation Accuracy: 0.9668
Training with n_neighbors=17
Validation Accuracy: 0.9672
Training with n_neighbors=18
Validation 