In [None]:
# Run the following cell if using Google Colab

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/cs340/project/models

!pip install git+https://github.com/openai/CLIP.git
!pip install cuml-cu12

In [6]:
import clip_feature_extractor
import numpy as np

from cuml.neighbors import KNeighborsClassifier
from cuml.metrics import accuracy_score
import cupy as cp
import cudf

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100 = clip_feature_extractor.get_CIFAR100_features()


Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to /root/.cache/cifar-100-python.tar.gz


100%|██████████| 169M/169M [00:03<00:00, 48.6MB/s]


Extracting /root/.cache/cifar-100-python.tar.gz to /root/.cache
Files already downloaded and verified
Extracting features from CIFAR100 dataset
Loaded previously extracted features from disk.


In [41]:
def train_knn(X_train, y_train, X_test, y_test, n_neighbors=5, algorithm='brute'):
    """
    Train a K-Nearest Neighbors classifier using RAPIDS cuML with GPU acceleration.

    Parameters:
    - X_train (np.ndarray or cp.ndarray): Training feature data.
    - y_train (np.ndarray or cp.ndarray): Training labels.
    - X_test (np.ndarray or cp.ndarray): Testing feature data.
    - y_test (np.ndarray or cp.ndarray): Testing labels.
    - n_neighbors (int): Number of neighbors to use (default: 5).
    - algorithm (str): Algorithm to compute the nearest neighbors (default: 'brute').

    Returns:
    - accuracy (float): Classification accuracy on the test set.
    - model (cuml.neighbors.KNeighborsClassifier): Trained KNN model.
    """

    def convert_and_cast(data, dtype):
        """
        Convert data to CuPy array if not already and cast to the specified dtype.

        Parameters:
        - data (np.ndarray or cp.ndarray): Input data.
        - dtype (data-type): Desired data type.

        Returns:
        - cp.ndarray: Converted and casted CuPy array.
        """
        if not isinstance(data, cp.ndarray):
            data = cp.asarray(data)
        return data.astype(dtype)

    # Combine conversion and type casting for all datasets
    X_train = convert_and_cast(X_train, cp.float32)
    X_test = convert_and_cast(X_test, cp.float32)
    y_train = convert_and_cast(y_train, cp.int32)
    y_test = convert_and_cast(y_test, cp.int32)

    # print shape of data as a sanity check
    print("X_train shape:", X_train.shape)
    print("y_train shape:", y_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_test shape:", y_test.shape)


    knn = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=algorithm)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    return accuracy, knn


In [42]:
accuracy, knn_model = train_knn(X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100, n_neighbors=25)
print(f"KNN Accuracy: {accuracy}")

X_train shape: (50000, 512)
y_train shape: (50000,)
X_test shape: (10000, 512)
y_test shape: (10000,)
KNN Accuracy: 0.7153000235557556
