In [None]:
# Run the following cell if using Google Colab

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/cs340/project

!pip install git+https://github.com/openai/CLIP.git
!pip install cuml-cu12

In [None]:
import clip_feature_extractor
import numpy as np

from sklearn.model_selection import KFold
from cuml.linear_model import LogisticRegression as cuLogisticRegression
from cuml.preprocessing import StandardScaler as cuStandardScaler
import cupy as cp
import cudf

%load_ext autoreload
%autoreload 2


stdout:



stderr:

Traceback (most recent call last):
  File "/home/tonyxdsu/.local/lib/python3.12/site-packages/numba/cuda/cudadrv/driver.py", line 254, in ensure_initialized
    self.cuInit(0)
  File "/home/tonyxdsu/.local/lib/python3.12/site-packages/numba/cuda/cudadrv/driver.py", line 327, in safe_cuda_api_call
    self._check_ctypes_error(fname, retcode)
  File "/home/tonyxdsu/.local/lib/python3.12/site-packages/numba/cuda/cudadrv/driver.py", line 395, in _check_ctypes_error
    raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [100] Call to cuInit results in CUDA_ERROR_NO_DEVICE

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/home/tonyxdsu/.local/lib/python3.12/site-packages/numba/cuda/cudadrv/driver.py", line 292, in __getattr__
    self.ensure_initialized()
  File "/home/tonyxdsu/.local/lib/python3.12/site-packages/numba/cuda/cudadrv/driver.py", line

In [2]:
def cuml_logreg_hyperpram_search(
    X_train, y_train, X_test, y_test,
    C_start=-4, C_stop=4, num_C=20, C_values=None,
    penalties=['l2', 'l1'], n_splits=2, max_iter=1000, verbose=False
):
    """
    Trains logistic regression models using RAPIDS cuML with hyperparameter tuning.

    Parameters:
    - X_train: Training features (NumPy or cuPy array)
    - y_train: Training labels (NumPy or cuPy array)
    - X_test: Testing features (NumPy or cuPy array)
    - y_test: Testing labels (NumPy or cuPy array)
    - C_start: Starting exponent for C values in logspace (default: -4)
    - C_stop: Ending exponent for C values in logspace (default: 4)
    - num_C: Number of C values to generate (default: 10)
    - C_values: Custom array of C values (overrides C_start, C_stop, num_C if provided)
    - penalties: List of penalties to try (default: ['l2', 'l1'])
    - n_splits: Number of splits for cross-validation (default: 2)
    - max_iter: Maximum number of iterations for the solver (default: 1000)
    - verbose: Whether to print progress during training (default: False)

    Returns:
    - best_model: The trained cuML LogisticRegression model with the best hyperparameters
    - best_params: Dictionary of the best hyperparameters
    - best_score: Best cross-validation score achieved during hyperparameter tuning
    - test_score: Accuracy of the best model on the test set
    """
    # Convert data to cuPy arrays if they are not already
    if not isinstance(X_train, cp.ndarray):
        X_train = cp.asarray(X_train)
    if not isinstance(y_train, cp.ndarray):
        y_train = cp.asarray(y_train)
    if not isinstance(X_test, cp.ndarray):
        X_test = cp.asarray(X_test)
    if not isinstance(y_test, cp.ndarray):
        y_test = cp.asarray(y_test)

    # Scale the data using cuML's StandardScaler
    scaler = cuStandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    if C_values is None:
        C_values = cp.logspace(C_start, C_stop, num=num_C)
    else:
        C_values = cp.asarray(C_values)

    total_models = len(C_values) * len(penalties) * n_splits
    print(f"Total number of models to train: {total_models}")

    best_score = -cp.inf
    best_params = None

    kf = KFold(n_splits, shuffle=True, random_state=42)

    for C in C_values:
        for penalty in penalties:
            cv_scores = []

            if verbose:
                print(f"  Training model with C={C}, penalty={penalty}")

            for train_index, val_index in kf.split(X_train_scaled):
                X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
                y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

                # Initialize and train the model
                model = cuLogisticRegression(penalty=penalty, C=C.item(), max_iter=max_iter)
                model.fit(X_train_fold, y_train_fold)

                # Evaluate on validation set
                score = model.score(X_val_fold, y_val_fold)
                cv_scores.append(score)

            mean_cv_score = cp.mean(cp.array(cv_scores))

            if mean_cv_score > best_score:
                best_score = mean_cv_score
                best_params = {'C': C.item(), 'penalty': penalty, 'max_iter': max_iter}
                print(f">> New best score: {best_score}, with params: {best_params}")

    print(f"Best parameters found: {best_params}")
    print(f"Best cross-validation score: {best_score}")

    # Train/test the best model on the entire set
    best_model = cuLogisticRegression(**best_params)
    best_model.fit(X_train_scaled, y_train)

    test_score = best_model.score(X_test_scaled, y_test)
    print(f"Test set accuracy: {test_score}")

    return best_model, best_params, best_score, test_score

# CIFAR10

In [4]:
X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10 = clip_feature_extractor.get_CIFAR10_features()

Files already downloaded and verified
Files already downloaded and verified
Loaded features from disk.


# CIFAR100

In [None]:
X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100 = clip_feature_extractor.get_CIFAR100_features()

In [9]:
# Broad search logistic regression on CIFAR100
best_model, best_params, best_score, test_score = cuml_logreg_hyperpram_search(
    X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100)

New best score: 0.7041399776935577, with params: {'C': 9.999999999999999e-05, 'penalty': 'l2', 'max_iter': 1000}
New best score: 0.7637400031089783, with params: {'C': 0.000774263682681127, 'penalty': 'l2', 'max_iter': 1000}
New best score: 0.7898200154304504, with params: {'C': 0.00599484250318941, 'penalty': 'l2', 'max_iter': 1000}
[W] [02:55:26.102354] QWL-QN: max iterations reached
[W] [02:55:26.103728] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [02:55:57.716029] QWL-QN: max iterations reached
[W] [02:55:57.718227] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [02:57:14.576602] QWL-QN: max iterations reached
[W] [02:57:14.577820] Maximum iterations reached before solver is converged. To increase model accuracy you

In [19]:
# Each model takes between 2s in the best case of fast convergence on a T4 GPU and
# up to 18s in the case of hitting max_iter = 1000 non-convergence

# Broad search yielded:
# Best parameters found: {'C': 0.00599484250318941, 'penalty': 'l2', 'max_iter': 1000}
# Best cross-validation score: 0.7898200154304504
# Test set accuracy: 0.8003000020980835

# Now perform narrower search around found value (though we are biased towards C=0.316 and l2 reg provided by OpenAI for CIFAR100)
best_model, best_params, best_score, test_score = cuml_logreg_hyperpram_search(
    X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100,
    C_start=-3, C_stop=0, num_C=40, C_values=None,
    penalties=['l2', 'l1'], n_splits=2, max_iter=1000, verbose=True
    )

Total number of models to train: 160
  Training model with C=0.001, penalty=l2
New best score: 0.7685000002384186, with params: {'C': 0.001, 'penalty': 'l2', 'max_iter': 1000}
  Training model with C=0.001, penalty=l1
  Training model with C=0.0011937766417144371, penalty=l2
New best score: 0.771340012550354, with params: {'C': 0.0011937766417144371, 'penalty': 'l2', 'max_iter': 1000}
  Training model with C=0.0011937766417144371, penalty=l1
  Training model with C=0.0014251026703029977, penalty=l2
New best score: 0.7743000090122223, with params: {'C': 0.0014251026703029977, 'penalty': 'l2', 'max_iter': 1000}
  Training model with C=0.0014251026703029977, penalty=l1
[W] [03:40:50.640331] QWL-QN stopped, because the line search failed to advance (step delta = 0.000000)
  Training model with C=0.0017012542798525892, penalty=l2
New best score: 0.776960015296936, with params: {'C': 0.0017012542798525892, 'penalty': 'l2', 'max_iter': 1000}
  Training model with C=0.0017012542798525892, pena

# MNIST

In [4]:
# Extract features of MNIST
X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST = clip_feature_extractor.get_MNIST_features()

Using device: cuda


Extracting features: 100%|██████████| 600/600 [02:54<00:00,  3.43it/s]


Using device: cuda


Extracting features: 100%|██████████| 100/100 [00:28<00:00,  3.55it/s]


In [8]:
# perform broad search on MNIST
best_model, best_params, best_score, test_score = cuml_logreg_hyperpram_search(
    X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST, num_C=40, verbose=True)

Total number of models to train: 160
  Training model with C=9.999999999999999e-05, penalty=l2
>> New best score: 0.9599500000476837, with params: {'C': 9.999999999999999e-05, 'penalty': 'l2', 'max_iter': 1000}
  Training model with C=9.999999999999999e-05, penalty=l1
  Training model with C=0.0001603718743751331, penalty=l2
>> New best score: 0.964983344078064, with params: {'C': 0.0001603718743751331, 'penalty': 'l2', 'max_iter': 1000}
  Training model with C=0.0001603718743751331, penalty=l1
[W] [04:32:46.839278] QWL-QN stopped, because the line search failed to advance (step delta = 0.000000)
  Training model with C=0.00025719138090593444, penalty=l2
>> New best score: 0.9691666662693024, with params: {'C': 0.00025719138090593444, 'penalty': 'l2', 'max_iter': 1000}
  Training model with C=0.00025719138090593444, penalty=l1
  Training model with C=0.0004124626382901352, penalty=l2
>> New best score: 0.9724000096321106, with params: {'C': 0.0004124626382901352, 'penalty': 'l2', 'max_

In [15]:
# Narrow search on MNIST around C=0.0464 and l2 loss
best_model, best_params, best_score, test_score = cuml_logreg_hyperpram_search(
    X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST, C_start=-2, C_stop=-1, num_C=50, penalties=['l2'], n_splits=4, verbose=True)

Total number of models to train: 200
  Training model with C=0.01, penalty=l2
>> New best score: 0.9861833304166794, with params: {'C': 0.01, 'penalty': 'l2', 'max_iter': 1000}
  Training model with C=0.010481131341546858, penalty=l2
>> New best score: 0.9863499999046326, with params: {'C': 0.010481131341546858, 'penalty': 'l2', 'max_iter': 1000}
  Training model with C=0.010985411419875584, penalty=l2
>> New best score: 0.9865333288908005, with params: {'C': 0.010985411419875584, 'penalty': 'l2', 'max_iter': 1000}
  Training model with C=0.011513953993264476, penalty=l2
>> New best score: 0.9866166561841965, with params: {'C': 0.011513953993264476, 'penalty': 'l2', 'max_iter': 1000}
  Training model with C=0.01206792640639329, penalty=l2
>> New best score: 0.9866499900817871, with params: {'C': 0.01206792640639329, 'penalty': 'l2', 'max_iter': 1000}
  Training model with C=0.012648552168552958, penalty=l2
>> New best score: 0.9867333322763443, with params: {'C': 0.012648552168552958, 