In [None]:
# Run the following cell if using Google Colab

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/cs340/project/models

!pip install git+https://github.com/openai/CLIP.git
!pip install cuml-cu12

In [None]:
import clip_feature_extractor
import numpy as np

from cuml.ensemble import RandomForestClassifier
from cuml.metrics import accuracy_score
import cupy as cp
import cudf

%load_ext autoreload
%autoreload 2

In [3]:
X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10 = clip_feature_extractor.get_CIFAR10_features()


Files already downloaded and verified
Files already downloaded and verified
Loaded features from disk.


In [None]:
def train_random_forest(X_train, y_train, X_test, y_test, 
                             n_estimators=100, max_depth=16):
    """
    Train a Random Forest classifier using RAPIDS cuML with GPU acceleration.

    Parameters:
    - X_train (np.ndarray or cp.ndarray): Training feature data.
    - y_train (np.ndarray or cp.ndarray): Training labels.
    - X_test (np.ndarray or cp.ndarray): Testing feature data.
    - y_test (np.ndarray or cp.ndarray): Testing labels.
    - n_estimators (int): Number of trees in the forest (default: 100).
    - max_depth (int or None): Maximum depth of the trees (default: None).

    Returns:
    - accuracy (float): Classification accuracy on the test set (0 to 1).
    - model (cuml.ensemble.RandomForestClassifier): Trained Random Forest model.
    """

    def convert_and_cast(data, dtype):
        """
        Convert data to CuPy array if not already and cast to the specified dtype.

        Parameters:
        - data (np.ndarray or cp.ndarray): Input data.
        - dtype (data-type): Desired data type.

        Returns:
        - cp.ndarray: Converted and cast CuPy array.
        """
        if not isinstance(data, cp.ndarray):
            data = cp.asarray(data)
        return data.astype(dtype)

    # Combine conversion and type casting for all datasets
    X_train = convert_and_cast(X_train, cp.float32)
    X_test = convert_and_cast(X_test, cp.float32)
    y_train = convert_and_cast(y_train, cp.int32)
    y_test = convert_and_cast(y_test, cp.int32)

    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    return accuracy, rf

In [None]:
# n_estimator = 1000, max_depth = 32 is at 2GB GPU memory usage
accuracy, model = train_random_forest(X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10, n_estimators=1000, max_depth=32)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9330000281333923


In [None]:
# Need to test when we run out of memory
# n_classes 100 crashes; CIFAR100 crashes

# Generate sample data
# Generate random data
n_train = 50000
n_test = 10000
n_features = 500  # Number of features
n_classes = 100   # Binary classification

# Training data
np.random.seed(42)  # For reproducibility
X_train = np.random.randn(n_train, n_features)  
y_train = np.random.randint(0, n_classes, n_train)

# Test data
X_test = np.random.randn(n_test, n_features)  # 100 test samples
y_test = np.random.randint(0, n_classes, n_test)

# Train model
accuracy, model = train_random_forest(X_train, y_train, X_test, y_test)
print(f"Test accuracy: {accuracy:.4f}")

: 