In [2]:
# Install PyTorch with CUDA support (modify according to your system and CUDA version)
!pip install torch torchvision torchaudio

# Install Ray and Ray Tune
!pip install ray[tune]

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [7]:
import os
import ray
from ray import tune
from ray.tune import Tuner, TuneConfig
from ray.air import RunConfig
from ray.tune.schedulers import ASHAScheduler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor

import wandb
from tqdm import tqdm

# Disable torchvision progress bars
os.environ["TORCHVISION_DISABLE_PROGRESS"] = "1"

# =============================================================================
#                           WEIGHTS & BIASES INTEGRATION
# =============================================================================

# Configuration and login for Weights & Biases
config_file = {
    "wandb_api_key": "bf8d1a3f64bd6397782ed9ec70231089c9deaefa"
}
wandb.login(key=config_file["wandb_api_key"])

# =============================================================================
#                           MODEL DEFINITION
# =============================================================================

class CustomModel(nn.Module):
    def __init__(self, layer_size=128, dropout_rate=0.3, **kwargs):
        super(CustomModel, self).__init__(**kwargs)
        self.fc1 = nn.Linear(784, layer_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(layer_size, 10)

    def forward(self, x):
        # Flatten the 28x28 MNIST images into a 784-dim vector
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# =============================================================================
#                         TRAINING FUNCTION
# =============================================================================

def train_model(config):
    """
    Training function that computes training and validation metrics,
    logs them to W&B as regular charts, and reports metrics to Ray Tune.
    """
    # Initialize a wandb run for this trial
    run = wandb.init(project="mnist_ray_tune", config=config)

    # Load datasets
    train_dataset = MNIST(root="data", train=True, transform=ToTensor(), download=True)
    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    test_dataset = MNIST(root="data", train=False, transform=ToTensor(), download=True)
    test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

    # Initialize model, optimizer, and loss criterion
    model = CustomModel(layer_size=config["layer_size"], dropout_rate=config["dropout_rate"])
    if config["optimizer"] == "adam":
        optimizer = optim.Adam(model.parameters(), lr=config["lr"])
    else:
        optimizer = optim.SGD(model.parameters(), lr=config["lr"])
    criterion = nn.CrossEntropyLoss()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Training loop with tqdm progress bar
    pbar = tqdm(total=config["epochs"], desc="Training", unit="epoch")
    for epoch in range(config["epochs"]):
        # ---------------------------
        # Training phase
        # ---------------------------
        model.train()
        train_loss_sum = 0.0
        train_correct = 0
        train_total = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss_sum += loss.item()
            _, predicted = torch.max(outputs, 1)
            train_correct += (predicted == labels).sum().item()
            train_total += labels.size(0)
        train_loss = train_loss_sum / len(train_loader)
        train_accuracy = train_correct / train_total

        # ---------------------------
        # Validation phase
        # ---------------------------
        model.eval()
        val_loss_sum = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss_val = criterion(outputs, labels)
                val_loss_sum += loss_val.item()
                _, predicted = torch.max(outputs, 1)
                val_correct += (predicted == labels).sum().item()
                val_total += labels.size(0)
        val_loss = val_loss_sum / len(test_loader)
        val_accuracy = val_correct / val_total

        # Log metrics to Weights & Biases as regular charts
        wandb.log({
            "loss": train_loss,
            "accuracy": train_accuracy,
            "val_loss": val_loss,
            "val_accuracy": val_accuracy,
            "epoch": epoch + 1
        })

        # Report metrics to Ray Tune
        tune.report({
            "loss": train_loss,
            "accuracy": train_accuracy,
            "val_loss": val_loss,
            "val_accuracy": val_accuracy,
            "epoch": epoch + 1
        })

        # Print only the epoch summary for this trial (PID)
        print(f"Epoch {epoch+1}/{config['epochs']} - Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.4f}")
        pbar.update(1)
    pbar.close()
    run.finish()

# =============================================================================
#                         RAY TUNE INTEGRATION
# =============================================================================

def run_ray_tune(search_space):
    """
    Initialize Ray, configure Tune, and run the experiments.
    """
    ray.shutdown()
    ray.init(num_cpus=8, ignore_reinit_error=True, logging_level="ERROR", include_dashboard=False)
    print("Ray available resources:", ray.available_resources())

    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=3,
        grace_period=1,
        reduction_factor=2
    )
    train_with_resources = tune.with_resources(train_model, resources={"cpu": 8})
    tuner = Tuner(
        train_with_resources,
        param_space=search_space,
        tune_config=TuneConfig(scheduler=scheduler, num_samples=1),
        run_config=RunConfig(name="mnist_ray_tune", verbose=0)
    )
    results = tuner.fit()
    return results

# =============================================================================
#                      HYPERPARAMETER SEARCH SPACE
# =============================================================================

search_space = {
    "lr": tune.loguniform(1e-5, 1e-2),
    "batch_size": tune.choice([32, 64, 128]),
    "optimizer": tune.choice(["adam", "sgd"]),
    "layer_size": tune.randint(64, 256),
    "dropout_rate": tune.uniform(0.1, 0.5),
    "epochs": 3,
}

# =============================================================================
#                              MAIN FUNCTION
# =============================================================================

def main():
    run_ray_tune(search_space)
    ray.shutdown()

if __name__ == "__main__":
    main()

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Ray available resources: {'node:__internal_head__': 1.0, 'CPU': 8.0, 'object_store_memory': 3987170918.0, 'node:172.28.0.12': 1.0, 'memory': 9303398810.0, 'GPU': 1.0, 'accelerator_type:T4': 1.0}
+----------------------------------------------------------+
| Configuration for experiment     mnist_ray_tune          |
+----------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator   |
| Scheduler                        AsyncHyperBandScheduler |
| Number of trials                 1                       |
+----------------------------------------------------------+

View detailed results here: /root/ray_results/mnist_ray_tune
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2025-03-30_21-02-05_000604_652/artifacts/2025-03-30_21-02-09/mnist_ray_tune/driver_artifacts`


[36m(train_model pid=11185)[0m wandb: Currently logged in as: toribio-diego23 (Cooper-Union) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
[36m(train_model pid=11185)[0m wandb: Tracking run with wandb version 0.19.8
[36m(train_model pid=11185)[0m wandb: Run data is saved locally in /tmp/ray/session_2025-03-30_21-02-05_000604_652/artifacts/2025-03-30_21-02-09/mnist_ray_tune/working_dirs/train_model_3e711_00000_0_batch_size=32,dropout_rate=0.4256,layer_size=71,lr=0.0000,optimizer=adam_2025-03-30_21-02-09/wandb/run-20250330_210216-95xxd8n8
[36m(train_model pid=11185)[0m wandb: Run `wandb offline` to turn off syncing.
[36m(train_model pid=11185)[0m wandb: Syncing run eager-bird-5
[36m(train_model pid=11185)[0m wandb: ⭐️ View project at https://wandb.ai/Cooper-Union/mnist_ray_tune
[36m(train_model pid=11185)[0m wandb: 🚀 View run at https://wandb.ai/Cooper-Union/mnist_ray_tune/runs/95xxd8n8
  0%|          | 0.00/9.91M [00:00<?, ?B/s]
  1%|          | 65.

[36m(train_model pid=11185)[0m Epoch 1/3 - Loss: 1.6143, Accuracy: 0.5940


[36m(train_model pid=11185)[0m Training:  33%|███▎      | 1/3 [00:11<00:23, 11.78s/epoch]


[36m(train_model pid=11185)[0m Epoch 2/3 - Loss: 0.8894, Accuracy: 0.7739


[36m(train_model pid=11185)[0m Training:  67%|██████▋   | 2/3 [00:23<00:11, 11.89s/epoch]



