In [4]:
!pip install torch torchvision torchaudio
!pip install ray[tune]
!pip install ray[default]
!pip install pyngrok

Collecting ray[tune]
  Downloading ray-2.44.1-cp311-cp311-manylinux2014_x86_64.whl.metadata (19 kB)
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ray-2.44.1-cp311-cp311-manylinux2014_x86_64.whl (68.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.1/68.1 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboardX, ray
Successfully installed ray-2.44.1 tensorboardX-2.6.2.2
Collecting aiohttp_cors (from ray[default])
  Downloading aiohttp_cors-0.8.0-py3-none-any.whl.metadata (20 kB)
Collecting colorful (from ray[default])
  Downloading colorful-0.5.6-py2.py3-none-any.whl.metadata (16 kB)
Collecting py-spy>=0.2.0 (from ray[default])
  Downloading py_spy-0.4.0-py

In [5]:
import os
import json
import ray
from ray import tune
from ray.tune import Tuner, TuneConfig
from ray.air import RunConfig
from ray.tune.schedulers import ASHAScheduler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
import wandb
from tqdm import tqdm
from pyngrok import ngrok

# Single config dictionary with all parameters
config = {
    "wandb_api_key": "KEY",
    "ngrok_auth_token": "KEY",
    "resource": {
        "total_cpus": 12,
        "num_tasks": 12,
        "cpus_per_task": 1,
        "gpus_per_task": 0
    },
    "tune_params": {
        "lr": tune.loguniform(1e-5, 1e-2),
        "batch_size": tune.choice([32, 64, 128]),
        "optimizer": tune.choice(["adam", "sgd"]),
        "layer_size": tune.randint(64, 256),
        "dropout_rate": tune.uniform(0.1, 0.5),
        "epochs": 3
    }
}

wandb.login(key=config["wandb_api_key"])
ngrok.set_auth_token(config["ngrok_auth_token"])

class CustomModel(nn.Module):
    def __init__(self, layer_size=128, dropout_rate=0.3, **kwargs):
        super(CustomModel, self).__init__(**kwargs)
        self.fc1 = nn.Linear(784, layer_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(layer_size, 10)

    def forward(self, x):
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

def train_model(tune_config):
    run = wandb.init(project="mnist_ray_tune", config=tune_config)
    train_dataset = MNIST(root="data", train=True, transform=ToTensor(), download=True)
    train_loader = DataLoader(train_dataset, batch_size=tune_config["batch_size"], shuffle=True)
    test_dataset = MNIST(root="data", train=False, transform=ToTensor(), download=True)
    test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

    model = CustomModel(layer_size=tune_config["layer_size"], dropout_rate=tune_config["dropout_rate"])
    optimizer = optim.Adam(model.parameters(), lr=tune_config["lr"]) if tune_config["optimizer"] == "adam" else optim.SGD(model.parameters(), lr=tune_config["lr"])
    criterion = nn.CrossEntropyLoss()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(tune_config["epochs"]):
        model.train()
        train_loss_sum, train_correct, train_total = 0.0, 0, 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss_sum += loss.item()
            _, predicted = torch.max(outputs, 1)
            train_correct += (predicted == labels).sum().item()
            train_total += labels.size(0)
        train_loss = train_loss_sum / len(train_loader)
        train_accuracy = train_correct / train_total

        model.eval()
        val_loss_sum, val_correct, val_total = 0.0, 0, 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss_val = criterion(outputs, labels)
                val_loss_sum += loss_val.item()
                _, predicted = torch.max(outputs, 1)
                val_correct += (predicted == labels).sum().item()
                val_total += labels.size(0)
        val_loss = val_loss_sum / len(test_loader)
        val_accuracy = val_correct / val_total

        wandb.log({
            "loss": train_loss,
            "accuracy": train_accuracy,
            "val_loss": val_loss,
            "val_accuracy": val_accuracy,
            "epoch": epoch + 1
        })
        tune.report({
            "loss": train_loss,
            "accuracy": train_accuracy,
            "val_loss": val_loss,
            "val_accuracy": val_accuracy,
            "epoch": epoch + 1
        })
    run.finish()

def run_ray_tune(cfg):
    ray.shutdown()
    ray.init(num_cpus=cfg["resource"]["total_cpus"],
             ignore_reinit_error=True,
             logging_level="ERROR",
             include_dashboard=True)
    public_url = ngrok.connect(8265, "http")
    print("Public URL for Ray Dashboard:", public_url)
    print("Ray available resources:", ray.available_resources())

    tuner = Tuner(
        tune.with_resources(train_model, resources={
            "cpu": cfg["resource"]["cpus_per_task"],
            "gpu": cfg["resource"]["gpus_per_task"]
        }),
        param_space=cfg["tune_params"],
        tune_config=TuneConfig(num_samples=cfg["resource"]["num_tasks"], scheduler=None),
        run_config=RunConfig(name="mnist_ray_tune", verbose=0)
    )
    results = tuner.fit()
    timeline_data = ray.timeline()
    with open("timeline.json", "w") as f:
        f.write(json.dumps(timeline_data, indent=2))
    return results

def main():
    run_ray_tune(config)
    ray.shutdown()

if __name__ == "__main__":
    main()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtoribio-diego23[0m ([33mCooper-Union[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Public URL for Ray Dashboard: NgrokTunnel: "https://748d-35-240-222-110.ngrok-free.app" -> "http://localhost:8265"
Ray available resources: {'accelerator_type:L4': 1.0, 'node:__internal_head__': 1.0, 'CPU': 12.0, 'memory': 39565255476.0, 'node:172.28.0.12': 1.0, 'object_store_memory': 16956538060.0, 'GPU': 1.0}
+--------------------------------------------------------+
| Configuration for experiment     mnist_ray_tune        |
+--------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator |
| Scheduler                        FIFOScheduler         |
| Number of trials                 12                    |
+--------------------------------------------------------+

View detailed results here: /root/ray_results/mnist_ray_tune
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2025-03-31_01-34-54_021914_1311/artifacts/2025-03-31_01-34-57/mnist_ray_tune/driver_artifacts`


[36m(train_model pid=4539)[0m wandb: Currently logged in as: toribio-diego23 (Cooper-Union) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
[36m(train_model pid=4539)[0m wandb: Tracking run with wandb version 0.19.8
[36m(train_model pid=4539)[0m wandb: Run data is saved locally in /tmp/ray/session_2025-03-31_01-34-54_021914_1311/artifacts/2025-03-31_01-34-57/mnist_ray_tune/working_dirs/train_model_5aa5c_00000_0_batch_size=128,dropout_rate=0.4329,layer_size=147,lr=0.0074,optimizer=adam_2025-03-31_01-34-57/wandb/run-20250331_013506-0eccc1w9
[36m(train_model pid=4539)[0m wandb: Run `wandb offline` to turn off syncing.
[36m(train_model pid=4539)[0m wandb: Syncing run feasible-microwave-61
[36m(train_model pid=4539)[0m wandb: ⭐️ View project at https://wandb.ai/Cooper-Union/mnist_ray_tune
[36m(train_model pid=4539)[0m wandb: 🚀 View run at https://wandb.ai/Cooper-Union/mnist_ray_tune/runs/0eccc1w9
[36m(train_model pid=4543)[0m wandb: 🚀 View run at https:




[36m(train_model pid=4542)[0m wandb: uploading output.log; uploading config.yaml[32m [repeated 2x across cluster][0m
[36m(train_model pid=4542)[0m wandb: Run history:[32m [repeated 3x across cluster][0m
[36m(train_model pid=4542)[0m wandb:     accuracy ▁▇█[32m [repeated 3x across cluster][0m
[36m(train_model pid=4542)[0m wandb:        epoch ▁▅█[32m [repeated 3x across cluster][0m
[36m(train_model pid=4542)[0m wandb:         loss █▂▁[32m [repeated 3x across cluster][0m
[36m(train_model pid=4550)[0m wandb: val_accuracy ▁▆█
[36m(train_model pid=4542)[0m wandb: Run summary:[32m [repeated 3x across cluster][0m
[36m(train_model pid=4542)[0m wandb:     accuracy 0.90555[32m [repeated 3x across cluster][0m
[36m(train_model pid=4542)[0m wandb:        epoch 3[32m [repeated 3x across cluster][0m
[36m(train_model pid=4542)[0m wandb:         loss 0.33301[32m [repeated 3x across cluster][0m
[36m(train_model pid=4542)[0m wandb: val_accuracy 0.9242[32m [repeated

In [4]:
!htop

/bin/bash: line 1: htop: command not found
