#### Applying Ray Tuner:

A **Ray Tuner** refers to the hyperparameter tuning component of **Ray Tune**, a library in the **Ray** ecosystem that simplifies distributed hyperparameter tuning for machine learning models.

Ray itself is a framework for building and running distributed applications, and Ray Tune is specifically designed to optimize model performance by efficiently exploring hyperparameter spaces.

In [None]:
import tensorflow_datasets as tfds
import logging
from zipfile import ZipFile
import os
import sys
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [None]:
import pandas as pd
import os
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from PIL import Image
import torch
from torchvision import transforms
import torchvision
import shutil

In [None]:
# Custom dataset class
class ImageDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.data_frame = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.data_frame.iloc[idx, 0])
        image = Image.open(img_name)
        label = int(self.data_frame.iloc[idx, 1])

        if self.transform:
            image = self.transform(image)

        return image, label


# Load the CSV file
train_csv_file = '/kaggle/input/deepweeds-image-classification/train_subset0.csv'  # Replace with your actual path
test_csv_file = '/kaggle/input/deepweeds-image-classification/test_subset0.csv'  # Replace with your actual path
val_csv_file = '/kaggle/input/deepweeds-image-classification/val_subset0.csv'  # Replace with your actual path
img_dir = '/kaggle/input/deepweeds-image-classification/images'  # Replace with your images directory


# Image transformations (resize, normalization, etc.)
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resizing to 128x128, you can adjust this
    transforms.ToTensor(),  # Convert the image to PyTorch tensor
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
  ])


# Create the train and test datasets
train_dataset = ImageDataset(csv_file=train_csv_file, img_dir=img_dir, transform=transform)
test_dataset = ImageDataset(csv_file=test_csv_file, img_dir=img_dir, transform=transform)
val_dataset = ImageDataset(csv_file=val_csv_file, img_dir=img_dir, transform=transform)


# Create DataLoaders for batching
def create_dataloader(loader_type):
  if loader_type=='train':
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    print(f"len train_loader {len(train_loader)}")
    return train_loader
  elif loader_type=='val':
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
    return val_loader
  else:
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    return test_loader


In [None]:
num_train_samples = len(train_dataset)
num_val_samples = len(val_dataset)
num_test_samples = len(test_dataset)
print(f"Number of training samples: {num_train_samples}")
print(f"Number of validation samples: {num_val_samples}")
print(f"Number of testing samples: {num_test_samples}")

Number of training samples: 10501
Number of validation samples: 3501
Number of testing samples: 3507


In [None]:
from torchvision import models
import torch.nn as nn
import torch.optim as optim


#####Example Workflow-

In a typical Ray Tune setup:

* You define a search space for your model’s hyperparameters (e.g., learning rate, batch size).
* Specify a search algorithm (random search, grid search, etc.).
* Use a scheduler (e.g., ASHA) for early stopping or adaptive resource allocation.
* Ray Tune then launches trials, monitors their performance, and refines the search process to find the best set of hyperparameters.

Ray Tune is especially useful for models that are computationally intensive, such as deep learning models, or when tuning must be done at scale.

In [None]:
# Install Ray

!pip install ray



In [None]:
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler

2024-10-30 13:38:53,666	INFO util.py:124 -- Outdated packages:
  ipywidgets==7.7.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-10-30 13:38:54,288	INFO util.py:124 -- Outdated packages:
  ipywidgets==7.7.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [None]:
from ray import tune, train

# Training loop
def train_resnet50(config):
  model = models.resnet50(pretrained=True)
  num_ftrs = model.fc.in_features
  model.fc = nn.Linear(num_ftrs, 9)  # Adjust for CIFAR-10 (8 classes)

  # Set Up Loss Function and Optimizer
  criterion = nn.CrossEntropyLoss()

  if config["optimizer"] == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=config["lr"])
  elif config["optimizer"] == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=0.7)
  optimizer = optim.Adam(model.parameters(), lr=config["lr"])

  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = model.to(device)

  train_loader = create_dataloader(loader_type='train')
  val_loader = create_dataloader(loader_type='val')

  for epoch in range(10):           # You can adjust the number of epochs
          model.train()
          running_loss = 0.0
          for i, (inputs, targets) in enumerate(train_loader):
              inputs, targets = inputs.to(device), targets.to(device)

              optimizer.zero_grad()
              outputs = model(inputs)
              loss = criterion(outputs, targets)
              loss.backward()
              optimizer.step()
              running_loss += loss.item()

              if i % 100 == 0:
                  print(f'Epoch [{epoch+1}/10], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

          train_loss = running_loss / len(train_loader)

          model.eval()
          val_loss = 0.0
          val_correct = 0
          val_total = 0
          with torch.no_grad():
            for inputs, targets in val_loader:
              inputs, targets = inputs.to(device), targets.to(device)
              outputs = model(inputs)
              _, predicted = outputs.max(1)
              loss = criterion(outputs, predicted)

              val_loss += loss.item()
              val_total += targets.size(0)
              val_correct += predicted.eq(targets).sum().item()

          # Calculate accuracy
          val_loss /= len(val_loader)
          val_accuracy = val_correct / val_total
          print(f'Epoch [{epoch+1}/10], Validation Accuracy: {val_accuracy:.4f}')

          # Correct use of tune.report
          train.report(dict(
            loss=train_loss,
            val_loss=val_loss,
            val_accuracy=val_accuracy
        ))


In [None]:
# Define hyperparameter search space that Ray Tune will search:

search_space = {
    "lr": tune.choice([1e-3, 1e-1]),           # Log-uniform distribution for learning rate
    "optimizer": tune.choice(["Adam", "SGD"])      # Choose between Adam and SGD as the optimizer
}
ray.shutdown()
ray.init()


2024-10-30 13:38:59,014	INFO worker.py:1753 -- Started a local Ray instance.


0,1
Python version:,3.10.14
Ray version:,2.24.0


[36m(train_resnet50 pid=372)[0m Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
  0%|          | 0.00/97.8M [00:00<?, ?B/s]
  7%|▋         | 6.88M/97.8M [00:00<00:01, 71.7MB/s]
 91%|█████████▏| 89.2M/97.8M [00:00<00:00, 166MB/s]
100%|██████████| 97.8M/97.8M [00:00<00:00, 151MB/s]
100%|██████████| 97.8M/97.8M [00:00<00:00, 163MB/s]


[36m(train_resnet50 pid=372)[0m len train_loader 657
[36m(train_resnet50 pid=372)[0m Epoch [1/10], Step [1/657], Loss: 2.2258
[36m(train_resnet50 pid=373)[0m len train_loader 657
[36m(train_resnet50 pid=372)[0m Epoch [1/10], Step [101/657], Loss: 1.7381[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
[36m(train_resnet50 pid=372)[0m Epoch [1/10], Step [201/657], Loss: 2.1544[32m [repeated 2x across cluster][0m
[36m(train_resnet50 pid=373)[0m Epoch [1/10], Step [301/657], Loss: 0.8693[32m [repeated 2x across cluster][0m
[36m(train_resnet50 pid=373)[0m Epoch [1/10], Step [401/657], Loss: 1.4604[32m [repeated 2x across cluster][0m
[36m(train_resnet50 pid=373)[0m Epoch [1/10], Step [501/657], Loss: 0.7192[32m [repeated 2x across cluster][0m
[36m(train_resnet50 pid=37

[36m(train_resnet50 pid=373)[0m Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
  0%|          | 0.00/97.8M [00:00<?, ?B/s]
 79%|███████▉  | 77.5M/97.8M [00:00<00:00, 166MB/s][32m [repeated 9x across cluster][0m


[36m(train_resnet50 pid=522)[0m len train_loader 657
[36m(train_resnet50 pid=373)[0m Epoch [2/10], Validation Accuracy: 0.6133
[36m(train_resnet50 pid=522)[0m Epoch [1/10], Step [1/657], Loss: 2.2264
[36m(train_resnet50 pid=522)[0m Epoch [1/10], Step [101/657], Loss: 1.4222[32m [repeated 2x across cluster][0m
[36m(train_resnet50 pid=522)[0m Epoch [1/10], Step [201/657], Loss: 1.8073[32m [repeated 2x across cluster][0m
[36m(train_resnet50 pid=522)[0m Epoch [1/10], Step [301/657], Loss: 1.3499[32m [repeated 2x across cluster][0m
[36m(train_resnet50 pid=522)[0m Epoch [1/10], Step [401/657], Loss: 1.4061[32m [repeated 2x across cluster][0m
[36m(train_resnet50 pid=522)[0m Epoch [1/10], Step [501/657], Loss: 1.0853[32m [repeated 2x across cluster][0m
[36m(train_resnet50 pid=522)[0m Epoch [1/10], Step [601/657], Loss: 1.0242[32m [repeated 2x across cluster][0m
[36m(train_resnet50 pid=373)[0m Epoch [3/10], Validation Accuracy: 0.6718
[36m(train_resnet50 pid=373

In [None]:
#  Set Up the Scheduler and Search Algorithm:

#We'll use 'ASHAScheduler' for early stopping and resource-efficient search
scheduler = ASHAScheduler(
    metric="loss",  # Metric to optimize
    mode="min",         # Maximize accuracy
    max_t=4,           # Max number of training epochs
    grace_period=1,     # Allow at least 1 epoch before early stopping
    reduction_factor=2  # Halve resources after each iteration
)

In [None]:
# Run Ray Tune Tuning:

#We'll invoke Ray Tune to start the hyperparameter tuning
analysis = tune.run(
    train_resnet50,  # The training function
    resources_per_trial={"cpu": 2, "gpu": 1},  # Adjust according to your hardware
    config=search_space,  # The hyperparameter search space
    num_samples=3,
    scheduler=scheduler,
    progress_reporter=tune.CLIReporter(
        metric_columns=["loss", "val_loss", "val_accuracy"]
    )
)

2024-10-30 13:39:00,360	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2024-10-30 13:39:08 (running for 00:00:00.28)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0/4 CPUs, 0/2 GPUs (0.0/1.0 accelerator_type:T4)
Result logdir: /tmp/ray/session_2024-10-30_13-38-54_743622_30/artifacts/2024-10-30_13-39-00/train_resnet50_2024-10-30_13-39-00/driver_artifacts
Number of trials: 3/3 (3 PENDING)
+----------------------------+----------+-------+-------+-------------+
| Trial name                 | status   | loc   |    lr | optimizer   |
|----------------------------+----------+-------+-------+-------------|
| train_resnet50_51ec7_00000 | PENDING  |       | 0.1   | SGD         |
| train_resnet50_51ec7_00001 | PENDING  |       | 0.001 | SGD         |
| train_resnet50_51ec7_00002 | PENDING  |       | 0.001 | SGD         |
+----------------------------+----------+-------+-------+-------------+


== Status ==
Current time: 2024-10-30 13:39:14 (running for 00:00:05.3

Trial name,loss,val_accuracy,val_loss
train_resnet50_51ec7_00000,1.56989,0.52128,0.694207
train_resnet50_51ec7_00001,0.783039,0.715795,0.313103
train_resnet50_51ec7_00002,0.954701,0.61411,0.416708


== Status ==
Current time: 2024-10-30 13:41:55 (running for 00:02:46.71)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 4.000: None | Iter 2.000: None | Iter 1.000: -1.5959437289012985
Logical resource usage: 4.0/4 CPUs, 2.0/2 GPUs (0.0/1.0 accelerator_type:T4)
Result logdir: /tmp/ray/session_2024-10-30_13-38-54_743622_30/artifacts/2024-10-30_13-39-00/train_resnet50_2024-10-30_13-39-00/driver_artifacts
Number of trials: 3/3 (1 PENDING, 2 RUNNING)
+----------------------------+----------+----------------+-------+-------------+---------+------------+----------------+
| Trial name                 | status   | loc            |    lr | optimizer   |    loss |   val_loss |   val_accuracy |
|----------------------------+----------+----------------+-------+-------------+---------+------------+----------------|
| train_resnet50_51ec7_00000 | RUNNING  | 172.19.2.2:372 | 0.1   | SGD         | 1.91762 |   0.493062 |       0.520137 |
| train_resnet50_51ec7_00001 | RUNNING  | 172.19.2.2:373 | 0.0

2024-10-30 13:49:37,565	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/train_resnet50_2024-10-30_13-39-00' in 0.0059s.
2024-10-30 13:49:37,570	INFO tune.py:1041 -- Total run time: 637.21 seconds (628.87 seconds for the tuning loop).


== Status ==
Current time: 2024-10-30 13:49:37 (running for 00:10:28.88)
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 4.000: -0.868869853788587 | Iter 2.000: -1.1685601514586939 | Iter 1.000: -1.38105472545827
Logical resource usage: 2.0/4 CPUs, 1.0/2 GPUs (0.0/1.0 accelerator_type:T4)
Result logdir: /tmp/ray/session_2024-10-30_13-38-54_743622_30/artifacts/2024-10-30_13-39-00/train_resnet50_2024-10-30_13-39-00/driver_artifacts
Number of trials: 3/3 (3 TERMINATED)
+----------------------------+------------+----------------+-------+-------------+----------+------------+----------------+
| Trial name                 | status     | loc            |    lr | optimizer   |     loss |   val_loss |   val_accuracy |
|----------------------------+------------+----------------+-------+-------------+----------+------------+----------------|
| train_resnet50_51ec7_00000 | TERMINATED | 172.19.2.2:372 | 0.1   | SGD         | 1.56989  |   0.694207 |       0.52128  |
| train_resnet50_51ec7_00001 | 

In [None]:
analysis.get_best_config(metric="loss", mode="min")

{'lr': 0.001, 'optimizer': 'SGD'}