In [1]:
!pip install git+https://github.com/openai/CLIP.git
!pip uninstall -y -q pyarrow
!pip install -q -U ray[tune]
!pip install -q ray[debug]

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-twvhyxx8
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-twvhyxx8
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ibis-framework 6.2.0 requires pyarrow<13,>=2, but you have pyarrow 14.0.1 which is incompatible.
pandas-gbq 0.17.9 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 14.0.1 which is incompatible.[0m[31m
[0m

In [3]:
# from google.colab import files
# files.upload()

from google.colab import drive
drive.mount('/content/gdrive')
!unzip /content/gdrive/MyDrive/DL/100-bird-species.zip -d /content/100-bird-species

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
import torch
import torch.nn as nn
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torchvision

import clip
from PIL import Image
import numpy as np

import os
from functools import partial

from ray import train, tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [5]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

device = get_default_device()
device

device(type='cuda')

In [6]:
clip_model, preprocess = clip.load("ViT-B/32", device=device)

In [7]:
# Defining dataset path
data_dir = '/content/100-bird-species/'
train_dir = data_dir + 'train/'
valid_dir = data_dir + 'valid/'
test_dir = data_dir + 'test/'

In [8]:
# Defining train transform and val transform along with data
train_transform = transforms.Compose([
  transforms.Resize((64, 64)),
  # transforms.Resize((32, 32)),
  transforms.RandomHorizontalFlip(),
  transforms.ToTensor()
])

val_transform = transforms.Compose([
  transforms.Resize((64, 64)),
  # transforms.Resize((32, 32)),
  transforms.ToTensor()
])

In [9]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

class ImageClassificationBase(nn.Module):
    def training_step(self, images, labels):
        out = self(images)                  # Generate predictions
        loss = F.cross_entropy(out, labels) # Calculate loss
        return loss

    def validation_step(self, images, labels):
        out = self(images)                    # Generate predictions
        loss = F.cross_entropy(out, labels)   # Calculate loss
        acc = accuracy(out, labels)           # Calculate accuracy
        return {'val_loss': loss.detach(), 'val_acc': acc}

    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}

    def epoch_end(self, epoch, result):
        print("Epoch [{}], last_lr: {:.5f}, train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
            epoch, result['lrs'][-1], result['train_loss'], result['val_loss'], result['val_acc']))

In [10]:
num_classes = 525

class BirdSpeciesModel(ImageClassificationBase):
    def __init__(self):
        super().__init__()
        self.clip = clip_model.visual
        self.network = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes))

    def forward(self, xb):
        xb = self.clip(xb)
        return self.network(xb)

In [11]:
from tqdm import tqdm
import copy

transform = transforms.ToPILImage()

@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = list()
    for images, labels in tqdm(val_loader):
      new_images = []
      for i, image in enumerate(images):
        image_input = preprocess(transform(image)).unsqueeze(0).to(device)
        image_input = image_input.squeeze()
        new_images.append(np.asarray(image_input.cpu()))
      new_images = np.asarray(new_images)
      new_images = torch.from_numpy(new_images)
      new_images = new_images.to(device)
      labels = labels.to(device)
      outputs.append(model.validation_step(new_images, labels))
    return model.validation_epoch_end(outputs)

In [18]:
def create_model():
    model = BirdSpeciesModel()
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
    model.to(device)
    return model, device

def load_checkpoint(model, optimizer, checkpoint_dir):
    checkpoint: train.Checkpoint = train.get_checkpoint()
    if checkpoint:
        with checkpoint.as_directory() as checkpoint_dir:
            model_state, optimizer_state = torch.load(
                os.path.join(checkpoint_dir, "checkpoint.pt"))
            model.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)

def get_data_loaders(batch_size):
    train_dataset = ImageFolder(train_dir, transform=train_transform)
    valid_dataset = ImageFolder(valid_dir, transform=val_transform)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, num_workers=8)

    return train_loader, valid_loader

def train_one_epoch(epoch, model, train_loader, optimizer, sched, preprocess, device, config):
    model.train()
    train_loss = 0.0
    epoch_steps = 0

    for i, (images, labels) in enumerate(tqdm(train_loader)):
        new_images = [preprocess_image(image, preprocess, device) for image in images]
        new_images = np.asarray(new_images)
        new_images = torch.from_numpy(new_images).to(device)

        optimizer.zero_grad()

        loss = model.training_step(new_images, labels)
        loss.backward()

        if config["grad_clip"]:
            nn.utils.clip_grad_value_(model.parameters(), config["grad_clip"])

        optimizer.step()
        sched.step()

        train_loss += loss.item()
        epoch_steps += 1

        if i % 2000 == 1999:
            print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, train_loss / epoch_steps))
            train_loss = 0.0

def validate(model, valid_loader, preprocess, device):
    model.eval()
    outputs = []

    for images, labels in tqdm(valid_loader):
        new_images = [preprocess_image(image, preprocess, device) for image in images]
        new_images = np.asarray(new_images)
        new_images = torch.from_numpy(new_images).to(device)
        labels = labels.to(device)

        outputs.append(model.validation_step(new_images, labels))

    return model.validation_epoch_end(outputs)

def train_BirdSpecies(config, epochs=10):
    model, device = create_model()
    optimizer = torch.optim.Adam(model.parameters(), config["max_lr"], weight_decay=config["weight_decay"])

    checkpoint_dir = None  # Set the checkpoint directory

    load_checkpoint(model, optimizer, checkpoint_dir)

    sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, config["max_lr"], epochs=epochs, steps_per_epoch=len(train_loader))

    train_loader, valid_loader = get_data_loaders(int(config["batch_size"]))

    for epoch in range(epochs):
        train_one_epoch(epoch, model, train_loader, optimizer, sched, preprocess, device, config)
        result = validate(model, valid_loader, preprocess, device)

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=result['val_loss'], accuracy=result['val_acc'])

    print("Finished Training")


In [12]:
def preprocess_image(image, preprocess, device):
    image_input = preprocess(transform(image)).unsqueeze(0).to(device)
    image_input = image_input.squeeze()
    return np.asarray(image_input.cpu())

# def train_BirdSpecies(config, epochs=10):
#     model = BirdSpeciesModel()

#     device = "cpu"
#     if torch.cuda.is_available():
#         device = "cuda:0"
#         if torch.cuda.device_count() > 1:
#             model = nn.DataParallel(model)
#     model.to(device)

#     train_dataset = ImageFolder(train_dir, transform=train_transform)
#     valid_dataset = ImageFolder(valid_dir,transform=val_transform)
#     # test_dataset = ImageFolder(test_dir,transform=val_transform)

#     train_loader = DataLoader(train_dataset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8)
#     valid_loader = DataLoader(valid_dataset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8)
#     # test_loader = DataLoader(test_dataset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8)

#     optimizer = torch.optim.Adam(model.parameters(), config["max_lr"], weight_decay=config["weight_decay"])

#     checkpoint: train.Checkpoint = train.get_checkpoint()
#     if checkpoint:
#         with checkpoint.as_directory() as checkpoint_dir:
#             model_state, optimizer_state = torch.load(
#                 os.path.join(checkpoint_dir, "checkpoint.pt"))
#             model.load_state_dict(model_state)
#             optimizer.load_state_dict(optimizer_state)

#     sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, config["max_lr"], epochs=epochs, steps_per_epoch=len(train_loader))

#     for epoch in range(epochs):  # loop over the dataset multiple times
#         # Training Phase
#         model.train()
#         train_loss = 0.0
#         epoch_steps = 0
#         for i, (images, labels) in enumerate(tqdm(train_loader)):
#             new_images = [preprocess_image(image, preprocess, device) for image in images]
#             new_images = np.asarray(new_images)
#             new_images = torch.from_numpy(new_images).to(device)

#             # zero the parameter gradients
#             optimizer.zero_grad()

#             # forward + backward + optimize
#             loss = model.training_step(new_images, labels)
#             loss.backward()
#             # Gradient clipping
#             if(config["grad_clip"]):
#                 nn.utils.clip_grad_value_(model.parameters(), config["grad_clip"])
#             optimizer.step()
#             # Update learning rate
#             sched.step()
#             train_loss += loss.item()
#             epoch_steps += 1
#             if i % 2000 == 1999:  # print every 2000 mini-batches
#                 print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
#                                                 train_loss / epoch_steps))
#                 train_loss = 0.0

#         # Validation phase
#         model.eval()
#         outputs = list()
#         for images, labels in tqdm(valid_loader):
#             new_images = [preprocess_image(image, preprocess, device) for image in images]
#             new_images = np.asarray(new_images)
#             new_images = torch.from_numpy(new_images).to(device)
#             labels = labels.to(device)

#             outputs.append(model.validation_step(new_images, labels))
#         result = model.validation_epoch_end(outputs)

#         with tune.checkpoint_dir(epoch) as checkpoint_dir:
#             path = os.path.join(checkpoint_dir, "checkpoint")
#             torch.save((model.state_dict(), optimizer.state_dict()), path)

#         tune.report(loss=result['val_loss'], accuracy=result['val_acc'])
#     print("Finished Training")

In [19]:
config = {
    "max_lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([2, 4, 8, 16]),
    "grad_clip": 0.1,
    "weight_decay": 1e-4
}

In [23]:
import ray
ray.shutdown()
ray.init()

2023-12-04 13:17:30,441	INFO worker.py:1673 -- Started a local Ray instance.


0,1
Python version:,3.10.12
Ray version:,2.8.1


In [None]:
num_samples = 10
max_num_epochs = 10
gpus_per_trial = 1

scheduler = ASHAScheduler(
    metric="loss",
    mode="min",
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2)

reporter = CLIReporter(
    # ``parameter_columns=["l1", "l2", "lr", "batch_size"]``,
    metric_columns=["loss", "accuracy", "training_iteration"])

config_id = ray.put(config)
result = tune.run(partial(train_BirdSpecies, config_id=config_id), ...)

result = tune.run(
    partial(train_BirdSpecies),
    resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
    config=config,
    num_samples=num_samples,
    scheduler=scheduler,
    progress_reporter=reporter)

best_trial = result.get_best_trial("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
    best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(
    best_trial.last_result["accuracy"]))

In [None]:
# best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
# device = "cpu"
# if torch.cuda.is_available():
#     device = "cuda:0"
#     if gpus_per_trial > 1:
#         best_trained_model = nn.DataParallel(best_trained_model)
# best_trained_model.to(device)

# best_checkpoint_dir = best_trial.checkpoint.value
# model_state, optimizer_state = torch.load(os.path.join(
#     best_checkpoint_dir, "checkpoint"))
# best_trained_model.load_state_dict(model_state)

# test_acc = test_accuracy(best_trained_model, device)
# print("Best trial test set accuracy: {}".format(test_acc))