<a href="https://colab.research.google.com/github/thomaoc1/FineTuningViT/blob/main/DL_A2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install -U transformers
!pip install -U accelerate
!pip install optuna

In [None]:
from transformers import AutoImageProcessor

processor = AutoImageProcessor.from_pretrained('google/vit-large-patch16-224-in21k')

In [None]:
from datasets import load_dataset
ds = load_dataset("timm/oxford-iiit-pet")

Downloading readme:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/378M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/413M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3680 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3669 [00:00<?, ? examples/s]

In [None]:
def transform(example_batch):
  inputs = processor([x.convert('RGB') for x in example_batch['image']], return_tensors='pt')
  inputs['label'] = example_batch['label']
  return inputs

In [None]:
prepared_ds = ds.with_transform(transform)

split_ratio = 0.1
train_test_split = prepared_ds['train'].train_test_split(test_size=split_ratio, stratify_by_column="label")

train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [None]:
len(prepared_ds["train"])

3680

In [None]:
len(val_dataset)

368

In [None]:
import torch

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch])
    }

In [None]:
from transformers import ViTForImageClassification

labels = ds['train'].features['label'].names


In [None]:
from datasets import load_metric

metric = load_metric("accuracy", trust_remote_code=True)
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

In [None]:
import optuna
from transformers import Trainer, TrainingArguments, get_cosine_schedule_with_warmup
from torch.optim import SGD

def objective(trial: optuna.Trial):
    model = ViTForImageClassification.from_pretrained(
        "google/vit-large-patch16-224-in21k",
        num_labels=len(labels),
        id2label={str(i): c for i, c in enumerate(labels)},
        label2id={c: str(i) for i, c in enumerate(labels)}
    )

    training_args = TrainingArguments(
      output_dir="./vit-base-beans",
      per_device_train_batch_size=16,
      gradient_accumulation_steps=32,
      evaluation_strategy="steps",
      num_train_epochs=4,
      fp16=True,
      save_steps=100,
      eval_steps=100,
      # max_steps=500,
      logging_steps=10,
      # learning_rate=2e-4,
      save_total_limit=2,
      remove_unused_columns=False,
      push_to_hub=False,
      report_to='tensorboard',
      load_best_model_at_end=True,
    )

    learning_rate = trial.suggest_categorical("learning_rate", [0.001, 0.003, 0.01, 0.03])

    # Setup optimizer and scheduler without trial argument in lambda
    def optimizers():
        optimizer = SGD(model.parameters(), lr=learning_rate, momentum=0.9)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=46,
            num_training_steps=460
        )
        return optimizer, scheduler

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=collate_fn,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=processor,
        optimizers=optimizers()
    )



    result = trainer.train()
    return result.training_loss

In [None]:
study = optuna.create_study(study_name='hyper-parameter-search', direction='minimize')

study.optimize(func=objective, n_trials=12)

print(study.best_value)
print(study.best_params)
print(study.best_trial)

[I 2024-04-13 13:50:32,040] A new study created in memory with name: hyper-parameter-search
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


[I 2024-04-13 13:54:07,286] Trial 0 finished with value: 3.084793527921041 and parameters: {'learning_rate': 0.03}. Best is trial 0 with value: 3.084793527921041.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


[I 2024-04-13 13:57:38,926] Trial 1 finished with value: 3.5736211140950522 and parameters: {'learning_rate': 0.001}. Best is trial 0 with value: 3.084793527921041.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


[I 2024-04-13 14:01:09,749] Trial 2 finished with value: 3.5736211140950522 and parameters: {'learning_rate': 0.001}. Best is trial 0 with value: 3.084793527921041.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


[I 2024-04-13 14:04:34,366] Trial 3 finished with value: 3.5736211140950522 and parameters: {'learning_rate': 0.001}. Best is trial 0 with value: 3.084793527921041.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


[I 2024-04-13 14:07:59,202] Trial 4 finished with value: 3.539794921875 and parameters: {'learning_rate': 0.003}. Best is trial 0 with value: 3.084793527921041.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


[I 2024-04-13 14:11:27,779] Trial 5 finished with value: 3.084793527921041 and parameters: {'learning_rate': 0.03}. Best is trial 0 with value: 3.084793527921041.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


[I 2024-04-13 14:14:54,526] Trial 6 finished with value: 3.084793527921041 and parameters: {'learning_rate': 0.03}. Best is trial 0 with value: 3.084793527921041.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


[I 2024-04-13 14:18:19,846] Trial 7 finished with value: 3.419179916381836 and parameters: {'learning_rate': 0.01}. Best is trial 0 with value: 3.084793527921041.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


[I 2024-04-13 14:21:45,299] Trial 8 finished with value: 3.419179916381836 and parameters: {'learning_rate': 0.01}. Best is trial 0 with value: 3.084793527921041.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


[I 2024-04-13 14:25:11,939] Trial 9 finished with value: 3.419179916381836 and parameters: {'learning_rate': 0.01}. Best is trial 0 with value: 3.084793527921041.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


[I 2024-04-13 14:28:38,739] Trial 10 finished with value: 3.084793527921041 and parameters: {'learning_rate': 0.03}. Best is trial 0 with value: 3.084793527921041.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


[I 2024-04-13 14:32:05,108] Trial 11 finished with value: 3.084793527921041 and parameters: {'learning_rate': 0.03}. Best is trial 0 with value: 3.084793527921041.


3.084793527921041
{'learning_rate': 0.03}
FrozenTrial(number=0, state=TrialState.COMPLETE, values=[3.084793527921041], datetime_start=datetime.datetime(2024, 4, 13, 13, 50, 32, 43959), datetime_complete=datetime.datetime(2024, 4, 13, 13, 54, 7, 286431), params={'learning_rate': 0.03}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': CategoricalDistribution(choices=(0.001, 0.003, 0.01, 0.03))}, trial_id=0, value=None)


In [None]:
print(study.best_params["learning_rate"])

0.03


In [None]:
from transformers import EarlyStoppingCallback

model = ViTForImageClassification.from_pretrained(
    "google/vit-large-patch16-224-in21k",
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

training_args = TrainingArguments(
  output_dir="./vit-base-beans-final",
  per_device_train_batch_size=16,
  # gradient_accumulation_steps=32,
  evaluation_strategy="steps",
  num_train_epochs=4,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  max_steps=500,
  logging_steps=10,
  # learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
)

learning_rate = study.best_params["learning_rate"]

# Setup optimizer and scheduler without trial argument in lambda
def optimizers():
    optimizer = SGD(model.parameters(), lr=learning_rate, momentum=0.9)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=46,
        num_training_steps=460
    )
    return optimizer, scheduler

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor,
    optimizers=optimizers(),
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

Step,Training Loss,Validation Loss,Accuracy
100,0.421,0.417164,0.872283
200,0.2779,0.288428,0.923913
300,0.1412,0.237523,0.923913
400,0.0563,0.205017,0.9375


In [None]:
!zip -r vit-base-beans.zip vit-base-beans

In [None]:
metrics = trainer.evaluate(prepared_ds['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =       2.42
  eval_accuracy           =     0.9357
  eval_loss               =     0.2153
  eval_runtime            = 0:00:43.93
  eval_samples_per_second =     83.515
  eval_steps_per_second   =     10.448
